Python 爬取鏈家淮安經紀人信息
#爬取鏈家房源經紀人信息
#有xpath解析爬取:人名,負責區域
#定義csv保存函數
1、導入模塊
import requests
from lxml import etree
import csv
import time
2、創建頁面抓取主函數def lianjia_spider(list_url):
def lianjia_spider(list_url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)\
AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2727.400'}
try:
response = requests.get(list_url, headers=headers)
except:
print('出錯了')
time.sleep(5)
content = response.text
3、解析函數 並調用保存函數save_csv_writer(item):
#解析頁面
html = etree.HTML(content)
#爬取每一位經紀人的信息整段代碼
agent_list = html.xpath('//li[@class="pictext flexbox box_center_v lazyload_ulog"]')
for agent in agent_list:
# name//*[@id="page-0---6a004ab5-9bcd-4bac-4800-99870c90b5e3"]/section[1]/div[2]/div/div[3]/ul/li[1]/div/div[2]/div[1]/span[1]/a[1]
# region//*[@id="page-0---6a004ab5-9bcd-4bac-4800-99870c90b5e3"]/section[1]/div[2]/div/div[3]/ul/li[1]/div/div[2]/div[2]/span[3]
agent_name = agent.xpath('div/div[2]/div[1]/span[1]/a[1]/text()')[0]
agent_region = agent.xpath('div/div[2]/div[2]/span[3]/text()')[0]
info = [agent_name, agent_region]
print('正在爬取', agent_name, agent_region)
save_csv_writer(info) #調用保存函數並保存數據爲csv格式
4、定義保存函數save_csv_writer(item):
#定義csv保存函數
def save_csv_writer(item):
with open('lianjia_jingjiren.csv', 'a+', encoding='utf-8', newline='')as csvfile:
writer = csv.writer(csvfile)
writer.writerow(item)
5、創建主函數並調用定義主函數
# 創建主函數
def main():
for i in range(100):
# 構造url格式
url = ('https://m.lianjia.com/ha/jingjiren/?page_size=15&_t=1&offset='+str(i*15))
lianjia_spider(url)
print('正在爬取', i)
6、調用並運行 程序
#調用函數
if __name__ =="__main__":
main()
程序運行結果如下: