*Python3-Spyder-urllib.request抓取搜haohuo平臺信息-保存到csv
-- coding: utf-8 --
“”"
Created on Sat Nov 9 10:15:32 2019
@author: Administrator
“”"
import urllib.request as request
import lxml.html as html
import csv
import time
import codecs
import random
headers={‘User-Agent’:’ Mozilla/5.0 (Windows NT 6.1; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36’}
#代理
px =request.ProxyHandler({
‘http’:‘182.35.80.197:9999’,
‘http’:‘117.69.201.81:9999’,
‘http’:‘182.34.34.201:9999’,
‘http’:‘113.120.33.49:9999’,
‘http’:‘222.89.32.187:9999’
})
opener = request.build_opener(px)
xpath1 = “//div[@class=“clearfix mt20 pb5”]/a/@href”
xpath = “//a[@class=“list-item-button list-item-see L”]/@href”
a = 1
#codevs 防止中文寫入時亂碼
f = codecs.open(‘soumeng.csv’,‘a’,encoding=‘utf-8’)
csv_writer = csv.writer(f)
while a<=82:
url = ‘https://s.912688.com/comp/dy/search?kw=%E5%BE%B7%E5%B7%9E&page=’+str(a)
print(url)
print(a)
#代理版
req = request.Request(url=url,headers=headers)
res = opener.open(req)
data = res.read().decode(“utf-8”)
#原版
data = request.urlopen(url).read().decode(“utf-8”)
dom = html.document_fromstring(data)
links = dom.xpath(xpath)
xpathGs = "//p[@class=\"shop-card-cname\"]/a/text()"
xpathXm = "//p[@class=\"shop-card-link\"]/a/text()"
xpathNb = "//p[@class=\"shop-card-num\"]/span/text()"
xpathGs2 = "//span[@class=\"com-name m-btm\"]/a/text()"
xpathXm2 = "//span[@class=\"name\"]/a/text()"
xpathNb2 = "//span[@class=\"phone\"]/text()"
#準備csv
for link in links:
#代理
req = request.Request(url=link,headers=headers)
res = opener.open(req)
data = res.read().decode("utf-8")
#data = request.urlopen(link).read().decode("utf-8")
dom=html.document_fromstring(data)
Gongsi = dom.xpath(xpathGs)
if Gongsi==[]:
print("plan B")
Gongsi = dom.xpath(xpathGs2)
Xingming = dom.xpath(xpathXm)
if Xingming==[]:
Xingming = dom.xpath(xpathXm2)
Nub = dom.xpath(xpathNb)
if Nub==[]:
Nub = dom.xpath(xpathNb2)
csv_writer.writerow([Gongsi,Xingming,Nub])
print(link)
print(str(Gongsi)+'-'+str(Xingming)+'-'+str(Nub))
#time.sleep(random.randint(0,2))
a += 1
#沉睡3-7秒
time.sleep(random.randint(0,5))
f.close()
print(“結束”)