前言
好大夫在線網(https://www.haodf.com),是封ip的,當然通過(高匿)代理IP一般就能解決。but噁心的地方在於該站使用了知道創宇的雲攔截,能夠獲取機子的真實ip,使用代理ip並沒有什麼卵用。
爬取源及爬取內容
爬取源是11萬條醫生頁面的url,獲取頁面的一些訪問信息。
目錄結構
代碼
import requests
from lxml import etree
import math
import threading
import random
import time
import datetime
import csv
# 寫文件
def write_file(path_file, mode, write_str):
with open(path_file, mode) as file:
file.write(write_str)
# 寫數據到csv中
def write_csv(path_file, mode, list_row):
with open(path_file, mode, newline='') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(list_row)
# 讀取url文件
def read_file(path_file):
with open(path_file, 'r') as file:
lines = file.readlines()
return lines
# 把所有的ulr分成n等份
def chunks(list, n):
chunks_list = []
len_list = len(list)
step = math.ceil(len_list / n)
for i in range(0, n):
chunks_list.append(list[i*step:(i+1)*step])
return chunks_list
# 獲取頁面
def get_page(url):
User_Agent = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'
]
len_user_agent = len(User_Agent)
random_num = random.randint(0, len_user_agent-1)
user_agent = User_Agent[random_num]
count = 0
status_code = 403
while status_code == 403:
try:
response = requests.get(
url=url,
proxies={
# 'http': 'http://c4b10796877647f297db63ecf2f92428:@proxy.crawlera.com:8010/',
},
headers={
'User-Agent': user_agent
}
)
response.encoding = 'cp936'
html = response.text
status_code = response.status_code
count += 1
time.sleep(count*3)
# print(response.status_code)
# print(str(count)+' ' + url)
except:
html = ''
if count > 30:
break
return html, url, status_code
# 獲取頁面信息
def get_info(html, url):
info = list()
info.append(url)
selector = etree.HTML(html)
try:
# 總訪問
visits = selector.xpath('//ul[@class="space_statistics"]/li[1]/span/text()')[0]
info.append(visits)
# 昨日訪問
visits_yesterday = selector.xpath('//ul[@class="space_statistics"]/li[2]/span/text()')[0]
info.append(visits_yesterday)
# 總文章
articles = selector.xpath('//ul[@class="space_statistics"]/li[3]/span/text()')[0]
info.append(articles)
# 總患者
patients = selector.xpath('//ul[@class="space_statistics"]/li[4]/span/text()')[0]
info.append(patients)
# 昨日診後報到患者
patients_after_yesterday = selector.xpath('//ul[@class="space_statistics"]/li[5]/span/text()')[0]
info.append(patients_after_yesterday)
# 微信診後報到患者
patients_after_wechat = selector.xpath('//ul[@class="space_statistics"]/li[6]/span/text()')[0]
info.append(patients_after_wechat)
# 總診後報到患者
patients_after = selector.xpath('//ul[@class="space_statistics"]/li[7]/span/text()')[0]
info.append(patients_after)
# 患者投票
votes_patient = selector.xpath('//ul[@class="space_statistics"]/li[8]/span/text()')[0]
info.append(votes_patient)
# 感謝信
letters_thanks = selector.xpath('//ul[@class="space_statistics"]/li[9]/span/text()')[0]
info.append(letters_thanks)
# 心意禮物
gifts = selector.xpath('//ul[@class="space_statistics"]/li[10]/span/text()')[0]
info.append(gifts)
# 上次在線
online_last = selector.xpath('//ul[@class="space_statistics"]/li[11]/span/text()')[0]
info.append(online_last)
# 開通時間
opening_time = selector.xpath('//ul[@class="space_statistics"]/li[12]/span/text()')[0]
info.append(opening_time)
except:
pass
# print('craw failed, try again')
return info
# index 爲線程 序號
def craw(index, chunks_list, path_log_file):
url_list = chunks_list[index]
for url in url_list:
url = url.replace('\n', '')
html, url, status_code = get_page(url)
info = get_info(html, url)
if len(info) == 1:
# 有問題記錄日誌
write_file(path_log_file, 'a', url+'\n')
else:
# 數據寫入
# 系統時間
now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
info.insert(1, now_time)
write_csv(path_file=path_data, mode='a', list_row=info)
print(info)
def main(path_url_file, n, path_log_file):
# 讀取url文件
list_url = read_file(path_url_file)
chunks_list = chunks(list_url, n)
# 把url分成n等份,也即n個線程
for index in range(0, n):
thread = threading.Thread(target=craw, args=(index, chunks_list, path_log_file))
thread.start()
if __name__ == '__main__':
path_url = './files/URL.csv'
path_data = './files/data.csv'
path_log_temp = './files/log_temp.txt'
path_log = './files/log.txt'
# 把日誌文件清空
write_file(path_file=path_log_temp, mode='w', write_str='')
write_file(path_file=path_log, mode='w', write_str='')
# 寫標題到csv中
title_row = [
'URL', '爬取時間', '總訪問', '昨日訪問', '總文章', '總患者', '昨日診後報到患者',
'微信診後報到患者', '總診後報到患者', '患者投票', '感謝信', '心意禮物', '上次在線', '開通時間'
]
write_csv(path_file=path_data, mode='w', list_row=title_row)
# 處理url文件
main(path_url_file=path_url, n=200, path_log_file=path_log_temp)
# 處理日誌文件
main(path_url_file=path_log_temp, n=5, path_log_file=path_log)
結果