python爬取好大夫在線網

前言

好大夫在線網(https://www.haodf.com),是封ip的,當然通過(高匿)代理IP一般就能解決。but噁心的地方在於該站使用了知道創宇的雲攔截,能夠獲取機子的真實ip,使用代理ip並沒有什麼卵用。
在這裏插入圖片描述

爬取源及爬取內容

爬取源是11萬條醫生頁面的url,獲取頁面的一些訪問信息。
在這裏插入圖片描述
在這裏插入圖片描述

目錄結構

在這裏插入圖片描述

代碼

import requests
from lxml import etree
import math
import threading
import random
import time
import datetime
import csv


# 寫文件
def write_file(path_file, mode, write_str):
    with open(path_file, mode) as file:
        file.write(write_str)


# 寫數據到csv中
def write_csv(path_file, mode, list_row):
    with open(path_file, mode, newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(list_row)


# 讀取url文件
def read_file(path_file):
    with open(path_file, 'r') as file:
        lines = file.readlines()
    return lines


# 把所有的ulr分成n等份
def chunks(list, n):
    chunks_list = []
    len_list = len(list)
    step = math.ceil(len_list / n)
    for i in range(0, n):
        chunks_list.append(list[i*step:(i+1)*step])
    return chunks_list


# 獲取頁面
def get_page(url):
    User_Agent = [
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'
    ]
    len_user_agent = len(User_Agent)
    random_num = random.randint(0, len_user_agent-1)
    user_agent = User_Agent[random_num]

    count = 0
    status_code = 403
    while status_code == 403:
        try:
            response = requests.get(
                url=url,
                proxies={
                    # 'http': 'http://c4b10796877647f297db63ecf2f92428:@proxy.crawlera.com:8010/',
                },
                headers={
                    'User-Agent': user_agent
                }
            )
            response.encoding = 'cp936'
            html = response.text
            status_code = response.status_code
            count += 1
            time.sleep(count*3)
            # print(response.status_code)
            # print(str(count)+' ' + url)
        except:
            html = ''
        if count > 30:
            break

    return html, url, status_code


# 獲取頁面信息
def get_info(html, url):
    info = list()
    info.append(url)
    selector = etree.HTML(html)
    try:
        # 總訪問
        visits = selector.xpath('//ul[@class="space_statistics"]/li[1]/span/text()')[0]
        info.append(visits)
        # 昨日訪問
        visits_yesterday = selector.xpath('//ul[@class="space_statistics"]/li[2]/span/text()')[0]
        info.append(visits_yesterday)
        # 總文章
        articles = selector.xpath('//ul[@class="space_statistics"]/li[3]/span/text()')[0]
        info.append(articles)
        # 總患者
        patients = selector.xpath('//ul[@class="space_statistics"]/li[4]/span/text()')[0]
        info.append(patients)
        # 昨日診後報到患者
        patients_after_yesterday = selector.xpath('//ul[@class="space_statistics"]/li[5]/span/text()')[0]
        info.append(patients_after_yesterday)
        # 微信診後報到患者
        patients_after_wechat = selector.xpath('//ul[@class="space_statistics"]/li[6]/span/text()')[0]
        info.append(patients_after_wechat)
        # 總診後報到患者
        patients_after = selector.xpath('//ul[@class="space_statistics"]/li[7]/span/text()')[0]
        info.append(patients_after)
        # 患者投票
        votes_patient = selector.xpath('//ul[@class="space_statistics"]/li[8]/span/text()')[0]
        info.append(votes_patient)
        # 感謝信
        letters_thanks = selector.xpath('//ul[@class="space_statistics"]/li[9]/span/text()')[0]
        info.append(letters_thanks)
        # 心意禮物
        gifts = selector.xpath('//ul[@class="space_statistics"]/li[10]/span/text()')[0]
        info.append(gifts)
        # 上次在線
        online_last = selector.xpath('//ul[@class="space_statistics"]/li[11]/span/text()')[0]
        info.append(online_last)
        # 開通時間
        opening_time = selector.xpath('//ul[@class="space_statistics"]/li[12]/span/text()')[0]
        info.append(opening_time)
    except:
        pass
        # print('craw failed, try again')
    return info


# index 爲線程 序號
def craw(index, chunks_list, path_log_file):
    url_list = chunks_list[index]
    for url in url_list:
        url = url.replace('\n', '')
        html, url, status_code = get_page(url)
        info = get_info(html, url)

        if len(info) == 1:
            # 有問題記錄日誌
            write_file(path_log_file, 'a', url+'\n')
        else:
            # 數據寫入
            # 系統時間
            now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            info.insert(1, now_time)
            write_csv(path_file=path_data, mode='a', list_row=info)
        print(info)


def main(path_url_file, n, path_log_file):
    # 讀取url文件
    list_url = read_file(path_url_file)
    chunks_list = chunks(list_url, n)
    # 把url分成n等份,也即n個線程
    for index in range(0, n):
        thread = threading.Thread(target=craw, args=(index, chunks_list, path_log_file))
        thread.start()


if __name__ == '__main__':
    path_url = './files/URL.csv'
    path_data = './files/data.csv'
    path_log_temp = './files/log_temp.txt'
    path_log = './files/log.txt'

    # 把日誌文件清空
    write_file(path_file=path_log_temp, mode='w', write_str='')
    write_file(path_file=path_log, mode='w', write_str='')

    # 寫標題到csv中
    title_row = [
        'URL', '爬取時間', '總訪問', '昨日訪問', '總文章', '總患者', '昨日診後報到患者',
        '微信診後報到患者', '總診後報到患者', '患者投票', '感謝信', '心意禮物', '上次在線', '開通時間'
    ]
    write_csv(path_file=path_data, mode='w', list_row=title_row)

    # 處理url文件
    main(path_url_file=path_url, n=200, path_log_file=path_log_temp)
    # 處理日誌文件
    main(path_url_file=path_log_temp, n=5, path_log_file=path_log)



結果

在這裏插入圖片描述
在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章