python 使用lxml requests抓取某網站的幫助文檔-----多線程處理

對於多線程的處理,我們先考慮腳本自身,所有的抓取都是基於categoryid的,所以我們可以考慮做一個任務隊列,生產者負責把id推送到隊列(Queue.put),消費者負責從隊列讀取id,來進行文件的讀取和下載。(Queue.get())

這樣,我們就要改造之前的腳本,先定義一個隊列Q_example=Queue() ,在categoryId函數中,把獲取到的id都put到Q_example中。然後再改造download()函數,去掉循環逐個獲取,改爲把id推送過來。最後的實現代碼如下:

# coding:utf-8
"""
author:@
"""
import requests
import urllib.request as ur
from lxml import etree
import  os
from lxml import etree
import threading
from queue import Queue
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
}
"獲取 文檔iD"
Q_example=Queue()
def categoryId():
    "獲取文檔對應的id"
    document_page=requests.get("http://help.tongtool.com/category/buildCategoryTreeHavingChildren")
    ducument_result = document_page.json()
    erp_help=ducument_result.get('datas')[0].get('childrenList')[0]  #get('childrenList')[0]爲ERP [1]爲listing []

    "得到幫助文檔內容"
    help_infos=erp_help.get('childrenList')[0].get('childrenList')
    categoryIds=[]
    for i in help_infos:
        "獲取一級菜單下面的子信息"
        sub_info=i.get('childrenList')  #list
        for j in sub_info:
            categoryIds.append(j.get('id'))
            Q_example.put(j.get('id'))
    return  categoryIds
def consumer(id):
    # id=Q_example.get()
    page = requests.get(
        "http://help.tongtool.com/docs/listDocsPage?categoryId={0}&pageSize=20&pageNum=1".format(id),
        headers=headers)
    result = page.json()
    try:
        contents = result.get('datas').get('list')
    except:
        print (page.url)
    for content in contents:
        "存在多個的時候逐個讀取"
        title = content.get('title')
        if not os.path.exists('d://tmp//{0}'.format(title)):
            try:
                os.makedirs('d://tmp//{0}'.format(title))
            except OSError:
                'title名稱異常,截取'
                title = title.split(":")[0]  # 存在一個報錯的地方,先這麼處理
                os.makedirs('d://tmp//{0}'.format(title))

        content_text = content.get('content')
        html = etree.HTML(content_text)
        "解析文本內容"
        html_contents = html.xpath("//span/text()")
        file_text = '{0}.txt'.format(title)
        all_contents = ""
        for html_content in html_contents:
            all_contents = all_contents + html_content + '\n'
        file = open('d:\\tmp\\{0}'.format(file_text), 'w', encoding='utf-8')
        file.write(all_contents)
        "解析幫助圖片"
        html_pages = html.xpath("//img/@src")
        "創建文件夾來存儲圖片"
        for page in html_pages:
            filename = page.split('/')[-1]
            print('準備下載:' + page)
            if 'help' not in page:
                print('過濾鏈接:' + page)
                continue
            if "http" not in page:
                page = "http:" + page
            ur.urlretrieve(page, os.path.join("d://tmp", title, filename))

def download():
    categoryIds=categoryId()
    for id in categoryIds:
        page=requests.get("http://help.tongtool.com/docs/listDocsPage?categoryId={0}&pageSize=20&pageNum=1".format(id),headers=headers)
        result=page.json()
        contents=result.get('datas').get('list')
        for content in contents:
            "存在多個的時候逐個讀取"
            title = content.get('title')
            if not os.path.exists('d://tmp//{0}'.format(title)):
                try:
                    os.makedirs('d://tmp//{0}'.format(title))
                except OSError:
                    'title名稱異常,截取'
                    title=title.split(":")[0]  #存在一個報錯的地方,先這麼處理
                    os.makedirs('d://tmp//{0}'.format(title))

            content_text = content.get('content')
            html = etree.HTML(content_text)
            "解析文本內容"
            html_contents = html.xpath("//span/text()")
            file_text = '{0}.txt'.format(title)
            all_contents = ""
            for html_content in html_contents:
                all_contents = all_contents + html_content + '\n'
            file = open('d:\\tmp\\{0}'.format(file_text), 'w', encoding='utf-8')
            file.write(all_contents)
            "解析幫助圖片"
            html_pages=html.xpath("//img/@src")
            "創建文件夾來存儲圖片"
            for page in html_pages:
                filename = page.split('/')[-1]
                print ('準備下載:'+page)
                if 'help' not in page:
                    print ('過濾鏈接:'+page)
                    continue
                if "http" not in page:
                    page="http:"+page
                ur.urlretrieve( page, os.path.join("d://tmp",title,filename))

if __name__ == '__main__':
    import time
    start_time=time.strftime("%H:%M:%S",time.localtime())
    print (time.strftime("%H:%m:%S",time.localtime()))
    ids=categoryId()
    print (dir(Q_example))
    print (Q_example.qsize())
    while Q_example.qsize()!=0:
        print('length is ',Q_example.qsize())
        threads=[]
        q_length=Q_example.qsize()
        if q_length>10:
            q_i=10
        else:
            q_i=q_length
        for i in range(q_i):
            value=Q_example.get()
            print ('value is ',value)
            t = threading.Thread(target=consumer, args=(value,))
            threads.append(t)
        for t in threads:
            t.start()
        for t in threads:
            t.join()
    end_time = time.strftime("%H:%M:%S", time.localtime())
    print ('start time : ',start_time)
    print ('end time : ',end_time)

兩者的運行時間結果對比如下:

多線程:start time :  10:50:03  end time :  10:50:28  共計25S

單線程:start time :  10:36:49 end time :  10:37:56 共計67S

其中代碼的這部分,最開始的寫法是直接for i in range(10),後來運行中,代碼始終不會結束。後來發現是最後一次的時候,當隊列的長度小於10,導致get不到數據,始終停在那裏了。

        if q_length>10:
            q_i=10
        else:
            q_i=q_length
        for i in range(q_i):
            value=Q_example.get()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章