懶人暢聽網,有聲小說類目數據採集,多線程速採案例,Python爬蟲120例之23例

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"多線程在 Python 爬蟲學習過程中應用落地,提速,提速,再提速。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"目標站點分析","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"本次要抓取的目標爲懶人暢聽網,其中我隨機選擇了一個分類,有聲小說頻道,其餘頻道可使用雷同的辦法抓取,增加遍歷之後,可以對全站進行抓取。","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/6c/6cd8285fc6b464c1025b6cc674b4fc0b.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"列表頁分頁規則如下","attrs":{}},{"type":"text","text":"本次依舊只對列表頁數據進行提取,只增加多線程模塊 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"threading","attrs":{}}],"attrs":{}},{"type":"text","text":" 的應用,提高採集效率。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"txt"},"content":[{"type":"text","text":"http://www.lrts.me/book/category/1/recommend/1/20\nhttp://www.lrts.me/book/category/1/recommend/2/20\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"提取規則模板如下:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"txt"},"content":[{"type":"text","text":"http://www.lrts.me/book/category/1/recommend/頁碼/20\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"全站頁碼數,可以直接人眼讀取,如果增加動態獲取,提取讀取一下分頁處數據即可。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"提取最終的數據源如下圖所示,包括書名,作者,主播三部分內容。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/a2/a23118f028da598b1a4f719a6fad4a57.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"編碼時間","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"本次案例中對於多線程部分,除共享全局變量外,增加信號量機制,即限制線程併發數量。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"信號量機制的簡單 Demo 如下所示:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"\nimport threading\nimport time\n\n\ndef run(n, semaphore):\n # 加鎖\n semaphore.acquire()\n time.sleep(2)\n print(f'正在運行線程{n}')\n # 釋放鎖\n semaphore.release()\n\n\nif __name__ == '__main__':\n num = 0\n # 最多允許 3 個線程同時運行\n semaphore = threading.BoundedSemaphore(3)\n for i in range(10):\n t = threading.Thread(target=run, args=(f'線程號:{i}', semaphore))\n t.start()\n while threading.active_count() != 1:\n pass\n else:\n print('所有線程運行完畢')\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"運行代碼,會發現先運行 3 個線程,再運行 3 個線程,當然同時運行的線程之間是沒有先後順序的。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/63/6352d288ed4b4ba6297db95693ae444e.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"信號量,即使用 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"threading","attrs":{}}],"attrs":{}},{"type":"text","text":" 模塊的 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"BoundedSemaphore","attrs":{}}],"attrs":{}},{"type":"text","text":" 類,該類可以設置允許一定數量的線程更改數據,即最多可同時運行幾個線程。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"代碼完整案例如下所示","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"import threading\nfrom threading import Lock,Thread\nimport random,requests\nfrom lxml import etree\n\ndef get_headers():\n uas = [\n \"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)\",\n ]\n ua = random.choice(uas)\n headers = {\n \"user-agent\": ua,\n \"referer\": \"https://www.baidu.com/\"\n }\n return headers\n\n\ndef run(url,semaphore):\n headers = get_headers()\n semaphore.acquire() #加鎖\n res = requests.get(url,headers=headers,timeout=5)\n if res:\n text = res.text\n element = etree.HTML(text)\n titles = element.xpath('//a[@class=\"book-item-name\"]/text()')\n authors = element.xpath('//a[@class=\"author\"]/text()')\n weakens = element.xpath('//a[@class=\"g-user-shutdown\"]/text()')\n save(url,titles,authors,weakens)\n\n\n semaphore.release() #釋放\n\ndef save(url,titles,authors,weakens):\n data_list = zip(titles,authors,weakens)\n for item in data_list:\n with open(\"./data.csv\",\"a+\",encoding=\"utf-8\") as f:\n f.write(f\"{item[0]},{item[1]},{item[2]}\\n\")\n print(url,\"該URL地址數據寫入完畢\")\nif __name__== '__main__':\n lock = Lock()\n url_format = 'https://www.lrts.me/book/category/1/recommend/{}/20'\n # 拼接URL,全局共享變量\n urls = [url_format.format(i) for i in range(1, 1372)]\n l = []\n semaphore = threading.BoundedSemaphore(5) # 最多允許5個線程同時運行\n for url in urls:\n t = threading.Thread(target=run,args=(url,semaphore))\n t.start()\n while threading.active_count() !=1:\n pass\n else:\n print('所有線程運行完畢')\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"代碼中 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"threading.active_count()","attrs":{}}],"attrs":{}},{"type":"text","text":" 部分,用於檢測是否存在活躍線程,如無,程序結束。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"運行代碼,得到如下結果,至此第 23 例已經學習完畢。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/1a/1a936d52a78819d45d0c95621492f293.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"收藏時間","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"代碼倉庫地址:","attrs":{}},{"type":"link","attrs":{"href":"https://codechina.csdn.net/hihell/python120","title":"","type":null},"content":[{"type":"text","text":"https://codechina.csdn.net/hihell/python120","attrs":{}}]},{"type":"text","text":",去給個關注或者 Star 吧。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https://download.csdn.net/download/hihell/21593682","title":"","type":null}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"來都來了,不發個評論,點個贊,收個藏嗎?","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"今天是持續寫作的第 203/ 365 天。可以關注我,點讚我、評論我、收藏我啦。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":"none"},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"horizontalrule","attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章