懒人畅听网,有声小说类目数据采集,多线程速采案例,Python爬虫120例之23例

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"多线程在 Python 爬虫学习过程中应用落地,提速,提速,再提速。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"目标站点分析","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"本次要抓取的目标为懒人畅听网,其中我随机选择了一个分类,有声小说频道,其余频道可使用雷同的办法抓取,增加遍历之后,可以对全站进行抓取。","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/6c/6cd8285fc6b464c1025b6cc674b4fc0b.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"列表页分页规则如下","attrs":{}},{"type":"text","text":"本次依旧只对列表页数据进行提取,只增加多线程模块 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"threading","attrs":{}}],"attrs":{}},{"type":"text","text":" 的应用,提高采集效率。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"txt"},"content":[{"type":"text","text":"http://www.lrts.me/book/category/1/recommend/1/20\nhttp://www.lrts.me/book/category/1/recommend/2/20\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"提取规则模板如下:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"txt"},"content":[{"type":"text","text":"http://www.lrts.me/book/category/1/recommend/页码/20\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"全站页码数,可以直接人眼读取,如果增加动态获取,提取读取一下分页处数据即可。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"提取最终的数据源如下图所示,包括书名,作者,主播三部分内容。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/a2/a23118f028da598b1a4f719a6fad4a57.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"编码时间","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"本次案例中对于多线程部分,除共享全局变量外,增加信号量机制,即限制线程并发数量。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"信号量机制的简单 Demo 如下所示:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"\nimport threading\nimport time\n\n\ndef run(n, semaphore):\n # 加锁\n semaphore.acquire()\n time.sleep(2)\n print(f'正在运行线程{n}')\n # 释放锁\n semaphore.release()\n\n\nif __name__ == '__main__':\n num = 0\n # 最多允许 3 个线程同时运行\n semaphore = threading.BoundedSemaphore(3)\n for i in range(10):\n t = threading.Thread(target=run, args=(f'线程号:{i}', semaphore))\n t.start()\n while threading.active_count() != 1:\n pass\n else:\n print('所有线程运行完毕')\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"运行代码,会发现先运行 3 个线程,再运行 3 个线程,当然同时运行的线程之间是没有先后顺序的。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/63/6352d288ed4b4ba6297db95693ae444e.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"信号量,即使用 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"threading","attrs":{}}],"attrs":{}},{"type":"text","text":" 模块的 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"BoundedSemaphore","attrs":{}}],"attrs":{}},{"type":"text","text":" 类,该类可以设置允许一定数量的线程更改数据,即最多可同时运行几个线程。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"代码完整案例如下所示","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"import threading\nfrom threading import Lock,Thread\nimport random,requests\nfrom lxml import etree\n\ndef get_headers():\n uas = [\n \"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)\",\n ]\n ua = random.choice(uas)\n headers = {\n \"user-agent\": ua,\n \"referer\": \"https://www.baidu.com/\"\n }\n return headers\n\n\ndef run(url,semaphore):\n headers = get_headers()\n semaphore.acquire() #加锁\n res = requests.get(url,headers=headers,timeout=5)\n if res:\n text = res.text\n element = etree.HTML(text)\n titles = element.xpath('//a[@class=\"book-item-name\"]/text()')\n authors = element.xpath('//a[@class=\"author\"]/text()')\n weakens = element.xpath('//a[@class=\"g-user-shutdown\"]/text()')\n save(url,titles,authors,weakens)\n\n\n semaphore.release() #释放\n\ndef save(url,titles,authors,weakens):\n data_list = zip(titles,authors,weakens)\n for item in data_list:\n with open(\"./data.csv\",\"a+\",encoding=\"utf-8\") as f:\n f.write(f\"{item[0]},{item[1]},{item[2]}\\n\")\n print(url,\"该URL地址数据写入完毕\")\nif __name__== '__main__':\n lock = Lock()\n url_format = 'https://www.lrts.me/book/category/1/recommend/{}/20'\n # 拼接URL,全局共享变量\n urls = [url_format.format(i) for i in range(1, 1372)]\n l = []\n semaphore = threading.BoundedSemaphore(5) # 最多允许5个线程同时运行\n for url in urls:\n t = threading.Thread(target=run,args=(url,semaphore))\n t.start()\n while threading.active_count() !=1:\n pass\n else:\n print('所有线程运行完毕')\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"代码中 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"threading.active_count()","attrs":{}}],"attrs":{}},{"type":"text","text":" 部分,用于检测是否存在活跃线程,如无,程序结束。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"运行代码,得到如下结果,至此第 23 例已经学习完毕。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/1a/1a936d52a78819d45d0c95621492f293.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"收藏时间","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"代码仓库地址:","attrs":{}},{"type":"link","attrs":{"href":"https://codechina.csdn.net/hihell/python120","title":"","type":null},"content":[{"type":"text","text":"https://codechina.csdn.net/hihell/python120","attrs":{}}]},{"type":"text","text":",去给个关注或者 Star 吧。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https://download.csdn.net/download/hihell/21593682","title":"","type":null}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"来都来了,不发个评论,点个赞,收个藏吗?","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"blockquote","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"今天是持续写作的第 203/ 365 天。可以关注我,点赞我、评论我、收藏我啦。","attrs":{}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":"none"},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"horizontalrule","attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章