誰有粉?就爬誰!他粉多,就爬他!Python 多線程採集 260000+ 粉絲數據

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"今天你想爬誰的粉呢?誰粉多,就爬誰。那誰有粉?沉默王二有粉。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"今天咱們繼續學習 Python 爬蟲,從本篇博客開始進行短暫的(15 篇)","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"多線程爬蟲學習","attrs":{}},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"第一篇就要採集 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"@沉默王二","attrs":{}}],"attrs":{}},{"type":"text","text":" 的粉絲,坐擁 27W+ 讀者,屬實讓人羨慕。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"目標數據源分析","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"本次要抓取的數據源是 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"https://blog.csdn.net/qing_gee?type=sub&subType=fans","attrs":{}}],"attrs":{}},{"type":"text","text":",其中的 ID 可以切換爲你希望採集的 ID,當然包括你自己的 ID。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"該頁面下滑刷新會自動請求一個 API 接口,即 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"https://blog.csdn.net/community/home-api/v1/get-fans-list?page=3&size=20&noMore=false&blogUsername=qing_gee","attrs":{}}],"attrs":{}},{"type":"text","text":",其中參數如下:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"page","attrs":{}}],"attrs":{}},{"type":"text","text":":頁碼,根據目標人粉絲總數 / 20 計算獲取即可;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"size","attrs":{}}],"attrs":{}},{"type":"text","text":":每頁數據,默認值 20;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"noMore","attrs":{}}],"attrs":{}},{"type":"text","text":":無用;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"blogUsername","attrs":{}}],"attrs":{}},{"type":"text","text":":博客用戶名","attrs":{}}]}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"同時在測試接口過程中,接口會返回異常數據,實測增加一個延時控制,可以大幅度提高接口數據返回穩定性。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"{'code': 400, 'message': 'fail', 'data': None}\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"正常接口數據返回如下圖所示:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/37/37b36335ea01bd668dbdb46d02e02c1a.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"使用技術點說明","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"本次採用 Python 多線程實現數據的採集,編碼使用 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"threading","attrs":{}}],"attrs":{}},{"type":"text","text":" 模塊進行多線程控制,本系列專欄從最簡單的多線程開始進行學習,例如本例,一次性發起 5(可自定義)個請求。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"完整代碼如下所示,代碼說明請參考註釋部分與尾部說明","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"import threading\nfrom threading import Lock, Thread\nimport time\nimport os\nimport requests\nimport random\n\n\nclass MyThread(threading.Thread):\n def __init__(self, name):\n super(MyThread, self).__init__()\n self.name = name\n\n def run(self):\n global urls\n lock.acquire()\n one_url = urls.pop()\n print(\"正在爬取:\", one_url)\n lock.release()\n print(\"任意線程等待隨機時間\")\n time.sleep(random.randint(1,3))\n res = requests.get(one_url, headers=self.get_headers(), timeout=5)\n\n if res.json()[\"code\"] != 400:\n data = res.json()[\"data\"][\"list\"]\n for user in data:\n name = user['username']\n nickname = self.remove_character(user['nickname'])\n userAvatar = user['userAvatar']\n blogUrl = user['blogUrl']\n blogExpert = user['blogExpert']\n briefIntroduction = self.remove_character(\n user['briefIntroduction'])\n\n with open('./qing_gee_data.csv', 'a+', encoding='utf-8') as f:\n print(f'{name},{nickname},{userAvatar},{blogUrl},{blogExpert},{briefIntroduction}')\n f.write(f\"{name},{nickname},{userAvatar},{blogUrl},{blogExpert},{briefIntroduction}\\n\")\n else:\n print(res.json())\n print(\"異常數據\", one_url)\n with open('./error.txt', 'a+', encoding='utf-8') as f:\n f.write(one_url+\"\\n\")\n # 去除特殊字符\n\n def remove_character(self, origin_str):\n if origin_str is None:\n return\n origin_str = origin_str.replace('\\n', '')\n origin_str = origin_str.replace(',', ',')\n return origin_str\n # 獲取隨機UA請求頭\n def get_headers(self):\n uas = [\n \"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)\"\n ]\n ua = random.choice(uas)\n # 特別注意下述 cookie 部分,需要手動從開發者工具中進行復制,否則抓取到的數據,缺少nikename 與個人簡介部分\n headers = {\n \"user-agent\": ua,\n 'cookie': 'UserName=你的ID; UserInfo=你的UserInfo; UserToken=你的UserToken;',\n \"referer\": \"https://blog.csdn.net/qing_gee?type=sub&subType=fans\"\n }\n return headers\n\n\nif __name__ == '__main__':\n lock = Lock()\n url_format = 'https://blog.csdn.net/community/home-api/v1/get-fans-list?page={}&size=20&noMore=false&blogUsername=qing_gee'\n urls = [url_format.format(i) for i in range(1, 13300)]\n l = []\n while len(urls) > 0:\n print(len(urls))\n for i in range(5):\n p = MyThread(\"t\"+str(i))\n l.append(p)\n p.start()\n for p in l:\n p.join()\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"代碼運行結果如下圖所示:","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/27/2767954f3f5d4b06ded5d8866d01c740.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"上述代碼用到了多線程,也同時用到了線程鎖,簡單的多線程代碼可以抽象爲下述內容。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"簡單的多線程代碼:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"import threading\nimport time\n\ndef run(n):\n print('task', n)\n time.sleep(3)\n\nif __name__ == '__main__':\n t1 = threading.Thread(target=run, args=('t1',))\n t2 = threading.Thread(target=run, args=('t2',))\n t1.start()\n t2.start()\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"其中比較核心的代碼是 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"threading.Thread","attrs":{}}],"attrs":{}},{"type":"text","text":",參數 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"target","attrs":{}}],"attrs":{}},{"type":"text","text":" 後面的值是函數名,","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"args","attrs":{}}],"attrs":{}},{"type":"text","text":" 是傳遞的參數,注意必須爲元組類型。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"爬蟲代碼還是用了共享全局變量,簡化代碼如下所示,其中重點學習 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"lock=Lock()","attrs":{}}],"attrs":{}},{"type":"text","text":" 部分代碼,以及在使用全局變量前後的 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"lock.acquire()","attrs":{}}],"attrs":{}},{"type":"text","text":" 和 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"lock.release()","attrs":{}}],"attrs":{}},{"type":"text","text":"。其中還用到了線程的 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"join","attrs":{}}],"attrs":{}},{"type":"text","text":" 方法,該方法主要是爲了讓主線程等待子線程執行。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"import threading\nfrom threading import Lock,Thread\nimport time,os\n\ndef work():\n global urls\n lock.acquire()\n # 獲取一個 url\n one_url = urls.pop()\n lock.release()\n\n print(\"得到的 URL 爲\",one_url)\n\n\nif __name__ == '__main__':\n lock = Lock()\n url_format = 'https://blog.csdn.net/community/home-api/v1/get-fans-list?page={}&size=20&noMore=false&blogUsername=qing_gee'\n # 拼接URL,全局共享變量\n urls = [url_format.format(i) for i in range(1, 13300)]\n l = []\n # 開啓線程數量\n for i in range(3):\n p = Thread(target=work)\n l.append(p)\n p.start()\n for p in l:\n p.join()\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"拿到這些數據,可以針對性的去描述一個作者的用戶畫像了,本部分在後續的博客中爲大家單獨開一篇詳細介紹。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"代碼在數據清理部分,還有優化的空間,由於設置了 13300 頁數據,故最終抓取到 26W+數據,查詢了一下,存在 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"夢想橡皮擦","attrs":{}}],"attrs":{}},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/01/0113338f8834c202e644bdece8171a0c.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"關注者中至少有 83 位博客專家,可以看到博客專家的個人簡介寫的都比較清楚,同時發現 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"jiangtao","attrs":{}}],"attrs":{}},{"type":"text","text":"(CSDN 創始人)","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/08/08d6cd09bb4b85c75f28504c4df92510.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"收藏時間","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"代碼下載地址:","attrs":{}},{"type":"link","attrs":{"href":"https://codechina.csdn.net/hihell/python120","title":"","type":null},"content":[{"type":"text","text":"https://codechina.csdn.net/hihell/python120","attrs":{}}]},{"type":"text","text":",可否給個 Star。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"來都來了,不發個評論,點個贊,收個藏嗎?","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章