谁有粉?就爬谁!他粉多,就爬他!Python 多线程采集 260000+ 粉丝数据

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"今天你想爬谁的粉呢?谁粉多,就爬谁。那谁有粉?沉默王二有粉。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"今天咱们继续学习 Python 爬虫,从本篇博客开始进行短暂的(15 篇)","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"多线程爬虫学习","attrs":{}},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"第一篇就要采集 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"@沉默王二","attrs":{}}],"attrs":{}},{"type":"text","text":" 的粉丝,坐拥 27W+ 读者,属实让人羡慕。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"目标数据源分析","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"本次要抓取的数据源是 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"https://blog.csdn.net/qing_gee?type=sub&subType=fans","attrs":{}}],"attrs":{}},{"type":"text","text":",其中的 ID 可以切换为你希望采集的 ID,当然包括你自己的 ID。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"该页面下滑刷新会自动请求一个 API 接口,即 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"https://blog.csdn.net/community/home-api/v1/get-fans-list?page=3&size=20&noMore=false&blogUsername=qing_gee","attrs":{}}],"attrs":{}},{"type":"text","text":",其中参数如下:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"page","attrs":{}}],"attrs":{}},{"type":"text","text":":页码,根据目标人粉丝总数 / 20 计算获取即可;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"size","attrs":{}}],"attrs":{}},{"type":"text","text":":每页数据,默认值 20;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"noMore","attrs":{}}],"attrs":{}},{"type":"text","text":":无用;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"codeinline","content":[{"type":"text","text":"blogUsername","attrs":{}}],"attrs":{}},{"type":"text","text":":博客用户名","attrs":{}}]}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"同时在测试接口过程中,接口会返回异常数据,实测增加一个延时控制,可以大幅度提高接口数据返回稳定性。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"{'code': 400, 'message': 'fail', 'data': None}\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"正常接口数据返回如下图所示:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/37/37b36335ea01bd668dbdb46d02e02c1a.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"使用技术点说明","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"本次采用 Python 多线程实现数据的采集,编码使用 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"threading","attrs":{}}],"attrs":{}},{"type":"text","text":" 模块进行多线程控制,本系列专栏从最简单的多线程开始进行学习,例如本例,一次性发起 5(可自定义)个请求。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"完整代码如下所示,代码说明请参考注释部分与尾部说明","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"import threading\nfrom threading import Lock, Thread\nimport time\nimport os\nimport requests\nimport random\n\n\nclass MyThread(threading.Thread):\n def __init__(self, name):\n super(MyThread, self).__init__()\n self.name = name\n\n def run(self):\n global urls\n lock.acquire()\n one_url = urls.pop()\n print(\"正在爬取:\", one_url)\n lock.release()\n print(\"任意线程等待随机时间\")\n time.sleep(random.randint(1,3))\n res = requests.get(one_url, headers=self.get_headers(), timeout=5)\n\n if res.json()[\"code\"] != 400:\n data = res.json()[\"data\"][\"list\"]\n for user in data:\n name = user['username']\n nickname = self.remove_character(user['nickname'])\n userAvatar = user['userAvatar']\n blogUrl = user['blogUrl']\n blogExpert = user['blogExpert']\n briefIntroduction = self.remove_character(\n user['briefIntroduction'])\n\n with open('./qing_gee_data.csv', 'a+', encoding='utf-8') as f:\n print(f'{name},{nickname},{userAvatar},{blogUrl},{blogExpert},{briefIntroduction}')\n f.write(f\"{name},{nickname},{userAvatar},{blogUrl},{blogExpert},{briefIntroduction}\\n\")\n else:\n print(res.json())\n print(\"异常数据\", one_url)\n with open('./error.txt', 'a+', encoding='utf-8') as f:\n f.write(one_url+\"\\n\")\n # 去除特殊字符\n\n def remove_character(self, origin_str):\n if origin_str is None:\n return\n origin_str = origin_str.replace('\\n', '')\n origin_str = origin_str.replace(',', ',')\n return origin_str\n # 获取随机UA请求头\n def get_headers(self):\n uas = [\n \"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)\"\n ]\n ua = random.choice(uas)\n # 特别注意下述 cookie 部分,需要手动从开发者工具中进行复制,否则抓取到的数据,缺少nikename 与个人简介部分\n headers = {\n \"user-agent\": ua,\n 'cookie': 'UserName=你的ID; UserInfo=你的UserInfo; UserToken=你的UserToken;',\n \"referer\": \"https://blog.csdn.net/qing_gee?type=sub&subType=fans\"\n }\n return headers\n\n\nif __name__ == '__main__':\n lock = Lock()\n url_format = 'https://blog.csdn.net/community/home-api/v1/get-fans-list?page={}&size=20&noMore=false&blogUsername=qing_gee'\n urls = [url_format.format(i) for i in range(1, 13300)]\n l = []\n while len(urls) > 0:\n print(len(urls))\n for i in range(5):\n p = MyThread(\"t\"+str(i))\n l.append(p)\n p.start()\n for p in l:\n p.join()\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"代码运行结果如下图所示:","attrs":{}}]},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/27/2767954f3f5d4b06ded5d8866d01c740.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"上述代码用到了多线程,也同时用到了线程锁,简单的多线程代码可以抽象为下述内容。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"简单的多线程代码:","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"import threading\nimport time\n\ndef run(n):\n print('task', n)\n time.sleep(3)\n\nif __name__ == '__main__':\n t1 = threading.Thread(target=run, args=('t1',))\n t2 = threading.Thread(target=run, args=('t2',))\n t1.start()\n t2.start()\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"其中比较核心的代码是 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"threading.Thread","attrs":{}}],"attrs":{}},{"type":"text","text":",参数 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"target","attrs":{}}],"attrs":{}},{"type":"text","text":" 后面的值是函数名,","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"args","attrs":{}}],"attrs":{}},{"type":"text","text":" 是传递的参数,注意必须为元组类型。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"爬虫代码还是用了共享全局变量,简化代码如下所示,其中重点学习 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"lock=Lock()","attrs":{}}],"attrs":{}},{"type":"text","text":" 部分代码,以及在使用全局变量前后的 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"lock.acquire()","attrs":{}}],"attrs":{}},{"type":"text","text":" 和 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"lock.release()","attrs":{}}],"attrs":{}},{"type":"text","text":"。其中还用到了线程的 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"join","attrs":{}}],"attrs":{}},{"type":"text","text":" 方法,该方法主要是为了让主线程等待子线程执行。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":"python"},"content":[{"type":"text","text":"import threading\nfrom threading import Lock,Thread\nimport time,os\n\ndef work():\n global urls\n lock.acquire()\n # 获取一个 url\n one_url = urls.pop()\n lock.release()\n\n print(\"得到的 URL 为\",one_url)\n\n\nif __name__ == '__main__':\n lock = Lock()\n url_format = 'https://blog.csdn.net/community/home-api/v1/get-fans-list?page={}&size=20&noMore=false&blogUsername=qing_gee'\n # 拼接URL,全局共享变量\n urls = [url_format.format(i) for i in range(1, 13300)]\n l = []\n # 开启线程数量\n for i in range(3):\n p = Thread(target=work)\n l.append(p)\n p.start()\n for p in l:\n p.join()\n","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"拿到这些数据,可以针对性的去描述一个作者的用户画像了,本部分在后续的博客中为大家单独开一篇详细介绍。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"代码在数据清理部分,还有优化的空间,由于设置了 13300 页数据,故最终抓取到 26W+数据,查询了一下,存在 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"梦想橡皮擦","attrs":{}}],"attrs":{}},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/01/0113338f8834c202e644bdece8171a0c.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"关注者中至少有 83 位博客专家,可以看到博客专家的个人简介写的都比较清楚,同时发现 ","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"jiangtao","attrs":{}}],"attrs":{}},{"type":"text","text":"(CSDN 创始人)","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/08/08d6cd09bb4b85c75f28504c4df92510.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"收藏时间","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"代码下载地址:","attrs":{}},{"type":"link","attrs":{"href":"https://codechina.csdn.net/hihell/python120","title":"","type":null},"content":[{"type":"text","text":"https://codechina.csdn.net/hihell/python120","attrs":{}}]},{"type":"text","text":",可否给个 Star。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"来都来了,不发个评论,点个赞,收个藏吗?","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章