寫在前面的話,還處於爬蟲初期,很多東西一知半解,邊學邊記錄,邊學邊做。代碼寫的自己都看不下去了。。。。
本期重點,美團網商鋪數據,暫只抓了美食商家數據。先上戰果,暫只抓了10萬條,一小時左右,未對數據去重。大概思路如下,先抓取各個省份城市,然後獲取其經緯度,最後構造參數,翻頁拿取數據。抓取結果
- 獲取各個城市名字,id。第一個地址(https://www.meituan.com/ptapi/getprovincecityinfo/)
- 獲取各個城市經緯度,構造參數。第二個地址(https://apis.map.qq.com/jsapi?qt=poi&wd=西安&pn=0&rn=10&rich_source=qipao&rich=web&nj=0&c=1&key=FBOBZ-VODWU-C7SVF-B2BDI-UK3JE-YBFUS&output=jsonp&pf=jsapi&ref=jsapi&cb=qq.maps._svcb3.search_service_0&)
- 構建請求參數,翻頁獲取數據。第三個地址(https://www.meituan.com/meishi/api/poi/getNearPoiList?offset=0&limit=10&cityId=1&lat=39.950256&lng=116.34784)此地址瀏覽器拿不到數據,需要用到postman測試,是個坑
抓包的查找接口的過程如下
再切換城市欄目可以通過chrome抓包拿到所有城市的id,name,便於我們下一步去獲取經緯度參數,地址見第一條
然後到首頁點擊美食,隨便點進一個商家 ,抓包拿到下面信息。
postman測試參數如下圖。
有個小坑需要注意,headrs請求頭如下圖
返回數據如下圖
至此我們已經知曉大概流程。以及關鍵參數,下面就是代碼實現模塊。代碼中涉及到的數據庫連接方式。查詢、插入操作都需要重寫,代碼中爲自己封裝的一些默認連接。
import requests
import json
import re
import time
import logging
logging.captureWarnings(True)
from multiprocessing import Pool
class MeiTuan:
def __init__(self):
self.headers= {
"Content-Type": "application/json;charset=utf-8",
"Host": "www.meituan.com",
"Referer": "https://www.meituan.com/meishi/4813791/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
self.re_w_c = re.compile(r'"city":(.*?)"area')
self.collect = Mongo(db_name='meituan', collecttion_name='shop')#自己寫的mongo連接,需要做更改
def all_response(self,url):
"""一個時常用到的請求函數
"""
response = requests.get(url=url,headers=self.headers,verify =False)
return response
def get_city_id(self):
"""
獲取城市id
:return:mongo裏面meituan /shop
"""
url = "https://www.meituan.com/ptapi/getprovincecityinfo/"
response = self.all_response(url=url)
response = json.loads(response.text)
for provinces in response:
city_info = {}
city_info['provinceName'] = provinces['provinceName']
city_info['city'] =[]
for city in provinces['cityInfoList']:
city_info['city'].append({"{}".format(city['name']):"{}".format(city['id'])})
self.collect.collect.insert_one(city_info)#這個需要重寫。
def get_city_weidu(self):
"""獲取經緯度函數"""
url = 'https://apis.map.qq.com/jsapi'
collect = Mongo(db_name='meituan', collecttion_name='new_shop')
citynames = self.collect.collect.find()#重寫
for cityname in citynames:
print(cityname)
city_info = {}
city_info['provinceName'] = cityname['provinceName']
city_info['city'] = []
for name in cityname['city']:
w_c={}
new_name = list(name.keys())[0]
params ={
"qt": "poi",
"wd": "%s"%new_name,
"pn": "0",
"rn": "10",
"rich_source": "qipao",
"rich": "web",
"nj": "0",
"c": "1",
"key": "FBOBZ-VODWU-C7SVF-B2BDI-UK3JE-YBFUS",
"output": "jsonp",
"pf": "jsapi",
"ref": "jsapi",
"cb": "qq.maps._svcb3.search_service_0",
}
response = requests.get(url=url,params=params)
result_1 = response.text.replace('qq.maps._svcb3.search_service_0 && qq.maps._svcb3.search_service_0(',"")
try:
result =json.loads(result_1[:-1])
w_c['lng']= result['detail']['city']['pointx']
w_c['lat'] = result['detail']['city']['pointy']
name.update(w_c)
city_info['city'].append(name)
except:
pass
print(city_info)
collect.collect.insert_one(city_info)
然後我們用多線程,測試10個線程,一小時10萬條數據,只怕了很少一部分,未封ip
def ceshi(data):
con= redis_r()
# client = Mongo(db_name='meituan', collecttion_name='shop_info')
headers={
"Content-Type": "application/json;charset=utf-8",
"Host": "www.meituan.com",
"Referer": "https://www.meituan.com/meishi/4813791/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
start_url = 'https://www.meituan.com/meishi/api/poi/getNearPoiList'
# print(data['provinceName'])
for city in data['city']:
city_id = city[list(city.keys())[0]]
city_name = list(city.keys())[0]
# print(" "*10,city_name)
lat =city['lat']
lng =city['lng']
num = 0
while True:
params = {
"offset": "%s"%str(num),
"limit": "10",
"cityId": "%s"%str(city_id),
"lat": "%s"%str(lat),
"lng": "%s"%str(lng),
}
res = con.sadd("old", "%s%s%s"%(data['provinceName'],city_name,num))#做了一個斷點續爬的小操作,使用省+市+當前頁數做flag,想停就停。
if res == 1:
response = requests.get(url=start_url,params=params,headers =headers)
print(response.url,"---------")
time.sleep(2)
try:
print(response.text)
datas = json.loads(response.text)
if datas['data'] == []:
break
for i in datas['data']:
k = {'provinceName': data['provinceName'], "city_name": city_name}
k.update(i)
print(k)
print("當前省份%s當前城市%s目前第%s頁"%(data['provinceName'],city_name,num))
client.collect.insert_one(k)
except:
pass
num += 1
if __name__ == '__main__':
client = Mongo(db_name='meituan', collecttion_name='new_shop')
city_info = client.collect.find()
result = []
for i in city_info:
result.append(i)
mt = MeiTuan()
p = Pool(10)
for provinces in result:
p.apply_async(ceshi, args=(provinces,))
p.close()
p.join()
至此,美團的爬蟲已經完結