Python 爬蟲實戰(三) 獲取百度地圖搜索結果

百度地圖其實有個API給開發者使用http://lbsyun.baidu.com/index.php?title=webapi/place-suggestion-api

但是呢,有一些缺點

1、需要申請ak碼才能使用

2、搜索結果比百度地圖的搜索結果少 ,有一些地址在API中根本搜索不到

所以呢,還是自己重新寫了一個百度地圖爬蟲

map_web.py 爬蟲

# -*- coding: utf-8 -*-
"""
Created on Tue Aug 20 15:22:02 2019

@author: Eric
"""
import requests
import re
from map_class import Address

def search_params(query, city, page=0):
    # 設置搜索的請求信息
    parameter = {'newmap': 1,
                 'reqflag': 'pcmap',
                 'biz': 1,
                 'from': 'webmap',
                 'da_par': 'direct',
                 'pcevaname': 'pc4.1',
                 'qt': 's',
                 'da_src': 'searchBox.button',
                 'wd': query,
                 'wd2': '' , 
                 'c': city,
                 'src': 0,
                 'pn': page,
                 'sug': 0,
                 'db': 0,
                 #'l': '11',
                 'addr': 0,
                 'biz_forward': {"scaler":1,"styles":"pl"},
                 'from': 'webmap',
                 'auth': '2dZB4vFJNWZ8@9fL6v99La95@FOJRvx=uxHLLBNVLLztComRB199Ay1uVt1GgvPUDZYOYIZuEt2gz4yYxGccZcuVtPWv3GuRBtR9KxXwUvhgMZSguxzBEHLNRTVtcEWe1GD8zv7ucvY1SGpuxxti0XEI=1mDLYClnDjnCENRRHN@Z@EBfiKKvCMuGllhIQT',
                 'device_ratio': 1,
                 'tn': 'B_NORMAL_MAP',
                 'nn': page*10,
                 'ie': 'utf-8',
                 't': '1566370557403'
                 }
    return parameter

def reduce(stri):
    # 獲取list結果
    stack = []
    str2 = ''
    flag = False
    for i in range(0, len(stri)):
        if stri[i] == '{':
            stack.append(stri[i])
            flag = True
        elif stri[i] == '}' and flag:
            stack.pop(-1)
        if flag:
            str2 = str2+stri[i]
        if not len(stack) and flag:
            break
    if str2 == '' :
        return False
    return str2

def search(query, city, findall=False, debug=False):
    """findall=True 代表獲取所有搜索結果
       findall=False 代表獲取第一頁搜索結果
    """
    try:
        # 訪問網址
        url = 'http://map.baidu.com/'
        parameter = search_params(query, city, 0)
        response = requests.get(url, params=parameter)
        response.encoding = 'unicode_escape'    # 轉碼
        text = response.text.replace(' ','')    
        
        total = int(re.findall('total":(.*?),', text)[0])   # 獲取結果數量
     
        if findall: max_page = (total//10) + 1 # 結果最大頁數
        else: max_page = 1  
       
        results_list = []
        for i in range(1, max_page+1):
            parameter = search_params(query, city, page=i-1)
            response = requests.get(url, params=parameter)
            response.encoding = 'unicode_escape'
            text = response.text.replace(' ','')
            xx1 = text.split(',"content":')
            xx2 = xx1[-1].split(',"current_city"')
            info = xx2[0]
            result = reduce(info)
            while result:
                results_list.append(result)
                info = info.replace(result, '')
                result = reduce(info)
                
        addresses = []
        for result in results_list:
            address = Address()
            address.fill(result)
            addresses.append(address)
            if debug: print(address)
    except:
        return []
    return addresses

results_list = search('文峯', 289) #city爲地區碼,可以在百度搜索那裏查看 

        


map_class.py 通過正則匹配需要的信息 

# -*- coding: utf-8 -*-
"""
Created on Tue Aug 20 16:01:41 2019

@author: win 10
"""
import re


class Address():
    
    def __init__(self):
        self.name = ''  # 地址名
        self.addr = ''  # 地址
        self.area = ''  # 區
        self.city = ''  # 市
        self.tag = ''   # 標籤
        self.prov = ''  # 省
        
    def fill(self, info):
        # 摘取信息
        name = re.findall('"geo_type":.*,"name":"(.*?)","navi_update_time', info)
        if name: self.name = name[0]
        tag = re.findall('std_tag":"(.*?)"', info)
        if tag and tag[0] != '': self.tag = tag[0]
        else: 
            tag = re.findall('di_tag":"(.*?)"', info)
            if tag: self.tag = tag[0]
        addr = re.findall('addr":"(.*?)"', info)
        if addr:  self.addr = addr[0]
        area = re.findall('area_name":"(.*?)","city_id', info)
        if area: self.area = area[0]
        city = re.findall('city_name":"(.*?)"',info)
        if city: self.city = city[0]
        prov = re.findall('\[(.*?)\(.*\|', info)
        if prov: self.prov = prov[0]
        
    def __str__(self):
        string = self.name+'\n'+self.addr+'\n'+self.prov+self.city+self.area+'\n'+self.tag
        return string
    

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章