python代理ip抓取大衆點評

原創

零落_World

2020-06-01 00:19

抓大衆點評才抓了幾頁就被屏蔽，找到如下方法解決。

第一步：獲取代理ip

在http://www.xicidaili.com/nn獲取代理，命名爲proxy_ip.py，代碼如下：

# coding:utf-8
import requests
from bs4 import BeautifulSoup
import re
import os.path

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)'
headers = {'User-Agent': user_agent}

def getListProxies():
    session = requests.session()
    page = session.get("http://www.xicidaili.com/nn", headers=headers)
    soup = BeautifulSoup(page.text, 'lxml')

    proxyList = []
    taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")})
    for trtag in taglist:
        tdlist = trtag.find_all('td')
        proxy = {'http': 'http://'+tdlist[1].string + ':' + tdlist[2].string}
        url = "http://ip.chinaz.com/getip.aspx"  #測試IP是否可用
        try:
            response = session.get(url, proxies=proxy, timeout=5)
            proxyList.append(proxy)
            if(len(proxyList) == 50):  # 獲取ip個數
                break
        except Exception, e:
            continue

    return proxyList

if __name__ == "__main__":
    proxy_list = getListProxies()
    for i in proxy_list:
        with open("proxy_ip.txt", "w") as fw:
            fw.write(i["http"] + "\n")

部分結果如下：

http://61.135.217.7:80
http://222.182.53.69:8118
http://116.249.222.96:8118
http://122.114.31.177:808
http://222.76.187.20:8118
http://115.46.151.140:8123
http://123.185.131.236:8118
http://112.114.95.43:8118
http://171.37.156.139:8123
http://115.55.158.113:8118
http://112.114.93.73:8118
http://113.221.46.141:8888
http://112.114.94.42:8118
http://180.115.12.214:28471
http://112.114.99.32:8118

第二步：利用代理ip抓取大衆點評某個城市的所有美食商鋪的評分

# coding:utf-8
import codecs
import json
import time
import re
import urllib2
import random
import requests
from collections import Counter


proxy_ip_list = []
with codecs.open("proxy_ip.txt", "r", "utf-8") as fr:
    for line in fr.readlines():
        line = line.strip()
        proxy_ip_list.append({"http": line})

def proxy_random():
    global proxy_ip_list
    index = random.randint(0, len(proxy_ip_list) - 1)
    return proxy_ip_list[index]

def crawl_page_proxy(url, proxy):
    # proxy = {'http': 'http://115.226.11.45:3128'}
    url = "http://www.dianping.com/search/category/35/10/p1"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    web_data = requests.get(url, headers=headers, proxies=proxy)
    # 正則匹配評分
    res = re.findall(r'class\=\"sml\-rank\-stars sml\-str\d+\"', web_data.text)
    return res

def run(data_file):
    """
    輸入文件，格式爲：
    {"city": "延安", "url":"http://www.dianping.com/search/category/78/10/","max_pages":50,"min_pages":1}
    {"city": "太原", "url":"http://www.dianping.com/search/category/35/10/","max_pages":50,"min_pages":1}
    """
    with codecs.open("data.txt", "r", "utf-8") as fr:
        for line in fr.readlines():
            line = line.strip()
            data_json = json.loads(line)
            city = data_json["city"]
            main_url = data_json["url"]
            max_page = data_json["max_pages"]
            min_page = data_json["min_pages"]
            city_dict = {}
            city_dict[city] = []
            for page in range(min_page, max_page + 1):
                url = main_url + "p" + str(page)
                print "pages ==== ",city, url
                i = 0
                while i < 10:  # 重試
                    proxy_ip = proxy_random()
                    try:
                        stars_list = crawl_page_proxy(url, proxy_ip)
                        print proxy_ip, "OK"
                        break
                    except:
                        i += 1
                        print proxy_ip, "ERROR"
                print "\n"
                city_dict[city] += stars_list
                time.sleep(random.uniform(3,10))
            with codecs.open(city + ".txt", "w", "utf-8") as fw:
                for city in city_dict:
                    fw.write(city + "\t" + str(city_dict[city]) + "\n")
            time.sleep(30)

run("data.txt")

完成，沒有被屏蔽了。

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python代理ip抓取大衆點評

【筆記】動手學深度學習-前言

公司新來一個幹練小夥，把 MyBatis 替換成 MyBatis-Plus，上線後哭暈在廁所。。。

支持非IE瀏覽器真的那麼難嗎？

爲啥就那麼痛恨IE？

2024 開源數據工程生態系統全景圖

Brian Sun：回覆“爲啥就那麼痛恨IE？”

體驗下，大廠在使用功能的API網關！

見鬼了！我家的 WiFi 只有下雨天才能正常使用...

短視頻文案提取原來如此簡單

oa系統集成及案例樣式

利用hadoop streaming對tensorflow模型進行分佈式預測

如何查看TFRecod數據詳情

利用小trick加速tensorflow的訓練

利用tensorflow estimator API實現雙塔推薦算法

機器學習（六）：疊加樹模型

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結