這是幫忙工商學院研究生院旅遊管理專業的舒老師弄的一個爬蟲項目，簡單的說算是三個網站一起的爬蟲，分別爬取攜程網、大衆點評和馬蜂窩馬蜂窩的差評信息，僅限於差評，用於論文研究；上次曾有幸受他邀請，教研究生學長學姐們怎麼用python做爬蟲，但是爬蟲的很多反爬和信息處理能力不是一兩天的時間就可以讓小白入門的。比如fa這三個網站，都有一些反爬蟲，特別是大衆點評，字體反爬，爬蟲軟件幾乎拿它沒轍，最多拿到缺失的內容。爲了開發速度，我也沒有去研究攜程網和馬蜂窩的ajax加載，直接選擇了最爲粗暴的selenium自動化。其中在點擊網頁節點上也是費了一些功夫，雖然不就前也弄過它們，但是很多反爬的措施也更新的很快，只能重新開發了。經過這個時間的學習，雖然進步了很多，在爬蟲上，更加相信一定要學好js了。中國加油，武漢加油，我也加油~ ~ ~

文章目錄

1、攜程網

目標文件和運行結果下載： https://www.lanzous.com/i9f3xwh

目標文件如圖：

任務： 需要爬取表格中對應的鏈接，並把爬取的內容存在和景點對應的txt文件中，如果評論數爲0，則不需要爬取。

1、攜程網

1.1、PC端差評代碼

import requests,json,time,random
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from lxml import etree
from lxml import html
import pandas as pd

startTime = time.time() #記錄起始時間
driver = webdriver.Chrome()
driver.implicitly_wait(5)
chrome_option = webdriver.ChromeOptions()
chrome_option.add_argument('--proxy--server=112.84.55.122:9999')#使用代理IP

def you_ctrip(file_name, xc_url):
    driver.get(xc_url)
    driver.implicitly_wait(10)
    driver.execute_script("window.scrollBy(0,1600)")
    driver.implicitly_wait(5)
    # 點擊差評
    button = driver.find_element_by_xpath('//*[@id="weiboCom1"]/div[2]/ul/li[5]/a')
    driver.execute_script("$(arguments[0]).click()",button)
    driver.implicitly_wait(5)
    driver.execute_script("window.scrollBy(0,1000)")
    try:
        PageNunber = driver.find_element_by_xpath('//div[@class="pager_v1"]/span/b').text
    except:
        PageNunber = False
    source = driver.page_source
    you_ctrip_spider(source, file_name)
    
    if PageNunber:
        print ("PageNunber = ", PageNunber)
        for i in range(2, int(PageNunber)+1):
            time.sleep(2)
            print ("@"*50)
            search = driver.find_element_by_id('gopagetext') #定位搜索框節點
            search.send_keys(i)#輸入搜素詞
            search.send_keys(Keys.ENTER)#點擊回車
            driver.execute_script("window.scrollBy(0,10000)")
            time.sleep(1)
            source = driver.page_source
            you_ctrip_spider(source, file_name)
            
    button = driver.find_element_by_xpath('//*[@id="weiboCom1"]/div[2]/ul/li[6]/a')
    driver.execute_script("$(arguments[0]).click()",button)
    driver.implicitly_wait(5)
    try:
        PageNunber = driver.find_element_by_xpath('//div[@class="pager_v1"]/span/b').text
    except:
        PageNunber = False
        # 獲取源碼並解析       
    source = driver.page_source
    you_ctrip_spider(source, file_name)
        
    if PageNunber:
        print ("PageNunber = ", PageNunber)
        for i in range(2, int(PageNunber)+1):
            time.sleep(2)
            print ("@"*50)
            search = driver.find_element_by_id('gopagetext') #定位搜索框節點
            search.send_keys(i)#輸入搜素詞
            search.send_keys(Keys.ENTER)#點擊回車
            driver.execute_script("window.scrollBy(0,10000)")
            time.sleep(1)
                # 獲取源碼並解析       
            source = driver.page_source
            you_ctrip_spider(source, file_name)
            
def you_ctrip_spider(source, file_name):
    xc_html = html.fromstring(source)
    # 提取全部評論
    xc_user_comments = xc_html.xpath('//li[@class="main_con"]/span/text()')
    xc_user_comment = "".join(xc_user_comments)
    print ("xc_user_comment = ", xc_user_comment)
    with open(file_name, "a", encoding="utf-8") as f:
        f.write(xc_user_comment+"\n")
        f.close()
        
def main():
    file_name = './景點差評測試.txt'
    max_comment = 41
    if int(max_comment) != 0:
        maxSlide = int(max_comment / 10)
        xc_url = "https://you.ctrip.com/sight/guiding120451/145654.html"
        if "sight" in xc_url:            
            you_ctrip(file_name, xc_url)

if __name__ == '__main__':
    main()

1.2、移動端差評代碼

import requests,json,time,random
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
from lxml import html
import pandas as pd

# 讀取表格
data = pd.read_excel('./差評統計.xlsx')

startTime = time.time() #記錄起始時間
driver = webdriver.Chrome()
driver.implicitly_wait(5)
chrome_option = webdriver.ChromeOptions()
chrome_option.add_argument('--proxy--server=112.84.55.122:9999')#使用代理IP

def spider_xiecheng(maxSlide, file_name, xc_url):
    driver.get(xc_url+ '&tag=-12') # &tag=-12是網頁上差評的標籤
#     print ("maxSlide===========================", maxSlide)
    # selenium下拉滾動條
    if int(maxSlide) > 0:
#         print ("$"*50)
        for i in range(0, int(maxSlide)):
            driver.execute_script("window.scrollTo(0,10000)")            
            time.sleep(1) #暫停時間：2~3秒
    # 獲取源碼並解析  
    time.sleep(2)
    source = driver.page_source
    xc_html = html.fromstring(source)
    # 提取全部評論
    xc_user_comments = xc_html.xpath('//*[@id="c_gs_comments_commentdetail"]//text()')
    xc_user_comment = "".join(xc_user_comments)
#     print ("xc_user_comment = ", xc_user_comment)
    # 提取全部回覆
    seller_replys = xc_html.xpath('//div[@class="seller-reply"]//text()')
    seller_reply = "".join(seller_replys)
#     print ("seller-reply = ", seller_reply)
    # 保存數據
    with open(file_name, "a", encoding="utf-8") as f:
        f.write(xc_user_comment+"\n")
        f.write(seller_reply+"\n")
        f.close()

def main():
    for i in range(0,96):
#         print (data['高級別景區'][i], data['攜程差評'][i], data['攜程'][i])
        file_name = './景點差評/'+str(data['高級別景區'][i]) + '.txt'
        max_comment = int(data['攜程差評'][i])
        if int(max_comment) != 0:
            maxSlide = int(max_comment / 10)
            xc_url = data['攜程'][i]
            spider_xiecheng(maxSlide, file_name, xc_url)
        else:
            print ("攜程網《%s》沒有差評"%(data['高級別景區'][i]))
        print ("正在爬取第%s個目標，一共有97個目標"%(i+1))
if __name__ == '__main__':
    main()

1.3、PC端和移動端一起運行

注意：表格中攜程網的鏈接不統一，有移動端和PC端，所以要分開進行爬取

import requests,json,time,random
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
from lxml import html
import pandas as pd

data = pd.read_excel('./差評統計.xlsx')

startTime = time.time() #記錄起始時間
driver = webdriver.Chrome()
driver.implicitly_wait(5)
chrome_option = webdriver.ChromeOptions()
chrome_option.add_argument('--proxy--server=112.84.55.122:9999')#使用代理IP

def you_ctrip(file_name, xc_url):
    driver.get(xc_url)
    driver.implicitly_wait(10)
    driver.execute_script("window.scrollBy(0,1600)")
    driver.implicitly_wait(5)
    # 點擊差評
    button = driver.find_element_by_xpath('//*[@id="weiboCom1"]/div[2]/ul/li[5]/a')
    driver.execute_script("$(arguments[0]).click()",button)
    driver.implicitly_wait(5)
    driver.execute_script("window.scrollBy(0,1000)")
    try:
        PageNunber = driver.find_element_by_xpath('//div[@class="pager_v1"]/span/b').text
    except:
        PageNunber = False
    source = driver.page_source
    you_ctrip_spider(source, file_name)
    
    if PageNunber:
#         print ("PageNunber = ", PageNunber)
        for i in range(2, int(PageNunber)+1):
            time.sleep(2)
#             print ("@"*50)
            search = driver.find_element_by_id('gopagetext') #定位搜索框節點
            search.send_keys(i)#輸入搜素詞
            search.send_keys(Keys.ENTER)#點擊回車
            driver.execute_script("window.scrollBy(0,10000)")
            time.sleep(1)
            source = driver.page_source
            you_ctrip_spider(source, file_name)
            
    button = driver.find_element_by_xpath('//*[@id="weiboCom1"]/div[2]/ul/li[6]/a')
    driver.execute_script("$(arguments[0]).click()",button)
    driver.implicitly_wait(5)
    try:
        PageNunber = driver.find_element_by_xpath('//div[@class="pager_v1"]/span/b').text
    except:
        PageNunber = False
        # 獲取源碼並解析       
    source = driver.page_source
    you_ctrip_spider(source, file_name)
        
    if PageNunber:
#         print ("PageNunber = ", PageNunber)
        for i in range(2, int(PageNunber)+1):
            time.sleep(2)
#             print ("@"*50)
            search = driver.find_element_by_id('gopagetext') #定位搜索框節點
            search.send_keys(i)#輸入搜素詞
            search.send_keys(Keys.ENTER)#點擊回車
            driver.execute_script("window.scrollBy(0,10000)")
            time.sleep(1)
                # 獲取源碼並解析       
            source = driver.page_source
            you_ctrip_spider(source, file_name)
            
def you_ctrip_spider(source, file_name):
    xc_html = html.fromstring(source)
    # 提取全部評論
    xc_user_comments = xc_html.xpath('//li[@class="main_con"]/span/text()')
    xc_user_comment = "".join(xc_user_comments)
#     print ("xc_user_comment = ", xc_user_comment)
    with open(file_name, "a", encoding="utf-8") as f:
        f.write(xc_user_comment+"\n")
        f.close()

def spider_xiecheng(maxSlide, file_name, xc_url):
    driver.get(xc_url+ '&tag=-12')
#     print ("maxSlide===========================", maxSlide)
    # selenium下拉滾動條
    if int(maxSlide) > 0:
        #下拉網頁
        for i in range(0, int(maxSlide)):
            js = "window.scrollTo(0, document.body.scrollHeight)"
            driver.execute_script(js)            
            time.sleep(1) #暫停時間：2~3秒
    # 獲取源碼並解析 
    time.sleep(2)
    source = driver.page_source
    xc_html = html.fromstring(source)
    # 提取全部評論
    xc_user_comments = xc_html.xpath('//*[@id="c_gs_comments_commentdetail"]//text()')
    xc_user_comment = "".join(xc_user_comments)
#     print ("xc_user_comment = ", xc_user_comment)
    # 提取全部回覆
    seller_replys = xc_html.xpath('//div[@class="seller-reply"]//text()')
    seller_reply = "".join(seller_replys)
#     print ("seller-reply = ", seller_reply)
    # 保存數據
    with open(file_name, "a", encoding="utf-8") as f:
        f.write(xc_user_comment+"\n")
        f.write(seller_reply+"\n")
        f.close()

def main():
    for i in range(55, 96):#38,96
#         print (data['高級別景區'][i], data['攜程差評'][i], data['攜程'][i])
        file_name = './評論/'+str(data['高級別景區'][i]) + '.txt' # 景點差評/
        max_comment = int(data['攜程差評'][i])
        if int(max_comment) != 0:
            maxSlide = int(max_comment / 10)
            xc_url = data['攜程'][i]
            print (data['高級別景區'][i], xc_url)
            if "you.ctrip" in xc_url:
                you_ctrip(file_name, xc_url)
            if "m.ctrip" in xc_url:
                spider_xiecheng(maxSlide, file_name, xc_url)
        else:
            print ("攜程網《%s》沒有差評"%(data['高級別景區'][i]))
        print ("正在爬取第%s個目標，一共有97個目標"%(i+1))
        
if __name__ == '__main__':
    main()

PC和移動一起運行結果截圖：

2、大衆點評代碼

這是在網上找的一個代碼，很是傾佩這位大神，感覺寫出了史詩級的爬蟲，目前有些地方還沒有搞懂，下一步就以這個爲目標啦！

#!/usr/bin/env python
# coding: utf-8

import datetime
import random
import time
import re
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pymongo
from lxml import etree
import requests
from pyquery import PyQuery as pq

client = pymongo.MongoClient('localhost',27017)
shidai = client['gongyuan']
comments = shidai['comments']

path_one = r'./chromedriver.exe'

COOKIES ='_lxsdk_cuid=16a3e5550cac8-0328ac989f3a72-3c644d0e-100200-16a3e5550cbc8; _lxsdk=16a3e5550cac8-0328ac989f3a72-3c644d0e-100200-16a3e5550cbc8; _hc.v=b108378a-8f67-0f82-24be-f6bd59936218.1555823941; s_ViewType=10; ua=zeroing; ctu=66a794ac79d236ecce433a9dd7bbb8bf29eff0bc049590703a72f844379eb7c5; dper=56648ebad0a12bed853d89482e9f3c35c89ef2504f07d5388fd0dfead6018398ae8c14a81efb6f9e42cb7e1f46473489252facff635921c09c106e3b36b311bafcd118a3e618fff67b5758b9bd5afca901c01dc9ec74027240ac50819479e9fc; ll=7fd06e815b796be3df069dec7836c3df; _lx_utm=utm_source%3Dgoogle%26utm_medium%3Dorganic; cy=2; cye=beijing; _lxsdk_s=16b84e44244-3d8-afd-795%7C1393851569%7C2'
f = open('./大衆點評之白雲圖書館.txt','wb+')

class DianpingComment:
    font_size = 14
    start_y = 23
    def __init__(self, shop_id, cookies, delay=7, handle_ban=True,comments =comments):
        self.shop_id = shop_id
        self._delay = delay
        self.num = 1
        self.db =comments
        self._cookies = self._format_cookies(cookies)
        self._css_headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        self._default_headers = {
            'Connection': 'keep-alive',
            'Host': 'www.dianping.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
            'Cookie':'_lxsdk_cuid=17047b1dca7c8-043c82f4977c0c-313f69-144000-17047b1dca8c8; _lxsdk=17047b1dca7c8-043c82f4977c0c-313f69-144000-17047b1dca8c8; _hc.v=8153ac08-3810-a1ae-e4a2-008446a9d6de.1581750804; dplet=0eb3ace34c81cdffbb2f525361af2dfe; dper=25d6344a89e2764310f17c768a342af6e26a0b970e352c4b7e1af7d055dd4f0fe27238f776757d0692e2b2057163d865dbac58eaaa644cd70bc1add585e3887a57646c5450f2ac8de9999ddbbb0b420dac991ff387517e3bab3bea6092fb494b; ua=dpuser_2307823987; ctu=60e486a44aca8c99b326d5acbeed5a4a2c97a82d1f07352d412ad90603dacb2b; cy=258; cye=guiyang; s_ViewType=10; ll=7fd06e815b796be3df069dec7836c3df; _lxsdk_s=1704bedf674-3f2-00a-b95%7C%7C26' }
        self._cur_request_url ='http://www.dianping.com/shop/{}/review_all?queryType=reviewGrade&queryVal=bad'.format(self.shop_id)
        self.sub_url ='http://www.dianping.com'

    def run(self):
        self._css_link = self._get_css_link(self._cur_request_url)
        self._font_dict = self._get_font_dict(self._css_link)
        self._get_conment_page()



    def _delay_func(self):
        delay_time = random.randint((self._delay - 2) * 10, (self._delay + 2) * 10) * 0.1
        time.sleep(delay_time)

    def _init_browser(self):
        """
            初始化遊覽器
        """
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        browser = webdriver.Chrome(chrome_options=chrome_options,executable_path=path_one)
        browser.get(self._cur_request_url)
        for name, value in self._cookies.items():
            browser.add_cookie({'name': name, 'value': value})
        browser.refresh()
        return browser

    def _handle_ban(self):
        """
            爬取速度過快，出現異常時處理驗證
        """
        try:
            self._browser.refresh()
            time.sleep(1)
            button = self._browser.find_element_by_id('yodaBox')
            move_x_offset = self._browser.find_element_by_id('yodaBoxWrapper').size['width']
            webdriver.ActionChains(self._browser).drag_and_drop_by_offset(
                button, move_x_offset, 0).perform()
        except:
            pass

    def _format_cookies(self, cookies):
        '''
        獲取cookies;;;
        :param cookies:
        :return:
        '''
        cookies = {cookie.split('=')[0]: cookie.split('=')[1]
                   for cookie in cookies.replace(' ', '').split(';')}
        return cookies

    def _get_conment_page(self):
        """
            請求評論頁，並將<span></span>樣式替換成文字;
        """
        while self._cur_request_url:
            self._delay_func()
            print('[{now_time}] {msg}'.format(now_time=datetime.datetime.now(), msg=self._cur_request_url))
            res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=self._cookies)
            while res.status_code !=200:
                cookie = random.choice(COOKIES)
                cookies = self._format_cookies(cookie)
                res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=cookies)
                if res.status_code == 200:
                    break
            html = res.text
            class_set = []
            for span in re.findall(r'<svgmtsi class="([a-zA-Z0-9]{5,6})"></svgmtsi>', html):
                class_set.append(span)
            for class_name in class_set:
                try:
                    html = re.sub('<svgmtsi class="%s"></svgmtsi>' % class_name, self._font_dict[class_name], html)
                    print('{}已替換完畢_______________________________'.format(self._font_dict[class_name]))
                except:
                    html = re.sub('<svgmtsi class="%s"></svgmtsi>' % class_name, '', html)
                    print('替換失敗…………………………………………………………………………&&&&&&&&&&&&&&&&&&&&&&&&')
            doc = pq(html)
            self._parse_comment_page(html)
            if  doc('.NextPage').attr('href'):
                self._default_headers['Referer'] = self._cur_request_url
                next_page_url1 = doc('.NextPage').attr('href')
                next_page_url =self.sub_url +  str(next_page_url1)
                print('next_url:{}'.format(next_page_url))
            else:
                next_page_url = None
            print('next_page_url:{}'.format(next_page_url))
            self._cur_request_url = next_page_url



    def _data_pipeline(self, data):
        """
            處理數據
        """
        print(data)

    def _parse_comment_page(self, html):
        """
            解析評論頁並提取數據,把數據寫入文件中；；
        """
        doc =pq(html)
        for li in doc('div.review-list-main > div.reviews-wrapper > div.reviews-items > ul > li'):

                doc_text =pq(li)
                if doc_text('.dper-info .name').text():
                    name = doc_text('.dper-info .name').text()
                else:
                    name = None
                try:
                    star = doc_text('.review-rank .sml-rank-stars').attr('class')

                except IndexError:
                    star = None
                if doc_text('div.misc-info.clearfix > .time').text():
                    date_time =doc_text('div.misc-info.clearfix > .time').text()
                else:
                    date_time=None
                if doc_text('.main-review .review-words').text():
                    comment =doc_text('.main-review .review-words').text()
                else:
                    comment=None

#                 data = {
#                     'name': name,
#                     'date_time':date_time,
#                     'star': star,
#                     'comment':comment
#                 }
                print(comment)
                f.write(str(comment.replace("收起評論","\n")).encode('utf-8'))
                print('寫入數據完成',comment)


    def _get_css_link(self, url):
        """
            請求評論首頁，獲取css樣式文件
        """
        try:
            print(url)
            res = requests.get(url, headers=self._default_headers, cookies = self._cookies)
            html = res.text
            css_link = re.search(r'<link re.*?css.*?href="(.*?svgtextcss.*?)">', html)
            print(css_link)
            assert css_link
            css_link = 'http:' + css_link[1]
            return css_link
        except:
            None

    def _get_font_dict(self, url):
        """
            獲取css樣式對應文字的字典
        """
        res = requests.get(url, headers=self._css_headers)
        html = res.text

        background_image_link = re.findall(r'background-image:.*?\((.*?svg)\)', html)
        print(background_image_link)
        background_image_link_list =[]
        for i in background_image_link:
            url ='http:'+i
            background_image_link_list.append(url)

        print(background_image_link_list)

        html = re.sub(r'span.*?\}', '', html)
        group_offset_list = re.findall(r'\.([a-zA-Z0-9]{5,6}).*?round:(.*?)px (.*?)px;', html)
        '''
        多個偏移字典，合併在一起；；；
        '''
        font_dict_by_offset_list ={}
        for i in background_image_link_list:

            font_dict_by_offset_list.update(self._get_font_dict_by_offset(i))

        font_dict_by_offset = font_dict_by_offset_list
        print(font_dict_by_offset)
        font_dict = {}
        for class_name, x_offset, y_offset in group_offset_list:
            x_offset = x_offset.replace('.0', '')
            y_offset = y_offset.replace('.0', '')
            try:
                font_dict[class_name] = font_dict_by_offset[int(y_offset)][int(x_offset)]

            except:
                font_dict[class_name] = ''
        return font_dict

    def _get_font_dict_by_offset(self, url):
        """
            獲取座標偏移的文字字典, 會有最少兩種形式的svg文件（目前只遇到兩種）
        """
        res = requests.get(url, headers=self._css_headers)
        html = res.text
        font_dict = {}
        y_list = re.findall(r'd="M0 (\d+?) ', html)
        if y_list:
            font_list = re.findall(r'<textPath .*?>(.*?)<', html)
            for i, string in enumerate(font_list):
                y_offset = self.start_y - int(y_list[i])

                sub_font_dict = {}
                for j, font in enumerate(string):
                    x_offset = -j * self.font_size
                    sub_font_dict[x_offset] = font
                font_dict[y_offset] = sub_font_dict
        else:
            font_list = re.findall(r'<text.*?y="(.*?)">(.*?)<', html)
            for y, string in font_list:
                y_offset = self.start_y - int(y)
                sub_font_dict = {}
                for j, font in enumerate(string):
                    x_offset = -j * self.font_size
                    sub_font_dict[x_offset] = font
                font_dict[y_offset] = sub_font_dict
        return font_dict

class Customer(DianpingComment):
    def _data_pipeline(self, data):
        print(data)
if __name__ == "__main__":
	# 鏈接後面的ID
    dianping = Customer('22062688', cookies=COOKIES)
    dianping.run()
    f.close()

PS: 該代碼僅供學習，如果侵犯您的權益，請下方評論刪除！

稍改動了一些地方，直接就可以跑表格中的數據了

#!/usr/bin/env python
# coding: utf-8

import datetime
import random
import time
import re
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pymongo
from lxml import etree
import requests
from pyquery import PyQuery as pq

startTime = time.time()
client = pymongo.MongoClient('localhost',27017)
shidai = client['gongyuan']
comments = shidai['comments']

path_one = r'./chromedriver.exe'

COOKIES ='_lxsdk_cuid=16a3e5550cac8-0328ac989f3a72-3c644d0e-100200-16a3e5550cbc8; _lxsdk=16a3e5550cac8-0328ac989f3a72-3c644d0e-100200-16a3e5550cbc8; _hc.v=b108378a-8f67-0f82-24be-f6bd59936218.1555823941; s_ViewType=10; ua=zeroing; ctu=66a794ac79d236ecce433a9dd7bbb8bf29eff0bc049590703a72f844379eb7c5; dper=56648ebad0a12bed853d89482e9f3c35c89ef2504f07d5388fd0dfead6018398ae8c14a81efb6f9e42cb7e1f46473489252facff635921c09c106e3b36b311bafcd118a3e618fff67b5758b9bd5afca901c01dc9ec74027240ac50819479e9fc; ll=7fd06e815b796be3df069dec7836c3df; _lx_utm=utm_source%3Dgoogle%26utm_medium%3Dorganic; cy=2; cye=beijing; _lxsdk_s=16b84e44244-3d8-afd-795%7C1393851569%7C2'

class DianpingComment:
    font_size = 14
    start_y = 23
    def __init__(self, shop_id, cookies, delay=7, handle_ban=True,comments =comments):
        self.shop_id = shop_id
        self._delay = delay
        self.num = 1
        self.db =comments
        self._cookies = self._format_cookies(cookies)
        self._css_headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        self._default_headers = {
            'Connection': 'keep-alive',
            'Host': 'www.dianping.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
            'Cookie':'_lxsdk_cuid=17047b1dca7c8-043c82f4977c0c-313f69-144000-17047b1dca8c8; _lxsdk=17047b1dca7c8-043c82f4977c0c-313f69-144000-17047b1dca8c8; _hc.v=8153ac08-3810-a1ae-e4a2-008446a9d6de.1581750804; dplet=0eb3ace34c81cdffbb2f525361af2dfe; dper=25d6344a89e2764310f17c768a342af6e26a0b970e352c4b7e1af7d055dd4f0fe27238f776757d0692e2b2057163d865dbac58eaaa644cd70bc1add585e3887a57646c5450f2ac8de9999ddbbb0b420dac991ff387517e3bab3bea6092fb494b; ua=dpuser_2307823987; ctu=60e486a44aca8c99b326d5acbeed5a4a2c97a82d1f07352d412ad90603dacb2b; cy=258; cye=guiyang; s_ViewType=10; ll=7fd06e815b796be3df069dec7836c3df; _lxsdk_s=1704bedf674-3f2-00a-b95%7C%7C26' }
        self._cur_request_url ='http://www.dianping.com/shop/{}/review_all?queryType=reviewGrade&queryVal=bad'.format(self.shop_id)
        self.sub_url ='http://www.dianping.com'

    def run(self):
        self._css_link = self._get_css_link(self._cur_request_url)
        self._font_dict = self._get_font_dict(self._css_link)
        self._get_conment_page()

    def _delay_func(self):
        delay_time = random.randint((self._delay - 2) * 10, (self._delay + 2) * 10) * 0.1
        time.sleep(delay_time)

    def _init_browser(self):
        """
            初始化遊覽器
        """
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        browser = webdriver.Chrome(chrome_options=chrome_options,executable_path=path_one)
        browser.get(self._cur_request_url)
        for name, value in self._cookies.items():
            browser.add_cookie({'name': name, 'value': value})
        browser.refresh()
        return browser

    def _handle_ban(self):
        """
            爬取速度過快，出現異常時處理驗證
        """
        try:
            self._browser.refresh()
            time.sleep(1)
            button = self._browser.find_element_by_id('yodaBox')
            move_x_offset = self._browser.find_element_by_id('yodaBoxWrapper').size['width']
            webdriver.ActionChains(self._browser).drag_and_drop_by_offset(
                button, move_x_offset, 0).perform()
        except:
            pass

    def _format_cookies(self, cookies):
        '''
        獲取cookies;;;
        :param cookies:
        :return:
        '''
        cookies = {cookie.split('=')[0]: cookie.split('=')[1]
                   for cookie in cookies.replace(' ', '').split(';')}
        return cookies

    def _get_conment_page(self):
        """
            請求評論頁，並將<span></span>樣式替換成文字;
        """
        while self._cur_request_url:
            self._delay_func()
#             print('[{now_time}] {msg}'.format(now_time=datetime.datetime.now(), msg=self._cur_request_url))
            res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=self._cookies)
            while res.status_code !=200:
                cookie = random.choice(COOKIES)
                cookies = self._format_cookies(cookie)
                res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=cookies)
                if res.status_code == 200:
                    break
            html = res.text
            class_set = []
            for span in re.findall(r'<svgmtsi class="([a-zA-Z0-9]{5,6})"></svgmtsi>', html):
                class_set.append(span)
            for class_name in class_set:
                try:
                    html = re.sub('<svgmtsi class="%s"></svgmtsi>' % class_name, self._font_dict[class_name], html)
#                     print('{}已替換完畢_______________________________'.format(self._font_dict[class_name]))
                except:
                    html = re.sub('<svgmtsi class="%s"></svgmtsi>' % class_name, '', html)
#                     print('替換失敗…………………………………………………………………………&&&&&&&&&&&&&&&&&&&&&&&&')
            doc = pq(html)
            self._parse_comment_page(html)
            if  doc('.NextPage').attr('href'):
                self._default_headers['Referer'] = self._cur_request_url
                next_page_url1 = doc('.NextPage').attr('href')
                next_page_url =self.sub_url +  str(next_page_url1)
#                 print('next_url:{}'.format(next_page_url))
            else:
                next_page_url = None
#             print('next_page_url:{}'.format(next_page_url))
            self._cur_request_url = next_page_url

    def _data_pipeline(self, data):
        """
            處理數據
        """
#         print(data)
    def _parse_comment_page(self, html):
        """
            解析評論頁並提取數據,把數據寫入文件中；；
        """
        doc =pq(html)
        for li in doc('div.review-list-main > div.reviews-wrapper > div.reviews-items > ul > li'):

                doc_text =pq(li)
                if doc_text('.dper-info .name').text():
                    name = doc_text('.dper-info .name').text()
                else:
                    name = None
                try:
                    star = doc_text('.review-rank .sml-rank-stars').attr('class')

                except IndexError:
                    star = None
                if doc_text('div.misc-info.clearfix > .time').text():
                    date_time =doc_text('div.misc-info.clearfix > .time').text()
                else:
                    date_time=None
                if doc_text('.main-review .review-words').text():
                    comment =doc_text('.main-review .review-words').text()
                else:
                    comment=None

#                 data = {
#                     'name': name,
#                     'date_time':date_time,
#                     'star': star,
#                     'comment':comment
#                 }
                print(comment)
                f.write(str(comment.replace("收起評論","\n")).encode('utf-8'))
#                 print('寫入數據完成',comment)


    def _get_css_link(self, url):
        """
            請求評論首頁，獲取css樣式文件
        """
        try:
#             print(url)
            res = requests.get(url, headers=self._default_headers, cookies = self._cookies)
            html = res.text
            css_link = re.search(r'<link re.*?css.*?href="(.*?svgtextcss.*?)">', html)
#             print(css_link)
            assert css_link
            css_link = 'http:' + css_link[1]
            return css_link
        except:
            None

    def _get_font_dict(self, url):
        """
            獲取css樣式對應文字的字典
        """
        res = requests.get(url, headers=self._css_headers)
        html = res.text

        background_image_link = re.findall(r'background-image:.*?\((.*?svg)\)', html)
#         print(background_image_link)
        background_image_link_list =[]
        for i in background_image_link:
            url ='http:'+i
            background_image_link_list.append(url)

#         print(background_image_link_list)

        html = re.sub(r'span.*?\}', '', html)
        group_offset_list = re.findall(r'\.([a-zA-Z0-9]{5,6}).*?round:(.*?)px (.*?)px;', html)
        '''
        多個偏移字典，合併在一起；；；
        '''
        font_dict_by_offset_list ={}
        for i in background_image_link_list:

            font_dict_by_offset_list.update(self._get_font_dict_by_offset(i))

        font_dict_by_offset = font_dict_by_offset_list
#         print(font_dict_by_offset)
        font_dict = {}
        for class_name, x_offset, y_offset in group_offset_list:
            x_offset = x_offset.replace('.0', '')
            y_offset = y_offset.replace('.0', '')
            try:
                font_dict[class_name] = font_dict_by_offset[int(y_offset)][int(x_offset)]

            except:
                font_dict[class_name] = ''
        return font_dict

    def _get_font_dict_by_offset(self, url):
        """
            獲取座標偏移的文字字典, 會有最少兩種形式的svg文件（目前只遇到兩種）
        """
        res = requests.get(url, headers=self._css_headers)
        html = res.text
        font_dict = {}
        y_list = re.findall(r'd="M0 (\d+?) ', html)
        if y_list:
            font_list = re.findall(r'<textPath .*?>(.*?)<', html)
            for i, string in enumerate(font_list):
                y_offset = self.start_y - int(y_list[i])

                sub_font_dict = {}
                for j, font in enumerate(string):
                    x_offset = -j * self.font_size
                    sub_font_dict[x_offset] = font
                font_dict[y_offset] = sub_font_dict
        else:
            font_list = re.findall(r'<text.*?y="(.*?)">(.*?)<', html)
            for y, string in font_list:
                y_offset = self.start_y - int(y)
                sub_font_dict = {}
                for j, font in enumerate(string):
                    x_offset = -j * self.font_size
                    sub_font_dict[x_offset] = font
                font_dict[y_offset] = sub_font_dict
        return font_dict

class Customer(DianpingComment):
    def _data_pipeline(self, data):
        print(data)
if __name__ == "__main__":
    data = pd.read_excel('./差評統計.xlsx')
    for i in range(40, 96):#0,96
        file_name = './景點差評/'+str(data['高級別景區'][i]) + '.txt' # 景點差評/
        max_comment = int(data['大衆點評差評'][i])
        if int(max_comment) != 0:
            print (i, data['高級別景區'][i], data['大衆點評'][i])
            dzdp_id = str(data['大衆點評'][i]).split("/shop/")[1]
            f = open(file_name, 'wb+')            
            dianping = Customer(dzdp_id, cookies=COOKIES)
            dianping.run()
            f.close()
    endTime = time.time()
    sum_time = (endTime-startTime) / 60
    print ("獲取該信息一共用了%s分鐘"%sum_time)

大衆點評截圖：

3、馬蜂窩差評代碼

from selenium import webdriver
import time
import requests
from lxml import etree
from lxml import html
import pandas as pd

startTime = time.time() #記錄起始時間
driver = webdriver.Chrome()
driver.implicitly_wait(5)
chrome_option = webdriver.ChromeOptions()
chrome_option.add_argument('--proxy--server=112.84.55.122:9999')#使用代理IP

def spider_mafengwo(file_name, mfw_url):
    driver.get(mfw_url)#打開馬蜂窩
    driver.implicitly_wait(6)#等待加載六秒
    driver.find_element_by_xpath('//*[@id="poi-navbar"]/ul/li[3]/a').click() #點擊蜂蜂評論
    time.sleep(3)
    driver.find_element_by_xpath('//div[@class="review-nav"]/ul/li[5]/a/span[1]').click() #點擊差評
    time.sleep(0.5)
    # 獲取源碼並解析 
    driver.execute_script("window.scrollBy(0,3000)")
    try:
        maxPage = driver.find_element_by_xpath('//span[@class="count"]/span').text #獲取最大頁
    except:
        maxPage = False
    if maxPage:
#         print ("maxPage = ", maxPage)
        for i in range(1, int(maxPage)):
            time.sleep(1.5)
            driver.find_element_by_link_text("後一頁").click()
            time.sleep(1)
            get_mafengwo_txt(file_name)
    else:
        get_mafengwo_txt(file_name)
def get_mafengwo_txt(file_name):
    time.sleep(1)
    mfw_source = driver.page_source
    mfw_html = html.fromstring(mfw_source)
    # 爬取文本描述
#     describes = mfw_html.xpath('//div[@class="summary"]//text()')
#     describe = "".join(describes).replace(" ","").replace("\n","")
#     print ("describe = ", describe)
    # 提取全部評論
    mfw_user_comments = mfw_html.xpath('//*[@id="c_gs_comments_commentdetail"]//text()')
    mfw_user_comment = "".join(mfw_user_comments)
#     print ("mfw_user_comment = ", mfw_user_comment)
    
    # 爬取回復
    rev_txts = mfw_html.xpath('//li[@class="rev-item comment-item clearfix"]//p//text()')
    rev_txt = "".join(rev_txts)
#     print ("rev_txt = ", rev_txt)
    
    # 爬取回復的評論
    comment_lists = mfw_html.xpath('//ul[@class="more_reply_box comment_list"]/li/text()')
    comment_list = "".join(comment_lists).replace(" ","").replace("\n","")
#     print ("comment_list = ", comment_list)
    with open(file_name, 'a', encoding='utf-8') as f:
#         f.write(describe+"\n")
        f.write(mfw_user_comment+"\n")
        f.write(rev_txt+"\n")
        f.write(comment_list+"\n")
        f.close()
        
def main():
    data = pd.read_excel('./差評統計.xlsx')
    for i in range(0, 96):#0,96
        file_name = './景點差評/'+str(data['高級別景區'][i]) + '.txt' # 景點差評
        max_comment = int(data['馬蜂窩差評'][i])
        if int(max_comment) != 0:
            mfw_url = data['馬蜂窩'][i]
            print (i, data['高級別景區'][i], mfw_url)
            spider_mafengwo(file_name, mfw_url)
        else:
            print ("馬蜂窩《%s》沒有差評"%(data['高級別景區'][i]))
        print ("正在爬取第%s個目標，一共有96個目標"%(i+1))
             
if __name__ == '__main__':
    main()
    endTime = time.time()
    sum_time = (endTime-startTime) / 60
    print ("獲取該信息一共用了%s分鐘"%sum_time)

馬蜂窩運行截圖：

4、結果

正在爬取差評：

結果彙總截圖：

（1）結果彙總到一個txt文件：

import os
path = './景點差評/'
f = open('./貴州景點差評彙總.txt', 'a', encoding='utf-8')#創建文件，a表示添加
for fileName in os.listdir(path):
    openFile = open(path+fileName,'r', encoding='utf-8')#打開文件，r表示讀取
    print (fileName)
    txt = openFile.read()#全部讀取
    f.write(txt)#寫入文件
    openFile.close()
f.close()

（2）分類彙總寫入txt

import os
path = './景點差評/'
file_5A = open('./貴州景點差評彙總_5A.txt', 'a', encoding='utf-8')
file_4A = open('./貴州景點差評彙總_4A.txt', 'a', encoding='utf-8') 
for fileName in os.listdir(path):
    openFile = open(path+fileName,'r', encoding='utf-8')
    txt = openFile.read()
    print (fileName)
    if "黃果樹" in fileName:
        file_5A.write(txt)
    elif "青巖古鎮" in fileName:
        file_5A.write(txt)
    elif "龍宮" in fileName:
        file_5A.write(txt)        
    elif "梵淨山" in fileName:
        file_5A.write(txt)
    elif "百里杜鵑" in fileName:
        file_5A.write(txt)
    elif "荔波漳江" in fileName:
        file_5A.write(txt)
    elif "鎮遠古鎮" in fileName:
        file_5A.write(txt)
    else:
        file_4A.write(txt)
openFile.close()
file_5A.close()
file_4A.close()

python爬蟲項目之攜程網、大衆點評和馬蜂窩貴州景點差評實戰彙總

文章目錄

1、攜程網

1.1、PC端差評代碼

1.2、移動端差評代碼

1.3、PC端和移動端一起運行

2、大衆點評代碼

3、馬蜂窩差評代碼

4、結果

985 碩士程序員，空窗 4 個月沒有 Offer！

賽博鬥地主——使用大語言模型扮演Agent智能體玩牌類遊戲。

VScode右鍵打開(添加到右鍵)

python爬蟲之爬取《書趣閣》小說教學

數據分析入門之Numpy 矩陣與通用函數

數據分析入門之好萊塢百萬級評論數據分析

python爬取美團評論做詞雲分析

圖像處理之opencv圖像美化

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結