網絡爬蟲-大衆點評-獲取美食商鋪評論標籤與推薦美食-本地Excel表格

1.由於大衆點評的反爬蟲措施(如Cookie就是必須放入請求頭Header中)太過嚴禁,博主本人在爬取測試過程中IP被封,更換了IP才得以繼續測試,並且後來博主在爬取過程中設置了小型防崩潰措施。

2.爬取速度不宜太快,爬取次數同一個IP下有限制

3.網上好多爬取方法已經失效或者是不怎麼關用,博主花下大量時間才得以爬取

 

首先先抓取各個美食商鋪的名稱、鏈接、星級、價格、地址等基本信息,並導入CSV表格中

import csv
import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC


# 提前限制JS與圖片加載
option = webdriver.ChromeOptions()
prefs = {
     'profile.default_content_setting_values': {
        'images': 2,
        'javascript':2
    }
}
option.add_experimental_option('prefs',prefs)

# 創建chrome參數對象
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)

wait = WebDriverWait(driver, 10)

def insert_csv(output_list):
    with open('restaurant_list.csv', 'a+', newline='', encoding='UTF-8') as csvfile:
        spamwriter = csv.writer(csvfile, dialect='excel')
        spamwriter.writerows(output_list)
        csvfile.close()

def page_search(i, output_list):
    try:
        print('頁數'+ str(i))
        driver.get('http://www.dianping.com/shanghai/ch10/p' + str(i))
        driver.implicitly_wait(6)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        return outputOneCommentResult(i, soup, output_list)
    except Exception as e:
        print('Error:', e)
        time.sleep(random.randint(2, 3) + random.random())
        page_search(i)

def outputOneCommentResult(page_id, soup, output_list):
    try:
        wait.until(EC.presence_of_element_located((By.ID, 'shop-all-list')))
        for item in soup.find(id='shop-all-list').ul:
            try:
                title = item.find(class_='tit').a.text.strip()
            except:
                title = ''
            try:
                link = item.find(class_='tit').a['href'] + '/review_more'
            except:
                link = ''
            try:
                star = item.find(class_='comment').span['title']
            except:
                star = ''
            try:
                comment_link = item.find(class_='review-num')['href']
            except:
                comment_link = ''
            try:
                comment = item.find(class_='review-num').b.text.strip()
            except:
                comment = ''
            try:
                price = item.find(class_='mean-price').b.text.strip()
            except:
                price = ''
            try:
                tag = item.find(class_='tag').text.strip()
            except:
                tag = ''
            try:
                addr = item.find(class_='addr').text.strip()
            except:
                addr = ''
            try:
                commentlist = commentlist.replace('\n', '|')
            except:
                commentlist = ''

            if title != '':
                output_list.append([str(page_id), title, link, star, comment_link, comment, price, tag, addr, commentlist])
                # print(output_new)

        return output_list
    except TimeoutError as e:
        print('Error:', e)
        outputOneCommentResult(page_id, soup)


if __name__ == '__main__':
    print('正在爬取大衆點評網站餐廳數據:')
    for i in range(1, 3):
        output_list = []
        output_list = page_search(i, output_list)
        insert_csv(output_list)
        time.sleep(random.randint(2, 3) + random.random())

    driver.close()

兩頁的採集數據:

之後從restaurant_list中獲取美食商鋪鏈接,再進行解析後獲取店鋪詳細信息

import csv
import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC

# 提前限制JS與圖片加載
option = webdriver.ChromeOptions()
prefs = {
     'profile.default_content_setting_values': {
        'images': 2,
        'javascript':2
    }
}
option.add_experimental_option('prefs',prefs)

# 創建chrome參數對象
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
# driver = webdriver.Chrome()
driver.implicitly_wait(6)
wait = WebDriverWait(driver, 10)

link_list = []

def get_csv():
    with open('restaurant_list.csv', encoding='UTF-8') as f:
        csv_file = csv.reader(f)
        link_list = [[row[1], row[2], row[3]] for row in csv_file]
        f.close()
    return link_list

def insert_csv(output_list):
    with open('restaurant_detail.csv', 'a+', newline='', encoding='UTF-8') as csvfile:
        spamwriter = csv.writer(csvfile, dialect='excel')
        spamwriter.writerows(output_list)
        csvfile.close()

def dishes_detail(eachone, dishes_list):
    try:
        dishes = eachone.text
        dishes_list.append(dishes)
        return dishes_list
    except Exception as e:
        print('Dishes Error:', e)

def comment_detail(eachone, comment_list):
    try:
        comment_tag = eachone.text.strip().replace(' ', '').replace('\n', '')
        comment_list.append(comment_tag)
        return comment_list
    except Exception as e:
        print('Comment Error:', e)

def link_detail(eachone, output_list):

        name = eachone[0]
        str = eachone[2]
        print('正在爬取:',name, str)

        # 進入評論頁面
        link = eachone[1] + '/review_all'
        driver.get(link)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # 爬取優秀評論
        comment_list = []
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#review-list > div.review-list-container > div.review-list-main > div.reviews-wrapper > div.reviews-tags > div.content')))
        for ea in soup.select('.tag'):
            comment_list = comment_detail(ea, comment_list)
        comment_list = '|'.join(comment_list)

        # 爬取優秀菜譜
        dishes_list = []
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#review-list > div.review-list-container > div.review-list-aside > div.shop-dish > div.dish-list.clearfix')))
        for ea in soup.select('.dish-list .dish-name'):
            dishes_list = dishes_detail(ea, dishes_list)
        dishes_list = '|'.join(dishes_list)

        output_list.append([name, eachone[1], dishes_list, comment_list])

        return output_list

if __name__ == '__main__':
    link_list = get_csv()
    print('開始爬取大衆點評熱門商家:')
    for eachone in link_list:
        output_list = []
        output_list = link_detail(eachone, output_list)
        insert_csv(output_list)
        time.sleep(random.randint(2, 3) + random.random())
    driver.close()

採用格式轉換將csv格式轉換成xlsx格式:

 

 

 

爬取結束.

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章