概述
- 使用firefox瀏覽器
- 使用selenium時,只能使用掃碼登錄,不能使用用戶名密碼登錄
- 使用用戶名密碼登錄時會提示"哎呀,出錯了,點擊刷新再來一次(error:9PiETg)"
- 掃碼登錄後可以將cookie保存,下一次可以使用cookie登錄,避免每次運行都要掃碼
- 保存的cookie會有失效時間
- 支持淘寶鏈接,天貓鏈接和短鏈接
- 爬取時還有獲取頁面元素不穩定的問題,暫時沒找到好辦法,只能多試幾次
- 商品評論的頁數只能到99頁,多於99頁會提示錯誤
- 翻頁太快會導致出現驗證,並且驗證時會提示"哎呀,出錯了,點擊刷新再來一次(error:9PiETg)"
效果
實現
源碼文件
文件 | 介紹 |
---|---|
main.py | 爬蟲入口,創建保存目錄,掃碼登錄或cookie登錄,開啓爬蟲 |
core.py | 爬蟲判斷,根據url創建TMall或者Taobao爬蟲 |
spider文件夾 taobaoSpider.py和tmallSpider.py | 實現了淘寶商品評論爬取和天貓商品評論爬取 |
browser.py | WebDriver FireFox抽象 |
util.py | 工具類 |
settings.py | 配置如掃碼登錄還是cookie,圖片存儲目錄,url等 |
main.py
# coding=utf-8
import json
import os
import settings
import util
from core import Crawler
from browser import FirefoxBrowser
#創建圖片目錄
util.mkStorePath(settings.STORE_PATH)
firefox = FirefoxBrowser()
#掃碼登錄並保存cookie
if settings.QRCODE == True:
cookies = firefox.get_cookies(settings.LOGIN_URL)
jsonCookies = json.dumps(cookies)
with open("cookies_tao.json", "w") as fp:
fp.write(jsonCookies)
fp.close()
print("cookie file done")
#否則使用保存的cookie登錄
else:
firefox.get(settings.LOGIN_URL)
if os.path.exists('cookies_tao.json'):
with open("cookies_tao.json","r",encoding="utf8") as fp:
cookies = json.loads(fp.read())
firefox.set_cookies(cookies)
fp.close()
#爬取商品評論
failedList = []
#第一遍有失敗的
for url in settings.URLS:
isSuccess = Crawler(url,firefox).start()
if isSuccess == False:
failedList.append(url)
#失敗的重試一次
for url in failedList:
Crawler(url,firefox).start()
firefox.close()
core.py
# coding=utf-8
import os
from time import sleep
from spider.tmallSpider import TmallSpider
from spider.taobaoSpider import TaobaoSpider
import settings
class Crawler(object):
def __init__(self, target_url,firefoxBrowser):
#TODO 驗證url可用性
self._firefox = firefoxBrowser
if(target_url.find('detail.tmall.com') != -1):
self._type = 1
elif(target_url.find('item.taobao.com') != -1):
self._type = 2
elif(target_url.find('m.tb.cn') != -1):
self._type = 0
self._firefox.get(target_url)
if self._type == 0:
self._firefox._wait_url(target_url,300)
self._url = self._firefox.driver().current_url
def start(self):
"""
:return: True False
"""
#判斷爬蟲類型
if(self._url.find('detail.tmall.com') != -1):
return TmallSpider(self._firefox).start()
elif(self._url.find('item.taobao.com') != -1):
return TaobaoSpider(self._firefox).start()
spider/tmallSpider.py
# coding=utf-8
import time
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import settings
import util
from util import ElementFilter
class TmallSpider(object):
def __init__(self,firefoxBrowser):
self._firefox = firefoxBrowser
self._surl = self._firefox.driver().current_url
self._sid = util.getIdAndMkdir(self._surl,settings.STORE_PATH)
self._item = {}
self._rate = []
def start(self):
"""
:return: True-爬取完成 False-爬取失敗
"""
#判斷爬蟲類型
print('start tmallSpider ' + str(self._sid))
#獲取item的標題
print('get Title')
self._item['title'] = self._firefox.get_element(ElementFilter.tm_dict['Title']).text
#找到包含評論頁面的view
print('get JTabBarBox')
element = self._firefox.get_element(ElementFilter.tm_dict['JTabBarBox'])
self._firefox.driver().execute_script("arguments[0].scrollIntoView()",element)
#找到包含radio的容器
print('get JTabbar')
jtabbar = self._firefox.get_element_without_wait(ElementFilter.tm_dict['JTabbar'])
if jtabbar is None:
print('JTabbar not found')
return False
jtabbar.click()
time.sleep(5)
#找到radio並點擊包含圖片的評論
print('get JReviews')
jreviews = self._firefox.get_element_without_wait(ElementFilter.tm_dict['JReviews'])
if jreviews is None:
print('JReviews not found')
return False
jreviews.click()
time.sleep(5)
#找到圖片評論,最多99頁
for num in range(1,99):
temp = self.parse(self._firefox.driver().page_source)
self._rate.append(temp)
print('page'+str(num))
num = num + 1
isLast = self._firefox.get_next_page_tmall('下一頁>>')
if isLast == False:
break
time.sleep(5)
self._item['rates'] = self._rate
return True
def parse(self,html):
bs4 = BeautifulSoup(html,"html.parser")
div_rate = bs4.find("div",class_="rate-grid")
items = []
#選擇每一行
trs = div_rate.select('tr')
for tr in trs:
item = {}
#td class="col-author" 作者
td3 = tr.select_one('td.col-author')
contents = td3.select_one('div.rate-user-info').contents
item['author'] = contents[0].strip()+ "***" + contents[2].strip()
item['rauthor'] = contents[0].strip() + contents[2].strip()
#td class="tm-col-master" 評論內容和圖片地址
td1 = tr.select_one('td.tm-col-master')
#追評 tm-rate-premiere 否則 tm-rate-content
premiere = td1.select_one('div.tm-rate-premiere')
if premiere is not None:
print('premiere')
#tm-rate-premiere
#初始評論內容
fulltxt = premiere.select_one('div.tm-rate-fulltxt').contents
if len(fulltxt) > 1:
item['tm-rate-fulltxt'] = fulltxt[1].strip()
else:
item['tm-rate-fulltxt'] = fulltxt[0].strip()
item['tm-rate-fulltxt'] = fulltxt
#評論時間
date = premiere.select_one('div.tm-rate-date').contents[0].strip()
item['tm-rate-date'] = date
#評論圖片url
lis = premiere.select('li')
datasrc=[]
for li in lis:
srcLi = li.attrs['data-src']
if srcLi.endswith(".png"):
continue
imgUrl = self.parseImg(srcLi,item['rauthor'])
datasrc.append(imgUrl)
#追評內容
append = td1.select_one('div.tm-rate-append')
fulltxt = append.select_one('div.tm-rate-fulltxt').contents
if len(fulltxt) > 1:
item['tm-rate-fulltxt'] = fulltxt[1].strip()
else:
item['tm-rate-fulltxt'] = fulltxt[0].strip()
item['append-rate-fulltxt'] = fulltxt
alis = append.select('li')
for li in alis:
srcLi = li.attrs['data-src']
if srcLi.endswith(".png"):
continue
imgUrl = self.parseImg(srcLi,item['rauthor'])
datasrc.append(imgUrl)
item['tm-m-photos'] = datasrc
else:
#tm-rate-content
content = td1.select_one('div.tm-rate-content')
#評論內容
fulltxt = content.select_one('div.tm-rate-fulltxt').contents
if len(fulltxt) > 1:
item['tm-rate-fulltxt'] = fulltxt[1].strip()
else:
item['tm-rate-fulltxt'] = fulltxt[0].strip()
#評論圖片url
lis = content.select('li')
datasrc=[]
for li in lis:
srcLi = li.attrs['data-src']
if srcLi.endswith(".png"):
continue
imgUrl = self.parseImg(srcLi,item['rauthor'])
datasrc.append(imgUrl)
item['tm-m-photos'] = datasrc
#評論時間
date = td1.select_one('div.tm-rate-date').contents[0].strip()
item['tm-rate-date'] = date
#td class="col-meta" 顏色和鞋碼
td2 = tr.select_one('td.col-meta div.rate-sku')
ps = td2.select('p')
item['color'] = ps[0]['title']
item['size'] = ps[1]['title']
items.append(item)
return items
def parseImg(self,picUrl,author):
picTemp = picUrl.rpartition('/')[2]
picDes = settings.STORE_PATH + '/' + self._sid + "/" + author + '_' + picTemp[:len(picTemp)-12]
picAll = "http:" + picUrl[:len(picUrl)-12]
urlretrieve(picAll,picDes)
return picAll
參考
selenium文檔
Beautiful Soup 4.4.0 文檔
CSS 選擇器參考手冊
selenium爬取淘寶評論信息
python +Selenium 爬取淘寶商品評論