Python爬取京東的機器學習類圖書的信息
一,配置搜索關鍵字和頁數,
本例是搜索”機器學習“,頁數我配了100頁沒封號。大概爬下來三千條圖書。用時沒有留意,大概就幾分鐘吧,很快的。
if __name__ == '__main__':
# 測試, 只爬取兩頁搜索頁與兩頁評論
test = CrawlDog('機器學習')
test.main(2)
test.store_xsl()
二,查找用到的三個URL的過程
1. 搜索圖書的URL
https://search.jd.com/Search?keyword=%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0&enc=utf-8&suggest=1.his.0.0&wq=&pvid=d73028f8cf3e46deb44d843ef082fcef
2. 評論總數,差評數,好評數的URL
F12 --> NetWork , 在機器學習搜索頁面往下拖鼠標,你會看到有好多圖片的加載或Json請求,
其中就有異步評論數的請求。
https://club.jd.com/comment/productCommentSummaries.action?referenceIds=69957954609,33316347153,20445809140,11166079878,40853170920&callback=jQuery4865124&_=1593055042908
在url裏把 &callback=jQuery4865124去掉,因爲加上的話就會有jQuery4865124這個字出現,
https://club.jd.com/comment/productCommentSummaries.action?referenceIds=69957954609,33316347153,20445809140,11166079878,40853170920&_=1593055042908
3. 當前價格與打折前價格URL
找的步驟和評論數的總結的那個URL一樣的。
四,代碼分析
主要包括三個方法
crawl_message() 爬取商品的基本信息,爬完後調用comments()和prices()方法
comments() 爬取( 總評數,平均得分,好評數,默認好評,好評率,追評數,視頻曬單數,差評數,中評數)
prices() 爬取當前商品的價格,打折前的商品價格
store_xsl() 爬完後 存放到excel表格裏
五,完整代碼
import requests
from lxml import etree
from concurrent import futures
import json
import pandas as pd
class CrawlDog:
comment_headers = {
'Referer': 'https://item.jd.com/%s.html' % 12615065,
'Accept-Charset': 'utf-8',
'accept-language': 'zh,en-US;q=0.9,en;q=0.8,zh-TW;q=0.7,zh-CN;q=0.6',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/74.0.3729.169 Safari/537.36'
}
def __init__(self, keyword):
"""
初始化
:param keyword: 搜索的關鍵詞
"""
self.keyword = keyword
self.data = pd.DataFrame()
def crawl_message(self, page):
"""
從搜索頁獲取相應信息並存入數據庫
:param page: 搜索頁的頁碼
"""
url = 'https://search.jd.com/Search?keyword={}&enc=utf-8&page={}&s={}'.format(self.keyword, page, (page-1)*30+1)
index_headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'Accept-Charset': 'utf-8',
'accept-language': 'zh,en-US;q=0.9,en;q=0.8,zh-TW;q=0.7,zh-CN;q=0.6',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/74.0.3729.169 Safari/537.36'
}
rsp = requests.get(url=url, headers=index_headers).content.decode()
print(url)
rsp = etree.HTML(rsp)
items = rsp.xpath('//li[contains(@class, "gl-item")]')
ids = []
for item in items:
try:
info = dict()
p_name = item.xpath('.//div[@class="p-name"]/a/em')
info['title'] = etree.tostring(p_name[0], method='text', encoding='unicode').replace('\r','').replace('\n','').replace('\t','')
info['price'] = item.xpath('.//div[@class="p-price"]//i/text()')[0]
info['shop'] = item.xpath('.//div[@class="p-shopnum"]//a/text()')[0]
info['icon'] = item.xpath('.//div[@class="p-icons"]//i/text()')
info['url'] = 'https:' + item.xpath('.//div[@class="p-name"]/a/@href')[0]
info['item_id'] = info.get('url').split('/')[-1][:-5]
book_details= item.xpath('.//div[@class="p-bookdetails"]//span/a')
info['author'] = etree.tostring(book_details[0], method='text', encoding='unicode').replace('\r','').replace('\n','').replace('\t','')
info['publish_date'] = item.xpath('.//div[@class="p-bookdetails"]//span[@class="p-bi-date"]/text()')
#info['price_'] = 0
info['old_price'] = 0
info['commentCount'] = 0
info['averageScore'] = 0
info['goodCount'] = 0
info['defaultGoodCount'] = 0
info['goodRate'] = 0
info['afterCount'] = 0
info['videoCount'] = 0
info['poorCount'] = 0
info['generalCount'] = 0
ids.append(info['item_id'])
self.data = self.data.append(info, ignore_index=True)
# 實際爬取過程中有一些廣告, 其中的一些上述字段爲空
except IndexError:
print('item信息不全, drop!')
continue
print(len(ids))
self.comments(ids)
self.prices(ids)
def comments(self, ids):
ids = ','.join([str(id) for id in ids])
url = 'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds={}'.format(ids)
comments = requests.get(url=url, headers=self.comment_headers).json()
# 總評數,平均得分,好評數,默認好評,好評率,追評數,視頻曬單數,差評數,中評數
comments_columns = ['commentCount', 'averageScore', 'goodCount', 'defaultGoodCount', 'goodRate',
'afterCount', 'videoCount', 'poorCount', 'generalCount']
for comment in comments["CommentsCount"]:
comments_data= [comment["CommentCount"], comment["AverageScore"], comment["GoodCount"], comment["DefaultGoodCount"],
comment["GoodRate"], comment["AfterCount"],
comment["VideoCount"], comment["PoorCount"],
comment["GeneralCount"]]
self.data.loc[self.data.item_id == str(comment['SkuId']), comments_columns] = comments_data
def prices(self, ids):
str_ids = ','.join(['J_'+str(id) for id in ids])
url = "https://p.3.cn/prices/mgets?ext=11000000&pin=&type=1&area=1_72_4137_0&skuIds=J_%s&pdbp=0&pdtk=&pdpin=&pduid=15229474889041156750382&source=list_pc_front" % str_ids
prices = requests.get(url, headers=self.comment_headers).json()
for price in prices:
#self.data.loc[self.data.item_id == price['id'][2:], 'price_'] = price.get('p')
self.data.loc[self.data.item_id == price['id'][2:], 'old_price'] = price.get("m")
def main(self, index_pn):
"""
實現爬取的函數
:param index_pn: 爬取搜索頁的頁碼總數
:return:
"""
# 爬取搜索頁函數的參數列表
il = [i +1 for i in range(index_pn)]
# 創建一定數量的線程執行爬取
with futures.ThreadPoolExecutor(3) as executor:
executor.map(self.crawl_message, il)
def store_xsl(self):
self.data.to_csv( 'data.csv', encoding = 'utf-8', index=False)
if __name__ == '__main__':
# 測試, 只爬取兩頁搜索頁與兩頁評論
test = CrawlDog('機器學習')
test.main(2)
test.store_xsl()