推薦頁封面抓取不需要考慮JS
,直接用XPath定位<img>
即可。
推薦頁url:https://www.bilibili.com/list/recommend/1.html
翻到x頁
就是x.html
抓取封面,定位到<img>
中的src
,獲取這個src
訪問下載到本地就行了。
用XPath
獲取src
路徑:
"//div[@class='zr_recomd']/ul/li/div/a/img/@src"
完整代碼:
# 抓取B站推薦頁視頻封面
import requests
from lxml import etree
header = {'User-Agent': 'chrome'} # request header
pic_save_ad = 'F:\\pyCharm\\spider\\pic_dirs'
def save_to_disk(pic):
if not pic:
return None
for pic_item in pic :
pic_name = pic_item.lstrip('http://').replace('.','').replace('/','').rstrip('jpg') + '.jpg'
file_path = "{}\\{}".format(pic_save_ad,pic_name)
print(file_path)
try:
res = requests.get(pic_item) # 加了headers會502,不加反而可以,可能是之前用過這個headers被反爬機制識別了?
if res.ok:
img = res.content
with open( file_path, 'wb') as f1:
f1.write(img)
except :
print('Failed to load this img !')
def solve(page):
urls = [u'https://www.bilibili.com/list/recommend/{}.html'.format(str(i)) for i in range(1, page+1)]
for url in urls:
text = requests.get(url, headers=header).text
html = etree.HTML(text)
links = []
links = html.xpath("//div[@class='zr_recomd']/ul/li/div/a/img/@src")
# print(len(links))
save_to_disk(links)
if __name__ == '__main__' :
k = int(input('Input the page you scrap'))
solve(k)