使用requests庫爬取B站視頻彈幕和評論
爬取目標
明星大偵探巨想談戀愛(上)、(下)和頭號玩家Ⅱ(上)、(下)
urls = [
'https://www.bilibili.com/video/BV1St411S7GP', # 巨想談戀愛
'https://www.bilibili.com/video/BV1Ut411i7dk',
'https://www.bilibili.com/video/BV14t411a7aN', # 頭號玩家
'https://www.bilibili.com/video/BV1xt41187ti'
]
主要有三個獲取數據的URL
1、爬取目標視頻的url:
例如:https://www.bilibili.com/video/BV14t411a7aN
- 根據url獲取BV號
- 從返回的請求頁面中獲取av號
- 根據BV號獲取cid(cid是獲取彈幕url中的參數)
URL爲:https://api.bilibili.com/x/player/pagelist?bvid=BV14t411a7aN&jsonp=jsonp
返回結果如下:
2、爬取目標視頻彈幕的URL:
請求的URL:http://comment.bilibili.com/72211064.xml
返回結果如下:
可以看到是xml格式的數據,之後解析即可,具體解析代碼:
response = requests.get(danmu_url, headers=headers)
if response.status_code == 200:
with open('bilibili.xml', 'wb') as fp:
fp.write(response.content)
html_comment = etree.parse('bilibili.xml', etree.HTMLParser())
ds = html_comment.xpath('//d')
danmu_list = []
for d in ds:
p = d.xpath('./@p')[0].split(',')
danmu_current_time = p[0] # 彈幕出現的時間
danmu_mode = p[1] # 彈幕模式:1..3 滾動彈幕 4底端彈幕 5頂端彈幕 6.逆向彈幕 7精準定位 8高級彈幕
danmu_font_size = p[2] # 彈幕字號
danmu_font_color = p[3] # 彈幕字體顏色
danmu_send_time = p[4] # 彈幕發送的時間
danmu_send_time = timestamp_datetime(int(danmu_send_time))
danmu_pool = p[5] # 彈幕池 0普通池 1字幕池 2特殊池 【目前特殊池爲高級彈幕專用】
danmu_send_id = p[6] # 彈幕發送者id
danmu_id = p[7] # 彈幕在彈幕數據庫中的id
text = d.xpath('./text()')[0]
comment = {
'danmu_current_time': danmu_current_time,
'danmu_send_time': danmu_send_time,
'danmu_send_id': danmu_send_id,
'text': text
}
danmu_list.append(comment)
3、爬取目標視頻的評論URL:
URL:https://api.bilibili.com/x/v2/reply?pn=1&type=1&oid=41111008&sort=1
其中oid就是之前獲取到的av號
返回結果如下:
實現代碼
'''
'''
@Description: 爬取B站彈幕和評論
@Author: sikaozhifu
@Date: 2020-06-24 11:29:58
@LastEditTime: 2020-06-26 10:20:37
@LastEditors: Please set LastEditors
'''
import requests
from lxml import etree
import re
import time
import pymongo
import math
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
'''
@description: 日期格式轉換
@param : value
@return: dt
'''
def timestamp_datetime(value):
format = r'%Y-%m-%d %H:%M:%S'
# value爲傳入的值爲時間戳(整形),如:1332888820
value = time.localtime(value)
# 經過localtime轉換後變成
# time.struct_time(tm_year=2012, tm_mon=3, tm_mday=28, tm_hour=6, tm_min=53, tm_sec=40, tm_wday=2, tm_yday=88, tm_isdst=0)
# 最後再經過strftime函數轉換爲正常日期格式。
dt = time.strftime(format, value)
return dt
def main():
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
db = client.bilibili
com = db.comments
perfix = 'https:'
urls = [
'https://www.bilibili.com/video/BV1St411S7GP', # 巨想談戀愛
'https://www.bilibili.com/video/BV1Ut411i7dk',
'https://www.bilibili.com/video/BV14t411a7aN', # 頭號玩家
'https://www.bilibili.com/video/BV1xt41187ti'
]
ID = 0
for url in urls:
response = requests.get(url, headers=headers)
html = etree.HTML(response.text)
title = html.xpath('//h1[@class = "video-title"]/@title')[0] # video的標題
up_url = perfix + html.xpath('//div[@class = "name"]/a/@href')[0]
up_id = re.search(r'\d+', up_url).group(0) # up_id
up_name = html.xpath('//div[@class = "name"]/a/text()')[0] # up_name
av_id_url = html.xpath('//meta[@itemprop = "url"]/@content')[0]
av_id = re.search(r'\d+', av_id_url).group(0) # av_id
bvid = re.search(r'video/(.*)', url).group(1) # bv_id
bvid_url = 'https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp' % bvid
response = requests.get(bvid_url, headers=headers)
cid = response.json()['data'][0]['cid']
danmu_url = 'http://comment.bilibili.com/%s.xml' % cid
time.sleep(1)
response = requests.get(danmu_url, headers=headers)
if response.status_code == 200:
with open('bilibili.xml', 'wb') as fp:
fp.write(response.content)
html_comment = etree.parse('bilibili.xml', etree.HTMLParser())
ds = html_comment.xpath('//d')
danmu_list = []
for d in ds:
p = d.xpath('./@p')[0].split(',')
danmu_current_time = p[0] # 彈幕出現的時間
danmu_mode = p[1] # 彈幕模式:1..3 滾動彈幕 4底端彈幕 5頂端彈幕 6.逆向彈幕 7精準定位 8高級彈幕
danmu_font_size = p[2] # 彈幕字號
danmu_font_color = p[3] # 彈幕字體顏色
danmu_send_time = p[4] # 彈幕發送的時間
danmu_send_time = timestamp_datetime(int(danmu_send_time))
danmu_pool = p[5] # 彈幕池 0普通池 1字幕池 2特殊池 【目前特殊池爲高級彈幕專用】
danmu_send_id = p[6] # 彈幕發送者id
danmu_id = p[7] # 彈幕在彈幕數據庫中的id
text = d.xpath('./text()')[0]
comment = {
'danmu_current_time': danmu_current_time,
'danmu_send_time': danmu_send_time,
'danmu_send_id': danmu_send_id,
'text': text
}
danmu_list.append(comment)
# print(danmu_send_time)
# print(text)
comment_url = 'https://api.bilibili.com/x/v2/reply?pn=1&type=1&oid=%s&sort=1' % av_id
response = requests.get(comment_url, headers=headers)
count = response.json()['data']['page']['count'] # 評論總數
page_count = math.ceil(int(count)/20) # 評論總頁數
comment_list = []
for pn in range(1, page_count + 1):
comment_url = 'https://api.bilibili.com/x/v2/reply?pn=%s&type=1&oid=%s&sort=1' % (pn, av_id)
response = requests.get(comment_url, headers=headers)
replies = response.json()['data']['replies']
for reply in replies:
reply_id = reply['member']['mid'] # 評論者id
reply_name = reply['member']['uname'] # 評論者暱稱
reply_sex = reply['member']['sex'] # 評論者性別
reply_time = timestamp_datetime(int(reply['ctime'])) # 評論時間
reply_like = reply['like'] # 評論點贊數
reply_content = reply['content']['message'] # 評論內容
reply_info = {
'reply_id': reply_id,
'reply_name': reply_name,
'reply_sex': reply_sex,
'reply_time': reply_time,
'reply_like': reply_like,
'reply_content': reply_content
}
comment_list.append(reply_info)
info = {
'ID': ID,
'video_title': title,
'up_id': up_id,
'up_name': up_name,
'danmu_list': danmu_list,
'comment_list': comment_list
}
print(info)
com.insert_one(info)
ID = ID + 1
time.sleep(1)
if __name__ == "__main__":
main()
運行結果
存到Mongodb裏面
在Mongodb compass裏面展示如下: