使用requests库爬取B站视频弹幕和评论
爬取目标
明星大侦探巨想谈恋爱(上)、(下)和头号玩家Ⅱ(上)、(下)
urls = [
'https://www.bilibili.com/video/BV1St411S7GP', # 巨想谈恋爱
'https://www.bilibili.com/video/BV1Ut411i7dk',
'https://www.bilibili.com/video/BV14t411a7aN', # 头号玩家
'https://www.bilibili.com/video/BV1xt41187ti'
]
主要有三个获取数据的URL
1、爬取目标视频的url:
例如:https://www.bilibili.com/video/BV14t411a7aN
- 根据url获取BV号
- 从返回的请求页面中获取av号
- 根据BV号获取cid(cid是获取弹幕url中的参数)
URL为:https://api.bilibili.com/x/player/pagelist?bvid=BV14t411a7aN&jsonp=jsonp
返回结果如下:
2、爬取目标视频弹幕的URL:
请求的URL:http://comment.bilibili.com/72211064.xml
返回结果如下:
可以看到是xml格式的数据,之后解析即可,具体解析代码:
response = requests.get(danmu_url, headers=headers)
if response.status_code == 200:
with open('bilibili.xml', 'wb') as fp:
fp.write(response.content)
html_comment = etree.parse('bilibili.xml', etree.HTMLParser())
ds = html_comment.xpath('//d')
danmu_list = []
for d in ds:
p = d.xpath('./@p')[0].split(',')
danmu_current_time = p[0] # 弹幕出现的时间
danmu_mode = p[1] # 弹幕模式:1..3 滚动弹幕 4底端弹幕 5顶端弹幕 6.逆向弹幕 7精准定位 8高级弹幕
danmu_font_size = p[2] # 弹幕字号
danmu_font_color = p[3] # 弹幕字体颜色
danmu_send_time = p[4] # 弹幕发送的时间
danmu_send_time = timestamp_datetime(int(danmu_send_time))
danmu_pool = p[5] # 弹幕池 0普通池 1字幕池 2特殊池 【目前特殊池为高级弹幕专用】
danmu_send_id = p[6] # 弹幕发送者id
danmu_id = p[7] # 弹幕在弹幕数据库中的id
text = d.xpath('./text()')[0]
comment = {
'danmu_current_time': danmu_current_time,
'danmu_send_time': danmu_send_time,
'danmu_send_id': danmu_send_id,
'text': text
}
danmu_list.append(comment)
3、爬取目标视频的评论URL:
URL:https://api.bilibili.com/x/v2/reply?pn=1&type=1&oid=41111008&sort=1
其中oid就是之前获取到的av号
返回结果如下:
实现代码
'''
'''
@Description: 爬取B站弹幕和评论
@Author: sikaozhifu
@Date: 2020-06-24 11:29:58
@LastEditTime: 2020-06-26 10:20:37
@LastEditors: Please set LastEditors
'''
import requests
from lxml import etree
import re
import time
import pymongo
import math
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
'''
@description: 日期格式转换
@param : value
@return: dt
'''
def timestamp_datetime(value):
format = r'%Y-%m-%d %H:%M:%S'
# value为传入的值为时间戳(整形),如:1332888820
value = time.localtime(value)
# 经过localtime转换后变成
# time.struct_time(tm_year=2012, tm_mon=3, tm_mday=28, tm_hour=6, tm_min=53, tm_sec=40, tm_wday=2, tm_yday=88, tm_isdst=0)
# 最后再经过strftime函数转换为正常日期格式。
dt = time.strftime(format, value)
return dt
def main():
client = pymongo.MongoClient(host='127.0.0.1', port=27017)
db = client.bilibili
com = db.comments
perfix = 'https:'
urls = [
'https://www.bilibili.com/video/BV1St411S7GP', # 巨想谈恋爱
'https://www.bilibili.com/video/BV1Ut411i7dk',
'https://www.bilibili.com/video/BV14t411a7aN', # 头号玩家
'https://www.bilibili.com/video/BV1xt41187ti'
]
ID = 0
for url in urls:
response = requests.get(url, headers=headers)
html = etree.HTML(response.text)
title = html.xpath('//h1[@class = "video-title"]/@title')[0] # video的标题
up_url = perfix + html.xpath('//div[@class = "name"]/a/@href')[0]
up_id = re.search(r'\d+', up_url).group(0) # up_id
up_name = html.xpath('//div[@class = "name"]/a/text()')[0] # up_name
av_id_url = html.xpath('//meta[@itemprop = "url"]/@content')[0]
av_id = re.search(r'\d+', av_id_url).group(0) # av_id
bvid = re.search(r'video/(.*)', url).group(1) # bv_id
bvid_url = 'https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp' % bvid
response = requests.get(bvid_url, headers=headers)
cid = response.json()['data'][0]['cid']
danmu_url = 'http://comment.bilibili.com/%s.xml' % cid
time.sleep(1)
response = requests.get(danmu_url, headers=headers)
if response.status_code == 200:
with open('bilibili.xml', 'wb') as fp:
fp.write(response.content)
html_comment = etree.parse('bilibili.xml', etree.HTMLParser())
ds = html_comment.xpath('//d')
danmu_list = []
for d in ds:
p = d.xpath('./@p')[0].split(',')
danmu_current_time = p[0] # 弹幕出现的时间
danmu_mode = p[1] # 弹幕模式:1..3 滚动弹幕 4底端弹幕 5顶端弹幕 6.逆向弹幕 7精准定位 8高级弹幕
danmu_font_size = p[2] # 弹幕字号
danmu_font_color = p[3] # 弹幕字体颜色
danmu_send_time = p[4] # 弹幕发送的时间
danmu_send_time = timestamp_datetime(int(danmu_send_time))
danmu_pool = p[5] # 弹幕池 0普通池 1字幕池 2特殊池 【目前特殊池为高级弹幕专用】
danmu_send_id = p[6] # 弹幕发送者id
danmu_id = p[7] # 弹幕在弹幕数据库中的id
text = d.xpath('./text()')[0]
comment = {
'danmu_current_time': danmu_current_time,
'danmu_send_time': danmu_send_time,
'danmu_send_id': danmu_send_id,
'text': text
}
danmu_list.append(comment)
# print(danmu_send_time)
# print(text)
comment_url = 'https://api.bilibili.com/x/v2/reply?pn=1&type=1&oid=%s&sort=1' % av_id
response = requests.get(comment_url, headers=headers)
count = response.json()['data']['page']['count'] # 评论总数
page_count = math.ceil(int(count)/20) # 评论总页数
comment_list = []
for pn in range(1, page_count + 1):
comment_url = 'https://api.bilibili.com/x/v2/reply?pn=%s&type=1&oid=%s&sort=1' % (pn, av_id)
response = requests.get(comment_url, headers=headers)
replies = response.json()['data']['replies']
for reply in replies:
reply_id = reply['member']['mid'] # 评论者id
reply_name = reply['member']['uname'] # 评论者暱称
reply_sex = reply['member']['sex'] # 评论者性别
reply_time = timestamp_datetime(int(reply['ctime'])) # 评论时间
reply_like = reply['like'] # 评论点赞数
reply_content = reply['content']['message'] # 评论内容
reply_info = {
'reply_id': reply_id,
'reply_name': reply_name,
'reply_sex': reply_sex,
'reply_time': reply_time,
'reply_like': reply_like,
'reply_content': reply_content
}
comment_list.append(reply_info)
info = {
'ID': ID,
'video_title': title,
'up_id': up_id,
'up_name': up_name,
'danmu_list': danmu_list,
'comment_list': comment_list
}
print(info)
com.insert_one(info)
ID = ID + 1
time.sleep(1)
if __name__ == "__main__":
main()
运行结果
存到Mongodb里面
在Mongodb compass里面展示如下: