Python實戰---爬取B站彈幕和評論

使用requests庫爬取B站視頻彈幕和評論

爬取目標

明星大偵探巨想談戀愛(上)、(下)和頭號玩家Ⅱ(上)、(下)

urls = [
        'https://www.bilibili.com/video/BV1St411S7GP',  # 巨想談戀愛
        'https://www.bilibili.com/video/BV1Ut411i7dk',
        'https://www.bilibili.com/video/BV14t411a7aN',  # 頭號玩家
        'https://www.bilibili.com/video/BV1xt41187ti'
    ]

主要有三個獲取數據的URL
1、爬取目標視頻的url:
例如:https://www.bilibili.com/video/BV14t411a7aN

  • 根據url獲取BV號
  • 從返回的請求頁面中獲取av號
    av號
  • 根據BV號獲取cid(cid是獲取彈幕url中的參數)
    URL爲:https://api.bilibili.com/x/player/pagelist?bvid=BV14t411a7aN&jsonp=jsonp
    返回結果如下:
    在這裏插入圖片描述
    2、爬取目標視頻彈幕的URL:
    請求的URL:http://comment.bilibili.com/72211064.xml
    返回結果如下:
    在這裏插入圖片描述
    可以看到是xml格式的數據,之後解析即可,具體解析代碼:
response = requests.get(danmu_url, headers=headers)
if response.status_code == 200:
with open('bilibili.xml', 'wb') as fp:
   fp.write(response.content)
html_comment = etree.parse('bilibili.xml', etree.HTMLParser())
ds = html_comment.xpath('//d')
danmu_list = []
for d in ds:
	p = d.xpath('./@p')[0].split(',')
	danmu_current_time = p[0]  # 彈幕出現的時間
	danmu_mode = p[1]  # 彈幕模式:1..3 滾動彈幕 4底端彈幕 5頂端彈幕 6.逆向彈幕 7精準定位 8高級彈幕
	danmu_font_size = p[2]  # 彈幕字號
	danmu_font_color = p[3]  # 彈幕字體顏色
	danmu_send_time = p[4]  # 彈幕發送的時間
	danmu_send_time = timestamp_datetime(int(danmu_send_time))
	danmu_pool = p[5]  # 彈幕池 0普通池 1字幕池 2特殊池 【目前特殊池爲高級彈幕專用】
	danmu_send_id = p[6]  # 彈幕發送者id
	danmu_id = p[7]  # 彈幕在彈幕數據庫中的id
	text = d.xpath('./text()')[0]
	comment = {
	   'danmu_current_time': danmu_current_time,
	   'danmu_send_time': danmu_send_time,
	   'danmu_send_id': danmu_send_id,
	   'text': text
	}
danmu_list.append(comment)

3、爬取目標視頻的評論URL:
URL:https://api.bilibili.com/x/v2/reply?pn=1&type=1&oid=41111008&sort=1
其中oid就是之前獲取到的av號
返回結果如下:
在這裏插入圖片描述

實現代碼

'''
'''
@Description: 爬取B站彈幕和評論
@Author: sikaozhifu
@Date: 2020-06-24 11:29:58
@LastEditTime: 2020-06-26 10:20:37
@LastEditors: Please set LastEditors
'''
import requests
from lxml import etree
import re
import time
import pymongo
import math
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}

'''
@description: 日期格式轉換
@param : value
@return: dt
'''


def timestamp_datetime(value):
    format = r'%Y-%m-%d %H:%M:%S'
    # value爲傳入的值爲時間戳(整形),如:1332888820
    value = time.localtime(value)
    # 經過localtime轉換後變成
    # time.struct_time(tm_year=2012, tm_mon=3, tm_mday=28, tm_hour=6, tm_min=53, tm_sec=40, tm_wday=2, tm_yday=88, tm_isdst=0)
    # 最後再經過strftime函數轉換爲正常日期格式。
    dt = time.strftime(format, value)
    return dt


def main():
    client = pymongo.MongoClient(host='127.0.0.1', port=27017)
    db = client.bilibili
    com = db.comments
    perfix = 'https:'
    urls = [
        'https://www.bilibili.com/video/BV1St411S7GP',  # 巨想談戀愛
        'https://www.bilibili.com/video/BV1Ut411i7dk',
        'https://www.bilibili.com/video/BV14t411a7aN',  # 頭號玩家
        'https://www.bilibili.com/video/BV1xt41187ti'
    ]
    ID = 0
    for url in urls:
        response = requests.get(url, headers=headers)
        html = etree.HTML(response.text)
        title = html.xpath('//h1[@class = "video-title"]/@title')[0]  # video的標題
        up_url = perfix + html.xpath('//div[@class = "name"]/a/@href')[0]
        up_id = re.search(r'\d+', up_url).group(0)  # up_id
        up_name = html.xpath('//div[@class = "name"]/a/text()')[0]  # up_name
        av_id_url = html.xpath('//meta[@itemprop = "url"]/@content')[0]
        av_id = re.search(r'\d+', av_id_url).group(0)  # av_id
        bvid = re.search(r'video/(.*)', url).group(1)  # bv_id
        bvid_url = 'https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp' % bvid
        response = requests.get(bvid_url, headers=headers)
        cid = response.json()['data'][0]['cid']
        danmu_url = 'http://comment.bilibili.com/%s.xml' % cid
        time.sleep(1)
        response = requests.get(danmu_url, headers=headers)
        if response.status_code == 200:
            with open('bilibili.xml', 'wb') as fp:
                fp.write(response.content)
        html_comment = etree.parse('bilibili.xml', etree.HTMLParser())
        ds = html_comment.xpath('//d')
        danmu_list = []
        for d in ds:
            p = d.xpath('./@p')[0].split(',')
            danmu_current_time = p[0]  # 彈幕出現的時間
            danmu_mode = p[1]  # 彈幕模式:1..3 滾動彈幕 4底端彈幕 5頂端彈幕 6.逆向彈幕 7精準定位 8高級彈幕
            danmu_font_size = p[2]  # 彈幕字號
            danmu_font_color = p[3]  # 彈幕字體顏色
            danmu_send_time = p[4]  # 彈幕發送的時間
            danmu_send_time = timestamp_datetime(int(danmu_send_time))
            danmu_pool = p[5]  # 彈幕池 0普通池 1字幕池 2特殊池 【目前特殊池爲高級彈幕專用】
            danmu_send_id = p[6]  # 彈幕發送者id
            danmu_id = p[7]  # 彈幕在彈幕數據庫中的id
            text = d.xpath('./text()')[0]
            comment = {
                'danmu_current_time': danmu_current_time,
                'danmu_send_time': danmu_send_time,
                'danmu_send_id': danmu_send_id,
                'text': text
            }
            danmu_list.append(comment)
            # print(danmu_send_time)
            # print(text)

        comment_url = 'https://api.bilibili.com/x/v2/reply?pn=1&type=1&oid=%s&sort=1' % av_id
        response = requests.get(comment_url, headers=headers)
        count = response.json()['data']['page']['count']  # 評論總數
        page_count = math.ceil(int(count)/20)  # 評論總頁數
        comment_list = []
        for pn in range(1, page_count + 1):
            comment_url = 'https://api.bilibili.com/x/v2/reply?pn=%s&type=1&oid=%s&sort=1' % (pn, av_id)
            response = requests.get(comment_url, headers=headers)
            replies = response.json()['data']['replies']
            for reply in replies:
                reply_id = reply['member']['mid']  # 評論者id
                reply_name = reply['member']['uname']  # 評論者暱稱
                reply_sex = reply['member']['sex']  # 評論者性別
                reply_time = timestamp_datetime(int(reply['ctime']))  # 評論時間
                reply_like = reply['like']  # 評論點贊數
                reply_content = reply['content']['message']  # 評論內容
                reply_info = {
                    'reply_id': reply_id,
                    'reply_name': reply_name,
                    'reply_sex': reply_sex,
                    'reply_time': reply_time,
                    'reply_like': reply_like,
                    'reply_content': reply_content
                }
                comment_list.append(reply_info)
        info = {
            'ID': ID,
            'video_title': title,
            'up_id': up_id,
            'up_name': up_name,
            'danmu_list': danmu_list,
            'comment_list': comment_list
        }
        print(info)
        com.insert_one(info)
        ID = ID + 1
        time.sleep(1)


if __name__ == "__main__":
    main()

運行結果

存到Mongodb裏面
在Mongodb compass裏面展示如下:
在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章