用Python爬取鬥魚各區的主播信息，並製作熱度排行榜

原創

New_boy25

2020-07-08 04:34

本次編程主要是爲了練習爬蟲編程和數據分析，對鬥魚直播進行爬蟲，按區域劃分獲取主播信息並用pandas進行數據處理，用matplotlib進行繪圖。

用到的功能有：requests主要爬蟲模塊、threading多線程模塊、pandas數據處理模塊、queue隊列模塊、lxml HTML解析器、matplotlib 繪圖模塊、time模塊。

另外一篇記錄主播熱度的文章可以瀏覽一下：
https://blog.csdn.net/New_boy25/article/details/101067531

思路如下：

先從鬥魚主頁（https://www.douyu.com/directory/all）中爬取分區的名稱以及網址
分別向每個分區發送請求並獲取響應
用xpath從每個響應中獲取每個分區首頁的主播名稱及其熱度
用pandas對獲取到的主播信息按熱度進行排序
將排序好的數據用matplotlib呈現出來（條形圖）
（另外使用到了隊列和多線程進行工作）

代碼如下，歡迎學習交流：

# coding=utf-8
import requests
import threading
import pandas as pd
import time
from queue import Queue
from lxml import etree
from matplotlib import font_manager, use
use('Agg')
from matplotlib import pyplot as plt


class DouyuSpider:
    def __init__(self):
        self.headers = {
            "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.3", }
        self.Index = "https://www.douyu.com/directory/all"
        # 創建隊列
        self.module_queue = Queue()
        self.module_content_queue = Queue()
        self.module_th_queue = Queue()
        self.main_info_queue = Queue()
        self.plot_info_queue = Queue()
	
	# 請求鏈接返回響應
    def parse_url(self, url):
        response = requests.get(url, headers=self.headers)
        return response.content.decode()
	
	# 使用xpath獲取每個分區
    def get_module(self, index_html):
        html = etree.HTML(index_html)
        module_list = html.xpath('''//a[@class="Aside-menu-item"]''')
        for temp in module_list:
            self.module_queue.put(temp)
        print(len(module_list))
	
	# 獲取每個分區的名稱和鏈接
    def get_module_content(self):
        while True:
            temp = []
            module = self.module_queue.get()
            title = module.xpath('''./@title''')[0] if len(module.xpath('''./@title''')) > 0 else None
            href = module.xpath('''./@href''')[0] if len(module.xpath('''./@href''')) > 0 else None
            temp.append(title)
            temp.append(href)
            self.module_content_queue.put(temp)
            self.module_queue.task_done()
	
	# 向每個分區發起請求獲取響應
    def parse_module(self):
        while True:
            module_content = self.module_content_queue.get()
            module_title = module_content[0]
            module_href = "https://www.douyu.com/" + module_content[1]
            ret = self.parse_url(module_href)
            module = {}
            module["title"] = module_title
            module["content"] = ret
            self.module_th_queue.put(module)
            self.module_content_queue.task_done()
	
	# 使用xpath獲取每個分區第一頁的主播信息（名字和熱度）
    def get_main_info(self):
        while True:
            th = self.module_th_queue.get()
            th_title = th["title"]
            th_str = th["content"]
            html = etree.HTML(th_str)
            div_list = html.xpath('''//div[@class="DyListCover-info"]''')
            name_list = []
            hot_list = []
            for temp in div_list:
            	# 獲取名字信息
                name = temp.xpath('''./h2[@class="DyListCover-user is-template"]//text()''')
                name = name[0] if len(name) > 0 else None
                name_list.append(name)
				# 獲取熱度信息並轉化爲數字
                hot = temp.xpath('''./span[@class="DyListCover-hot is-template"]/text()''')
                hot = hot[0] if len(hot) > 0 else "0"
                if hot.count('萬'):
                    hot = float(hot[0:-1]) * 10000
                    hot_list.append(hot)
                else:
                    hot_list.append(int(hot))
            info = {}
            info["title"] = th_title
            info["name_list"] = name_list
            info["hot_list"] = hot_list
            self.main_info_queue.put(info)
            self.module_th_queue.task_done()
	
	# 使用pandas篩選無效的數據（有點多此一舉）
    def deal_info(self):
        while True:
            info_list = self.main_info_queue.get()
            name_list = info_list["name_list"]
            hot_list = info_list["hot_list"]
            title = info_list["title"]
            df = pd.DataFrame({"name": name_list, "hot": hot_list})
            df = df[df["hot"] != 0]
            df = df.set_index("name")
            df = df.sort_values(by="hot", ascending=False)
            df = df.head(20)
            # 爲繪圖提取可用的數據
            x = df.index
            y = df.values
            y = y.reshape(len(x))
            plot_info = {}
            plot_info["x"] = x
            plot_info["y"] = y
            plot_info["title"] = title
            self.plot_info_queue.put(plot_info)
            self.main_info_queue.task_done()

    def plot_and_save(self):
        i = 1
        while True:
            plot_info = self.plot_info_queue.get()
            t = plot_info["title"]
            x = plot_info["x"]
            y = plot_info["y"]
			# 設置圖形信息
            plt.figure(figsize=(20, 8), dpi=80)
            my_font1 = font_manager.FontProperties(fname='C:\Windows\Fonts\msyh.ttc', size=18)
            my_font2 = font_manager.FontProperties(fname='C:\Windows\Fonts\msyh.ttc', size=10)
            plt.xlabel('主播名稱', fontproperties=my_font1)
            plt.ylabel('主播熱度', fontproperties=my_font1)
            plt.grid(alpha=0.3)
			# 繪製圖片
            plt.bar(range(len(x)), y, width=0.5, color="orange")
            _x = range(len(x))
            _xticks_label = [i for i in x]
            plt.xticks(_x, _xticks_label, fontproperties=my_font2, rotation=20)
            time_list = list(time.localtime())[3:6]
            time_str = str(time_list[0]) + ":" + str(time_list[1]) + ":" + str(time_list[2])
            plt.title("鬥魚：{}區主播熱度排行榜--{}".format(t, time_str), fontproperties=my_font1)
            file_name = "鬥魚{}區-熱度排行榜.png".format(t)
            plt.savefig(file_name)
            print(t, i)
            i += 1
            self.plot_info_queue.task_done()

    def run(self):
        # 1.向主頁發送請求獲取響應
        index_html = self.parse_url(self.Index)
        # 2.獲取模塊的名稱和地址
        self.get_module(index_html)
		# 開啓多線程
        t_list = []
        for i in range(2):
            t_1 = threading.Thread(target=self.get_module_content)
            t_list.append(t_1)
        # 3.向每個模塊發送響應
        for i in range(10):
            t_2 = threading.Thread(target=self.parse_module)
            t_list.append(t_2)
        # 4.提取主要信息
        for i in range(5):
            t_3 = threading.Thread(target=self.get_main_info)
            t_list.append(t_3)
        # 5.進行數據處理
        for i in range(5):
            t_4 = threading.Thread(target=self.deal_info)
            t_list.append(t_4)
        # 6.進行繪圖並保存圖片
        t_5 = threading.Thread(target=self.plot_and_save)
        t_list.append(t_5)
        for t in t_list:
            t.setDaemon(True)	# 將子線程設置爲守護線程
            t.start()

        for q in [self.module_queue, self.module_content_queue, self.module_th_queue, self.main_info_queue,
                  self.plot_info_queue]:
            q.join()	# 使主線程進入等待，直到子線程完成任務


if __name__ == "__main__":
    t1 = time.time()
    new_module = DouyuSpider()
    new_module.run()
    t2 = time.time()
    print("花費{}s".format(t2 - t1))

結果如下：

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

用Python爬取鬥魚各區的主播信息，並製作熱度排行榜

本次編程主要是爲了練習爬蟲編程和數據分析，對鬥魚直播進行爬蟲，按區域劃分獲取主播信息並用pandas進行數據處理，用matplotlib進行繪圖。

高效率使用windows

今天做一個背單詞（程序員）的小遊戲，用到了redis來儲存單詞

Django的分頁功能——Paginator

Python爬蟲爬取太平洋汽車網的汽車信息，將信息存進MongoDB數據庫

將redis設置爲遠程可訪問

今日份Mysql學習——增刪查改

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結