Python——爬取直播網站房間名及熱度

鬥魚直播

# coding=utf-8
'''
    爬取鬥魚直播房間名和人氣值
'''
# 導入selenium工具
import time
from selenium import webdriver
from lxml import etree

class Douyu(object):

    # 初始化
    def __init__(self):
        # 通過瀏覽器加載網頁
        self.driver = webdriver.PhantomJS()
        # 要統計的數量
        self.room_count = 0   # 房間數量
        self.hot_count = 0   # 熱度

    # 獲取房間名和人氣
    def run(self):
        # 打開網頁
        # self.driver.get('https://www.douyu.com/directory/all')
        # 爬取相關的內容
        content = etree.HTML(self.driver.page_source)
        rooms = content.xpath('//li[@class="layout-Cover-item"]/div[@class="DyListCover HeaderCell is-href"]/a/div[@class="DyListCover-content"]')
        for room in rooms:
            # 獲取房間名稱
            tmp = room.xpath('./div[@class="DyListCover-info"]/h3[@class="DyListCover-intro"]/text()')
            roomname = tmp[0]
            # 獲取人氣
            tmp = room.xpath('./div[@class="DyListCover-info"]/span[@class="DyListCover-hot"]/text()')
            hot = tmp[0]
            print('人氣:'+hot+';房間:'+roomname)
            # 增加房間數量
            self.room_count += 1
            # 增加人氣數量
            if hot[-1] == '萬':
                hot = hot[:-1]
                hot = int(float(hot) * 10000)
                self.hot_count += hot
            else:
                hot = int(hot)
                self.hot_count += hot
        # 輸出結果
        print('當前直播房間總量:',self.room_count)
        print('當前人氣總數:',self.hot_count)

    # 遍歷頁數
    def test(self):
        # 打開網頁
        self.driver.get('https://www.douyu.com/directory/all')
        # 循環遍歷每一頁
        page = 0
        while True:
            # 延遲一點
            time.sleep(5)
            page += 1
            # 嘗試查找laypage_next
            ret = self.driver.find_element_by_class_name('dy-Pagination-next').get_attribute("aria-disabled")
            if ret.lower() == 'false':
                print('-'*30+'第' + str(page) + '頁'+'-'*30)
                self.run()
            else:
                print('-'*30+'最後一頁'+'-'*30)
                break
            self.driver.find_element_by_class_name('dy-Pagination-next').click()

if __name__ == '__main__':
    dy = Douyu()
    dy.test()


虎牙直播

# coding=utf-8
'''
    爬取虎牙直播房間名和人氣值
'''
# 導入selenium工具
import time
from selenium import webdriver
from lxml import etree

class Huya(object):

    # 初始化
    def __init__(self):
        # 通過瀏覽器加載網頁
        self.driver = webdriver.PhantomJS()
        # 要統計的數量
        self.room_count = 0   # 房間數量
        self.hot_count = 0   # 熱度
        # self.rooms_count = 0
        # self.hots_count = 0

    # 獲取房間名和人氣
    def run(self):
        # 打開網頁
        # self.driver.get('https://www.huya.com/l')
        # 爬取相關的內容
        content = etree.HTML(self.driver.page_source)
        rooms = content.xpath('//li[@class="game-live-item"]')
        for room in rooms:
            # 獲取房間名稱
            tmp = room.xpath('./a[@class="title new-clickstat"]/text()')
            roomname = tmp[0]
            # 獲取人氣
            tmp = room.xpath('./span[@class="txt"]/span[@class="num"]/i[@class="js-num"]/text()')
            hot = tmp[0]
            print('房間:'+roomname+'; 人氣:'+str(hot))
            # 增加房間數量
            self.room_count += 1
            # 增加人氣數量
            if hot[-1] == '萬':
                hot = hot[:-1]
                hot = int(float(hot) * 10000)
                self.hot_count += hot
            else:
                hot = int(hot)
                self.hot_count += hot
        # 輸出結果
        print('當前直播房間總量:',self.room_count)
        print('當前人氣總數:',self.hot_count)

    # 遍歷頁數
    def test(self):
        # 打開網頁
        self.driver.get('https://www.huya.com/l')
        # 循環遍歷每一頁
        page = 0
        while True:
            # 延遲一點
            time.sleep(5)
            page += 1
            # 嘗試查找laypage_next
            ret = self.driver.page_source.find('laypage_next')
            if ret >= 0:
                print('-'*30+'第' + str(page) + '頁'+'-'*30)
                self.run()
            else:
                print('-'*'最後一頁'+'-'*30)
                break
            self.driver.find_element_by_class_name('laypage_next').click()

if __name__ == '__main__':
    huya = Huya()
    huya.test()

總結

  1. xpath 要填寫正確
  2. 每個網站的翻頁方式不同
  3. 靈活使用 find_element_by_class_name 方法以及 get_attribute 方法
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章