鬥魚直播
'''
爬取鬥魚直播房間名和人氣值
'''
import time
from selenium import webdriver
from lxml import etree
class Douyu(object):
def __init__(self):
self.driver = webdriver.PhantomJS()
self.room_count = 0
self.hot_count = 0
def run(self):
content = etree.HTML(self.driver.page_source)
rooms = content.xpath('//li[@class="layout-Cover-item"]/div[@class="DyListCover HeaderCell is-href"]/a/div[@class="DyListCover-content"]')
for room in rooms:
tmp = room.xpath('./div[@class="DyListCover-info"]/h3[@class="DyListCover-intro"]/text()')
roomname = tmp[0]
tmp = room.xpath('./div[@class="DyListCover-info"]/span[@class="DyListCover-hot"]/text()')
hot = tmp[0]
print('人氣:'+hot+';房間:'+roomname)
self.room_count += 1
if hot[-1] == '萬':
hot = hot[:-1]
hot = int(float(hot) * 10000)
self.hot_count += hot
else:
hot = int(hot)
self.hot_count += hot
print('當前直播房間總量:',self.room_count)
print('當前人氣總數:',self.hot_count)
def test(self):
self.driver.get('https://www.douyu.com/directory/all')
page = 0
while True:
time.sleep(5)
page += 1
ret = self.driver.find_element_by_class_name('dy-Pagination-next').get_attribute("aria-disabled")
if ret.lower() == 'false':
print('-'*30+'第' + str(page) + '頁'+'-'*30)
self.run()
else:
print('-'*30+'最後一頁'+'-'*30)
break
self.driver.find_element_by_class_name('dy-Pagination-next').click()
if __name__ == '__main__':
dy = Douyu()
dy.test()
虎牙直播
'''
爬取虎牙直播房間名和人氣值
'''
import time
from selenium import webdriver
from lxml import etree
class Huya(object):
def __init__(self):
self.driver = webdriver.PhantomJS()
self.room_count = 0
self.hot_count = 0
def run(self):
content = etree.HTML(self.driver.page_source)
rooms = content.xpath('//li[@class="game-live-item"]')
for room in rooms:
tmp = room.xpath('./a[@class="title new-clickstat"]/text()')
roomname = tmp[0]
tmp = room.xpath('./span[@class="txt"]/span[@class="num"]/i[@class="js-num"]/text()')
hot = tmp[0]
print('房間:'+roomname+'; 人氣:'+str(hot))
self.room_count += 1
if hot[-1] == '萬':
hot = hot[:-1]
hot = int(float(hot) * 10000)
self.hot_count += hot
else:
hot = int(hot)
self.hot_count += hot
print('當前直播房間總量:',self.room_count)
print('當前人氣總數:',self.hot_count)
def test(self):
self.driver.get('https://www.huya.com/l')
page = 0
while True:
time.sleep(5)
page += 1
ret = self.driver.page_source.find('laypage_next')
if ret >= 0:
print('-'*30+'第' + str(page) + '頁'+'-'*30)
self.run()
else:
print('-'*'最後一頁'+'-'*30)
break
self.driver.find_element_by_class_name('laypage_next').click()
if __name__ == '__main__':
huya = Huya()
huya.test()
總結
- xpath 要填寫正確
- 每個網站的翻頁方式不同
- 靈活使用 find_element_by_class_name 方法以及 get_attribute 方法