- 網易新聞
- 爬蟲
- python
註釋挺詳細了,直接上全部代碼,歡迎各位大佬批評指正。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from time import sleep
from lxml import etree
import os
import requests
import csv
# 創建一個無頭瀏覽器對象
chrome_options = Options()
# 設置它爲無框模式
chrome_options.add_argument('--headless')
# 如果在windows上運行需要加代碼
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# 設置一個10秒的隱式等待
browser.implicitly_wait(10)
# 使用谷歌無頭瀏覽器來加載動態js
def start_get(url,news_type):
browser.get(url)
sleep(1)
# 翻到頁底
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(1)
# 點擊加載更多
more_btn = browser.find_elements(By.CSS_SELECTOR, '.load_more_btn')
if more_btn:
try:
more_btn[0].click()
except Exception as e:
print(e)
print('繼續....')
sleep(1)
# 再次翻頁到底
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
# 拿到頁面源代碼
source = browser.page_source
parse_page(source)
# 對新聞列表頁面進行解析
def parse_page(html):
# 創建etree對象
tree = etree.HTML(html)
new_lst = tree.xpath('//div[@class="news_title"]//a')
for one_new in new_lst:
title = one_new.xpath('./text()')[0]
link = one_new.xpath('./@href')[0]
try:
write_in(title, link,news_type)
except Exception as e:
print(e)
# 將其寫入到文件
def write_in(title, link,news_type):
alist = []
print('開始寫入新聞{}'.format(title))
browser.get(link)
sleep(1)
# 再次翻頁到底
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
# 拿到頁面源代碼
source = browser.page_source
tree = etree.HTML(source)
alist.append(news_type)
alist.append(title)
con_link = link
alist.append(con_link)
content_lst = tree.xpath('//div[@class="post_text"]//p')
con = ''
if content_lst:
for one_content in content_lst:
if one_content.text:
con = con + '\n' + one_content.text.strip()
alist.append(con)
post_time_source = tree.xpath('//div[@class="post_time_source"]')[0].text
if "來源:" in post_time_source:
post_time = post_time_source.split("來源:")[0]
alist.append(post_time)
else:
post_time = post_time_source[0].text
alist.append(post_time)
post_source = tree.xpath('//div[@class="post_time_source"]/a[@id="ne_article_source"]')[0].text
alist.append(post_source)
# browser.get(url)
tiecount = tree.xpath('//a[@class="js-tiecount js-tielink"]')[0].text
alist.append(tiecount)
tiejoincount = tree.xpath('//a[@class="js-tiejoincount js-tielink"]')[0].text
alist.append(tiejoincount)
# 1. 創建文件對象
f = open('網易.csv', 'a+', encoding='utf-8',newline='')
# 2. 基於文件對象構建 csv寫入對象
csv_writer = csv.writer(f)
# 3. 構建列表頭
# csv_writer.writerow(["姓名", "年齡", "性別"])
# 4. 寫入csv文件內容
print(alist)
csv_writer.writerow(alist)
f.close()
if __name__ == '__main__':
urls = browser.find_elements_by_xpath('//a[@ne-role="tab-nav"]')
links =[]
for one in urls:
url = one.get_attribute('href')
links.append(url)
for one in range(len(links)):
if 'http' in links[one]:
# headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
if not os.path.exists('new'):
os.mkdir('new')
news_type = urls[one].text
start_get(links[one],news_type)
browser.quit()
結果如下:
注:本文僅用於技術交流,不得用於商業用途。不遵守者,與本文作者無關。