python爬蟲之網易新聞網（簡潔版）

原創

2020-06-16 02:27

網易新聞
爬蟲
python

註釋挺詳細了，直接上全部代碼，歡迎各位大佬批評指正。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from time import sleep
from lxml import etree
import os
import requests
import csv

# 創建一個無頭瀏覽器對象
chrome_options = Options()
# 設置它爲無框模式
chrome_options.add_argument('--headless')
# 如果在windows上運行需要加代碼
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# 設置一個10秒的隱式等待
browser.implicitly_wait(10)
# 使用谷歌無頭瀏覽器來加載動態js
def start_get(url,news_type):
      browser.get(url)
      sleep(1)
      # 翻到頁底
      browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
      sleep(1)
      # 點擊加載更多
      more_btn = browser.find_elements(By.CSS_SELECTOR, '.load_more_btn')
      if more_btn:
          try:
              more_btn[0].click()
          except Exception as e:
              print(e)
      print('繼續....')
      sleep(1)
      # 再次翻頁到底
      browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
      # 拿到頁面源代碼
      source = browser.page_source
      parse_page(source)

# 對新聞列表頁面進行解析
def parse_page(html):
    # 創建etree對象
    tree = etree.HTML(html)
    new_lst = tree.xpath('//div[@class="news_title"]//a')
    for one_new in new_lst:
        title = one_new.xpath('./text()')[0]
        link = one_new.xpath('./@href')[0]
        try:
            write_in(title, link,news_type)
        except Exception as e:
            print(e)

# 將其寫入到文件
def write_in(title, link,news_type):
    alist = []
    print('開始寫入新聞{}'.format(title))
    browser.get(link)
    sleep(1)
    # 再次翻頁到底
    browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    # 拿到頁面源代碼
    source = browser.page_source
    tree = etree.HTML(source)

    alist.append(news_type)
    alist.append(title)

    con_link = link
    alist.append(con_link)

    content_lst = tree.xpath('//div[@class="post_text"]//p')
    con = ''
    if content_lst:
        for one_content in content_lst:
            if one_content.text:
                con = con + '\n' + one_content.text.strip()
        alist.append(con)

        post_time_source = tree.xpath('//div[@class="post_time_source"]')[0].text
        if "來源:" in post_time_source:
            post_time = post_time_source.split("來源:")[0]
            alist.append(post_time)

        else:
            post_time = post_time_source[0].text
            alist.append(post_time)
        post_source = tree.xpath('//div[@class="post_time_source"]/a[@id="ne_article_source"]')[0].text
        alist.append(post_source)

        # browser.get(url)
        tiecount = tree.xpath('//a[@class="js-tiecount js-tielink"]')[0].text
        alist.append(tiecount)

        tiejoincount = tree.xpath('//a[@class="js-tiejoincount js-tielink"]')[0].text
        alist.append(tiejoincount)

        # 1. 創建文件對象
        f = open('網易.csv', 'a+', encoding='utf-8',newline='')
        # 2. 基於文件對象構建 csv寫入對象
        csv_writer = csv.writer(f)
        # 3. 構建列表頭
        # csv_writer.writerow(["姓名", "年齡", "性別"])
        # 4. 寫入csv文件內容
        print(alist)
        csv_writer.writerow(alist)
        f.close()
   
if __name__ == '__main__':
    urls = browser.find_elements_by_xpath('//a[@ne-role="tab-nav"]')
    links =[]
    for one in urls:
        url = one.get_attribute('href')
        links.append(url)

    for one in range(len(links)):
            if 'http' in links[one]:
                # headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
                if not os.path.exists('new'):
                    os.mkdir('new')
                news_type = urls[one].text
                start_get(links[one],news_type)
 
    browser.quit()

結果如下：

注：本文僅用於技術交流，不得用於商業用途。不遵守者，與本文作者無關。

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python爬蟲之網易新聞網（簡潔版）

10分鐘搞定Mysql主從部署配置

如何使用 JS 判斷用戶是否處於活躍狀態

「Pygors跨平臺GUI」2：安裝MinGW-w64、MSYS2還是WSL2

[轉帖]

python列出centos7內存使用前50的進程信息

「Pygors跨平臺GUI」1：Pygors跨平臺GUI應用研究

一鍵自動化博客發佈工具,用過的人都說好(掘金篇)

lightdb數據庫超時相關控制參數

lightdb秒級增加列和刪除列（not null帶默認值）

Java ThreadPoolShutdown

經典bug——續更累積

測試常用正則表達式整理彙總

工作需要的資料、工作流程——測試小小小白時的整理

App測試點——測試小小小小白時的整理

測試工程師績效考覈表——僅供參考，各指標由公司制度決定

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結