python爬蟲之網易新聞網(簡潔版)

  • 網易新聞
  • 爬蟲
  • python

註釋挺詳細了,直接上全部代碼,歡迎各位大佬批評指正。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from time import sleep
from lxml import etree
import os
import requests
import csv

# 創建一個無頭瀏覽器對象
chrome_options = Options()
# 設置它爲無框模式
chrome_options.add_argument('--headless')
# 如果在windows上運行需要加代碼
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# 設置一個10秒的隱式等待
browser.implicitly_wait(10)
# 使用谷歌無頭瀏覽器來加載動態js
def start_get(url,news_type):
      browser.get(url)
      sleep(1)
      # 翻到頁底
      browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
      sleep(1)
      # 點擊加載更多
      more_btn = browser.find_elements(By.CSS_SELECTOR, '.load_more_btn')
      if more_btn:
          try:
              more_btn[0].click()
          except Exception as e:
              print(e)
      print('繼續....')
      sleep(1)
      # 再次翻頁到底
      browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
      # 拿到頁面源代碼
      source = browser.page_source
      parse_page(source)

# 對新聞列表頁面進行解析
def parse_page(html):
    # 創建etree對象
    tree = etree.HTML(html)
    new_lst = tree.xpath('//div[@class="news_title"]//a')
    for one_new in new_lst:
        title = one_new.xpath('./text()')[0]
        link = one_new.xpath('./@href')[0]
        try:
            write_in(title, link,news_type)
        except Exception as e:
            print(e)

# 將其寫入到文件
def write_in(title, link,news_type):
    alist = []
    print('開始寫入新聞{}'.format(title))
    browser.get(link)
    sleep(1)
    # 再次翻頁到底
    browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    # 拿到頁面源代碼
    source = browser.page_source
    tree = etree.HTML(source)

    alist.append(news_type)
    alist.append(title)

    con_link = link
    alist.append(con_link)

    content_lst = tree.xpath('//div[@class="post_text"]//p')
    con = ''
    if content_lst:
        for one_content in content_lst:
            if one_content.text:
                con = con + '\n' + one_content.text.strip()
        alist.append(con)

        post_time_source = tree.xpath('//div[@class="post_time_source"]')[0].text
        if "來源:" in post_time_source:
            post_time = post_time_source.split("來源:")[0]
            alist.append(post_time)

        else:
            post_time = post_time_source[0].text
            alist.append(post_time)
        post_source = tree.xpath('//div[@class="post_time_source"]/a[@id="ne_article_source"]')[0].text
        alist.append(post_source)

        # browser.get(url)
        tiecount = tree.xpath('//a[@class="js-tiecount js-tielink"]')[0].text
        alist.append(tiecount)

        tiejoincount = tree.xpath('//a[@class="js-tiejoincount js-tielink"]')[0].text
        alist.append(tiejoincount)

        # 1. 創建文件對象
        f = open('網易.csv', 'a+', encoding='utf-8',newline='')
        # 2. 基於文件對象構建 csv寫入對象
        csv_writer = csv.writer(f)
        # 3. 構建列表頭
        # csv_writer.writerow(["姓名", "年齡", "性別"])
        # 4. 寫入csv文件內容
        print(alist)
        csv_writer.writerow(alist)
        f.close()
   
if __name__ == '__main__':
    urls = browser.find_elements_by_xpath('//a[@ne-role="tab-nav"]')
    links =[]
    for one in urls:
        url = one.get_attribute('href')
        links.append(url)

    for one in range(len(links)):
            if 'http' in links[one]:
                # headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
                if not os.path.exists('new'):
                    os.mkdir('new')
                news_type = urls[one].text
                start_get(links[one],news_type)
 
    browser.quit()

結果如下:
在這裏插入圖片描述
注:本文僅用於技術交流,不得用於商業用途。不遵守者,與本文作者無關。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章