python爬蟲之環球網(簡潔版)

註釋挺詳細了,直接上全部代碼,歡迎各位大佬批評指正。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from time import sleep
from lxml import etree
import os
import requests
import csv

# 創建一個無頭瀏覽器對象
chrome_options = Options()
# 設置它爲無框模式
chrome_options.add_argument('--headless')
# 如果在windows上運行需要加代碼
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)

# 設置一個10秒的隱式等待
browser.implicitly_wait(10)
# 使用谷歌無頭瀏覽器來加載動態js
def start_get(url,news_type):
        browser.get(url)
        sleep(1)
        for one in range(30):
            # 翻到頁底
            browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            sleep(1)
         # 拿到頁面源代碼
        source = browser.page_source
        parse_page(url,source)
 
# 對新聞列表頁面進行解析
def parse_page(url,html):
    # 創建etree對象
    tree = etree.HTML(html)
    new_lst = tree.xpath('//ul[@id="recommend"]//a')
    for one_new in new_lst:
        title = one_new.xpath('.//h4/text()')[0]
        link = url + one_new.xpath('./@href')[0]
        try:
            write_in(title, link,news_type)
        except Exception as e:
            print(e)
   
# 將其寫入到文件
def write_in(title, link,news_type):
    alist = []
    print('開始寫入新聞:{}'.format(title))
    # response = requests.get(url=link)
    browser.get(link)
    sleep(1)
    # 再次翻頁到底
    browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    # 拿到頁面源代碼
    source = browser.page_source
    tree = etree.HTML(source)

    alist.append(news_type)
    # title = title.replace('?', '')
    alist.append(title)

    con_link = link
    alist.append(con_link)

    content_lst = tree.xpath('//section[@data-type="rtext"]/p')
    con = ''
    if content_lst:
        for one_content in content_lst:
            if one_content.text:
                con = con + '\n' + one_content.text.strip()
        alist.append(con)

        # post_time_source = tree.xpath('//div[@class="left-t"]')[0].text

        post_time = tree.xpath('//div[@class="metadata-info"]//p[@class="time"]')[0].text
        alist.append(post_time)

        post_source = tree.xpath('//div[@class="metadata-info"]//span[@class="source"]//a')
        if post_source:
            post_source=post_source[0].text
        else:
            post_source = tree.xpath('//div[@class="metadata-info"]//span[@class="source"]//span')[0].text
        alist.append(post_source)
         # 1. 創建文件對象
        f = open('環球網n.csv', 'a+', encoding='utf-8',newline='')
        # 2. 基於文件對象構建 csv寫入對象
        csv_writer = csv.writer(f)
        print(alist)
        csv_writer.writerow(alist)
        f.close()
if __name__ == '__main__':
    urls = ['https://world.huanqiu.com/', 'https://china.huanqiu.com/', 'https://mil.huanqiu.com/', 'https://finance.huanqiu.com/', 'https://sports.huanqiu.com/', 'https://ent.huanqiu.com/']
    i = 0
    news_types = ["國際","國內","軍事","財經","體育","娛樂"]
    for url in urls:
        # headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
        if not os.path.exists('new'):
            os.mkdir('new')
        news_type = news_types[i]
        start_get(url, news_type)
        i = i+1
	browser.quit()

結果如下:
在這裏插入圖片描述
注:本文僅用於技術交流,不得用於商業用途。不遵守者,與本文作者無關。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章