python爬蟲之環球網（簡潔版）

原創

2020-03-04 02:56

註釋挺詳細了，直接上全部代碼，歡迎各位大佬批評指正。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from time import sleep
from lxml import etree
import os
import requests
import csv

# 創建一個無頭瀏覽器對象
chrome_options = Options()
# 設置它爲無框模式
chrome_options.add_argument('--headless')
# 如果在windows上運行需要加代碼
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)

# 設置一個10秒的隱式等待
browser.implicitly_wait(10)
# 使用谷歌無頭瀏覽器來加載動態js
def start_get(url,news_type):
        browser.get(url)
        sleep(1)
        for one in range(30):
            # 翻到頁底
            browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            sleep(1)
         # 拿到頁面源代碼
        source = browser.page_source
        parse_page(url,source)
 
# 對新聞列表頁面進行解析
def parse_page(url,html):
    # 創建etree對象
    tree = etree.HTML(html)
    new_lst = tree.xpath('//ul[@id="recommend"]//a')
    for one_new in new_lst:
        title = one_new.xpath('.//h4/text()')[0]
        link = url + one_new.xpath('./@href')[0]
        try:
            write_in(title, link,news_type)
        except Exception as e:
            print(e)
   
# 將其寫入到文件
def write_in(title, link,news_type):
    alist = []
    print('開始寫入新聞:{}'.format(title))
    # response = requests.get(url=link)
    browser.get(link)
    sleep(1)
    # 再次翻頁到底
    browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    # 拿到頁面源代碼
    source = browser.page_source
    tree = etree.HTML(source)

    alist.append(news_type)
    # title = title.replace('?', '')
    alist.append(title)

    con_link = link
    alist.append(con_link)

    content_lst = tree.xpath('//section[@data-type="rtext"]/p')
    con = ''
    if content_lst:
        for one_content in content_lst:
            if one_content.text:
                con = con + '\n' + one_content.text.strip()
        alist.append(con)

        # post_time_source = tree.xpath('//div[@class="left-t"]')[0].text

        post_time = tree.xpath('//div[@class="metadata-info"]//p[@class="time"]')[0].text
        alist.append(post_time)

        post_source = tree.xpath('//div[@class="metadata-info"]//span[@class="source"]//a')
        if post_source:
            post_source=post_source[0].text
        else:
            post_source = tree.xpath('//div[@class="metadata-info"]//span[@class="source"]//span')[0].text
        alist.append(post_source)
         # 1. 創建文件對象
        f = open('環球網n.csv', 'a+', encoding='utf-8',newline='')
        # 2. 基於文件對象構建 csv寫入對象
        csv_writer = csv.writer(f)
        print(alist)
        csv_writer.writerow(alist)
        f.close()
if __name__ == '__main__':
    urls = ['https://world.huanqiu.com/', 'https://china.huanqiu.com/', 'https://mil.huanqiu.com/', 'https://finance.huanqiu.com/', 'https://sports.huanqiu.com/', 'https://ent.huanqiu.com/']
    i = 0
    news_types = ["國際","國內","軍事","財經","體育","娛樂"]
    for url in urls:
        # headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
        if not os.path.exists('new'):
            os.mkdir('new')
        news_type = news_types[i]
        start_get(url, news_type)
        i = i+1
	browser.quit()

結果如下：

注：本文僅用於技術交流，不得用於商業用途。不遵守者，與本文作者無關。

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

python爬蟲之環球網（簡潔版）

CORS error 但是 status code 是200 OK

壓縮上傳的GPU數據的方案

使用skopeo同步鏡像

經典bug——續更累積

測試常用正則表達式整理彙總

工作需要的資料、工作流程——測試小小小白時的整理

App測試點——測試小小小小白時的整理

測試工程師績效考覈表——僅供參考，各指標由公司制度決定

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結