Selenium自動翻頁爬取證券公司公告信息——以中國平安爲例

預期效果

該公司有198頁公告,每一頁有若干公告,通過代碼爬取每一個公告標題日期等信息並根據關鍵字判斷是否爲所需要的信息,對所需要的公告位置作標記。
在這裏插入圖片描述
在這裏插入圖片描述

代碼實現

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import csv
import re
import time
# from bs4 import BeautifulSoup

driver = webdriver.Chrome()

# wait=driver.implicitly_wait(10)
wait=WebDriverWait(driver, 30,1)

def search(i):
    print('準備翻第{}頁'.format(i))
    try:
        time.sleep(1.5)
        # smart_wait(self,"#pg_noticelist > a.nextPage.page")
        submit=wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR,"#pg_noticelist > a.nextPage.page")) )
        submit.click()
        print('翻完第{}頁'.format(i))
    except TimeoutException:
        return search(i)

def get_products(i):
    doc = pq(driver.page_source, parser="html")
    items = doc('#module-content li').items()
    j = 0
    for item in items:
        j = j + 1
        title = item('li .l-title').text()
        # print('第{}頁{}條新聞:{}'.format(i,j, title))
        # date=item('li .span')
        date = item('li span').text()
        # print(date)
        # module-content > li:nth-child(1) > span.fr.date

        for key in ['調研', '蒞臨', '訪', '研討',  '視察', '接見', '召開', '指導']:
            result = re.search(key, title)
            if result:
                print(title)
                print('查閱第{}頁,第{}條新聞,關鍵字是{}'.format(i, j, key))
                # print('查閱第{}頁,第{}條新聞'.format(i, j))
                index='第{}頁第{}條新聞'.format(i,j)
                row = [date,index,key,title]
                writer.writerow(row)

def main():
    f = open('平安證券news', 'w', encoding='utf-8-sig', newline='')
    global writer
    writer = csv.writer(f)
    head = [ 'date', 'index','key','title']
    writer.writerow(head)
    driver.get('https://stock.pingan.com/static/info/notice/noticelist.html?noticeType=0')
    # get_products(1)
    for i in range(1,198):
        print('準備執行第{}頁'.format(i))
        search(i)
        get_products(i)
        print('執行完第{}頁'.format(i))

if __name__=='__main__':
    main()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章