預期效果
該公司有198頁公告,每一頁有若干公告,通過代碼爬取每一個公告標題日期等信息並根據關鍵字判斷是否爲所需要的信息,對所需要的公告位置作標記。
代碼實現
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import csv
import re
import time
# from bs4 import BeautifulSoup
driver = webdriver.Chrome()
# wait=driver.implicitly_wait(10)
wait=WebDriverWait(driver, 30,1)
def search(i):
print('準備翻第{}頁'.format(i))
try:
time.sleep(1.5)
# smart_wait(self,"#pg_noticelist > a.nextPage.page")
submit=wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,"#pg_noticelist > a.nextPage.page")) )
submit.click()
print('翻完第{}頁'.format(i))
except TimeoutException:
return search(i)
def get_products(i):
doc = pq(driver.page_source, parser="html")
items = doc('#module-content li').items()
j = 0
for item in items:
j = j + 1
title = item('li .l-title').text()
# print('第{}頁{}條新聞:{}'.format(i,j, title))
# date=item('li .span')
date = item('li span').text()
# print(date)
# module-content > li:nth-child(1) > span.fr.date
for key in ['調研', '蒞臨', '訪', '研討', '視察', '接見', '召開', '指導']:
result = re.search(key, title)
if result:
print(title)
print('查閱第{}頁,第{}條新聞,關鍵字是{}'.format(i, j, key))
# print('查閱第{}頁,第{}條新聞'.format(i, j))
index='第{}頁第{}條新聞'.format(i,j)
row = [date,index,key,title]
writer.writerow(row)
def main():
f = open('平安證券news', 'w', encoding='utf-8-sig', newline='')
global writer
writer = csv.writer(f)
head = [ 'date', 'index','key','title']
writer.writerow(head)
driver.get('https://stock.pingan.com/static/info/notice/noticelist.html?noticeType=0')
# get_products(1)
for i in range(1,198):
print('準備執行第{}頁'.format(i))
search(i)
get_products(i)
print('執行完第{}頁'.format(i))
if __name__=='__main__':
main()