《Python金融大數據挖掘與分析全流程詳解》實戰 筆記整理

1、獲取新浪財經實時股票數據

# =============================================================================
# 9.1 新浪股票實時數據挖掘實戰 by 王宇韜
# =============================================================================

from selenium import webdriver
import re
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://finance.sina.com.cn/realstock/company/sh000001/nc.shtml')
data = browser.page_source
# print(data)
browser.quit()
#提取股價的正則表達式
p_price = '<div id="price" class=".*?">(.*?)</div>'
price = re.findall(p_price, data)
print(price)

2、東方財富網數據挖掘實戰

# =============================================================================
# 9.2 東方財富網數據挖掘實戰 by 王宇韜
# =============================================================================

from selenium import webdriver
import re


def dongfang(company):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(chrome_options=chrome_options)
    url = 'http://so.eastmoney.com/news/s?keyword=' + company
    browser.get(url)
    #獲得網頁數據,存在data裏
    data = browser.page_source
    browser.quit()
    # print(data)

    p_title = '<div class="news-item"><h3><a href=".*?">(.*?)</a>'
    p_href = '<div class="news-item"><h3><a href="(.*?)">.*?</a>'
    p_date = '<p class="news-desc">(.*?)</p>'
    #從data裏提取標題、鏈接、日期
    title = re.findall(p_title,data)
    href = re.findall(p_href,data)
    date = re.findall(p_date,data,re.S)

    for i in range(len(title)):
        title[i] = re.sub('<.*?>', '', title[i])
        date[i] = date[i].split(' ')[0]
        #整合之後輸出
        print(str(i+1) + '.' + title[i] + ' - '+ date[i])
        print(href[i])


companys = ['華能信託', '阿里巴巴', '騰訊', '京東', '萬科']
for i in companys:
    try:
        dongfang(i)
        print(i + '該公司東方財富網爬取成功')
    except:
        print(i + '該公司東方財富網爬取失敗')


3、裁判文書網:自動在網頁上搜索

# =============================================================================
# 9.3 裁判文書網數據挖掘實戰 by 王宇韜
# =============================================================================

from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('http://wenshu.court.gov.cn/')
browser.maximize_window()

browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[2]/input').clear()  # 清空原搜索框
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[2]/input').send_keys('房地產')  # 在搜索框內模擬輸入'房地產'三個字
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[3]').click()  # 點擊搜索按鈕
time.sleep(10)  # 如果還是獲取不到你想要的內容,你可以把這個時間再稍微延長一些
data = browser.page_source
browser.quit()
print(data)

4、巨潮資訊網:多個指定關鍵詞的公告信息批量爬取

# =============================================================================
# 9.4 巨潮資訊網數據挖掘實戰 by 王宇韜
# =============================================================================

from selenium import webdriver
import re

def juchao(keyword):
    browser = webdriver.Chrome()
    url = 'http://www.cninfo.com.cn/new/fulltextSearch?notautosubmit=&keyWord=' + keyword
    browser.get(url)
    data = browser.page_source
    # print(data)
    browser.quit()

    p_title = '<td class="sub-title"><a href=".*?" target="_blank">(.*?)</td>'
    p_href = '<td class="sub-title"><a href="(.*?)" target="_blank">.*?</td>'
    p_date = '<div class="sub-time-time">(.*?)</div>'
    title = re.findall(p_title, data)
    href = re.findall(p_href, data)
    date = re.findall(p_date, data)

    for i in range(len(title)):
        title[i] = re.sub(r'<.*?>', '', title[i])
        href[i] = 'http://www.cninfo.com.cn' + href[i]
        href[i] = re.sub('amp;', '', href[i])
        date[i] = date[i].split(' ')[0]
        print(str(i + 1) + '.' + title[i] + ' - ' + date[i])
        print(href[i])

keywords = ['理財', '現金管理', '紓困']
for i in keywords:
    juchao(i)

 

發佈了27 篇原創文章 · 獲贊 9 · 訪問量 1萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章