1、獲取新浪財經實時股票數據
# =============================================================================
# 9.1 新浪股票實時數據挖掘實戰 by 王宇韜
# =============================================================================
from selenium import webdriver
import re
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://finance.sina.com.cn/realstock/company/sh000001/nc.shtml')
data = browser.page_source
# print(data)
browser.quit()
#提取股價的正則表達式
p_price = '<div id="price" class=".*?">(.*?)</div>'
price = re.findall(p_price, data)
print(price)
2、東方財富網數據挖掘實戰
# =============================================================================
# 9.2 東方財富網數據挖掘實戰 by 王宇韜
# =============================================================================
from selenium import webdriver
import re
def dongfang(company):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
url = 'http://so.eastmoney.com/news/s?keyword=' + company
browser.get(url)
#獲得網頁數據,存在data裏
data = browser.page_source
browser.quit()
# print(data)
p_title = '<div class="news-item"><h3><a href=".*?">(.*?)</a>'
p_href = '<div class="news-item"><h3><a href="(.*?)">.*?</a>'
p_date = '<p class="news-desc">(.*?)</p>'
#從data裏提取標題、鏈接、日期
title = re.findall(p_title,data)
href = re.findall(p_href,data)
date = re.findall(p_date,data,re.S)
for i in range(len(title)):
title[i] = re.sub('<.*?>', '', title[i])
date[i] = date[i].split(' ')[0]
#整合之後輸出
print(str(i+1) + '.' + title[i] + ' - '+ date[i])
print(href[i])
companys = ['華能信託', '阿里巴巴', '騰訊', '京東', '萬科']
for i in companys:
try:
dongfang(i)
print(i + '該公司東方財富網爬取成功')
except:
print(i + '該公司東方財富網爬取失敗')
3、裁判文書網:自動在網頁上搜索
# =============================================================================
# 9.3 裁判文書網數據挖掘實戰 by 王宇韜
# =============================================================================
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('http://wenshu.court.gov.cn/')
browser.maximize_window()
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[2]/input').clear() # 清空原搜索框
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[2]/input').send_keys('房地產') # 在搜索框內模擬輸入'房地產'三個字
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[3]').click() # 點擊搜索按鈕
time.sleep(10) # 如果還是獲取不到你想要的內容,你可以把這個時間再稍微延長一些
data = browser.page_source
browser.quit()
print(data)
4、巨潮資訊網:多個指定關鍵詞的公告信息批量爬取
# =============================================================================
# 9.4 巨潮資訊網數據挖掘實戰 by 王宇韜
# =============================================================================
from selenium import webdriver
import re
def juchao(keyword):
browser = webdriver.Chrome()
url = 'http://www.cninfo.com.cn/new/fulltextSearch?notautosubmit=&keyWord=' + keyword
browser.get(url)
data = browser.page_source
# print(data)
browser.quit()
p_title = '<td class="sub-title"><a href=".*?" target="_blank">(.*?)</td>'
p_href = '<td class="sub-title"><a href="(.*?)" target="_blank">.*?</td>'
p_date = '<div class="sub-time-time">(.*?)</div>'
title = re.findall(p_title, data)
href = re.findall(p_href, data)
date = re.findall(p_date, data)
for i in range(len(title)):
title[i] = re.sub(r'<.*?>', '', title[i])
href[i] = 'http://www.cninfo.com.cn' + href[i]
href[i] = re.sub('amp;', '', href[i])
date[i] = date[i].split(' ')[0]
print(str(i + 1) + '.' + title[i] + ' - ' + date[i])
print(href[i])
keywords = ['理財', '現金管理', '紓困']
for i in keywords:
juchao(i)