#導入各種需要用到的庫
import requests
from bs4 import BeautifulSoup
import traceback
import re
# 定義第一個函數,獲取網頁的數據
def getHTMLText(url):
try:
r = requests.get(url,timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "getHTMLText函數異常"
# 定義第二個函數,對獲取到的網頁數據處理,循環遍歷
def getStockList(lst,stockURL):
html = getHTMLText(stockURL)
# BeautifulSoup具有解析遍歷維護標籤樹的功能
soup = BeautifulSoup(html,'html.parser')
# 查找所有的a標籤,返回列表
a = soup.find_all('a')
# 遍歷列表
for i in a:
try:
# 返回屬性是href的值
href = i.attrs['href']
# 對值進行正則匹配,符合的添加到列表當中
lst.append(re.findall(r"[s][hz]\d{6}",href)[0])
except:
# 如果異常繼續
continue
# 定義第三個函數
def getStockInfo(lst,stockURL,fpath):
# 遍歷各個股票代碼
for stock in lst:
# 各個股票的詳情頁
url = stockURL + stock + ".html"
# 獲取頁面信息
html = getHTMLText(url)
try:
if html =='':
continue
# 創建空字典
infoDict = {}
soup = BeautifulSoup(html,'html.parser')
# 查找div標籤,並且屬性是'class':'stock-bets'的第一個標籤
stockInfo = soup.find('div',attrs={'class':'stock-bets'})
# 返回列表,屬性是'class':'bets-name'的所有標籤的第一個標籤
name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
#將股票名稱添加到字典中,這的.text和.string有區別,區別是當有子孫節點時.string不能返回值
infoDict.update({'股票名稱':name.text.split()[0]})
# 查找所有的信息名稱和值
keyList = stockInfo.find_all('dt')
valueList = stockInfo.find_all('dd')
# 遍歷信息名稱和值並且添加到字典中,存放到文件當中
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
infoDict[key] = val
with open(fpath,'a',encoding='utf-8') as f:
f.write(str(infoDict) + '\n')
# 進度條,命令行纔可以看到
count = count+1
print('\r當前速度:{:.2f}%'.format(count*100/len(lst)),end='')
except:
count = count + 1
print('\r當前速度:{:.2f}%'.format(count * 100 / len(lst)), end='')
traceback.print_exc()
continue
def main():
stock_list_url = ""
stock_info_url = ""
output_file = 'E:\pycharmProjects\pachonglianxi\StockInfo.txt'
slist = []
getStockList(slist,stock_list_url)
getStockInfo(slist,stock_info_url,output_file)
main()