from __future__ import print_function
import requests, re, json, time, os
import random
from bs4 import BeautifulSoup
import pandas as pd
import cx_Oracle
class Cwzb(object):
def __init__(self):
self.Url = 'http://quote.eastmoney.com/stocklist.html'
self.BaseData = []
self.symbolList = []
self.fileColumn="A股代碼|A股簡稱|總市值|淨資產|淨利潤|市盈率|市淨率|毛利率|淨利率|ROE|A股上市日期|A股總股本|A股流通股本"
self.Date = time.strftime('%Y%m%d')
self.RecordLogFile = 'basedata' + self.Date
self.headers = {
'user-agent': 'Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 53.0.2785.104Safari / 537.36Core / 1.53.4882.400QQBrowser / 9.7.13059.400'
}
def write_data_to_file(self, text):
ext =os.path.exists(self.RecordLogFile)
with open(self.RecordLogFile, 'a') as f:
if ext:
f.write(text + '\n')
else:
f.write(self.fileColumn + '\n')
def read_data_from_file(self):
with open(self.RecordLogFile, 'r') as f:
next(f) #跳過第一行文件頭
for line in f:
self.symbolList.append(line.split('|')[0])
self.BaseData.append(line)
def readLog(self):
if os.path.exists(self.RecordLogFile):
print('record exist...')
self.read_data_from_file()
else:
print('get data again...')
def downExcelSZ(self):
url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1110&TABKEY=tab1'
rst = requests.get(url, timeout=60, headers=self.headers)
with open('深圳A股.xlsx', 'wb') as f:
f.write(rst.content)
data = pd.read_excel('深圳A股.xlsx', sheet_name='A股列表', converters={'公司代碼': str,'A股代碼': str})
return data
def downExcelSH(self):
url = 'http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=1'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/'
}
rst = requests.get(url, timeout=60, headers=headers)
print(rst.status_code)
with open('上海A股.txt', 'wb') as f:
f.write(rst.content)
lines = rst.content.decode('GBK').split("\n")
df = pd.DataFrame(columns=["公司代碼", "公司簡稱", "A股代碼", "A股簡稱", "A股上市日期", "A股總股本", "A股流通股本"])
row = 0
for line in lines:
row = row + 1
if row == 1:
continue #跳過第一行的字段頭
if len(line) > 0:
ls = line.split("\t")[:7]
df.loc[row] = [s.strip() for s in ls]
return df
def get_detail(self, in_symbol):
if in_symbol.startswith("6"):
cwzburl = 'http://quote.eastmoney.com/sh%s.html' % in_symbol
else:
cwzburl = 'http://quote.eastmoney.com/sz%s.html' % in_symbol
# 發送請求
try:
rdm = random.randint(1, 3)
print(' %d秒後開始採集%s' % (rdm, in_symbol))
time.sleep(rdm)
rst = requests.get(cwzburl, timeout=60, headers=self.headers)
cwzbhtml = rst.content
cwzbsoup = BeautifulSoup(cwzbhtml, 'html.parser', from_encoding="GBK")
cwzb_list = cwzbsoup.find('div', class_='cwzb').tbody.tr.get_text().split()
outPut = "|".join(str(i) for i in cwzb_list[1:]) #去掉第一列股票代碼,有錯誤的,直接取交易所的
return outPut
except Exception as e:
print('perhaps timeout:', e, cwzburl)
def write_file(self, symbol, cwzb):
self.BaseData.append(cwzb)
self.write_data_to_file(cwzb)
self.symbolList.append(symbol)
print(" 寫文件成功-%s" % symbol)
def InsertOracle(self):
# Connect as user "hr" with password "welcome" to the "oraclepdb" service running on this computer.
connection = cx_Oracle.connect("gw", "gw", "localhost/xe")
cursor = connection.cursor()
list_param = []
with open(self.RecordLogFile, 'r') as f:
next(f) # 跳過第一行文件頭
for line in f:
t = tuple(line.split("|"))
list_param.append(t)
cursor.execute(""" delete from tb_cwzb where data_date=trunc(sysdate)""")
sql = "INSERT INTO tb_cwzb(data_date,symbol, name,total_market_value,net_asset,net_income,pe,pb,gross_profit_ratio,net_profit_ratio,ROE,list_date,total_shares,cir_shares) VALUES (trunc(sysdate),:1, :2, :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13)"
cursor.executemany(sql, list_param)
connection.commit()
cursor.execute("""select data_date,count(*) cnt from tb_cwzb group by data_date""")
for trad_date, count in cursor:
print("日期:", trad_date, "記錄數量:", count)
def main():
stock = Cwzb()
# 返回深圳市場的DataFrame
dfsz = stock.downExcelSZ()
# 返回上海市場的DataFrame
dfsh = stock.downExcelSH()
stock.readLog()
if 1:
cnt = 0
for index, row in dfsz.iterrows():
cnt = cnt + 1
smb = row["公司代碼"]
print('深圳-處理進度%d/%d' % (cnt, dfsz.shape[0]))
if smb in stock.symbolList:
print('%s已經採集過:' % smb)
continue
cwzb = stock.get_detail(smb)
#原始數據 A股總股本,A股流通股本 單位爲股
stock.write_file(smb, row["A股代碼"]+"|"+row["A股簡稱"]+"|"+cwzb + "|" + row["A股上市日期"] + "|" + row["A股總股本"].replace(",","") + "|" + row["A股流通股本"].replace(",",""))
cnt = 0
for index, row in dfsh.iterrows():
cnt = cnt + 1
smb = row["公司代碼"]
print('上海-處理進度%d/%d' % (cnt, dfsh.shape[0]))
if smb in stock.symbolList:
print('%s已經採集過:' % smb)
continue
cwzb = stock.get_detail(smb)
#原始數據 A股總股本,A股流通股本 單位爲萬股
stock.write_file(smb, row["A股代碼"]+"|"+row["A股簡稱"]+"|"+cwzb + "|" + row["A股上市日期"] + "|" + str(round(float(row["A股總股本"])*10000)) + "|" + str(round(float(row["A股流通股本"])*10000)))
stock.InsertOracle()
if __name__ == '__main__':
main()