pyhton爬取網頁數據入庫到oracle

from __future__ import print_function
import requests, re, json, time, os
import random
from bs4 import BeautifulSoup
import pandas as pd

import cx_Oracle

class Cwzb(object):
    def __init__(self):
        self.Url = 'http://quote.eastmoney.com/stocklist.html'
        self.BaseData = []
        self.symbolList = []
        self.fileColumn="A股代碼|A股簡稱|總市值|淨資產|淨利潤|市盈率|市淨率|毛利率|淨利率|ROE|A股上市日期|A股總股本|A股流通股本"
        self.Date = time.strftime('%Y%m%d')
        self.RecordLogFile = 'basedata' + self.Date
        self.headers = {
            'user-agent': 'Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 53.0.2785.104Safari / 537.36Core / 1.53.4882.400QQBrowser / 9.7.13059.400'
        }

    def write_data_to_file(self, text):
        ext =os.path.exists(self.RecordLogFile)
        with open(self.RecordLogFile, 'a') as f:
            if ext:
                f.write(text + '\n')
            else:
                f.write(self.fileColumn + '\n')

    def read_data_from_file(self):
        with open(self.RecordLogFile, 'r') as f:
            next(f) #跳過第一行文件頭
            for line in f:
                self.symbolList.append(line.split('|')[0])
                self.BaseData.append(line)

    def readLog(self):
        if os.path.exists(self.RecordLogFile):
            print('record exist...')
            self.read_data_from_file()
        else:
            print('get data again...')

    def downExcelSZ(self):
        url = 'http://www.szse.cn/api/report/ShowReport?SHOWTYPE=xlsx&CATALOGID=1110&TABKEY=tab1'
        rst = requests.get(url, timeout=60, headers=self.headers)
        with open('深圳A股.xlsx', 'wb') as f:
            f.write(rst.content)
        data = pd.read_excel('深圳A股.xlsx', sheet_name='A股列表', converters={'公司代碼': str,'A股代碼': str})
        return data

    def downExcelSH(self):
        url = 'http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=1'

        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
            'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/'
        }
        rst = requests.get(url, timeout=60, headers=headers)
        print(rst.status_code)
        with open('上海A股.txt', 'wb') as f:
            f.write(rst.content)

        lines = rst.content.decode('GBK').split("\n")
        df = pd.DataFrame(columns=["公司代碼", "公司簡稱", "A股代碼", "A股簡稱", "A股上市日期", "A股總股本", "A股流通股本"])
        row = 0
        for line in lines:
            row = row + 1
            if row == 1:
                continue  #跳過第一行的字段頭
            if len(line) > 0:
                ls = line.split("\t")[:7]
                df.loc[row] = [s.strip() for s in ls]
        return df

    def get_detail(self, in_symbol):
        if in_symbol.startswith("6"):
            cwzburl = 'http://quote.eastmoney.com/sh%s.html' % in_symbol
        else:
            cwzburl = 'http://quote.eastmoney.com/sz%s.html' % in_symbol
        # 發送請求
        try:
            rdm = random.randint(1, 3)
            print('  %d秒後開始採集%s' % (rdm, in_symbol))
            time.sleep(rdm)
            rst = requests.get(cwzburl, timeout=60, headers=self.headers)
            cwzbhtml = rst.content
            cwzbsoup = BeautifulSoup(cwzbhtml, 'html.parser', from_encoding="GBK")
            cwzb_list = cwzbsoup.find('div', class_='cwzb').tbody.tr.get_text().split()
            outPut = "|".join(str(i) for i in cwzb_list[1:])  #去掉第一列股票代碼,有錯誤的,直接取交易所的
            return outPut
        except Exception as e:
            print('perhaps timeout:', e, cwzburl)

    def write_file(self, symbol, cwzb):
        self.BaseData.append(cwzb)
        self.write_data_to_file(cwzb)
        self.symbolList.append(symbol)
        print("    寫文件成功-%s" % symbol)

    def InsertOracle(self):
        # Connect as user "hr" with password "welcome" to the "oraclepdb" service running on this computer.
        connection = cx_Oracle.connect("gw", "gw", "localhost/xe")
        cursor = connection.cursor()

        list_param = []
        with open(self.RecordLogFile, 'r') as f:
            next(f)  # 跳過第一行文件頭
            for line in f:
                t = tuple(line.split("|"))
                list_param.append(t)

        cursor.execute(""" delete from   tb_cwzb where data_date=trunc(sysdate)""")
        sql = "INSERT INTO tb_cwzb(data_date,symbol, name,total_market_value,net_asset,net_income,pe,pb,gross_profit_ratio,net_profit_ratio,ROE,list_date,total_shares,cir_shares) VALUES (trunc(sysdate),:1, :2, :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13)"
        cursor.executemany(sql, list_param)
        connection.commit()

        cursor.execute("""select data_date,count(*) cnt  from  tb_cwzb group by data_date""")
        for trad_date, count in cursor:
            print("日期:", trad_date, "記錄數量:", count)
def main():
    stock = Cwzb()
    # 返回深圳市場的DataFrame
    dfsz = stock.downExcelSZ()
    # 返回上海市場的DataFrame
    dfsh = stock.downExcelSH()
    stock.readLog()
    if 1:
        cnt = 0
        for index, row in dfsz.iterrows():
            cnt = cnt + 1
            smb = row["公司代碼"]
            print('深圳-處理進度%d/%d' % (cnt, dfsz.shape[0]))
            if smb in stock.symbolList:
                print('%s已經採集過:' % smb)
                continue
            cwzb = stock.get_detail(smb)
            #原始數據 A股總股本,A股流通股本  單位爲股
            stock.write_file(smb, row["A股代碼"]+"|"+row["A股簡稱"]+"|"+cwzb + "|" + row["A股上市日期"] + "|" + row["A股總股本"].replace(",","") + "|" + row["A股流通股本"].replace(",",""))
    cnt = 0
    for index, row in dfsh.iterrows():
        cnt = cnt + 1
        smb = row["公司代碼"]
        print('上海-處理進度%d/%d' % (cnt, dfsh.shape[0]))
        if smb in stock.symbolList:
            print('%s已經採集過:' % smb)
            continue
        cwzb = stock.get_detail(smb)
        #原始數據 A股總股本,A股流通股本  單位爲萬股
        stock.write_file(smb, row["A股代碼"]+"|"+row["A股簡稱"]+"|"+cwzb + "|" + row["A股上市日期"] + "|" + str(round(float(row["A股總股本"])*10000)) + "|" + str(round(float(row["A股流通股本"])*10000)))
    stock.InsertOracle()

if __name__ == '__main__':
    main()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章