1.背景
之前寫的抓取A股所有上市公司信息的小程序在上交所網站改版後,需要同步修改
pyton2.7.9
2.分析過程
以抓取宇通客車【600066】信息爲例
打開網址http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE=600066
紅框中的內容是需要抓取的信息,查看網頁源碼
可以看到公司信息並沒有直接寫到html中,使用chrome “開發者工具”快捷鍵F12,查看瀏覽器與服務器的交互過程(在這一步走了彎路,使用selenium+plantomjs模擬瀏覽器然後分析html以及使用ghost.py+beautifulsoup都沒有成功)
可以在標紅線的url上看到返回的公司信息,剩下的就是模擬瀏覽器請求這個url了,request header中的refer一定不能省略,不然會報403
返回的信息是json格式的,可以使用python自帶的json庫轉換爲dict,可以參考searchJ.js來獲得想要的信息
具體見github網址https://github.com/shenyanf/AShareListedCompanyList
# -*- coding: utf-8 -*-
'''
Created on 2016年4月19日
@author: a
'''
import urllib2
import json
from time import sleep
class JSONObject:
def __init__(self, d):
self.__dict__ = d
class AchieveSSEStockInfo:
'''獲得上海證卷交易所股票信息.'''
# 指標的方法,順序已經排好,請不要亂動
__public__ = ['getCompanyCode', 'getCompanyShortName', 'getCompanyName', 'getCompanyEnlishName', 'getIpoAddress', 'getASharesCode',
'getASharesShortName', 'getASharesIPODate', 'getASharesTotalCapital', 'getASharesOutstandingCaptial', 'getBSharesCode',
'getBSharesShortName', 'getBSharesIPODate', 'getBSharesTotalCapital', 'getBSharesOutstandingCaptial', 'getArea', 'getProvince', 'getCity', 'getTrade', 'getWebsite']
achieveIndexFromURLA = ['CHANGEABLE_BOND_ABBR', 'OFFICE_ZIP', 'AREA_NAME_DESC', 'FULL_NAME_IN_ENGLISH', 'COMPANY_CODE', 'CSRC_MIDDLE_CODE_DESC', 'SECURITY_ABBR_A', 'COMPANY_ADDRESS', 'SECURITY_CODE_A', 'SECURITY_CODE_B', 'SECURITY_30_DESC', 'COMPANY_ABBR', 'OFFICE_ADDRESS', 'CHANGEABLE_BOND_CODE', 'ENGLISH_ABBR', 'LEGAL_REPRESENTATIVE', 'REPR_PHONE', 'E_MAIL_ADDRESS', 'FOREIGN_LISTING_ADDRESS', 'STATE_CODE_A_DESC', 'SSE_CODE_DESC', 'FOREIGN_LISTING_DESC', 'SECURITY_CODE_A_SZ', 'CSRC_GREAT_CODE_DESC', 'WWW_ADDRESS', 'CSRC_CODE_DESC', 'STATE_CODE_B_DESC', 'FULLNAME']
'''
all indexs as follow:
companyCode 公司代碼
companyShortName 公司簡稱
companyName 公司全稱
companyEnlishName 英文名稱
ipoAddress 註冊地址
aSharesCode A股代碼
aSharesShortName A股簡稱
aSharesIPODate A股上市日期
aSharesTotalCapital A股總股本
aSharesOutstandingCaptial A股流通股本
bSharesCode B股代碼
bSharesShortName B股簡稱
bSharesIPODate B股上市日期
bSharesTotalCapital B股總股本
bSharesOutstandingCaptial B股流通股本
area 地區
province 省份
city 城市
trade 所屬行業
website 公司網址
status A股狀態/B股狀態
'''
def getCompanyCode(self):
return self.__getBasicValue('COMPANY_CODE')
def getStatus(self):
v = self.__getBasicValue('STATE_CODE_A_DESC') + '/' + self.__getBasicValue('STATE_CODE_B_DESC')
# print v
if v == '-/-' or u'摘牌' in v:
return False
else:
return True
def getCompanyShortName(self):
return self.__getBasicValue('COMPANY_ABBR') + '/' + self.__getBasicValue('ENGLISH_ABBR')
def getCompanyName(self):
return self.__getBasicValue('FULLNAME')
def getCompanyEnlishName(self):
return self.__getBasicValue('FULL_NAME_IN_ENGLISH')
def getIpoAddress(self):
return self.__getBasicValue('COMPANY_ADDRESS')
def getASharesCode(self):
return self.__getBasicValue('SECURITY_CODE_A')
def getASharesShortName(self):
return self.__getBasicValue('COMPANY_ABBR') + '/' + self.__getBasicValue('ENGLISH_ABBR')
def getASharesIPODate(self):
result = ''
try:
rsDict = self.__getDatas(self.basicURLB)
if rsDict == '-' or rsDict is None:
result = '-'
else:
ipoDate = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
print ipoDate
result = ipoDate.get('LISTINGDATEA')
except:
result = '-'
return result
def getTotalCapital(self):
return self.__getCapitalValue('totalShares')
def getASharesTotalCapital(self):
aShareTotalShare = 0.0
AShareNonFlowShare = self.__getCapitalValue('totalNonFlowShare')
AShareFlowShare = self.getASharesOutstandingCaptial()
if AShareNonFlowShare != '-' and AShareNonFlowShare:
aShareTotalShare += float(AShareNonFlowShare)
if AShareFlowShare != '-' and AShareFlowShare:
aShareTotalShare += float(AShareFlowShare)
return aShareTotalShare
def getASharesOutstandingCaptial(self):
return self.__getCapitalValue('AShares')
def getBSharesTotalCapital(self):
return self.getBSharesOutstandingCaptial()
def getBSharesOutstandingCaptial(self):
return self.__getCapitalValue('BShares')
def getBSharesCode(self):
return self.__getBasicValue('SECURITY_CODE_B')
def getBSharesShortName(self):
if self.getBSharesCode().find('-') != -1:
return ''
else:
return self.getASharesShortName()
def getBSharesIPODate(self):
result = ''
try:
rsDict = self.__getDatas(self.basicURLC)
if rsDict == '-' or rsDict is None:
result = '-'
else:
ipoDate = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
print ipoDate
result = ipoDate.get('LISTINGDATEB')
except:
result = '-'
return result
def getArea(self):
return self.__getBasicValue('AREA_NAME_DESC')
def getProvince(self):
return self.getArea()
def getCity(self):
return self.getArea()
def getTrade(self):
return self.__getBasicValue('SSE_CODE_DESC')
# CSRC行業(門類/大類/中類)
# 'CSRC_CODE_DESC') + '/' + self.__getBasicValue('CSRC_GREAT_CODE_DESC') + '/' + self.__getBasicValue('CSRC_MIDDLE_CODE_DESC')
def getWebsite(self):
return self.__getBasicValue('WWW_ADDRESS')
def __getDatas(self, url, basicInfo=True):
'''獲取指定地址的html內容 .'''
request = urllib2.Request(url)
request.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
request.add_header('Accept-Encoding', 'gzip, deflate, sdch')
request.add_header('Accept-Language', 'zh-CN,zh;q=0.8,en;q=0.6')
request.add_header('Cache-Control', 'max-age=0')
request.add_header('Connection', 'keep-alive')
request.add_header('Host', 'query.sse.com.cn')
request.add_header('Upgrade-Insecure-Requests', '1')
if basicInfo:
request.add_header('Referer', 'http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE=' + str(self.stockCode))
else:
request.add_header('Referer', 'http://www.sse.com.cn/assortment/stock/list/info/capital/index.shtml?COMPANY_CODE=' + str(self.stockCode))
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36')
# 嘗試5次,如果每次都是timeout,打印提示信息,返回none
maxNum = 5
for i in range(maxNum):
try:
response = urllib2.urlopen(url=request, timeout=15)
# 慢一點 不然被屏蔽
sleep(5)
break
except:
pass
if i < maxNum - 1:
continue
else:
print 'URLError: <urlopen error timed out> All times is failed '
return None
response.encoding = 'utf-8'
result = response.read()
# print result
str2JsonData = str(result).split('(')[1].split(')')[0]
pythonObjData = json.loads(str2JsonData, object_hook=JSONObject)
# print pythonObjData.result
if not pythonObjData.result:
return '-'
else:
return pythonObjData.result
def __getBasicValue(self, key):
'''獲得上市公司基本信息的值.'''
result = ''
try:
# 首次使用該方法,需要訪問url,獲取網頁內容
if self.stockBasicInfo == None:
rsDict = self.__getDatas(self.basicURLA)
if rsDict == '-' or rsDict is None:
result = '-'
else:
self.stockBasicInfo = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
# print self.stockBasicInfo
result = self.stockBasicInfo.get(key)
except:
result = '-'
# print result
return result
def __getCapitalValue(self, key):
'''獲得上市公司股本信息的值.'''
result = ''
try:
# 首次使用該方法,需要訪問url,獲取網頁內容
if self.stockCapitalInfo == None:
rsDict = self.__getDatas(self.capitalURL, basicInfo=False)
if rsDict == '-' or rsDict is None:
result = '-'
else:
self.stockCapitalInfo = dict((name, getattr(rsDict, name)) for name in dir(rsDict) if not name.startswith('__'))
# print self.stockCapitalInfo
result = self.stockCapitalInfo.get(key)
except:
result = '-'
# print result
return result
def __mergeBasicURL(self, sqlId, stockCode):
return 'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback12345&isPagination=false&sqlId=' + sqlId + '&productid=' + str(stockCode) + '&_=14555555555552'
def __init__(self, stockCode):
self.stockCode = stockCode
self.basicURLA = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_C', stockCode)
self.basicURLB = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_AGSSR_C', stockCode)
self.basicURLC = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_BGSSR_C', stockCode)
self.basicURLD = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_MSXX_C', stockCode)
self.basicURLE = r'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback46644&isPagination=true&stockCode=' + str(stockCode) + '&tradeBeginDate=19700101&tradeEndDate=20161001&order=tradeBeginDate%7Cdesc&sqlId=PL_SCRL_SCRLB&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&pageHelp.pageSize=5&_=1475720975596'
self.capitalURL = 'http://query.sse.com.cn/security/stock/queryCompanyStockStruct.do?jsonCallBack=jsonpCallback86976&isPagination=false&companyCode=' + str(stockCode) + '&_=1475732919742'
self.stockBasicInfo = None
self.stockCapitalInfo = None
pass
if __name__ == '__main__':
for i in range(600001, 600003):
a = AchieveSSEStockInfo(600013)
for j in range(a.__public__.__len__()):
m = a.__public__[j]
f = getattr(a, m)
print m, f()
附錄:
1.使用requests庫抓取頁面的時候的編碼問題 https://segmentfault.com/q/10100000003410142.openpyxl參考手冊 http://openpyxl.readthedocs.io/en/default/ http://openpyxl.readthedocs.io/en/default/usage.html
3.urllib2使用 http://zhuoqiang.me/python-urllib2-usage.html#http
4.讀寫json數據 http://python3-cookbook.readthedocs.io/zh_CN/latest/c06/p02_read-write_json_data.html
5.python中 class 或對象屬性轉化成dict 、dict轉換成對象 http://blog.csdn.net/chenyulancn/article/details/8203763
6.【原創】說說JSON和JSONP,也許你會豁然開朗,含jQuery用例 http://www.cnblogs.com/dowinning/archive/2012/04/19/json-jsonp-jquery.html
7.Applying borders to a cell in OpenPyxl http://stackoverflow.com/questions/24917201/applying-borders-to-a-cell-in-openpyxl
後記:
目前上交所已經提供A股上市公司xls的下載了,雖然信息不太完整,連接地址http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=1