# -*- coding: GB2312 -*-
import requests
from bs4 import BeautifulSoup
import csv
import time
import codecs
# 獲取數據
def getHTML(url):
response = requests.get(url)
print(response.apparent_encoding) # 返回頁面編碼:GB2312
# 單用GB2312也可以好像
try:
print(response.text.encode('GB2312').decode('GB2312'))
except:
print(response.text.encode('utf-8').decode('utf-8'))
return response.text
# 處理數據
def writeFile(writer, res):
# 返回的數據:var TbrNdpvg={pages:401,data:[]},"url":""}
startid = res.find("[")
endid = res.find("]")
print(startid)
print(endid)
new_content = res[startid + 1:endid - 1]
# print(content_txt[startid:endid])
list = new_content.split("\",")
print(len(list))
new_list = []
for i in list:
i = i.replace('"', '')
new_list.append(i)
print(new_list)
m = len(new_list)
for i in range(m):
# print(type(new_list[i]))
try:
new_list[i].encode('GB2312').decode('GB2312')
except:
new_list[i].encode('utf-8').decode('utf-8')
new_list_i = new_list[i].split(',')
new_list_i[0] = new_list_i[0] + '\t' # 寫csv,不允許科學計算
print(type(new_list_i))
print(new_list_i)
writer.writerow(new_list_i)
for i in range(40):
url = "http://data.eastmoney.com/DataCenter_V3/gdzjc.ashx?pagesize=500&page=" + str(
i) + "&js=var%20UWExJjvK¶m=&sortRule=-1&sortType=BDJZ&tabid=jjc&code=&name=&rt=50815994"
print(url)
res = getHTML(url)
time.sleep(1) # 每隔一秒訪問
csvFile = codecs.open('d:/Aa.csv', 'a', encoding='utf8') # codecs.open() 防止寫入亂碼
writer = csv.writer(csvFile)
writeFile(writer, res)