簡單的數據處理過程:數據收集,數據整理,數據描述,數據分析便捷數據獲取:
- 抓取:urllib內建模塊-urllib.request, requests第三方庫, scrapy框架
- 解析:BeautifulSoup庫,re模塊
- 數據獲取例子:
import pandas as pd
pandas as pd
quotesdf=pd.read_csv('axp.csv')
=pd.read_csv('axp.csv')
print(quotesdf)
(quotesdf)
r=reuqest.get("https://")
=reuqest.get("https://")
r.text
.text
數據準備
- dfjid加列索引關鍵字母.columns
import requests
requests
import re
re
import pandas as pd
pandas as pd
def retrieve_dji_list()
retrieve_dji_list()
...
return dji_list
return dji_list
dji_list=retrieve_dji_list()
=retrieve_dji_list()
djidf=pd.DataFrame(dji_list)
=pd.DataFrame(dji_list)
cols=['code','name','lasttrade']
=['code','name','lasttrade']
djifdf.columns=cols;
.columns=cols;
print(quotesdf)
(quotesdf)
將時間轉換成普通時間:date.fromtimestamp(quotes[i]['date']),date.striftime(x,'%Y-%m-%d')
def retrieve_quotes_historical(stock_code)
retrieve_quotes_historical(stock_code)
...
return[item for in quotes if not 'type' in item]
return[item for in quotes if not 'type' in item]
quotes=retrieve_quotes_historical('AXP')
=retrieve_quotes_historical('AXP')
list1=[]
=[]
for i in range(len(quotes))
i in range(len(quotes))
#轉換成常規時間
#轉換成常規時間
x=date.fromtimestamp(quotes[i]['date'])
x=date.fromtimestamp(quotes[i]['date'])
#轉換成固定時間
#轉換成固定時間
y=date.stftime(x,'%X-%y-%d')
y=date.stftime(x,'%X-%y-%d')
list.append(y)
list.append(y)
quotesdf_ori=pd.DataFrame(quotes,index=list1)
=pd.DataFrame(quotes,index=list1)
quotesdf_m=quotedsdf_ori.drop(['unadjclose'],axis=1)
=quotedsdf_ori.drop(['unadjclose'],axis=1)
quotesdf=quotesdf_m.drop(['date'],axis=1)
=quotesdf_m.drop(['date'],axis=1)
print(quotedsdf)
(quotedsdf)
數據顯示:
list(djidf.index)#顯示行索引
(djidf.index)#顯示行索引
list(djidf.columns)#顯示列索引
(djidf.columns)#顯示列索引
djidf.values#顯示數據的值
.values#顯示數據的值
djidf.describe#顯示數據的描述
.describe#顯示數據的描述
djidf.lasttrade#顯示數據的格式
.lasttrade#顯示數據的格式
數據選擇:相似的選擇
djidf.loc[[1:6],['code','lasttrade']]
.loc[[1:6],['code','lasttrade']]
djidf.iloc[[1:6],[0,2]]
.iloc[[1:6],[0,2]]
#條件篩選
quotesdf[(quotedsdf.index>'$value')]
[(quotedsdf.index>'$value')]
簡單的統計與數據篩選:一般可以在值的裏面加篩選條件 [ ]
djidf.lasttrade.mean()
.lasttrade.mean()
djidf[djidf.lasttrade>=180].name#條件篩選
[djidf.lasttrade>=180].name#條件篩選
GROUPING&MERGE
#APPEND---加行到DataFrame
data.append(data1)
.append(data1)
#Concat---連接pandas對象
data.concate(pieceData)
.concate(pieceData)
#JON---SQL類型的連接
data.merge(sqlData)
.merge(sqlData)
高級數據可視化及其常用庫: 聚類分析,MATPOTLIB繪圖應用和屬性控制,PANDAS作圖
#KMEANS通過SKLEARN和SCIPY這兩個庫中的包:
#sklearn.cluster和scipy.cluster.vq
#sklearn.cluster和scipy.cluster.vq
import numpy as np
numpy as np
import scipy.cluster.vq import vq,kmeans,whiten
scipy.cluster.vq import vq,kmeans,whiten
data=np.array([list1,list2,list3,list4,list4])
=np.array([list1,list2,list3,list4,list4])
whiten=whiten(data)
=whiten(data)
centoids,_ = kmeans(whiten,2)
,_ = kmeans(whiten,2)
result,_=vq(whiten,centoids)
,_=vq(whiten,centoids)
print(result)
(result)
import numpy as np
numpy as np
import sklearn.cluster import KMeans
sklearn.cluster import KMeans
kmeans=KMEANS(n_cluster=2),fit(X)
=KMEANS(n_cluster=2),fit(X)
pred=kmeans.predict(x)
=kmeans.predict(x)
#MATPLOTLIB繪圖應用和繪圖基礎:
#繪圖API-pyplot模塊和集成庫--pylab模塊(包含Numpy和pyplot中的常用函數)
pt.plot(x,y,"colorAndstyle")
.plot(x,y,"colorAndstyle")
pt.bar(x,y,"colorAndStyle")
.bar(x,y,"colorAndStyle")
#添加文字和標題
plt.title('Stock Statistics of Coca-Cola')
.title('Stock Statistics of Coca-Cola')
plt.xlabel('Month')
.xlabel('Month')
plt.ylabel('Average Close Price')
.ylabel('Average Close Price')
#多圖
plt.plot(x,y,color='green',marker='o')
.plot(x,y,color='green',marker='o')
plt.plot(x1,y1,color='r',marker='o')
.plot(x1,y1,color='r',marker='o')
plt.savefig('1.jpg')
.savefig('1.jpg')
#pands繪圖
data.plot(kind='bar',stacked='True')
.plot(kind='bar',stacked='True')
data.plot(kind'pie',subplots=True,autopct='%.2f')
.plot(kind'pie',subplots=True,autopct='%.2f')
數據存取:csv和excel
#csv格式存取
data=pd.DataFrame(quotes)
=pd.DataFrame(quotes)
data.to_csv('stockAXP.csv')
.to_csv('stockAXP.csv')
data=pd.read_csv('stockAXP.csv')
=pd.read_csv('stockAXP.csv')
#excel的存取
data.to_excel('stockAXP.xlsx',sheet_name='AXP')
.to_excel('stockAXP.xlsx',sheet_name='AXP')
data=pd.read_excel('stockAXp.xlsx')
=pd.read_excel('stockAXp.xlsx')
python的理工科應用和人文社科應用
#常見python的圖像處理庫:Pillow(PIL),OPENCV,Skimage
#常見的生物學工具:Biopython
#python人文社科類工具:NLTK語料庫(包含brown詞典,網絡和聊天腳本,路透社等)