筆記 Data Processing Using Python 4(基本數據統計和數據可視化)

簡單的數據處理過程:數據收集,數據整理,數據描述,數據分析便捷數據獲取:

  1. 抓取:urllib內建模塊-urllib.request, requests第三方庫, scrapy框架
  2. 解析:BeautifulSoup庫,re模塊
  3. 數據獲取例子:

 

import pandas as pd pandas as pd
quotesdf=pd.read_csv('axp.csv')=pd.read_csv('axp.csv')
print(quotesdf)(quotesdf)
r=reuqest.get("https://")=reuqest.get("https://")
r.text.text

數據準備

  1. dfjid加列索引關鍵字母.columns
import requests requests
import re re
import pandas as pd pandas as pd
def retrieve_dji_list() retrieve_dji_list()
    ...
    return dji_listreturn dji_list
dji_list=retrieve_dji_list()=retrieve_dji_list()
djidf=pd.DataFrame(dji_list)=pd.DataFrame(dji_list)
cols=['code','name','lasttrade']=['code','name','lasttrade']
djifdf.columns=cols;.columns=cols;
print(quotesdf)(quotesdf)

將時間轉換成普通時間:date.fromtimestamp(quotes[i]['date']),date.striftime(x,'%Y-%m-%d')

 

def retrieve_quotes_historical(stock_code) retrieve_quotes_historical(stock_code)
    ...
    return[item for in quotes if not 'type' in item]return[item for in quotes if not 'type' in item]
quotes=retrieve_quotes_historical('AXP')=retrieve_quotes_historical('AXP')
list1=[]=[]
for i in range(len(quotes)) i in range(len(quotes))
    #轉換成常規時間#轉換成常規時間
    x=date.fromtimestamp(quotes[i]['date'])x=date.fromtimestamp(quotes[i]['date'])
    #轉換成固定時間#轉換成固定時間
    y=date.stftime(x,'%X-%y-%d')y=date.stftime(x,'%X-%y-%d')
    list.append(y)list.append(y)
quotesdf_ori=pd.DataFrame(quotes,index=list1)=pd.DataFrame(quotes,index=list1)
quotesdf_m=quotedsdf_ori.drop(['unadjclose'],axis=1)=quotedsdf_ori.drop(['unadjclose'],axis=1)
quotesdf=quotesdf_m.drop(['date'],axis=1)=quotesdf_m.drop(['date'],axis=1)
print(quotedsdf)(quotedsdf)

 

數據顯示:

 

list(djidf.index)#顯示行索引(djidf.index)#顯示行索引
list(djidf.columns)#顯示列索引(djidf.columns)#顯示列索引
djidf.values#顯示數據的值.values#顯示數據的值
djidf.describe#顯示數據的描述.describe#顯示數據的描述
djidf.lasttrade#顯示數據的格式.lasttrade#顯示數據的格式

數據選擇:相似的選擇

 

djidf.loc[[1:6],['code','lasttrade']].loc[[1:6],['code','lasttrade']]
djidf.iloc[[1:6],[0,2]].iloc[[1:6],[0,2]]
#條件篩選
quotesdf[(quotedsdf.index>'$value')][(quotedsdf.index>'$value')]

簡單的統計與數據篩選:一般可以在值的裏面加篩選條件 [ ]

 

djidf.lasttrade.mean().lasttrade.mean()
djidf[djidf.lasttrade>=180].name#條件篩選[djidf.lasttrade>=180].name#條件篩選

GROUPING&MERGE

 

#APPEND---加行到DataFrame
data.append(data1).append(data1)
#Concat---連接pandas對象
data.concate(pieceData).concate(pieceData)
#JON---SQL類型的連接
data.merge(sqlData).merge(sqlData)

高級數據可視化及其常用庫: 聚類分析,MATPOTLIB繪圖應用和屬性控制,PANDAS作圖

 

#KMEANS通過SKLEARN和SCIPY這兩個庫中的包:
    #sklearn.cluster和scipy.cluster.vq#sklearn.cluster和scipy.cluster.vq
import numpy as np numpy as np
import scipy.cluster.vq import vq,kmeans,whiten scipy.cluster.vq import vq,kmeans,whiten
data=np.array([list1,list2,list3,list4,list4])=np.array([list1,list2,list3,list4,list4])
whiten=whiten(data)=whiten(data)
centoids,_ = kmeans(whiten,2),_ = kmeans(whiten,2)
result,_=vq(whiten,centoids),_=vq(whiten,centoids)
print(result)(result)
import numpy as np numpy as np
import sklearn.cluster import KMeans sklearn.cluster import KMeans
kmeans=KMEANS(n_cluster=2),fit(X)=KMEANS(n_cluster=2),fit(X)
pred=kmeans.predict(x)=kmeans.predict(x)
#MATPLOTLIB繪圖應用和繪圖基礎:
#繪圖API-pyplot模塊和集成庫--pylab模塊(包含Numpy和pyplot中的常用函數)
pt.plot(x,y,"colorAndstyle").plot(x,y,"colorAndstyle")
pt.bar(x,y,"colorAndStyle").bar(x,y,"colorAndStyle")
#添加文字和標題
plt.title('Stock Statistics of Coca-Cola').title('Stock Statistics of Coca-Cola')
plt.xlabel('Month').xlabel('Month')
plt.ylabel('Average Close Price').ylabel('Average Close Price')
#多圖
plt.plot(x,y,color='green',marker='o').plot(x,y,color='green',marker='o')
plt.plot(x1,y1,color='r',marker='o').plot(x1,y1,color='r',marker='o')
plt.savefig('1.jpg').savefig('1.jpg')
#pands繪圖
data.plot(kind='bar',stacked='True').plot(kind='bar',stacked='True')
data.plot(kind'pie',subplots=True,autopct='%.2f').plot(kind'pie',subplots=True,autopct='%.2f')

數據存取:csv和excel

 

#csv格式存取
data=pd.DataFrame(quotes)=pd.DataFrame(quotes)
data.to_csv('stockAXP.csv').to_csv('stockAXP.csv')
data=pd.read_csv('stockAXP.csv')=pd.read_csv('stockAXP.csv')
#excel的存取
data.to_excel('stockAXP.xlsx',sheet_name='AXP').to_excel('stockAXP.xlsx',sheet_name='AXP')
data=pd.read_excel('stockAXp.xlsx')=pd.read_excel('stockAXp.xlsx')

python的理工科應用和人文社科應用

 

#常見python的圖像處理庫:Pillow(PIL),OPENCV,Skimage
#常見的生物學工具:Biopython
#python人文社科類工具:NLTK語料庫(包含brown詞典,網絡和聊天腳本,路透社等)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章