代碼 | merge(
df1=DataFrame({'key':['b','d','b','g','d','u','a'],'data1':range(7)}) df2=DataFrame({'key':['b','d','s'],'data2':range(3)}) df3=pd.merge(df1,df2) #print(df3) # 外連接,並集 print(pd.merge(df1,df2,how='outer')) #左鏈接 print(pd.merge(df1,df2,how='left')) #右鏈接 print(pd.merge(df1,df2,how='right')) | join(默認是左連接的) df1=DataFrame({'key':['b','d','b','g','d','u','a'],'data1':range(7)}) df2=DataFrame({'key':['b','d','s'],'data2':range(3)}) #print(df1) #print(df2) df3=pd.merge(df1,df2) #print(df3) print('*******************我是分隔符************************') # 外連接,並集 print(pd.merge(df1,df2,how='outer')) #左鏈接 print('*******************我是分隔符************************') print(pd.merge(df1,df2,how='left')) #右鏈接 print('*******************我是分隔符************************') print(pd.merge(df1,df2,how='right')) | concat軸向連接()當沒有索引時、concat不管列名,直接加到一起,可以加到後面、也可以加到右邊,axis=0爲加到後面,axis=1爲加到右邊,左邊的數據結構沒有變,變的是右邊數據結構。 |
| data=DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]}) print(data) print(data.duplicated()) data['k3']=range(7) print(data.drop_duplicates('k1')) print(data.drop_duplicates('k1',keep='last')) |
s1=Series([1,999,-1000,2,999]) print(s1) print("*************repalce替換一個值*************") print(s1.replace(999,np.nan)) print("*************repalce對不同的值進行不同的替換*************") print(s1.replace([999,-1000],[np.nan,'None'])) print("*************repalce參數形式爲字典*************") print(s1.replace({999:np.nan,-1000:'空值啦'})) |
data=DataFrame(np.arange(12).reshape(3,4),index=['Ohio','Colorado','NewYork'],columns=['one','two','three','four']) print(data) print('**************此種方式是就地修改原始數據********************') data.index=data.index.map(str.upper)#此種方式是就地修改原始數據 print(data) print('**************這種是通過創建數據的轉換版********************') print(data.rename(index=str.title,columns=str.upper))#這種是通過創建數據的轉換版,如果希望就地修改,加參數inplace=True print('**************還是那個原始數據********************') print(data)#原始數據依舊不變 print('**************結合字典類型對象實現對部分軸標籤的更新********************') print(data.rename(index={'OHIO':'Beijing','COLORADO':'Shanghai'},columns={'one':'first','two':'second'})) |
#離散化和麪元劃分 ages=[20,22,25,27,31,21,23,37,61,41,32] bin=[18,25,35,60,100] cats=pd.cut(ages,bin) print('****************數據劃分***********************') print(cats) print('****************以年齡數據進行分組標號******************') print(cats.codes) print('****************統計分組組次和頻數******************') print(pd.value_counts(cats)) group_names=['Youth','YoungAdult','MiddleAged','Senior'] print('**************設置自己的面元(分組)名稱***************') print(pd.cut(ages,bin,labels=group_names)) print('**********統計設置自己的面元(分組)名稱後的組次和頻數***********') print(pd.value_counts(pd.cut(ages,bin,labels=group_names))) | print('**************cut中傳入面元數量而不是面元名稱************') data=np.random.rand(20)#正態分佈 data_cut=pd.cut(data,4,precision=5) print(data_cut) print('****************統計分組組次和頻數******************') print(pd.value_counts(data_cut))data1=np.random.randn(1000) data1_cut=pd.qcut(data1,4) print('****************按四分位數進行切割******************') print(data1_cut) print('****************統計分組組次和頻數******************') print(pd.value_counts(data1_cut)) print('****************設置自定義分位數******************') data1_cut1=pd.qcut(data1,[0,0.1,0.5,0.9,1]) print(data1_cut1) print('****************統計分組組次和頻數******************') print(pd.value_counts(data1_cut1)) |