python數據處理

#-------------------------python數據分析特徵預處理代碼示例----------------------------
import pandas as pd
import scipy.stats as ss
import numpy as np

from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

#--------------------特徵選擇-------------------
def main1():
    df=pd.DataFrame({'A':ss.norm.rvs(size=10),'B':ss.norm.rvs(size=10),
                     'C':ss.norm.rvs(size=10),'D':np.random.randint(low=0,high=2,size=10)})
    #print(df)
    X=df.loc[:,['A','B','C']]
    Y=df.loc[:,'D']
    print('X',X)
    print('Y',Y)

    # 特徵選擇
    # 1)減少特徵數量,降維,使模型泛化能力更強,減少過擬合;
    # 2)增強特徵和特徵之間的聯繫

    # 選擇兩個得分較高的特徵
    skb=SelectKBest(k=2)
    skb.fit(X,Y)
    print(skb.transform(X))

    #使用遞歸消除特徵(反覆構建模型(SVM/迴歸....),選出最好(差)的特徵,
    # 把選出的特徵除去,再在剩餘特徵上重複該過程,直到所有特徵都遍歷了,
    # 這個過程中特徵被消除的次序就是特徵的排序)
    rfe=RFE(estimator=SVR(kernel='linear'),n_features_to_select=2,step=1)
    print(rfe.fit_transform(X,Y))

    #與指定閾值比較,刪除小於閾值的特徵
    sfm=SelectFromModel(estimator=DecisionTreeRegressor(),threshold=0.01)
    print(sfm.fit_transform(X,Y))


#--------------------除去異常值------------------
def main2():
    df=pd.DataFrame({'A':['a0','a1','a1','a2','a3','a4'],
                     'B':['b0','b1','b2','b2','b3',None],
                     'C':[1,2,None,3,4,5],
                     'D':[0.1, 10.2, 11.4, 8.9, 9.1, 12],
                     'E':[10, 19, 32, 25, 8, None],
                     'F':["f0", "f1", "g2", "f3", "f4", "f5"]})
    print(df.isnull())#判斷是否爲空,是-True,否-False
    print(df.dropna(subset=['B','C']))#去掉B,C列的空值
    print(df.duplicated(['A'],keep='first'))#判斷A列是否有重複值,有的話,第一個重複值爲True,其他的爲False
    print(df.drop_duplicates(['A','B'],keep='first',inplace=False))#判斷A,B兩列是否有共同的重複值
    print(df['B'].fillna('b*'))
    print(df['B'].fillna(df['E'].mean()))
    print(df['E'].interpolate(method='spline',order=3))#用插值的方法填入異常值
    print(df[df['D']<df['D'].quantile(0.75)+1.5*(df['D'].quantile(0.75)-df['D'].quantile(0.25))]
          [df['D']>df['D'].quantile(0.25)-1.5*(df['D'].quantile(0.75)-df['D'].quantile(0.25))])
    print(df[[True if item.startswith('f') else False for item in list(df['F'].values)]])

#-----------------------------數據處理-------------------------
def main3():

    #離散化
    lst=[6,8,10,15,16,24,25,40,67]
    binings,bins=pd.qcut(lst,q=3,retbins=True)#等深分箱
    print(list(bins))
    #結果:[6.0, 13.333333333333332, 24.333333333333332, 67.0]
    print(pd.cut(lst,bins=3))
    #[(5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333],
    # (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (26.333, 46.667], (46.667, 67.0]]
    # Categories (3, interval[float64]): [(5.939, 26.333] < (26.333, 46.667] < (46.667, 67.0]]
    print(pd.cut(lst,bins=4,labels=['low','medium','high','very high']))
    #[low, low, low, low, low, medium, medium, high, very high]
    # Categories (4, object): [low < medium < high < very high]

    #歸一化與標準化
    from sklearn.preprocessing import MinMaxScaler,StandardScaler
    print(MinMaxScaler().fit_transform(np.array([1.4,10,15,21]).reshape(-1,1)))
    print(StandardScaler().fit_transform(np.array([1,1,1,1,0,0,0,0]).reshape(-1,1)))
    print(StandardScaler().fit_transform(np.array([1,0,0,0,0,0,0,0]).reshape(-1,1)))

    #onn-hot編碼
    from sklearn.preprocessing import LabelEncoder,OneHotEncoder
    print(LabelEncoder().fit_transform(np.array(["Down", "Down", "Up", "Down", "Up"]).reshape(-1, 1)))
    print(LabelEncoder().fit_transform(np.array(["Low", "Medium", "Low", "High", "Medium"]).reshape(-1, 1)))
    #結果
        #[0 0 1 0 1]
        #[1 2 1 0 2]
    lb_encoder = LabelEncoder()
    lb_encoder = lb_encoder.fit(np.array(["Red", "Yellow", "Blue", "Green"]))
    lb_trans_f = lb_encoder.transform(np.array(["Red", "Yellow", "Blue", "Green"]))
    oht_enoder = OneHotEncoder().fit(lb_trans_f.reshape(-1, 1))
    print(oht_enoder.transform(lb_encoder.transform(np.array(["Red", "Blue"])).reshape(-1, 1)).toarray())

    #規範化
    from sklearn.preprocessing import Normalizer
    print(Normalizer(norm='l1').fit_transform([1,1,3,-1,2]))

    # LDA降維
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    y = np.array([0, 0, 0, 1, 1, 1])
    clf = LinearDiscriminantAnalysis()
    clf.fit(X, y)
    print(clf.predict([[-0.8, -1]]))
    
if __name__=='__main__':
    main1()
    main2()
    main3()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章