#-------------------------python數據分析特徵預處理代碼示例----------------------------
import pandas as pd
import scipy.stats as ss
import numpy as np
from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
#--------------------特徵選擇-------------------
def main1():
df=pd.DataFrame({'A':ss.norm.rvs(size=10),'B':ss.norm.rvs(size=10),
'C':ss.norm.rvs(size=10),'D':np.random.randint(low=0,high=2,size=10)})
#print(df)
X=df.loc[:,['A','B','C']]
Y=df.loc[:,'D']
print('X',X)
print('Y',Y)
# 特徵選擇
# 1)減少特徵數量,降維,使模型泛化能力更強,減少過擬合;
# 2)增強特徵和特徵之間的聯繫
# 選擇兩個得分較高的特徵
skb=SelectKBest(k=2)
skb.fit(X,Y)
print(skb.transform(X))
#使用遞歸消除特徵(反覆構建模型(SVM/迴歸....),選出最好(差)的特徵,
# 把選出的特徵除去,再在剩餘特徵上重複該過程,直到所有特徵都遍歷了,
# 這個過程中特徵被消除的次序就是特徵的排序)
rfe=RFE(estimator=SVR(kernel='linear'),n_features_to_select=2,step=1)
print(rfe.fit_transform(X,Y))
#與指定閾值比較,刪除小於閾值的特徵
sfm=SelectFromModel(estimator=DecisionTreeRegressor(),threshold=0.01)
print(sfm.fit_transform(X,Y))
#--------------------除去異常值------------------
def main2():
df=pd.DataFrame({'A':['a0','a1','a1','a2','a3','a4'],
'B':['b0','b1','b2','b2','b3',None],
'C':[1,2,None,3,4,5],
'D':[0.1, 10.2, 11.4, 8.9, 9.1, 12],
'E':[10, 19, 32, 25, 8, None],
'F':["f0", "f1", "g2", "f3", "f4", "f5"]})
print(df.isnull())#判斷是否爲空,是-True,否-False
print(df.dropna(subset=['B','C']))#去掉B,C列的空值
print(df.duplicated(['A'],keep='first'))#判斷A列是否有重複值,有的話,第一個重複值爲True,其他的爲False
print(df.drop_duplicates(['A','B'],keep='first',inplace=False))#判斷A,B兩列是否有共同的重複值
print(df['B'].fillna('b*'))
print(df['B'].fillna(df['E'].mean()))
print(df['E'].interpolate(method='spline',order=3))#用插值的方法填入異常值
print(df[df['D']<df['D'].quantile(0.75)+1.5*(df['D'].quantile(0.75)-df['D'].quantile(0.25))]
[df['D']>df['D'].quantile(0.25)-1.5*(df['D'].quantile(0.75)-df['D'].quantile(0.25))])
print(df[[True if item.startswith('f') else False for item in list(df['F'].values)]])
#-----------------------------數據處理-------------------------
def main3():
#離散化
lst=[6,8,10,15,16,24,25,40,67]
binings,bins=pd.qcut(lst,q=3,retbins=True)#等深分箱
print(list(bins))
#結果:[6.0, 13.333333333333332, 24.333333333333332, 67.0]
print(pd.cut(lst,bins=3))
#[(5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333],
# (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (26.333, 46.667], (46.667, 67.0]]
# Categories (3, interval[float64]): [(5.939, 26.333] < (26.333, 46.667] < (46.667, 67.0]]
print(pd.cut(lst,bins=4,labels=['low','medium','high','very high']))
#[low, low, low, low, low, medium, medium, high, very high]
# Categories (4, object): [low < medium < high < very high]
#歸一化與標準化
from sklearn.preprocessing import MinMaxScaler,StandardScaler
print(MinMaxScaler().fit_transform(np.array([1.4,10,15,21]).reshape(-1,1)))
print(StandardScaler().fit_transform(np.array([1,1,1,1,0,0,0,0]).reshape(-1,1)))
print(StandardScaler().fit_transform(np.array([1,0,0,0,0,0,0,0]).reshape(-1,1)))
#onn-hot編碼
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
print(LabelEncoder().fit_transform(np.array(["Down", "Down", "Up", "Down", "Up"]).reshape(-1, 1)))
print(LabelEncoder().fit_transform(np.array(["Low", "Medium", "Low", "High", "Medium"]).reshape(-1, 1)))
#結果
#[0 0 1 0 1]
#[1 2 1 0 2]
lb_encoder = LabelEncoder()
lb_encoder = lb_encoder.fit(np.array(["Red", "Yellow", "Blue", "Green"]))
lb_trans_f = lb_encoder.transform(np.array(["Red", "Yellow", "Blue", "Green"]))
oht_enoder = OneHotEncoder().fit(lb_trans_f.reshape(-1, 1))
print(oht_enoder.transform(lb_encoder.transform(np.array(["Red", "Blue"])).reshape(-1, 1)).toarray())
#規範化
from sklearn.preprocessing import Normalizer
print(Normalizer(norm='l1').fit_transform([1,1,3,-1,2]))
# LDA降維
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([0, 0, 0, 1, 1, 1])
clf = LinearDiscriminantAnalysis()
clf.fit(X, y)
print(clf.predict([[-0.8, -1]]))
if __name__=='__main__':
main1()
main2()
main3()