sklean常用api

特徵工程

1.標準化

from sklearn.preprocessing import StandardScaler
data = StandardScaler().fit_transform(data)

2.區間縮放

from sklearn.preprocessing import MinMaxScaler
data = MinMaxScaler().fit_transform(data)
3.歸一化,便於計算梯度下降
from sklearn.preprocessing import Normalizer
data = Normalizer().fit_transform(data)
4.定量特徵二值化(大於epsiloin的爲1, 小於epsiloin的爲0)
from sklearn.preprocessing import Binarizer
data = Binarizer(threshold=epsilon).fit_transform(data)
5.類別特徵轉換成數值特徵
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
data = vec.fit_transform(data.to_dict(orient=“recoed”))
6.卡方檢驗,選擇最好的特徵
from sklearn.feature_extraction import SelectKBest
from sklearn.feature_extraction import chi2
skb = SelectKBest(chi2, k = 10).fit(X,Y)
x_train = skb.transform(x_train)
7.互信息法
from sklearn.feature_selection import SelectKBest
from minepy import iNE
def mic(x, y):
m = MINE()
m.compute_score(x, y)
return (m.mic(), 0.5)

選擇K個最好的特徵,返回特徵選擇後的數據

data = SelectKBest(lambda X, Y:array(map(lambd x:mic(x, y), X.T, k=2).fit_transform(x, y)))

8.主成分分析

from sklearn.decomposition import PCA
estimator = PCA(n_components=2)
x_pca = estimator.fit_transform(x_data)

學習算法

1.劃分數據集和測試集

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

2.交叉驗證集
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, Y, cv=10)

3.訓練

from sklearn import learnAlgorithm #對應的學習算法名稱
la = learnAlgorithm()
la.fit(X_train, Y_train)
score = la.score(X_test, Y_test)
4.隨機梯度下降
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X, y)
from sklearn.linear_model import SGDRegressor
rlf = SGDRegressor()
rlf.fit(X, y)
5.支持向量機,分類和迴歸
from sklearn.svm import SVC
svc_linear = SVC(kernel=“linear”)#選擇不同的核函數
from sklearn.svm import SVR
svm_linear = SVR(kernel=“linear”)
6.樸素貝葉斯
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
7.決策樹
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion=“entropy”,max_depth=4,min_samples_leaf=5)#指定最大深度和最小的樣本數,防止過擬合
8.隨機森林
from sklearn.ensemble import RandonForestClassifier
rfc = RandonForestClassifier(max_depth=3,min_samples_leaf=5)
9.梯度提升樹
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(max_depth=3, min_samples_leaf=5)
10.極限迴歸森林
from sklearn.ensemble import ExtraTreesRegressor

模型評估

1.獲取精確率,召回率等等

from sklearn import metrics
accuracy_rate = metrics.accuracy_score(y_test, y_predict)
metrics.classification_report(y_test, y_predict, target_names = data.target_names)#可以獲取準確率,召回率等數據

2.交叉驗證

from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, Y, cv=10)

X,Y爲ndarray類型,csv讀取進來的數據可以用df.values來轉 # Pipeline機制 # pipeline機制實現了對全部步驟的流式化封裝和管理,應用於參數 # 集在數據集上的重複使用.Pipeline對象接受二元tuple構成的 # list,第一個元素爲自定義名稱,第二個元素爲sklearn中的 # transformer或estimator,即處理特徵和用於學習的方法. # 以樸素貝葉斯爲例,根據處理特徵的不同方法有以下代碼

clf_1 = Pipeline([(‘count_vec’, CountVectorizer()), (‘mnb’, MultinomialNB())])
clf_2 = Pipeline([(‘hash_vec’, HashingVectorizer(non_negative=True)), (‘mnb’, MultinomialNB())])
clf_3 = Pipeline([(‘tfidf_vec’, TfidfVectorizer()), (‘mnb’, MultinomialNB())])

特徵選擇

from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2,percentile=per)
X_train = fs.fit_transform(Xtrain, Y_train)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章