這幾天在看 sklearn 的文檔,發現他的分類器有很多,這裏做一些簡略的記錄。
大致可以將這些分類器分成兩類: 1)單一分類器,2)集成分類器
一、單一分類器
下面這個例子對一些單一分類器效果做了比較
from sklearn.cross_validation import cross_val_score from sklearn.datasets import make_blobs # meta-estimator from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis classifiers = { 'KN': KNeighborsClassifier(3), 'SVC': SVC(kernel="linear", C=0.025), 'SVC': SVC(gamma=2, C=1), 'DT': DecisionTreeClassifier(max_depth=5), 'RF': RandomForestClassifier(n_estimators=10, max_depth=5, max_features=1), # clf.feature_importances_ 'ET': ExtraTreesClassifier(n_estimators=10, max_depth=None), # clf.feature_importances_ 'AB': AdaBoostClassifier(n_estimators=100), 'GB': GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0), # clf.feature_importances_ 'GNB': GaussianNB(), 'LD': LinearDiscriminantAnalysis(), 'QD': QuadraticDiscriminantAnalysis()} X, y = make_blobs(n_samples=10000, n_features=10, centers=100, random_state=0) for name, clf in classifiers.items(): scores = cross_val_score(clf, X, y) print(name,'\t--> ',scores.mean())
下圖是效果圖:
二、集成分類器
集成分類器有四種:Bagging, Voting, GridSearch, PipeLine。最後一個PipeLine其實是管道技術
1.Bagging
from sklearn.ensemble import BaggingClassifier from sklearn.neighbors import KNeighborsClassifier meta_clf = KNeighborsClassifier() bg_clf = BaggingClassifier(meta_clf, max_samples=0.5, max_features=0.5)
2.Voting
from sklearn import datasets from sklearn import cross_validation from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier iris = datasets.load_iris() X, y = iris.data[:, 1:3], iris.target clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[2,1,2]) for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']): scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
3.GridSearch
import numpy as np from sklearn.datasets import load_digits from sklearn.ensemble import RandomForestClassifier from sklearn.grid_search import GridSearchCV from sklearn.grid_search import RandomizedSearchCV # 生成數據 digits = load_digits() X, y = digits.data, digits.target # 元分類器 meta_clf = RandomForestClassifier(n_estimators=20) # ================================================================= # 設置參數 param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # 運行隨機搜索 RandomizedSearch n_iter_search = 20 rs_clf = RandomizedSearchCV(meta_clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() rs_clf.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) print(rs_clf.grid_scores_) # ================================================================= # 設置參數 param_grid = {"max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"]} # 運行網格搜索 GridSearch gs_clf = GridSearchCV(meta_clf, param_grid=param_grid) start = time() gs_clf.fit(X, y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(gs_clf.grid_scores_))) print(gs_clf.grid_scores_)
4.PipeLine
第一個例子
from sklearn import svm from sklearn.datasets import samples_generator from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn.pipeline import Pipeline # 生成數據 X, y = samples_generator.make_classification(n_informative=5, n_redundant=0, random_state=42) # 定義Pipeline,先方差分析,再SVM anova_filter = SelectKBest(f_regression, k=5) clf = svm.SVC(kernel='linear') pipe = Pipeline([('anova', anova_filter), ('svc', clf)]) # 設置anova的參數k=10,svc的參數C=0.1(用雙下劃線"__"連接!) pipe.set_params(anova__k=10, svc__C=.1) pipe.fit(X, y) prediction = pipe.predict(X) pipe.score(X, y) # 得到 anova_filter 選出來的特徵 s = pipe.named_steps['anova'].get_support() print(s)
第二個例子
import numpy as np from sklearn import linear_model, decomposition, datasets from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target # 定義管道,先降維(pca),再邏輯迴歸 pca = decomposition.PCA() logistic = linear_model.LogisticRegression() pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) # 把管道再作爲grid_search的estimator n_components = [20, 40, 64] Cs = np.logspace(-4, 4, 3) estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, logistic__C=Cs)) estimator.fit(X_digits, y_digits)