- 特徵抽取:對特徵進行向量化:根據詞頻;根據詞頻和文檔頻率;以及是否考慮停用詞。stop_word=‘english'表示考慮英語中常有的停用詞。
measurements=[{'city':'Dubai','temperature':'33.'},{'city':'London','temperature':'12.'},{'city':'San Fransisco','temperature':'18.'}]
from sklearn.feature_extraction import DictVectorizer
vec=DictVectorizer()
#DictVectorizer對特徵進行抽取和細化:將dict類型的list數據,轉換成numpy array
vec.fit_transform(measurements).toarray()
vec.get_feature_names()
#使用CountVectorizer(只根據詞頻)進行向量化
from sklearn.datasets import fetch_20newsgroups
news=fetch_20newsgroups(subset='all')
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer()
#只統計詞頻 默認不去除停用詞
X_count_train=count_vec.fit_transform(X_train)
X_count_test=count_vec.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
mnb_count=MultinomialNB()
mnb_count.fit(X_count_train,y_train)
mnb_count_y_predict=mnb_count.predict(X_count_test)
print 'the accuracy :',mnb_count.score(X_count_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,mnb_count_y_predict,target_names=news.target_names)
#使用TfidfVectorizer(根據詞頻和文檔頻率)進行向量化
from sklearn.datasets import fetch_20newsgroups
news=fetch_20newsgroups(subset='all')
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)
from sklearn.feature_extraction.text import TfidfVectorizer
tfi_vec=TfidfVectorizer()
#統計詞頻以及文檔頻率 默認不去除停用詞
X_tfi_train=tfi_vec.fit_transform(X_train)
X_tfi_test=tfi_vec.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
mnb_tfi=MultinomialNB()
mnb_tfi.fit(X_tfi_train,y_train)
mnb_tfi_y_predict=mnb_tfi.predict(X_tfi_test)
print 'the accuracy :',mnb_tfi.score(X_tfi_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,mnb_tfi_y_predict,target_names=news.target_names)
#使用停用詞進行對比
#設置停用詞爲‘english’則表示調用系統默認的英文停用詞
count_filter_vec,tfi_filter_vec=CountVectorizer(analyzer='word',stop_words='english'),TfidfVectorizer(analyzer='word',stop_words='english')
#使用有停用詞的CountVectorizer
X_count_filter_train=count_filter_vec.fit_transform(X_train)
X_count_filter_test=count_filter_vec.transform(X_test)
#使用有停用詞的TfidfVectorizer
X_tfi_filter_train=tfi_filter_vec.fit_transform(X_train)
X_tfi_filter_test=tfi_filter_vec.transform(X_test)
mnb_count_filter=MultinomialNB()
mnb_count_filter.fit(X_count_filter_train,y_train)
y_count_predict=mnb_count_filter.predict(X_count_filter_test)
mnb_tfi_filter=MultinomialNB()
mnb_tfi_filter.fit(X_tfi_filter_train,y_train)
y_tfi_predict=mnb_tfi_filter.predict(X_tfi_filter_test)
print 'the accuracy :',mnb_count_filter.score(X_count_filter_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_count_predict,target_names=news.target_names)
print 'the accuracy :',mnb_tfi_filter.score(X_tfi_filter_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_tfi_predict,target_names=news.target_names)
#實驗發現 加了停用詞之後 正確率會提升
- 特徵篩選:選擇不同比例的特徵進行測試,選擇效果最好的特徵。chi2是卡方檢驗。
import pandas as pd
titanic=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
y=titanic['survived']
X=titanic.drop(['row.names','name','survived'],axis=1)
X['age'].fillna(X['age'].mean(),inplace=True)
X.fillna('UNKNOWN',inplace=True)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec=DictVectorizer()
X_train=vec.fit_transform(X_train.to_dict(orient='record'))
X_test=vec.transform(X_test.to_dict(orient='record'))
print len(vec.feature_names_)
#使用決策樹進行預測
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train,y_train)
y_predict=dt.predict(X_test)
dt.score(X_test,y_test)
#使用特徵篩選器;學會如何篩選出最適合的特徵值
#x y 第一維不一致的問題是因爲把results寫成了result
import numpy as np
from sklearn import feature_selection
#s篩選前20%的特徵
#fs返回最佳的前20%個特徵 chi2是卡方檢驗 用來計算單一特徵與類別之間的相關性
fs=feature_selection.SelectPercentile(feature_selection.chi2,percentile=20)
X_fs_train=fs.fit_transform(X_train,y_train)
dt.fit(X_fs_train,y_train)
X_fs_test=fs.transform(X_test)
dt.score(X_fs_test,y_test)
from sklearn.model_selection import cross_val_score
percentiles=np.arange(1,100,2)
results=[]
for i in percentiles:
fs=feature_selection.SelectPercentile(feature_selection.chi2,percentile=i)
X_train_fs=fs.fit_transform(X_train,y_train)
#cv選擇每次測試的折數 按照5折 每次1折作爲測試集 其餘作爲訓練集 不斷循環 每一折都做一次測試集
scores=cross_val_score(dt,X_train_fs,y_train,cv=5)
#更新results 不斷加入平均分數
results=np.append(results,scores.mean())
print results
opt=np.where(results==results.max())[0]
print'Opt:',np.array(percentiles)[opt]
import pylab as pl
percentiles=percentiles.reshape(-1,1)
results=results.reshape(-1,1)
pl.plot(percentiles,results)
pl.xlabel('%%percentiles of features')
pl.ylabel('accuracy')
pl.show()
from sklearn import feature_selection
fs=feature_selection.SelectPercentile(feature_selection.chi2,percentile=7)
X_train_fs=fs.fit_transform(X_train,y_train)
dt.fit(X_train_fs,y_train)
X_test_fs=fs.transform(X_test)
dt.score(X_test_fs,y_test)
- 模型正則化:欠擬合,過擬合。爲了防止過擬合,使用L1正則(加入參數w的絕對值約束),或者使用L2範數正則。
#比薩直徑與售價的關係 X_train=[[6],[8],[10],[14],[18]] y_train=[[7],[9],[13],[17.5],[18]] from sklearn.linear_model import LinearRegression regressor=LinearRegression() regressor.fit(X_train,y_train) import numpy as np #linspace均勻採樣在0,-26之間採樣100個點 xx=np.linspace(0,26,100) xx=xx.reshape(xx.shape[0],1) yy=regressor.predict(xx) import matplotlib.pyplot as plt plt.scatter(X_train,y_train) #設置legend圖例:一次擬合 直線 plt1,=plt.plot(xx,yy,label="Degree=1") plt.axis([0,25,0,25]) plt.xlabel('Diameter of Pizza') plt.ylabel('Price of Pizza') #爲完全控制,將句柄傳遞給legend plt.legend(handles=[plt1]) plt.show() print 'regressor :',regressor.score(X_train,y_train)
-
#使用二次多項式模型 #y=a+bx+cx^2 from sklearn.preprocessing import PolynomialFeatures poly2=PolynomialFeatures() X_train_poly2=poly2.fit_transform(X_train) #fit_transform之後[6]變成了[1,6,36] regressor_poly2=LinearRegression() regressor_poly2.fit(X_train_poly2,y_train) xx_poly2=poly2.transform(xx) yy_poly2=regressor_poly2.predict(xx_poly2) plt.scatter(X_train,y_train) plt1,=plt.plot(xx,yy,label='Degree=1') plt2,=plt.plot(xx,yy_poly2,label='Degree=2') #設置橫縱座標軸 plt.axis([0,25,0,25]) plt.xlabel('Diameter of Pizza') plt.ylabel('Price of Pizza') #爲完全控制,將句柄傳遞給legend plt.legend(handles=[plt1,plt2]) plt.show() print 'regressor_poly :',regressor_poly2.score(X_train_poly2,y_train)
-
#使用四次多項式模型 #y=a+bx+cx^2 from sklearn.preprocessing import PolynomialFeatures poly4=PolynomialFeatures(degree=4) X_train_poly4=poly4.fit_transform(X_train) regressor_poly4=LinearRegression() regressor_poly4.fit(X_train_poly4,y_train) xx_poly4=poly4.transform(xx) yy_poly4=regressor_poly4.predict(xx_poly4) plt.scatter(X_train,y_train) plt1,=plt.plot(xx,yy,label='Degree=1') plt2,=plt.plot(xx,yy_poly2,label='Degree=2') plt4,=plt.plot(xx,yy_poly4,label='Degree=4') #設置橫縱座標軸 plt.axis([0,25,0,25]) plt.xlabel('Diameter of Pizza') plt.ylabel('Price of Pizza') #爲完全控制,將句柄傳遞給legend plt.legend(handles=[plt1,plt2,plt4]) plt.show() print 'regressor_poly :',regressor_poly4.score(X_train_poly4,y_train)
-
#測試集進行測試 X_test=[[6],[8],[11],[16]] y_test=[[8],[12],[15],[18]] regressor.score(X_test,y_test) X_test_poly2=poly2.transform(X_test) X_test_poly4=poly4.transform(X_test) regressor_poly2.score(X_test_poly2,y_test) regressor_poly4.score(X_test_poly4,y_test) #加入L1正則 :Lasso from sklearn.linear_model import Lasso lasso_poly4=Lasso() lasso_poly4.fit(X_train_poly4,y_train) print lasso_poly4.score(X_test_poly4,y_test) #coef輸出函數的參數 print lasso_poly4.coef_ print regressor_poly4.coef_ from sklearn.linear_model import Ridge ridge_poly4=Ridge() ridge_poly4.fit(X_train_poly4,y_train) print ridge_poly4.score(X_test_poly4,y_test) print ridge_poly4.coef_ print np.sum(lasso_poly4.coef_**2) print np.sum(ridge_poly4.coef_**2) print np.sum(regressor_poly4.coef_**2) from sklearn.datasets import fetch_20newsgroups import numpy as np news=fetch_20newsgroups(subset='all') from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test=train_test_split(news.data[:3000],news.target[:3000],test_size=0.25,random_state=33) from sklearn.svm import SVC from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline #pipeline 一種簡化代碼的方法 先數據處理再預測 clf=Pipeline([('vect',TfidfVectorizer(stop_words='english',analyzer='word')),('svc',SVC())]) parameters={'svc_gamma':np.logspace(-2,1,4),'svc_C':np.logspace(-1,1,3)} from sklearn.model_selection import GridSearchCV gs=GridSearchCV(clf,parameters,verbose=2,refit=True,cv=3) %time _=gs.fit(X_train,y_train) gs.best_params_, gs.bset_score_ print gs.score(X_test,y_test)
超參數搜索方法:網格搜索:單線程以及並行搜索。