python機器學習及實踐 第三章3.1

  • 特徵抽取:對特徵進行向量化:根據詞頻;根據詞頻和文檔頻率;以及是否考慮停用詞。stop_word=‘english'表示考慮英語中常有的停用詞。
measurements=[{'city':'Dubai','temperature':'33.'},{'city':'London','temperature':'12.'},{'city':'San Fransisco','temperature':'18.'}]
from sklearn.feature_extraction import DictVectorizer
vec=DictVectorizer()
#DictVectorizer對特徵進行抽取和細化:將dict類型的list數據,轉換成numpy array
vec.fit_transform(measurements).toarray()
vec.get_feature_names()
#使用CountVectorizer(只根據詞頻)進行向量化
from sklearn.datasets import fetch_20newsgroups
news=fetch_20newsgroups(subset='all')
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer()
#只統計詞頻 默認不去除停用詞
X_count_train=count_vec.fit_transform(X_train)
X_count_test=count_vec.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
mnb_count=MultinomialNB()
mnb_count.fit(X_count_train,y_train)
mnb_count_y_predict=mnb_count.predict(X_count_test)
print 'the accuracy :',mnb_count.score(X_count_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,mnb_count_y_predict,target_names=news.target_names)
#使用TfidfVectorizer(根據詞頻和文檔頻率)進行向量化
from sklearn.datasets import fetch_20newsgroups
news=fetch_20newsgroups(subset='all')
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)
from sklearn.feature_extraction.text import TfidfVectorizer
tfi_vec=TfidfVectorizer()
#統計詞頻以及文檔頻率 默認不去除停用詞
X_tfi_train=tfi_vec.fit_transform(X_train)
X_tfi_test=tfi_vec.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
mnb_tfi=MultinomialNB()
mnb_tfi.fit(X_tfi_train,y_train)
mnb_tfi_y_predict=mnb_tfi.predict(X_tfi_test)
print 'the accuracy :',mnb_tfi.score(X_tfi_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,mnb_tfi_y_predict,target_names=news.target_names)
#使用停用詞進行對比
#設置停用詞爲‘english’則表示調用系統默認的英文停用詞 
count_filter_vec,tfi_filter_vec=CountVectorizer(analyzer='word',stop_words='english'),TfidfVectorizer(analyzer='word',stop_words='english')
#使用有停用詞的CountVectorizer
X_count_filter_train=count_filter_vec.fit_transform(X_train)
X_count_filter_test=count_filter_vec.transform(X_test)
#使用有停用詞的TfidfVectorizer
X_tfi_filter_train=tfi_filter_vec.fit_transform(X_train)
X_tfi_filter_test=tfi_filter_vec.transform(X_test)
mnb_count_filter=MultinomialNB()
mnb_count_filter.fit(X_count_filter_train,y_train)
y_count_predict=mnb_count_filter.predict(X_count_filter_test)

mnb_tfi_filter=MultinomialNB()
mnb_tfi_filter.fit(X_tfi_filter_train,y_train)
y_tfi_predict=mnb_tfi_filter.predict(X_tfi_filter_test)

print 'the accuracy :',mnb_count_filter.score(X_count_filter_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_count_predict,target_names=news.target_names)

print 'the accuracy :',mnb_tfi_filter.score(X_tfi_filter_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_tfi_predict,target_names=news.target_names)

#實驗發現 加了停用詞之後 正確率會提升
  • 特徵篩選:選擇不同比例的特徵進行測試,選擇效果最好的特徵。chi2是卡方檢驗。
import pandas as pd
titanic=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
y=titanic['survived']
X=titanic.drop(['row.names','name','survived'],axis=1)
X['age'].fillna(X['age'].mean(),inplace=True)
X.fillna('UNKNOWN',inplace=True)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec=DictVectorizer()
X_train=vec.fit_transform(X_train.to_dict(orient='record'))
X_test=vec.transform(X_test.to_dict(orient='record'))
print len(vec.feature_names_)

#使用決策樹進行預測
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train,y_train)
y_predict=dt.predict(X_test)
dt.score(X_test,y_test)
#使用特徵篩選器;學會如何篩選出最適合的特徵值
#x y 第一維不一致的問題是因爲把results寫成了result
import numpy as np
from sklearn import feature_selection
#s篩選前20%的特徵
#fs返回最佳的前20%個特徵 chi2是卡方檢驗 用來計算單一特徵與類別之間的相關性
fs=feature_selection.SelectPercentile(feature_selection.chi2,percentile=20)
X_fs_train=fs.fit_transform(X_train,y_train)
dt.fit(X_fs_train,y_train)
X_fs_test=fs.transform(X_test)
dt.score(X_fs_test,y_test)

from sklearn.model_selection import cross_val_score
percentiles=np.arange(1,100,2)
results=[]
for i in percentiles:
    fs=feature_selection.SelectPercentile(feature_selection.chi2,percentile=i)
    X_train_fs=fs.fit_transform(X_train,y_train)
    #cv選擇每次測試的折數 按照5折 每次1折作爲測試集 其餘作爲訓練集 不斷循環 每一折都做一次測試集
    scores=cross_val_score(dt,X_train_fs,y_train,cv=5)
    #更新results 不斷加入平均分數
    results=np.append(results,scores.mean())
print results
opt=np.where(results==results.max())[0]
print'Opt:',np.array(percentiles)[opt]
import pylab as pl
percentiles=percentiles.reshape(-1,1)
results=results.reshape(-1,1)
pl.plot(percentiles,results)
pl.xlabel('%%percentiles of features')
pl.ylabel('accuracy')
pl.show()

from sklearn import feature_selection
fs=feature_selection.SelectPercentile(feature_selection.chi2,percentile=7)
X_train_fs=fs.fit_transform(X_train,y_train)
dt.fit(X_train_fs,y_train)
X_test_fs=fs.transform(X_test)
dt.score(X_test_fs,y_test)

  • 模型正則化:欠擬合,過擬合。爲了防止過擬合,使用L1正則(加入參數w的絕對值約束),或者使用L2範數正則。
    #比薩直徑與售價的關係
    X_train=[[6],[8],[10],[14],[18]]
    y_train=[[7],[9],[13],[17.5],[18]]
    from sklearn.linear_model import LinearRegression
    regressor=LinearRegression()
    regressor.fit(X_train,y_train)
    import numpy as np
    #linspace均勻採樣在0,-26之間採樣100個點
    xx=np.linspace(0,26,100)
    xx=xx.reshape(xx.shape[0],1)
    yy=regressor.predict(xx)
    import matplotlib.pyplot as plt
    plt.scatter(X_train,y_train)
    #設置legend圖例:一次擬合 直線
    plt1,=plt.plot(xx,yy,label="Degree=1")
    plt.axis([0,25,0,25])
    plt.xlabel('Diameter of Pizza')
    plt.ylabel('Price of Pizza')
    #爲完全控制,將句柄傳遞給legend
    plt.legend(handles=[plt1])
    plt.show()
    print 'regressor :',regressor.score(X_train,y_train)

  • #使用二次多項式模型
    #y=a+bx+cx^2
    from sklearn.preprocessing import PolynomialFeatures
    poly2=PolynomialFeatures()
    X_train_poly2=poly2.fit_transform(X_train)
    #fit_transform之後[6]變成了[1,6,36]
    regressor_poly2=LinearRegression()
    regressor_poly2.fit(X_train_poly2,y_train)
    xx_poly2=poly2.transform(xx)
    yy_poly2=regressor_poly2.predict(xx_poly2)
    plt.scatter(X_train,y_train)
    plt1,=plt.plot(xx,yy,label='Degree=1')
    plt2,=plt.plot(xx,yy_poly2,label='Degree=2')
    #設置橫縱座標軸
    plt.axis([0,25,0,25])
    plt.xlabel('Diameter of Pizza')
    plt.ylabel('Price of Pizza')
    #爲完全控制,將句柄傳遞給legend
    plt.legend(handles=[plt1,plt2])
    plt.show()
    print 'regressor_poly :',regressor_poly2.score(X_train_poly2,y_train)

  • #使用四次多項式模型
    #y=a+bx+cx^2
    from sklearn.preprocessing import PolynomialFeatures
    poly4=PolynomialFeatures(degree=4)
    X_train_poly4=poly4.fit_transform(X_train)
    regressor_poly4=LinearRegression()
    regressor_poly4.fit(X_train_poly4,y_train)
    xx_poly4=poly4.transform(xx)
    yy_poly4=regressor_poly4.predict(xx_poly4)
    plt.scatter(X_train,y_train)
    plt1,=plt.plot(xx,yy,label='Degree=1')
    plt2,=plt.plot(xx,yy_poly2,label='Degree=2')
    plt4,=plt.plot(xx,yy_poly4,label='Degree=4')
    #設置橫縱座標軸
    plt.axis([0,25,0,25])
    plt.xlabel('Diameter of Pizza')
    plt.ylabel('Price of Pizza')
    #爲完全控制,將句柄傳遞給legend
    plt.legend(handles=[plt1,plt2,plt4])
    plt.show()
    print 'regressor_poly :',regressor_poly4.score(X_train_poly4,y_train)

  • #測試集進行測試
    X_test=[[6],[8],[11],[16]]
    y_test=[[8],[12],[15],[18]]
    regressor.score(X_test,y_test)
    X_test_poly2=poly2.transform(X_test)
    X_test_poly4=poly4.transform(X_test)
    regressor_poly2.score(X_test_poly2,y_test)
    regressor_poly4.score(X_test_poly4,y_test)
    #加入L1正則 :Lasso
    from sklearn.linear_model import Lasso
    lasso_poly4=Lasso()
    lasso_poly4.fit(X_train_poly4,y_train)
    print lasso_poly4.score(X_test_poly4,y_test)
    #coef輸出函數的參數
    print lasso_poly4.coef_
    print regressor_poly4.coef_
    from sklearn.linear_model import Ridge
    ridge_poly4=Ridge()
    ridge_poly4.fit(X_train_poly4,y_train)
    print ridge_poly4.score(X_test_poly4,y_test)
    print ridge_poly4.coef_
    print np.sum(lasso_poly4.coef_**2)
    print np.sum(ridge_poly4.coef_**2)
    print np.sum(regressor_poly4.coef_**2)
    from sklearn.datasets import fetch_20newsgroups
    import numpy as np
    news=fetch_20newsgroups(subset='all')
    from sklearn.model_selection import train_test_split
    X_train,X_test,y_train,y_test=train_test_split(news.data[:3000],news.target[:3000],test_size=0.25,random_state=33)
    from sklearn.svm import SVC
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.pipeline import Pipeline
    #pipeline 一種簡化代碼的方法 先數據處理再預測
    clf=Pipeline([('vect',TfidfVectorizer(stop_words='english',analyzer='word')),('svc',SVC())])
    parameters={'svc_gamma':np.logspace(-2,1,4),'svc_C':np.logspace(-1,1,3)}
    from sklearn.model_selection import GridSearchCV
    
    gs=GridSearchCV(clf,parameters,verbose=2,refit=True,cv=3)
    %time _=gs.fit(X_train,y_train)
    gs.best_params_, gs.bset_score_
    print gs.score(X_test,y_test)
    

    超參數搜索方法:網格搜索:單線程以及並行搜索。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章