python機器學習及實踐第三章3.1

特徵抽取：對特徵進行向量化：根據詞頻；根據詞頻和文檔頻率；以及是否考慮停用詞。stop_word=‘english'表示考慮英語中常有的停用詞。

measurements=[{'city':'Dubai','temperature':'33.'},{'city':'London','temperature':'12.'},{'city':'San Fransisco','temperature':'18.'}]
from sklearn.feature_extraction import DictVectorizer
vec=DictVectorizer()
#DictVectorizer對特徵進行抽取和細化:將dict類型的list數據，轉換成numpy array
vec.fit_transform(measurements).toarray()
vec.get_feature_names()
#使用CountVectorizer(只根據詞頻)進行向量化
from sklearn.datasets import fetch_20newsgroups
news=fetch_20newsgroups(subset='all')
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer()
#只統計詞頻 默認不去除停用詞
X_count_train=count_vec.fit_transform(X_train)
X_count_test=count_vec.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
mnb_count=MultinomialNB()
mnb_count.fit(X_count_train,y_train)
mnb_count_y_predict=mnb_count.predict(X_count_test)
print 'the accuracy :',mnb_count.score(X_count_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,mnb_count_y_predict,target_names=news.target_names)
#使用TfidfVectorizer(根據詞頻和文檔頻率)進行向量化
from sklearn.datasets import fetch_20newsgroups
news=fetch_20newsgroups(subset='all')
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)
from sklearn.feature_extraction.text import TfidfVectorizer
tfi_vec=TfidfVectorizer()
#統計詞頻以及文檔頻率 默認不去除停用詞
X_tfi_train=tfi_vec.fit_transform(X_train)
X_tfi_test=tfi_vec.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
mnb_tfi=MultinomialNB()
mnb_tfi.fit(X_tfi_train,y_train)
mnb_tfi_y_predict=mnb_tfi.predict(X_tfi_test)
print 'the accuracy :',mnb_tfi.score(X_tfi_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,mnb_tfi_y_predict,target_names=news.target_names)
#使用停用詞進行對比
#設置停用詞爲‘english’則表示調用系統默認的英文停用詞 
count_filter_vec,tfi_filter_vec=CountVectorizer(analyzer='word',stop_words='english'),TfidfVectorizer(analyzer='word',stop_words='english')
#使用有停用詞的CountVectorizer
X_count_filter_train=count_filter_vec.fit_transform(X_train)
X_count_filter_test=count_filter_vec.transform(X_test)
#使用有停用詞的TfidfVectorizer
X_tfi_filter_train=tfi_filter_vec.fit_transform(X_train)
X_tfi_filter_test=tfi_filter_vec.transform(X_test)
mnb_count_filter=MultinomialNB()
mnb_count_filter.fit(X_count_filter_train,y_train)
y_count_predict=mnb_count_filter.predict(X_count_filter_test)

mnb_tfi_filter=MultinomialNB()
mnb_tfi_filter.fit(X_tfi_filter_train,y_train)
y_tfi_predict=mnb_tfi_filter.predict(X_tfi_filter_test)

print 'the accuracy :',mnb_count_filter.score(X_count_filter_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_count_predict,target_names=news.target_names)

print 'the accuracy :',mnb_tfi_filter.score(X_tfi_filter_test,y_test)
from sklearn.metrics import classification_report
print classification_report(y_test,y_tfi_predict,target_names=news.target_names)

#實驗發現 加了停用詞之後 正確率會提升

特徵篩選：選擇不同比例的特徵進行測試，選擇效果最好的特徵。chi2是卡方檢驗。

import pandas as pd
titanic=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
y=titanic['survived']
X=titanic.drop(['row.names','name','survived'],axis=1)
X['age'].fillna(X['age'].mean(),inplace=True)
X.fillna('UNKNOWN',inplace=True)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=33)
from sklearn.feature_extraction import DictVectorizer
vec=DictVectorizer()
X_train=vec.fit_transform(X_train.to_dict(orient='record'))
X_test=vec.transform(X_test.to_dict(orient='record'))
print len(vec.feature_names_)

#使用決策樹進行預測
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train,y_train)
y_predict=dt.predict(X_test)
dt.score(X_test,y_test)
#使用特徵篩選器；學會如何篩選出最適合的特徵值
#x y 第一維不一致的問題是因爲把results寫成了result
import numpy as np
from sklearn import feature_selection
#s篩選前20%的特徵
#fs返回最佳的前20%個特徵 chi2是卡方檢驗 用來計算單一特徵與類別之間的相關性
fs=feature_selection.SelectPercentile(feature_selection.chi2,percentile=20)
X_fs_train=fs.fit_transform(X_train,y_train)
dt.fit(X_fs_train,y_train)
X_fs_test=fs.transform(X_test)
dt.score(X_fs_test,y_test)

from sklearn.model_selection import cross_val_score
percentiles=np.arange(1,100,2)
results=[]
for i in percentiles:
    fs=feature_selection.SelectPercentile(feature_selection.chi2,percentile=i)
    X_train_fs=fs.fit_transform(X_train,y_train)
    #cv選擇每次測試的折數 按照5折 每次1折作爲測試集 其餘作爲訓練集 不斷循環 每一折都做一次測試集
    scores=cross_val_score(dt,X_train_fs,y_train,cv=5)
    #更新results 不斷加入平均分數
    results=np.append(results,scores.mean())
print results
opt=np.where(results==results.max())[0]
print'Opt:',np.array(percentiles)[opt]
import pylab as pl
percentiles=percentiles.reshape(-1,1)
results=results.reshape(-1,1)
pl.plot(percentiles,results)
pl.xlabel('%%percentiles of features')
pl.ylabel('accuracy')
pl.show()

from sklearn import feature_selection
fs=feature_selection.SelectPercentile(feature_selection.chi2,percentile=7)
X_train_fs=fs.fit_transform(X_train,y_train)
dt.fit(X_train_fs,y_train)
X_test_fs=fs.transform(X_test)
dt.score(X_test_fs,y_test)

模型正則化：欠擬合，過擬合。爲了防止過擬合，使用L1正則（加入參數w的絕對值約束）,或者使用L2範數正則。

#比薩直徑與售價的關係
X_train=[[6],[8],[10],[14],[18]]
y_train=[[7],[9],[13],[17.5],[18]]
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train,y_train)
import numpy as np
#linspace均勻採樣在0,-26之間採樣100個點
xx=np.linspace(0,26,100)
xx=xx.reshape(xx.shape[0],1)
yy=regressor.predict(xx)
import matplotlib.pyplot as plt
plt.scatter(X_train,y_train)
#設置legend圖例：一次擬合 直線
plt1,=plt.plot(xx,yy,label="Degree=1")
plt.axis([0,25,0,25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
#爲完全控制，將句柄傳遞給legend
plt.legend(handles=[plt1])
plt.show()
print 'regressor :',regressor.score(X_train,y_train)

#使用二次多項式模型
#y=a+bx+cx^2
from sklearn.preprocessing import PolynomialFeatures
poly2=PolynomialFeatures()
X_train_poly2=poly2.fit_transform(X_train)
#fit_transform之後[6]變成了[1,6,36]
regressor_poly2=LinearRegression()
regressor_poly2.fit(X_train_poly2,y_train)
xx_poly2=poly2.transform(xx)
yy_poly2=regressor_poly2.predict(xx_poly2)
plt.scatter(X_train,y_train)
plt1,=plt.plot(xx,yy,label='Degree=1')
plt2,=plt.plot(xx,yy_poly2,label='Degree=2')
#設置橫縱座標軸
plt.axis([0,25,0,25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
#爲完全控制，將句柄傳遞給legend
plt.legend(handles=[plt1,plt2])
plt.show()
print 'regressor_poly :',regressor_poly2.score(X_train_poly2,y_train)

#使用四次多項式模型
#y=a+bx+cx^2
from sklearn.preprocessing import PolynomialFeatures
poly4=PolynomialFeatures(degree=4)
X_train_poly4=poly4.fit_transform(X_train)
regressor_poly4=LinearRegression()
regressor_poly4.fit(X_train_poly4,y_train)
xx_poly4=poly4.transform(xx)
yy_poly4=regressor_poly4.predict(xx_poly4)
plt.scatter(X_train,y_train)
plt1,=plt.plot(xx,yy,label='Degree=1')
plt2,=plt.plot(xx,yy_poly2,label='Degree=2')
plt4,=plt.plot(xx,yy_poly4,label='Degree=4')
#設置橫縱座標軸
plt.axis([0,25,0,25])
plt.xlabel('Diameter of Pizza')
plt.ylabel('Price of Pizza')
#爲完全控制，將句柄傳遞給legend
plt.legend(handles=[plt1,plt2,plt4])
plt.show()
print 'regressor_poly :',regressor_poly4.score(X_train_poly4,y_train)

#測試集進行測試
X_test=[[6],[8],[11],[16]]
y_test=[[8],[12],[15],[18]]
regressor.score(X_test,y_test)
X_test_poly2=poly2.transform(X_test)
X_test_poly4=poly4.transform(X_test)
regressor_poly2.score(X_test_poly2,y_test)
regressor_poly4.score(X_test_poly4,y_test)
#加入L1正則 :Lasso
from sklearn.linear_model import Lasso
lasso_poly4=Lasso()
lasso_poly4.fit(X_train_poly4,y_train)
print lasso_poly4.score(X_test_poly4,y_test)
#coef輸出函數的參數
print lasso_poly4.coef_
print regressor_poly4.coef_
from sklearn.linear_model import Ridge
ridge_poly4=Ridge()
ridge_poly4.fit(X_train_poly4,y_train)
print ridge_poly4.score(X_test_poly4,y_test)
print ridge_poly4.coef_
print np.sum(lasso_poly4.coef_**2)
print np.sum(ridge_poly4.coef_**2)
print np.sum(regressor_poly4.coef_**2)
from sklearn.datasets import fetch_20newsgroups
import numpy as np
news=fetch_20newsgroups(subset='all')
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data[:3000],news.target[:3000],test_size=0.25,random_state=33)
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
#pipeline 一種簡化代碼的方法 先數據處理再預測
clf=Pipeline([('vect',TfidfVectorizer(stop_words='english',analyzer='word')),('svc',SVC())])
parameters={'svc_gamma':np.logspace(-2,1,4),'svc_C':np.logspace(-1,1,3)}
from sklearn.model_selection import GridSearchCV

gs=GridSearchCV(clf,parameters,verbose=2,refit=True,cv=3)
%time _=gs.fit(X_train,y_train)
gs.best_params_, gs.bset_score_
print gs.score(X_test,y_test)

超參數搜索方法：網格搜索：單線程以及並行搜索。

python機器學習及實踐第三章3.1

[leetcode 83]刪除排序鏈表中的重複元素

python 負數開平方根精度控制

python換行

用tensorflow實現minist手寫數字識別

[leetcode 541]反轉字符串

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

python機器學習及實踐 第三章3.1

python機器學習及實踐第三章3.1