
ML CLassifier模塊


1. Random Forest + KFold

import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

rf = RandomForestClassifier(n_jobs=-1)  # parallel building
kfold = KFold(n_splits=10)

2. Holdout Test Set Evaluation

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

xTrain, xTest, yTrain, yTest = train_test_split(X_features, data.label, test_size=0.2)

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(xTrain, yTrain)

# find out the most important features with respect to the model
sorted(zip(rf_model.feature_importances_, xTrain.columns), reverse=True)[:5]

y_pred = rf_model.predict(xTest)
precision, recall, fscore, support = score(yTest, y_pred, pos_label = "spam", average = "binary")

print('precision: {} / recall: {} / accuracy: {}'.format(precision, recall, (y_pred==yTest).sum()/len(y_pred)))

3. Grid Search + Model Evaluation


 def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    prec, recall, fscore, sup = score(y_test, y_pred, pos_label="spam", average="binary")
    print("Est:{}/Dpeth:{}\nprecision:{}/recall:{}/accur:{}".format(n_est, depth, prec, recall, (y_pred==y_test).sum()/len(y_pred)))
for n_est in [10,20,50]:
    for depth in range(10,40,10):
        train_RF(n_est, depth)


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
param = {'n_estimators':[10,150,300],
gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat, data["label"])
pd.DataFrame(gs_fit.cv_results_).sort_values("mean_test_score", ascending=False)[:5]

 本機跑GridSearchCV的時候報了Memory Error錯誤。解決方法有別的博客講解,建議增大虛擬內存,具體操作在此不贅述。

4. Gradient Boost

定義:Ensemble learning method that takes an iterative approach to combining weak learners to create a strong learner by focusing on mistakes of prior iterations. Decision tree based. 



  1. Bagging, so training can be done in parallel.
  2. Unweighted voting for final prediction. 
  3. Easier to tune, harder to overfit. 

Gradient Boosting:

  1. Boosting, so training must be done iteratively.
  2. Weighted voting for final prediction.
  3. Harder to tune, easier to overfit. 

Tradeoffs of GB:


  1. powerful
  2. accepts various types of inputs
  3. can be used for classification or regression
  4. outputs feature importance


  1. longer to train
  2. more likely to overfit
  3. more difficult to properly tune
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

gb = GradientBoostingClassifier()
param = {
    "n_estimators":[100, 150],
    "max_depth":[7, 11, 15],
gs = GridSearchCV(gb, param, cv = 5, n_jobs=-1)
cv_fit = gs.fit(X_tfidf_feat, data.label)
pd.DataFrame(cv_fit.cv_results_).sort_values("mean_test_score", ascending=False)[:5]

5. Pipeline總結

  • read in raw text
  • clean text and tokenize
  • feature engineering
  • fit simple model
  • tune hyperparameters and evalueate model
  • final model selection

Vectorizers should be fit on the training set and only be used to transform the test set. 


split data into trainig and test set -> train vectorizers on training set and use that to transform test set -> fit best RF and GB model on training set and predict on test set -> evaluate results of two models to select best model 


import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)



from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

#RF model 
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

#GB model 
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))


發佈了34 篇原創文章 · 獲贊 43 · 訪問量 1萬+
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.