本博客參照了復旦大學計算機科學技術學院邱錫鵬教授的文章https://www.zhihu.com/question/324189960
題目:實現基於logistic/softmax regression的文本分類
-
參考
-
數據集:Classify the sentiment of sentences from the Rotten Tomatoes dataset
-
實現要求:NumPy
-
需要了解的知識點:
- 文本特徵表示:Bag-of-Word,N-gram
- 分類器:logistic/softmax regression,損失函數、(隨機)梯度下降、特徵選擇
- 數據集:訓練集/驗證集/測試集的劃分
-
實驗:
- 分析不同的特徵、損失函數、學習率對最終分類性能的影響
- shuffle 、batch、mini-batch
注:代碼並沒有嚴格參照要求去做,而是使用了sklearn封裝好的工具。
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib.pylab as plt
%matplotlib inline
#載入數據
df_train = pd.read_csv(r'sentiment-analysis-on-movie-reviews/train.tsv',delimiter='\t')
df_test = pd.read_csv(r'sentiment-analysis-on-movie-reviews/test.tsv',delimiter='\t')
df_train.head()
#對文本數據做預處理
df_train['Phrase'] = df_train['Phrase'].apply(lambda x: x.lower())
# print(df_train['Phrase'])
#不能使用默認的停用詞表,因爲類似於'a'這樣的字母會被去除掉,而有些項就是單單一個'a'
# stop_word = set(stopwords.words('english')) #採用英文停用詞表
# df_train['Phrase'] = df_train['Phrase'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_word]))
df_train['tokenizer_sents'] = df_train['Phrase'].apply(lambda x: nltk.word_tokenize(x)) #對句子分詞
# print(df_train['tokenizer_sents']) #輸出類似[a, joke, in, the, united, states]
#提取詞幹
stemmer = SnowballStemmer('english')
df_train['tokenizer_sents'] = df_train['tokenizer_sents'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x]))
# print(df_train['tokenizer_sents']) #有些效果並不好,如forced被變成了forc
#劃分並製作數據集
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
X = df_train['tokenizer_sents']
y = df_train['Sentiment']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1)
vect = CountVectorizer() #可以加上不同參數:CountVectorizer(ngram_range = (1,1),analyzer = 'word',min_df = 0.001),其中ngram_range表示N元特徵
X_train_df = vect.fit_transform(X_train) #對文本進行編碼
X_test_df = vect.transform(X_test) #注意不是fit_transform
print('特徵數量:',len(vect.get_feature_names())) #特徵數量: 10730
#構建不同機器學習模型
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_df,y_train)
y_pred_class = lr.predict(X_test_df)
print('LR:',metrics.accuracy_score(y_test,y_pred_class)) #LR:0.6295014737921313
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
nb = MultinomialNB()
nb.fit(X_train_df,y_train)
# print(X_train_df) #稀疏矩陣,如下
# (0, 635) 1
# (1, 3495) 1
# print(X_test_df)
y_pred_class = nb.predict(X_test_df)
print('NB:',metrics.accuracy_score(y_test,y_pred_class)) #NB: 0.612392669486095
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit(X_train_df,y_train)
y_pred_class = sgd.predict(X_test_df)
print('SGD:',metrics.accuracy_score(y_test,y_pred_class)) #SGD: 0.6094771241830066
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_df, y_train)
y_pred_class = rfc.predict(X_test_df)
print('RF:',metrics.accuracy_score(y_test, y_pred_class)) #RF: 0.6248237857234397
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train_df, y_train)
y_pred_class = xgb.predict(X_test_df)
print('XGB:',metrics.accuracy_score(y_test, y_pred_class)) #XGB: 0.5396642317057542
#使用xgboost的工具衡量特徵重要性
from xgboost import plot_importance
fig,ax = plt.subplots(figsize=(10,15))
plot_importance(xgb,height=0.5,max_num_features=64,ax=ax)
plt.show()