17 機器學習案例——基於樸素貝葉斯算法的文本分類(留言板評論分類案例)

from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

# 留言板評論分類案例

def testNB_skl():
    posting = ['my dog has flea problems help please', 'maybe not take him to dog park stupid',
               'my dalmation is so cute I love him', 'stop posting stupid worthless garbage', 
               'mr licks ate my steak how to stop him', 'quit buying worthless dog food stupid']
    classVec = [0, 1, 0, 1, 0, 1]
    
    # 交叉驗證選擇 訓練集和測試集
    train_data, test_data, train_y, test_y = train_test_split(posting, classVec, test_size=0.2, train_size=0.8)
    
    # 生成文本的詞頻矩陣
    vectorizer = CountVectorizer()                             # 用於 詞袋模型 統計詞頻
    wordX = vectorizer.fit_transform(train_data)
    
    # 訓練分類器
    clf = MultinomialNB().fit(wordX, train_y)
    
    # 預測測試集的分類結果
    
    test_wordX = vectorizer.transform(test_data).toarray()
    predicted = clf.predict(test_wordX)                        # 預測
    for doc, category in zip(test_data, predicted):
        print(doc, ':', category)
        
    #在測試集上的性能評估
    classTarget_names = ['正常言論', '侮辱性言論']
    print(classification_report(test_y, predicted, target_names=classTarget_names))
    
testNB_skl()

輸出:

maybe not take him to dog park stupid : 0
stop posting stupid worthless garbage : 1
             precision    recall  f1-score   support

       正常言論       0.00      0.00      0.00         0
      侮辱性言論       1.00      0.50      0.67         2

avg / total       1.00      0.50      0.67         2
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章