垃圾短信檢測
代碼:
# _*_ coding: tf-8 _*_
# 垃圾短信檢測
# 1、導入需要的包
import pandas as pd
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
# 2、讀取數據集
# 第一列是短信的label,\t鍵後面是短信的正文
# ham:非垃圾短信
# spam:垃圾短信
df = pd.read_csv('SMSSpamCollection.txt', delimiter = '\t', header = None) # 用‘\t’分割每行的兩列,沒有文件頭
y ,X_train = df[0], df[1] # 類別賦值給df[0],短消息文本本身賦值給df[1]
# 3、用tf-idf向量化
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_train)
# 4、訓練模型,使用邏輯迴歸
lr = linear_model.LogisticRegression()
lr.fit(X, y)
# 5、測試
testX = vectorizer.transform(['URGENT! Your mobile No. 1234 was awarded a Prize',
'Hey honey, whats up?'])
predictions = lr.predict(testX)
print(predictions)