很多博客都是在跑機器實戰書上的代碼,代碼和數據集網上有很多資源。本章博客重點記錄我在跑代碼的報錯,以及書上的代碼有誤的地方:
def textParse(bigString): # input is big string, #output is word list
import re
listOfTokens = re.split(r'\W*', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList = [];
classList = [];
fullText = []
for i in range(1, 26):
# 正常郵件
print('111111111 %d ', i)
wordList = textParse(open('email/spam/%d.txt' % i, 'r', encoding='utf-8').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
# 垃圾郵件
print('00000000 %d ', i)
wordList = textParse(open('email/ham/%d.txt' % i, 'r', encoding='utf-8').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
# 創建詞彙表
vocabList = createVocabList(docList) # create vocabulary
trainingSet = list(range(50));
testSet = [] # create test set
# 構建隨機的訓練集合
for i in range(10):
# 任意抽取出10分郵件,第i封,選一個隨機數
randIndex = int(random.uniform(0, len(trainingSet)))
# 加入test的集合中
testSet.append(trainingSet[randIndex])
# 從訓練的集合中移除
del (trainingSet[randIndex])
trainMat = [];
trainClasses = []
# 訓練集合
for docIndex in trainingSet: # train the classifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
errorCount = 0
# 對測試集進行分類
for docIndex in testSet: # classify the remaining items
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
# 通過分類器得出的結果和判斷的詞彙表進行判斷球的判斷的錯誤率
if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
errorCount += 1
print("classification error", docList[docIndex])
print('the error rate is: ', float(errorCount) / len(testSet))
# return vocabList,fullText