所用數據爲經典的20Newsgroup數據
數據集鏈接:http://qwone.com/~jason/20Newsgroups/(比較慢,建議採用Science上網等其他方法下載)
直接上完整代碼:
# -*- coding: utf-8 -*-
import os
import math
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
def TF(wordSet,split):
tf = dict.fromkeys(wordSet, 0)
for word in split:
tf[word] += 1
return tf
def IDF(tfList):
idfDict = dict.fromkeys(tfList[0],0) #詞爲key,初始值爲0
N = len(tfList) #總文檔數量
for tf in tfList: # 遍歷字典中每一篇文章
for word, count in tf.items(): #遍歷當前文章的每一個詞
if count > 0 : #當前遍歷的詞語在當前遍歷到的文章中出現
idfDict[word] += 1 #包含詞項tj的文檔的篇數df+1
for word, Ni in idfDict.items(): #利用公式將df替換爲逆文檔頻率idf
idfDict[word] = math.log10(N/Ni) #N,Ni均不會爲0
return idfDict #返回逆文檔頻率IDF字典
def TFIDF(tf, idfs): #tf詞頻,idf逆文檔頻率
tfidf = {}
for word, tfval in tf.items():
tfidf[word] = tfval * idfs[word]
return tfidf
if __name__ == "__main__":
#1 獲取文件
text=[]
name_all = os.listdir(r'20news-bydate-train/alt.atheism/')
for i in range(len(name_all)):
name = "20news-bydate-train/alt.atheism/" + name_all[i]
f = open(name,"rb")
str1=f.read()
text.append(str1)
f.close()
#2 將每篇文檔進行分詞
wordSet = {}
split_list = []
for i in range(len(text)):
split =str(text[i]).split(' ')
split_list.append(split)
wordSet = set(wordSet ).union(split)#通過set去重來構建詞庫
#3 統計每篇文章各項詞語的詞頻
tf = []
for i in range(len(split_list)):
tf.append(TF(wordSet,split_list[i]))
#4 計算文檔集的逆文檔頻率
idfs = IDF(tf)
#5 tf*idf = tfidf算法
tfidf = []
for i in range(len(tf)):
tfidf.append(TFIDF(tf[i], idfs))
print(pd.DataFrame(tfidf)) #可轉換爲DataFrame類型用於後序操作
本例讀取了480篇英文文檔,並將其向量化
最終獲取到了一個480*31412維的DataFrame類型數據,可根據後續PCA降維和相關分類算法的實際需要將其轉換爲ndarray類型、矩陣類型(scipy.sparse.csr.csr_matrix)等。