這篇文章是做數據處理時輸出的數據
第一列:詞(包括n-gram); 第二列:DF,第三列:全部文章中的 total TF; 第四列:total TF/全部文章不去重的總字數,第五列:第四列/DF; 第六列:total TF/該詞出現的文章中不去重的總字數;第7列:第六列/DF
from collections import defaultdict
def get_count(fPath):
invertedIndex = defaultdict(list)
docNumber = 0
text=[]
with open(fPath, 'r',encoding='utf-8') as f:
line = f.readline()
while line:
line = line.strip('\n').split(' ')#這裏輸出的line是切好詞的list
text.append(line)
lengthOfDocument = len(line) # 讀出文章的長度,也就是每個line的長度
docNumber += 1 # 計算文檔索引,也就是line的索引
if len(line) == 0:# 文本中的空行也要讀取。在我看來有點多餘。
line = f.readline()
continue
docIndex = line[0] # 文章標籤
for term in set(line):
count = line.count(term) #計算每行中去重單詞的個數
invertedIndex[term].append([docIndex,count,lengthOfDocument])
line = f.readline()
f.close()
# print(invertedIndex) # 輸出的是文章標籤、詞對應的在文章中的詞頻、文章的長度
countTime = defaultdict(list)
# df_dict = defaultdict(list)
for k, v in invertedIndex.items():
frequence = 0
NumberOfDocuWord = 0
for i in range(len(v)):
frequence += v[i][1] # 計算詞在整個文章中的詞頻
NumberOfDocuWord += v[i][2] # 計算詞出現的文章的文章長度之和
countTime[k] = [len(v),frequence, NumberOfDocuWord] # len(v) 代表詞出現在了多少篇文章當中
# print(countTime)
return invertedIndex,countTime,text
#這個function是將詞寫到txt文件裏
def search_word(countTime,text2,total_words):
with open('data_understanding.csv','w',newline='',encoding='utf_8_sig') as fw:
df=[]
total_tf=[]
total_tf_total_Word=[]
total_word_in_document=[]
for searchword in text2:
if searchword in countTime:
df.append(countTime.get(searchword)[0]/2060)
total_tf.append(countTime.get(searchword)[1])
total_tf_total_Word.append(countTime.get(searchword)[1]/len(total_words))
total_word_in_document.append(countTime.get(searchword)[1]/countTime.get(searchword)[2])
fw.write('word' + ',' + 'DF' + ',' + 'total TF' + ',' + 'totalTF/total_Word' + ',' + '4/DF' + ',' + 'total TF/total_word_in_document' + ',' + '6/DF'+'\n')
for i in range(len(text2)):
fw.write(text2[i]+','+str(df[i])+','+str(total_tf[i])+','+str(total_tf_total_Word[i])+',' +str(total_tf_total_Word[i]/df[i])+','+str(total_word_in_document[i])+','+ str(total_word_in_document[i]/df[i])+'\n')
fw.close()