Python實現文本詞頻統計——讀取英文文本進行詞頻統計並輸出

1. 讀取文本

    # 1 獲取文本
    f = open("這裏修改爲你要讀取文件的地址", "r",encoding='UTF-8')
    txt = f.read()
    txt = txt.lower()  # 將所有字符轉換爲小寫
    f.close()

2.劃分單詞

    # 2 劃分單詞
    array = re.split('[ ,.\n]', txt)
    #print('分詞結果',array)

3.詞頻統計

    # 3 詞頻統計
    dic = {}
    for i in array:
        if i not in dic:
            dic[i] = 1
        else:
            dic[i] += 1

4. 除掉無價值的詞

    del [dic[''],dic['the'],dic['i'],dic['and'],dic['it'],dic['are'],dic['a'],dic['to'],dic['is']
         , dic['my'],dic['this'],dic['for'],dic['of'],dic['that'],dic['in'],dic['have'],dic['at']
         , dic['was'],dic['with'],dic['one'],dic['on'],dic['not'],dic['so'],dic['hair'],dic['dryer']
         , dic['you'],dic["it's"],dic['had'],dic['has'],dic['be'],dic['dry'],dic['<br'],dic['/><br']
         , dic['we']]

5. 輸出出現頻率最高的100個單詞

    print('\n')
    print(order_dict1(dic, 100),)

6.完整代碼

import os
import re

def order_dict(dicts, n):
    result = []
    result1 = []
    p = sorted([(k, v) for k, v in dicts.items()], reverse=True)
    s = set()
    for i in p:
        s.add(i[1])
    for i in sorted(s, reverse=True)[:n]:
        for j in p:
            if j[1] == i:
                result.append(j)
    for r in result:
        result1.append(r[0])

    return result1


def order_dict1(dicts, n):  # 截取排序結果想要的部分返回就好了
    list1 = sorted(dicts.items(), key=lambda x: x[1])

    return list1[-1:-(n + 1):-1]
    # return list1[-2:-(n+2):-1]   #去除統計結果爲""的情況(前面步驟中,字典沒有提前""去掉的情況下)


if __name__ == "__main__":
    # 1 獲取文本
    f = open("這裏修改爲你要讀取文件的地址", "r",encoding='UTF-8')
    txt = f.read()
    txt = txt.lower()  # 將所有字符轉換爲小寫
    f.close()

    # 2 劃分單詞
    array = re.split('[ ,.\n]', txt)
    #print('分詞結果',array)

    # 3 詞頻統計
    dic = {}
    for i in array:
        if i not in dic:
            dic[i] = 1
        else:
            dic[i] += 1
    # 4 除掉無價值的詞
    del [dic[''],dic['the'],dic['i'],dic['and'],dic['it'],dic['are'],dic['a'],dic['to'],dic['is']
         , dic['my'],dic['this'],dic['for'],dic['of'],dic['that'],dic['in'],dic['have'],dic['at']
         , dic['was'],dic['with'],dic['one'],dic['on'],dic['not'],dic['so'],dic['hair'],dic['dryer']
         , dic['you'],dic["it's"],dic['had'],dic['has'],dic['be'],dic['dry'],dic['<br'],dic['/><br']
         , dic['we']]

    # 5 輸出出現頻率最高的100個單詞
    print('\n')
    print(order_dict1(dic, 100),)

7.運行結果

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章