统计词频

统计词频

英文文本词频统计

import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# 读取数据
def get_data(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read().strip()
    return text

# 英文缩写替换
def replace_abbreviations(text):
    text = text.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
        .replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
        .replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
        .replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
        .replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")\
        .replace("didn't", "did not").replace("hadn't", "had not").replace("waht's", "what is").replace("couldn't", "could not")\
        .replace("you'll", "you will").replace("you've", "you have")
    
    text = text.replace("'s", "")
    
    return text

# 删除标签符号、数字及其他字符
def clear_str(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    return " ".join(text.split())

# 词干提取
def stem_words(text):
    text_words_stem = [lemma.lemmatize(word, pos='v') for word in text.split()]
    return " ".join(text_words_stem)

# 统计词频
def collection_words(text):
    # 字典
    words_freq = {}
    for word in text.split():
        words_freq.setdefault(word, 0)
        words_freq[word] += 1
    return words_freq

# 主函数
def main(file_path):
    text = get_data(file_path)
    text = replace_abbreviations(text)
    text = clear_str(text)
    text = stem_words(text)
    words_freq = collection_words(text)
    return words_freq

if __name__ == "__main__":
    file_path = 'D:/Python/JupyterNotebook/wordcounter/sophiesworld_1_to_2.txt'
    lemma = WordNetLemmatizer()
    words_freq = main(file_path)
    
    # 输出词频
    # print(words_freq)
    
    # 按照词频大小进行排序
    words_freq_sorted = sorted(words_freq.items(), key=lambda x: x[1], reverse=True)
    print(words_freq_sorted)

中文文本词频统计

1、方法1
构建字典

import jieba
import re

text = ['今晚19:30《天下足球》直播互动话题:国家德比,巴萨取胜的关键之处?欢迎积极留言,我们将选择您的精彩留言与全国观众分享。',
        '德甲前四捉对厮杀,“罗贝里”复活拜仁大胜、门兴多特平分秋色。',
        '今晚《天下足球》19:30,直播内容:专题《欧洲杯豪门恩怨》;专题《名人堂:苏格拉底,大师远去》;尤文米兰双双取胜,积分榜上你追我赶。',
        '今晚《天下足球》19:30,直播内容:国米罗马遭遇诡异失利;巴萨皇马用胜利迎国家德比。']
text_cut = []

for item in text:
    item = re.sub("[:《》,?。“”‘’;!、0-9()]", "", item)  # 删除标点符号及数字
    item_cut = list(jieba.cut(item))  # 分词
    text_cut.append(item_cut)
    
# 字典,用于统计词频
words_freq = {}
for item in text_cut:
    for w in item:
        words_freq.setdefault(w, 0)  # 如果关键字在字典中不存在,把这个关键字加入字典,
        words_freq[w] += 1 # 词频+1
        
print(words_freq)

输出:
{‘今晚’: 3, ‘天下足球’: 3, ‘直播’: 3, ‘互动’: 1, ‘话题’: 1, ‘国家’: 2, ‘德比’: 2, ‘巴萨’: 2, ‘取胜’: 2, ‘的’: 2, ‘关键’: 1, ‘之’: 1, ‘处’: 1, ‘欢迎’: 1, ‘积极’: 1, ‘留言’: 2, ‘我们’: 1, ‘将’: 1, ‘选择’: 1, ‘您’: 1, ‘精彩’: 1, ‘与’: 1, ‘全国’: 1, ‘观众’: 1, ‘分享’: 1, ‘德甲’: 1, ‘前四捉’: 1, ‘对’: 1, ‘厮杀’: 1, ‘罗’: 1, ‘贝里’: 1, ‘复活’: 1, ‘拜仁’: 1, ‘大胜’: 1, ‘门兴’: 1, ‘多特’: 1, ‘平分秋色’: 1, ‘内容’: 2, ‘专题’: 2, ‘欧洲杯’: 1, ‘豪门’: 1, ‘恩怨’: 1, ‘名人堂’: 1, ‘苏格拉底’: 1, ‘大师’: 1, ‘远去’: 1, ‘尤文’: 1, ‘米兰’: 1, ‘双双’: 1, ‘积分榜’: 1, ‘上’: 1, ‘你追我赶’: 1, ‘国米’: 1, ‘罗马’: 1, ‘遭遇’: 1, ‘诡异’: 1, ‘失利’: 1, ‘皇马’: 1, ‘用’: 1, ‘胜利’: 1, ‘迎’: 1}
2、方法2
使用collections库

import jieba
import collections
import re

text = ['今晚19:30《天下足球》直播互动话题:国家德比,巴萨取胜的关键之处?欢迎积极留言,我们将选择您的精彩留言与全国观众分享。',
        '德甲前四捉对厮杀,“罗贝里”复活拜仁大胜、门兴多特平分秋色。',
        '今晚《天下足球》19:30,直播内容:专题《欧洲杯豪门恩怨》;专题《名人堂:苏格拉底,大师远去》;尤文米兰双双取胜,积分榜上你追我赶。',
        '今晚《天下足球》19:30,直播内容:国米罗马遭遇诡异失利;巴萨皇马用胜利迎国家德比。']

text_str = ""

for item in text:
    item = re.sub("[:《》,?。“”‘’;!、0-9()]", "", item)  # 删除标点符号及数字
    item_cut = list(jieba.cut(item))  # 分词
    text_str += " ".join(item_cut)  # 字符串相加
    text_str += " "

words_freq = collections.Counter(text_str.split(" ")[:-1])  # 取到倒数第二位,最后一位不要
print(words_freq)

输出:
Counter({‘今晚’: 3, ‘天下足球’: 3, ‘直播’: 3, ‘国家’: 2, ‘德比’: 2, ‘巴萨’: 2, ‘取胜’: 2, ‘的’: 2, ‘留言’: 2, ‘内容’: 2, ‘专题’: 2, ‘互动’: 1, ‘话题’: 1, ‘关键’: 1, ‘之’: 1, ‘处’: 1, ‘欢迎’: 1, ‘积极’: 1, ‘我们’: 1, ‘将’: 1, ‘选择’: 1, ‘您’: 1, ‘精彩’: 1, ‘与’: 1, ‘全国’: 1, ‘观众’: 1, ‘分享’: 1, ‘德甲’: 1, ‘前四捉’: 1, ‘对’: 1, ‘厮杀’: 1, ‘罗’: 1, ‘贝里’: 1, ‘复活’: 1, ‘拜仁’: 1, ‘大胜’: 1, ‘门兴’: 1, ‘多特’: 1, ‘平分秋色’: 1, ‘欧洲杯’: 1, ‘豪门’: 1, ‘恩怨’: 1, ‘名人堂’: 1, ‘苏格拉底’: 1, ‘大师’: 1, ‘远去’: 1, ‘尤文’: 1, ‘米兰’: 1, ‘双双’: 1, ‘积分榜’: 1, ‘上’: 1, ‘你追我赶’: 1, ‘国米’: 1, ‘罗马’: 1, ‘遭遇’: 1, ‘诡异’: 1, ‘失利’: 1, ‘皇马’: 1, ‘用’: 1, ‘胜利’: 1, ‘迎’: 1})
下面,我对《人民的名义》这篇小说进行词频统计:

import re
import jieba

# 读取数据
def get_data(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read().strip()
    return text

# 删除非中文字符
def clear_str(text):
    text = re.sub("[!?'©'《》'\ufeff'(),。'\u3000'‘’/“”:.;、——【】……'\n'0-9a-zA-z' '-@$¥%*^~]", "", text)
    return text

# 分词
def text_cut(text):
    # 对于一些人名和地名,jieba处理的不好,不过我们可以帮jieba加入以下词汇, 如:
    jieba.suggest_freq('沙瑞金', True)
    jieba.suggest_freq('易学习', True)
    jieba.suggest_freq('王大路', True)
    jieba.suggest_freq('欧阳菁', True)
    jieba.suggest_freq('高育良', True)
    jieba.suggest_freq('李达康', True)
    jieba.suggest_freq('侯亮平', True)
    jieba.suggest_freq('赵东来', True)
    jieba.suggest_freq('京州', True)
    jieba.suggest_freq('毛娅', True)
    jieba.suggest_freq('陈海', True)
    jieba.suggest_freq('丁义珍', True)
    jieba.suggest_freq('赵德汉', True)
    jieba.suggest_freq('祁同伟', True)
    jieba.suggest_freq('陆亦可', True)
    jieba.suggest_freq('陈岩石', True)
    jieba.suggest_freq('郑西坡', True)
    jieba.suggest_freq('陈清泉', True)
    jieba.suggest_freq('蔡成功', True)
    jieba.suggest_freq('孙连城', True)
    jieba.suggest_freq('侦察处', True)
    jieba.suggest_freq('高小琴', True)
    
    text = list(jieba.cut(text))
    return text

# 统计词频
def collects_words(text):
    words_freq = {}
    for word in text:
        words_freq.setdefault(word, 0)
        words_freq[word] += 1
    return words_freq

# 主函数
def main(file_path):
    text = get_data(file_path)
    text = clear_str(text)
    text = text_cut(text)
    words_freq = collects_words(text)
    return words_freq

if __name__ == "__main__":
    # 文件路径
    file_path = 'D:/Python/JupyterNotebook/in_the_name_of_people.txt'
    words_freq = main(file_path)
    # print(words_freq)
    
    # 按照词频大小进行排序
    words_freq_sorted = sorted(words_freq.items(), key=lambda x: x[1], reverse=True)
    print(words_freq_sorted)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章