统计词频
英文文本词频统计
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# 读取数据
def get_data(file_path):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
text = file.read().strip()
return text
# 英文缩写替换
def replace_abbreviations(text):
text = text.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
.replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
.replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
.replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
.replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")\
.replace("didn't", "did not").replace("hadn't", "had not").replace("waht's", "what is").replace("couldn't", "could not")\
.replace("you'll", "you will").replace("you've", "you have")
text = text.replace("'s", "")
return text
# 删除标签符号、数字及其他字符
def clear_str(text):
text = re.sub("[^a-zA-Z]", " ", text)
return " ".join(text.split())
# 词干提取
def stem_words(text):
text_words_stem = [lemma.lemmatize(word, pos='v') for word in text.split()]
return " ".join(text_words_stem)
# 统计词频
def collection_words(text):
# 字典
words_freq = {}
for word in text.split():
words_freq.setdefault(word, 0)
words_freq[word] += 1
return words_freq
# 主函数
def main(file_path):
text = get_data(file_path)
text = replace_abbreviations(text)
text = clear_str(text)
text = stem_words(text)
words_freq = collection_words(text)
return words_freq
if __name__ == "__main__":
file_path = 'D:/Python/JupyterNotebook/wordcounter/sophiesworld_1_to_2.txt'
lemma = WordNetLemmatizer()
words_freq = main(file_path)
# 输出词频
# print(words_freq)
# 按照词频大小进行排序
words_freq_sorted = sorted(words_freq.items(), key=lambda x: x[1], reverse=True)
print(words_freq_sorted)
中文文本词频统计
1、方法1
构建字典
import jieba
import re
text = ['今晚19:30《天下足球》直播互动话题:国家德比,巴萨取胜的关键之处?欢迎积极留言,我们将选择您的精彩留言与全国观众分享。',
'德甲前四捉对厮杀,“罗贝里”复活拜仁大胜、门兴多特平分秋色。',
'今晚《天下足球》19:30,直播内容:专题《欧洲杯豪门恩怨》;专题《名人堂:苏格拉底,大师远去》;尤文米兰双双取胜,积分榜上你追我赶。',
'今晚《天下足球》19:30,直播内容:国米罗马遭遇诡异失利;巴萨皇马用胜利迎国家德比。']
text_cut = []
for item in text:
item = re.sub("[:《》,?。“”‘’;!、0-9()]", "", item) # 删除标点符号及数字
item_cut = list(jieba.cut(item)) # 分词
text_cut.append(item_cut)
# 字典,用于统计词频
words_freq = {}
for item in text_cut:
for w in item:
words_freq.setdefault(w, 0) # 如果关键字在字典中不存在,把这个关键字加入字典,
words_freq[w] += 1 # 词频+1
print(words_freq)
输出:
{‘今晚’: 3, ‘天下足球’: 3, ‘直播’: 3, ‘互动’: 1, ‘话题’: 1, ‘国家’: 2, ‘德比’: 2, ‘巴萨’: 2, ‘取胜’: 2, ‘的’: 2, ‘关键’: 1, ‘之’: 1, ‘处’: 1, ‘欢迎’: 1, ‘积极’: 1, ‘留言’: 2, ‘我们’: 1, ‘将’: 1, ‘选择’: 1, ‘您’: 1, ‘精彩’: 1, ‘与’: 1, ‘全国’: 1, ‘观众’: 1, ‘分享’: 1, ‘德甲’: 1, ‘前四捉’: 1, ‘对’: 1, ‘厮杀’: 1, ‘罗’: 1, ‘贝里’: 1, ‘复活’: 1, ‘拜仁’: 1, ‘大胜’: 1, ‘门兴’: 1, ‘多特’: 1, ‘平分秋色’: 1, ‘内容’: 2, ‘专题’: 2, ‘欧洲杯’: 1, ‘豪门’: 1, ‘恩怨’: 1, ‘名人堂’: 1, ‘苏格拉底’: 1, ‘大师’: 1, ‘远去’: 1, ‘尤文’: 1, ‘米兰’: 1, ‘双双’: 1, ‘积分榜’: 1, ‘上’: 1, ‘你追我赶’: 1, ‘国米’: 1, ‘罗马’: 1, ‘遭遇’: 1, ‘诡异’: 1, ‘失利’: 1, ‘皇马’: 1, ‘用’: 1, ‘胜利’: 1, ‘迎’: 1}
2、方法2
使用collections库
import jieba
import collections
import re
text = ['今晚19:30《天下足球》直播互动话题:国家德比,巴萨取胜的关键之处?欢迎积极留言,我们将选择您的精彩留言与全国观众分享。',
'德甲前四捉对厮杀,“罗贝里”复活拜仁大胜、门兴多特平分秋色。',
'今晚《天下足球》19:30,直播内容:专题《欧洲杯豪门恩怨》;专题《名人堂:苏格拉底,大师远去》;尤文米兰双双取胜,积分榜上你追我赶。',
'今晚《天下足球》19:30,直播内容:国米罗马遭遇诡异失利;巴萨皇马用胜利迎国家德比。']
text_str = ""
for item in text:
item = re.sub("[:《》,?。“”‘’;!、0-9()]", "", item) # 删除标点符号及数字
item_cut = list(jieba.cut(item)) # 分词
text_str += " ".join(item_cut) # 字符串相加
text_str += " "
words_freq = collections.Counter(text_str.split(" ")[:-1]) # 取到倒数第二位,最后一位不要
print(words_freq)
输出:
Counter({‘今晚’: 3, ‘天下足球’: 3, ‘直播’: 3, ‘国家’: 2, ‘德比’: 2, ‘巴萨’: 2, ‘取胜’: 2, ‘的’: 2, ‘留言’: 2, ‘内容’: 2, ‘专题’: 2, ‘互动’: 1, ‘话题’: 1, ‘关键’: 1, ‘之’: 1, ‘处’: 1, ‘欢迎’: 1, ‘积极’: 1, ‘我们’: 1, ‘将’: 1, ‘选择’: 1, ‘您’: 1, ‘精彩’: 1, ‘与’: 1, ‘全国’: 1, ‘观众’: 1, ‘分享’: 1, ‘德甲’: 1, ‘前四捉’: 1, ‘对’: 1, ‘厮杀’: 1, ‘罗’: 1, ‘贝里’: 1, ‘复活’: 1, ‘拜仁’: 1, ‘大胜’: 1, ‘门兴’: 1, ‘多特’: 1, ‘平分秋色’: 1, ‘欧洲杯’: 1, ‘豪门’: 1, ‘恩怨’: 1, ‘名人堂’: 1, ‘苏格拉底’: 1, ‘大师’: 1, ‘远去’: 1, ‘尤文’: 1, ‘米兰’: 1, ‘双双’: 1, ‘积分榜’: 1, ‘上’: 1, ‘你追我赶’: 1, ‘国米’: 1, ‘罗马’: 1, ‘遭遇’: 1, ‘诡异’: 1, ‘失利’: 1, ‘皇马’: 1, ‘用’: 1, ‘胜利’: 1, ‘迎’: 1})
下面,我对《人民的名义》这篇小说进行词频统计:
import re
import jieba
# 读取数据
def get_data(file_path):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
text = file.read().strip()
return text
# 删除非中文字符
def clear_str(text):
text = re.sub("[!?'©'《》'\ufeff'(),。'\u3000'‘’/“”:.;、——【】……'\n'0-9a-zA-z' '-@$¥%*^~]", "", text)
return text
# 分词
def text_cut(text):
# 对于一些人名和地名,jieba处理的不好,不过我们可以帮jieba加入以下词汇, 如:
jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('欧阳菁', True)
jieba.suggest_freq('高育良', True)
jieba.suggest_freq('李达康', True)
jieba.suggest_freq('侯亮平', True)
jieba.suggest_freq('赵东来', True)
jieba.suggest_freq('京州', True)
jieba.suggest_freq('毛娅', True)
jieba.suggest_freq('陈海', True)
jieba.suggest_freq('丁义珍', True)
jieba.suggest_freq('赵德汉', True)
jieba.suggest_freq('祁同伟', True)
jieba.suggest_freq('陆亦可', True)
jieba.suggest_freq('陈岩石', True)
jieba.suggest_freq('郑西坡', True)
jieba.suggest_freq('陈清泉', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孙连城', True)
jieba.suggest_freq('侦察处', True)
jieba.suggest_freq('高小琴', True)
text = list(jieba.cut(text))
return text
# 统计词频
def collects_words(text):
words_freq = {}
for word in text:
words_freq.setdefault(word, 0)
words_freq[word] += 1
return words_freq
# 主函数
def main(file_path):
text = get_data(file_path)
text = clear_str(text)
text = text_cut(text)
words_freq = collects_words(text)
return words_freq
if __name__ == "__main__":
# 文件路径
file_path = 'D:/Python/JupyterNotebook/in_the_name_of_people.txt'
words_freq = main(file_path)
# print(words_freq)
# 按照词频大小进行排序
words_freq_sorted = sorted(words_freq.items(), key=lambda x: x[1], reverse=True)
print(words_freq_sorted)