Mac——Python生成中文詞雲
目標:對文本text進行分詞,提取出topK的高頻詞,生成可視化的詞雲圖。
本文接python-wordcloud詞雲練習,主要實現中文的詞雲。
Python代碼:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@Author :geekzw
@File :wordcloud_demo.py
@Time :2020/3/12 9:20 PM
@Software :Pycharm
@Copyright (c) 2020,All Rights Reserved.
"""
import re
import collections
import numpy as np
import jieba
import wordcloud
from PIL import Image
import matplotlib.pyplot as plt
import docx
class Text2WordCloud:
def __init__(self,
file_path,
stopwords_path,
template_pic_path,
output_pic_path):
self.file_path = file_path
self.stopwords_path = stopwords_path
self.template_pic_path = template_pic_path
self.output_pic_path = output_pic_path
def load(self):
self.mask = np.array(Image.open(self.template_pic_path)) # 定義詞頻背景
self.stopwords = self.read_file(self.stopwords_path) # 讀取停頓詞
self.ww = [word[0] for word in self.stopwords]
self.pattern = re.compile("|".join(self.ww)) # 定義正則表達式匹配模式
self.data = self.read_file(self.file_path) # 讀取文件內容
self.string_data = ""
for data in self.data:
self.string_data += data[0]
self.string_data += re.sub(self.pattern, '', self.string_data) # 將符合模式的字符去除
# 讀取word文檔
@staticmethod
def read_file(filename, sep="\t"):
if ".docx" in filename:
return docx.getdocumenttext(docx.opendocx(filename))
elif ".txt" in filename:
with open(filename, 'r', encoding="utf-8-sig") as file:
lines = file.readlines()
if sep is not None:
lines = [line.replace("\n", "").split(sep) for line in lines]
else:
lines = [line.replace("\n", "") for line in lines]
return lines
def count(self):
# step 1: jieba分詞切分
object_list = jieba.cut(self.string_data, cut_all=False) # 精確模式分詞
# step 2: 統計詞頻
self.word_counts = collections.Counter(object_list) # 對分詞做詞頻統計
# word_counts_topK = self.word_counts.most_common(20) # 獲取前K最高頻的詞
# print(word_counts_topK) # 輸出檢查
def plot(self):
wc = wordcloud.WordCloud(
background_color='white', # 設置背景顏色
font_path='/System/Library/Fonts/Hiragino Sans GB.ttc', # 設置字體格式
mask=self.mask, # 設置背景圖
max_words=400, # 最多顯示詞數
max_font_size=80, # 字體最大值
scale=64 # 調整圖片清晰度,值越大越清楚
)
wc.generate_from_frequencies(self.word_counts) # 從字典生成詞雲
image_colors = wordcloud.ImageColorGenerator(self.mask) # 從背景圖建立顏色方案
wc.recolor(color_func=image_colors) # 將詞雲顏色設置爲背景圖方案
wc.to_file(self.output_pic_path) # 將圖片輸出爲文件
plt.imshow(wc) # 顯示詞雲
plt.axis('off') # 關閉座標軸
plt.show() # 顯示圖像
if __name__ == "__main__":
twc = Text2WordCloud(
file_path="text.docx",
stopwords_path="stopwords.txt",
template_pic_path="鳴人.jpg",
output_pic_path="res.png"
)
twc.load()
twc.count()
twc.plot()
效果:
說明:docx.py文件可以從github中獲取,或留言獲取。
比較有價值的博客,請參考: