Python3進行中文文章分詞實現詞雲圖與TOP詞頻統計

工具：Python 3

一下是代碼，實現對docx文件的中文文章分詞以及作詞雲圖、TOP詞頻統計

import docx
import jieba
from scipy.misc import imread
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager
from PIL import Image, ImageDraw, ImageFont
from wordcloud import WordCloud, ImageColorGenerator
versionN = 6  # 版本
filePath01 = r'F://data_temp/wordTest01.docx'  # 源文件路徑
filePath02 = r'F://data_temp/wordCut-v{0}.txt'.format(versionN)   # 分詞結果文件保存路徑
filePath03 = r'F://data_temp/wordCount-v{0}.txt'.format(versionN)  # # 分詞詞頻統計結果文件保存路徑
filePath04 = r"F://data_temp/test.jpg"  # 詞雲圖背景圖片
filePath05 = r'F://data_temp/wordCloud-v{0}.jpg'.format(versionN)  # 詞雲圖保存路徑
filePath06 = r'F://data_temp/全新硬筆行書簡體.ttf'  # 字體文件
filePath07 = r'F://data_temp/wordCountBar-v{0}.jpg'.format(versionN)  # TOP詞頻圖保存路徑

file01 = docx.Document(filePath01)
docText01 = ''
for i in file01.paragraphs:
    docText01 = docText01 + i.text
segList = '/'.join(jieba.cut(docText01, cut_all=False))  # cut_all=False 精確分詞,分詞符號爲/
with open(filePath02, 'a', encoding='utf-8') as f1:  # 保存分詞結果
    f1.write(segList)
    f1.close()
wordList = segList.split('/')
arr = np.array(wordList)
keyUse = np.unique(arr)
wordDict = {}
for i in keyUse:
    mask = (arr == i)  # return like this [ True False ... False False  True]
    arr_new = arr[mask]  # get the True index element
    v = arr_new.size  # 計數 count the size of i
    wordDict[i] = v  # 賦值 assignment index i of dict
wordDictSorted = sorted(wordDict.items(), key=lambda item: item[1], reverse=True)  # reverse=True 按value值降序排列
PunctuationS = ['，', '。', '?', '、', ' ', '“', '”', '：', '（', '）',
                '.', '', '', '', '', '', '', '', '', '', '', '', '',
                '', '', '', '', '', '', '', '', '', '', '', '', '']
with open(filePath03, 'a', encoding='utf-8') as f2:
    for i in wordDictSorted:
        if i[0] not in PunctuationS and len(i[0]) > 1:
            f2.write('{0}|{1}\n'.format(i[0], i[1]))
        else:
            continue
    f2.close()

color_mask = imread(filePath04)  # 讀取背景圖片,注意路徑
wc = WordCloud(
    scale=6,   # 越大分辨率越高
    font_path="simkai.ttf",  # 設置字體，不指定就會出現亂碼，注意字體路徑
    #font_path=path.join(d,'simsun.ttc'),
    background_color='white',  # 設置背景色
    mask=color_mask,  # 詞雲形狀
    max_words=2000,  # 允許最大詞彙
    max_font_size=60  # 最大號字體
)
wc.generate(segList)  # 產生詞雲
image_colors = ImageColorGenerator(color_mask)  # 從背景圖片生成顏色值
wc.to_file(filePath05)  # 保存圖片
plt.figure()  # 修復不顯示圖片的bug
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")  # 實現詞雲圖片按照圖片顏色取色
plt.axis("off")  # 關閉座標軸
plt.show()

# 畫出詞頻統計條形圖，用漸變顏色顯示，選取前N個詞頻
fig, ax = matplotlib.pyplot.subplots()  # fig：matplotlib.figure.Figure對象  ax：Axes(軸)對象或Axes(軸)對象數組
myFont = matplotlib.font_manager.FontProperties(fname=filePath06)  # 指定一個ttf字體文件作爲圖表使用的字體
# 默認狀態下，matplotlb無法在圖表中使用中文

words = []
counts = []
topN = 30
wordCount01 = open(filePath03, 'r', encoding='utf-8')
for i in wordCount01:
    words.append(i.split('|')[0])
    counts.append(int(i.split('|')[1].strip(r'\n')))


# 這裏是爲了實現條狀的漸變效果，以該色號爲基本色實現漸變效果
colors = ['#FA8072']
for i in range(len(words[:30]) - 1):
    colors.append('#FA{0}'.format(int(colors[-1][3:]) - 1))

rectS = ax.barh(np.arange(topN), counts[:topN], align='center', color=colors)  # 繪製橫向條形圖
# 修改Y軸的刻度
ax.set_yticks(np.arange(topN))  # 設置刻度值
ax.set_yticklabels(words[:topN], fontproperties=myFont)  # 因爲已經排序好,所以直接取前三十個即可，用詞替換刻度值
ax.invert_yaxis()  # 翻轉Y座標軸
ax.set_title('文章中的高頻詞彙', fontproperties=myFont, fontsize=17)  # 設置標題
ax.set_xlabel(u"出現次數", fontproperties=myFont)  # 設置X軸標題
for rect in rectS:
    width = rect.get_width()
    ax.text(1.03 * width, rect.get_y() + rect.get_height()/2., '%d' % int(width), ha='center', va='center')
plt.rcParams['figure.figsize'] = (8.0, 4.0)  # 設置figure_size尺寸
plt.rcParams['savefig.dpi'] = 300  # 圖片像素
plt.rcParams['figure.dpi'] = 300  # 分辨率
plt.savefig(filePath07)
# 不知道爲什麼會報錯ValueError: setting an array element with a sequence.,但是保存圖片成功
plt.show()

以下是部分分詞結果：

簡單/易懂/的/機器/學習/知識/（/一/）/：/人工智能/、/建模/和/機器/學習/96/ / /Vency/ /2017.11/./28/ /00/:/42/*/ /字數/ 
/1454/ /閱讀/ /437/評論/ /0/喜歡/ /4/寫/在/前面/：/本文/是/系列/的/第一篇/文章/，/“/簡單/易懂/的/機器/學習/知識/”/
系列/文章/一方面/是/爲了/讓/更/多/人/瞭解/、/入門/機器/學習/；/另一方面/，/也/是/爲了/讓/自己/在/機器/學習/領域/持
續/學習/下去/。/在/本/系列/中/，/不會/講/細節/的/算法/和/論證/過程/（/我/暫時/也/不會/•/•/•/）/，/會/講/一些/簡單/
易懂/的/基礎知識/，/並/附以/案例/助於/理解/。/1/./人工智能/是/什麼/？/在/狹義/上/，/人工智能/（/AI/）/是/指以/Siri/、
/Alexa/等/語音/助手/，/用/語音/代替/界面/交互/的/個人/虛擬/助手/。/在/廣義/上/，/人工智能/（/AI/）/是/指/由/人工/制
造/出/的/智能/機器/，/是/一種/能夠/學習/的/計算機程序/，/可/代替/人類/去/解決/需要/人類/智慧/才能/解決/的/問題/。/
人工智能/包括/自然語言/處理/、/語音/識別/、/圖像識別/、/機器/學習/等/，/每/一個/分支/都/很/複雜/和/龐大/。/本文/主
講/機器/學習/，/其他/的/大家/可/自行/研究/。/2/./什麼/是/建模/？/在/我們/深入/瞭解/之前/，/先說/一下/建模/的/概念/。
/建模/是/指/把/具體/問題/抽象/成爲/某/一類/問題/並用/數學模型/表示/，/是/應用/於/工程/、/科學/等/各/方面/的/通用/方
法/，/是/一種/對/現實/世界/的/抽象/總結/。/（/PS/：/實際/建模/應用/於/社會/各個方面/，/產品/經理/在/從/實際/業務/中
/梳理/出/角色/、/流程/和/實體/也/是/建模/過程/。/如果/眼中/只有/數學/建模/就/過於/狹隘/了/。/）/建模/的/流程/具體/如
下/：/分析/問題/中/的/各種因素/，/並用/變量/表示/→/分析/變量/之間/的/關係/，/相互依存/或/獨立/等/→/根據/實際/問題/
選用/合適/的/數學/框架/（/典型/的/有/優化/問題/，/配置/問題/等/）/，/並/將/具體/問題/在/此/框架/下/表達/出/某種/公式/
→/選用/合適/的/算法/求解/表達/出/的/公式/→/使用/計算結果/解釋/實際/問題/，/並/分析/結果/。/由此可見/，/在/我們/描述

詞雲圖：

TOP詞頻圖：

效果還不錯。

Python3進行中文文章分詞實現詞雲圖與TOP詞頻統計

數據庫中表建立索引的優缺點

NLP之文本預處理

spark環境idea與sbt的配置

用本機VM虛擬機作爲網站的服務器

服務器搭建本地局域網下載文件（sz下載大文件總是出問題）

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結