data.txt文本是做了預處理等操作生成的數據,每一行代表一條數據:
in conjunction with the release of the the allen institute for ai partnered with
the recent outbreak of the deadly and highly infectious covid disease caused by
coronaviruses is related illness that vary from a common cold more severe
it is shown that the evaporation rate of a liquid sample containing the
covid illness an on going epidemic started in wuhan city china in december
in the beginning of december covid virus that slipped from animals humans in
建模代碼:
from gensim import corpora
import gensim # pip install gensim
def get_topic(all_contents, num_topic=10):
# num_topic 定義LDA模型需要訓練成多少類
try:
def lda_analyze(all_contents, num_topic=10):
"""這是訓練LDA的核心方法"""
dictionary = corpora.Dictionary(all_contents)
corpus = [dictionary.doc2bow(sentence) for sentence in all_contents]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic) # 核心代碼
return lda
# all_contents is list to list
lda = lda_analyze(all_contents, num_topic=num_topic)
for topic in lda.print_topics(num_words=20): # 這裏是打印LDA分類的結果
print(topic[1])
# save model
lda.save('lda_' + str(num_topic) + '.model')
except Exception as e:
print(e)
# 整合data的核心代碼
data = list(iter(open('data.txt')))
data = [content.split() for content in data]
for i in range(16):
get_topic(data, i + 1) # 從分爲1個類別到16個類別,都跑一跑,然後把結果保存下來