LDA主題模型練習1

**1.**本文針對LDA主題模型進行學習和聯繫,核心摘要如下:
在這裏插入圖片描述**2.**NLP中的共現對應條件概率(獨立時最特殊),最大似然估計計算字符的共現例子:
在這裏插入圖片描述在這裏插入圖片描述**3.**LDA主題模型代碼實例

#-*-coding:utf8-*-
import jieba

dir1='E:/ssssszzz/lda/'
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r',encoding="UTF-8").readlines()] 
    #readlines是行的list
    return stopwords

def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist(dir1+'stopWords/stopwords.txt')
    outstr = ''
    for word in sentence_seged:
    	#jieba對每一行進行分詞,並將英文字母大寫轉爲小寫
        word = word.lower()
        #不是停用詞,不是特殊符號就加入到str字符串中,並以空格分隔開。
        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr

inputs = open(dir1+'input/copurs.txt','r+',encoding="UTF-8") 
#讀入文本語料,中文強制添加encoding='utf-8'纔不會出現下面的gbk報錯。
#UnicodeDecodeError: 'gbk' codec can't decode byte 0xbe in position 44: illegal multibyte sequence 
content1= inputs.readlines() 
inputs.close()
outputs = open(dir1+'input/copurs_out.txt', 'w',encoding="UTF-8") #寫入jieba分詞的結果
for line in content1:
    line_seg = seg_sentence(line)
    outputs.write(line_seg + '\n')
outputs.close()


import codecs
from gensim import corpora
from gensim.models import LdaModel
from gensim import models
from gensim.corpora import Dictionary


te = []
fp = codecs.open(dir1+'input/copurs_out.txt','r',encoding="UTF-8")
for line in fp:
    line = line.split()
    te.append([ w for w in line  ])
print(len(te)) 
dictionary = corpora.Dictionary(te)
corpus = [ dictionary.doc2bow(text) for text in te ]
#語料扔到TF-IDF中計算
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

#########Run the LDA model for XX topics ###############################
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50,passes=2000) 
doc_topic = [a for a in lda[corpus]]

####### write the topics in file topics_result.txt ####################
topics_r = lda.print_topics(num_topics = 50, num_words = 10)
topic_name = codecs.open(dir1+'output/topics_result.txt','w',encoding="UTF-8")
for v in topics_r:
    topic_name.write(str(v)+'\n')

    
######################  write the class results to file  #########################
###################### each document belongs to which topic ######################

fp2 = codecs.open(dir1+'output/documents_result.txt','w',encoding="UTF-8")
for t in doc_topic:
    c = []
    c.append([a[1] for a in t])
    m = max(c[0])
    
    for i in range(0, len(t)):
        if m in t[i]:
            #print(t[i])
            fp2.write(str(t[i][0]) + '  ' + str(t[i][1]) + '\n')
            break
################################ OVER ############################################
fp2.close()

#########Run the LDA model for XX topics ###############################
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50,passes=2000) 
doc_topic = [a for a in lda[corpus]]

運行效果:然而不知道是單篇文章沒用,還是哪裏沒弄對,不知道LDA爲什麼並沒有給出具體的主體,也沒有實現對多文檔的歸類功能?
1>.documents_result.txt

41  0.8911111111111114
22  0.988604651162792
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
41  0.6289422620757548
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
43  0.9711764705882348
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
28  0.9772093023255799
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
41  0.7800000000000002
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
8  0.9591666666666674
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
13  0.8775000000000017
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
13  0.9608000000000005
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
14  0.7695384615384631
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
30  0.9387500000000005
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
9  0.9683870967741941
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
38  0.8366666666666672
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
49  0.981153846153846
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
14  0.9777272727272702
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
31  0.9920325203247484
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
26  0.7549999999999999
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
35  0.9821818181818174
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
21  0.9833898305084731
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
27  0.9711764705882333
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
21  0.984444444444443
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
41  0.7525000000000003
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
8  0.986756756756759
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
19  0.9920967741935494
0  0.01999999999999999
0  0.01999999999999999
0  0.01999999999999999
45  0.9836666666666665
0  0.01999999999999999
15  0.5099999999999999

2>.topics_result.txt

(0, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(1, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(2, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(3, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(4, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(5, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(6, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(7, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(8, '0.086*"創新" + 0.057*"標準" + 0.048*"技術" + 0.038*"銀行" + 0.038*"市場" + 0.038*"間" + 0.038*"體系" + 0.038*"金融" + 0.029*"科技" + 0.029*"標準化"')
(9, '0.077*"技術" + 0.052*"產品" + 0.052*"未來" + 0.052*"世界" + 0.026*"很大" + 0.026*"帶來" + 0.026*"隨之而來" + 0.026*"新" + 0.026*"顛覆性" + 0.026*"充分運用"')
(10, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(11, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(12, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(13, '0.093*"世界" + 0.074*"科技" + 0.074*"金融" + 0.056*"技術" + 0.056*"支撐" + 0.056*"標準化" + 0.056*"未來" + 0.056*"元素" + 0.037*"工作" + 0.037*"一是"')
(14, '0.098*"標準" + 0.049*"工作組" + 0.049*"技術標準" + 0.039*"銀行" + 0.039*"市場" + 0.039*"間" + 0.030*"體系" + 0.020*"區塊" + 0.020*"人工智能" + 0.020*"包括"')
(15, '0.101*"途徑" + 0.002*"業務" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"sc9" + 0.002*"wg1" + 0.002*"此項工作" + 0.002*"金標委" + 0.002*"提升"')
(16, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(17, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(18, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(19, '0.061*"市場" + 0.046*"銀行" + 0.030*"間" + 0.030*"建設" + 0.023*"技術" + 0.023*"金融" + 0.023*"中國" + 0.023*"中國外匯交易中心" + 0.023*"基礎設施" + 0.023*"平臺"')
(20, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(21, '0.047*"金融" + 0.047*"標準" + 0.047*"國際標準" + 0.031*"市場" + 0.031*"科技" + 0.031*"間" + 0.031*"銀行" + 0.023*"發展" + 0.023*"我國" + 0.023*"中國外匯交易中心"')
(22, '0.065*"金融" + 0.065*"市場" + 0.054*"間" + 0.043*"科技" + 0.032*"銀行" + 0.032*"發展" + 0.021*"影響" + 0.021*"顛覆性" + 0.021*"交易量" + 0.021*"中"')
(23, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(24, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(25, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(26, '0.084*"三是" + 0.084*"國際標準" + 0.084*"兼容" + 0.002*"核心成員" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"各項" + 0.002*"業務" + 0.002*"大部分"')
(27, '0.072*"fix" + 0.048*"相關" + 0.048*"外" + 0.048*"組織" + 0.048*"工作組" + 0.048*"國際標準" + 0.024*"exchange" + 0.024*"參與" + 0.024*"iso" + 0.024*"互聯互通"')
(28, '0.057*"新" + 0.057*"活動" + 0.038*"世界" + 0.038*"物" + 0.038*"機" + 0.038*"融合" + 0.019*"鏈" + 0.019*"區塊" + 0.019*"中" + 0.019*"未來"')
(29, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(30, '0.125*"貨幣" + 0.042*"世界" + 0.042*"金融交易" + 0.042*"數字" + 0.042*"介質" + 0.042*"思考" + 0.042*"構建" + 0.042*"如果說" + 0.042*"實物" + 0.042*"轉換"')
(31, '0.069*"間" + 0.054*"銀行" + 0.054*"市場" + 0.046*"標準" + 0.038*"生態圈" + 0.031*"機構" + 0.015*"機制" + 0.015*"服務" + 0.015*"開發" + 0.015*"深度"')
(32, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(33, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(34, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(35, '0.095*"標準" + 0.048*"iso" + 0.048*"語義" + 0.048*"各項" + 0.032*"統一" + 0.032*"組織" + 0.032*"建立" + 0.032*"工作組" + 0.016*"開放" + 0.016*"門戶"')
(36, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(37, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(38, '0.072*"二是" + 0.072*"建立" + 0.072*"體系" + 0.072*"生態圈" + 0.072*"標準" + 0.001*"sc9" + 0.001*"模型" + 0.001*"fix" + 0.001*"核心成員" + 0.001*"semantic"')
(39, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(40, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(41, '0.114*"銀行" + 0.114*"科技" + 0.113*"間" + 0.112*"市場" + 0.112*"金融" + 0.088*"標準化" + 0.059*"建設" + 0.030*"\ufeff" + 0.001*"此項工作" + 0.001*"核心成員"')
(42, '0.002*"sc9" + 0.002*"此項工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成員" + 0.002*"各項" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(43, '0.048*"世界" + 0.048*"外匯交易" + 0.048*"做" + 0.048*"程序" + 0.048*"交易" + 0.024*"半神" + 0.024*"另一端" + 0.024*"構築" + 0.024*"很難" + 0.024*"創造"')

4.節選出來《銀行間市場文章》中的20個關鍵詞

import jieba
import jieba.analyse as janali #這裏必須導入
#sentence = '全國港澳研究會會長徐澤在會上發言指出,學習系列重要講話要深刻領會 主席關於香港迴歸後的憲制基礎和憲制秩序的論述,這是過去20年特別是中共十八大以來"一國兩制"在香港實踐取得成功的根本經驗。首先,要在夯實 香港的憲制基礎、鞏固香港的憲制秩序上着力。只有牢牢確立起"一國兩制"的憲制秩序,才能保證"一國兩制"實踐不走樣 、不變形。其次,要在完善基本法實施的制度和機制上用功。中央直接行使的權力和特區高度自治權的結合是特區憲制秩 序不可或缺的兩個方面,同時必須切實建立以行政長官爲核心的行政主導體制。第三,要切實加強香港社會特別是針對公 職人員和青少年的憲法、基本法宣傳,牢固樹立"一國"意識,堅守"一國"原則。第四,要努力在全社會形成聚焦發展、抵 制泛政治化的氛圍和勢能,全面準確理解和落實基本法有關經濟事務的規定,使香港繼續在國家發展中發揮獨特作用並由 此讓最廣大民衆獲得更實在的利益。' 
sentence=''
with open(dir1+'input/copurs.txt','r+',encoding="UTF-8") as fp:
    content1=fp.readlines()
for line in content1:
    if len(line)==0:
        continue
    else:
        #print(line)
        sentence+=line.strip()  #將文章做字符串拼接
jieba.analyse.set_stop_words(dir1+'stopWords/stopwords.txt')  #導入停用詞詞典
jieba.load_userdict(dir1+'input/cefts_dict.txt')	    #導入自定義詞典   
keywords = janali.extract_tags(sentence, topK=20, withWeight=True, allowPOS=('n','nr','ns')) 

效果

('銀行間市場', 0.6035416603405825)
('金融科技', 0.27855768938796116)
('標準', 0.2627427868752427)
('標準體系', 0.23213140782330097)
('中國外匯交易中心', 0.172578511944)
('銀行間市場技術標準工作組', 0.16249198547631066)
('金融科技標準化', 0.13927884469398058)
('國際標準', 0.1262114562056505)
('創新', 0.12261755526291263)
('生態圈', 0.11933242465533982)
('ISO', 0.11606570391165048)
('建設', 0.11446558441118447)
('技術', 0.10996793425794174)
('金融', 0.10195540484135922)
('標準化', 0.0947095517924272)
('世界', 0.09328703584712622)
('FIX', 0.09285256312932039)
('顛覆性', 0.08509699940038834)
('工作組', 0.08246122214097089)
('發展', 0.07636060204823301)

5.補充例程(LDA代碼及數據結構、存儲結構解析)

from gensim import corpora, models, similarities
from pprint import pprint

# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


if __name__ == '__main__':
    f = open('../data/LDA_test.txt')
    stop_list = set('for a of the and to in'.split())
    # texts = [line.strip().split() for line in f]
    # print(texts)
    texts = [[word for word in line.strip().lower().split() if word not in stop_list] for line in f]
    print('Text = ')
    pprint(texts)

    dictionary = corpora.Dictionary(texts)  #這是一個字典,直接使用字典的遍歷方式查看具體的數據結構和組織方式
    type(dictionary)  #gensim.corpora.dictionary.Dictionary
    for key,value in dictionary.items():
        print(key,'===========',value)
    V = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in texts]
    type(corpus)  #list,每個元素是有個tuple對象,tuple中分裝了詞的ID和頻數
    corpus_tfidf = models.TfidfModel(corpus)[corpus] #轉化所有9篇文章中的每個詞彙的詞頻佔比與逆文檔率的乘積
    #corpus_tfidf = models.TfidfModel(corpus)[corpus[0]]  #輸出第一篇文檔的每個字的詞頻佔比與逆文檔率的乘積
    type(models.TfidfModel(corpus)) # gensim.models.tfidfmodel.TfidfModel
    print(models.TfidfModel(corpus))   #TfidfModel(num_docs=9, num_nnz=51),文檔數爲9,51個單詞
    print('TF-IDF:')
    for c in corpus_tfidf: #多篇文章時爲二維數組,輸出是每一篇文章的每個詞彙tf-idf的值
        print(c)

    print('\nLSI Model:')
    lsi = models.LsiModel(corpus_tfidf, num_topics=3, id2word=dictionary)
    topic_result = [a for a in lsi[corpus_tfidf]]
    pprint(topic_result)
    print('LSI Topics:')
    pprint(lsi.print_topics(num_topics=3, num_words=5))
    #9篇文章兩兩之間的相似度
    similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf])   # similarities.Similarity()
    print('Similarity:')
    pprint(list(similarity))#發現針對第一篇文章爲1的,它和第3和4篇文章主體是最相近的

    print('\nLDA Model:')
    num_topics = 2
    lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
                          alpha='auto', eta='auto', minimum_probability=0.001)
    type(lda)#gensim.models.ldamodel.LdaModel
    doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]  #9篇文章的主體
    doc_topic = [doc_t for doc_t in lda[corpus_tfidf[0]]]  #只輸出第一篇文章的兩個主體
    print('Document-Topic:\n')
    pprint(doc_topic)
    #for doc_topic in lda.get_document_topics(corpus_tfidf[0]): #單篇文章的主體分數
    for doc_topic in lda.get_document_topics(corpus_tfidf): #多篇文章
        print(doc_topic)
    for topic_id in range(num_topics):
        print('Topic', topic_id)
        # pprint(lda.get_topic_terms(topicid=topic_id))
        pprint(lda.show_topic(topic_id))
    similarity = similarities.MatrixSimilarity(lda[corpus_tfidf])
    print('Similarity:')
    pprint(list(similarity))
    for top in lda.print_topics(10):
        print(top)  #輸出主題=關鍵詞的線性加權組合序列
    hda = models.HdpModel(corpus_tfidf, id2word=dictionary)
    topic_result = [a for a in hda[corpus_tfidf]]
    print('\n\nUSE WITH CARE--\nHDA Model:')
    pprint(topic_result)
    print('HDA Topics:')
    print(hda.print_topics(num_topics=2, num_words=5))
   
Text = 
[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]
17 =========== relation
18 =========== perceived
34 =========== quasi
28 =========== paths
23 =========== random
29 =========== widths
19 =========== measurement
2 =========== human
20 =========== error
27 =========== graph
21 =========== binary
1 =========== applications
8 =========== response
9 =========== user
25 =========== generation
5 =========== computer
16 =========== engineering
14 =========== management
7 =========== opinion
32 =========== ordering
10 =========== survey
26 =========== intersection
0 =========== abc
30 =========== well
15 =========== testing
31 =========== iv
13 =========== eps
24 =========== unordered
4 =========== interface
11 =========== system
22 =========== trees
3 =========== lab
33 =========== minors
12 =========== time
6 =========== machine
TfidfModel(num_docs=9, num_nnz=51)
TF-IDF:
[(0, 0.4301019571350565), (1, 0.4301019571350565), (2, 0.2944198962221451), (3, 0.4301019571350565), (4, 0.2944198962221451), (5, 0.2944198962221451), (6, 0.4301019571350565)]
[(5, 0.3726494271826947), (7, 0.5443832091958983), (8, 0.3726494271826947), (9, 0.27219160459794917), (10, 0.3726494271826947), (11, 0.27219160459794917), (12, 0.3726494271826947)]
[(4, 0.438482464916089), (9, 0.32027755044706185), (11, 0.32027755044706185), (13, 0.438482464916089), (14, 0.6405551008941237)]
[(2, 0.3449874408519962), (11, 0.5039733231394895), (13, 0.3449874408519962), (15, 0.5039733231394895), (16, 0.5039733231394895)]
[(8, 0.30055933182961736), (9, 0.21953536176370683), (12, 0.30055933182961736), (17, 0.43907072352741366), (18, 0.43907072352741366), (19, 0.43907072352741366), (20, 0.43907072352741366)]
[(21, 0.48507125007266594), (22, 0.24253562503633297), (23, 0.48507125007266594), (24, 0.48507125007266594), (25, 0.48507125007266594)]
[(22, 0.31622776601683794), (26, 0.6324555320336759), (27, 0.31622776601683794), (28, 0.6324555320336759)]
[(22, 0.20466057569885868), (27, 0.20466057569885868), (29, 0.40932115139771735), (30, 0.40932115139771735), (31, 0.40932115139771735), (32, 0.40932115139771735), (33, 0.2801947048062438), (34, 0.40932115139771735)]
[(10, 0.6282580468670046), (27, 0.45889394536615247), (33, 0.6282580468670046)]

LSI Model:
[[(0, 0.34057117986841867), (1, -0.20602251622679593), (2, 0.2516322932612177)],
 [(0, 0.6933040002171559),
  (1, 0.007232758390388435),
  (2, -0.42828031011610446)],
 [(0, 0.5902607670389721), (1, -0.3526046949085568), (2, 0.30883209258107536)],
 [(0, 0.5214901821825134), (1, -0.33887976154055316), (2, 0.4328304015902547)],
 [(0, 0.3953319317635442),
  (1, -0.059192853366600705),
  (2, -0.6817088379096499)],
 [(0, 0.036353173528493994), (1, 0.18146550208819023), (2, 0.2040948457195041)],
 [(0, 0.14709012328778975), (1, 0.49432948127822346), (2, 0.2520741552399365)],
 [(0, 0.21407117317565383), (1, 0.6406456664453949), (2, 0.21254395627262415)],
 [(0, 0.40066568318170775), (1, 0.6413108299093997), (2, -0.04311301052997242)]]
LSI Topics:
[(0,
  '0.400*"system" + 0.318*"survey" + 0.290*"user" + 0.274*"eps" + '
  '0.236*"management"'),
 (1,
  '0.421*"minors" + 0.420*"graph" + 0.293*"survey" + 0.239*"trees" + '
  '0.226*"paths"'),
 (2,
  '-0.318*"response" + -0.318*"time" + -0.261*"error" + -0.261*"perceived" + '
  '-0.261*"relation"')]
Similarity:
[array([ 1.        ,  0.33056617,  0.989982  ,  0.99866086, -0.06638948,
        0.20311232,  0.04321319, -0.01678792, -0.0182744 ], dtype=float32),
 array([ 0.33056617,  0.99999994,  0.4467398 ,  0.28133896,  0.87825346,
       -0.271207  , -0.0051405 ,  0.10747484,  0.48745936], dtype=float32),
 array([ 0.989982  ,  0.4467398 ,  1.        ,  0.9822476 ,  0.07334356,
        0.09873419, -0.02226283, -0.06350847, -0.00516158], dtype=float32),
 array([ 0.99866086,  0.28133896,  0.9822476 ,  1.0000001 , -0.11496954,
        0.21939825,  0.04205609, -0.02510544, -0.04712418], dtype=float32),
 array([-0.06638948,  0.87825346,  0.07334356, -0.11496954,  1.        ,
       -0.62235594, -0.31511295, -0.17545702,  0.2503119 ], dtype=float32),
 array([ 0.20311232, -0.271207  ,  0.09873419,  0.21939825, -0.62235594,
        0.99999994,  0.9262762 ,  0.85813475,  0.5853234 ], dtype=float32),
 array([ 0.04321319, -0.0051405 , -0.02226283,  0.04205609, -0.31511295,
        0.9262762 ,  1.        ,  0.9883332 ,  0.83967173], dtype=float32),
 array([-0.01678792,  0.10747484, -0.06350847, -0.02510544, -0.17545702,
        0.85813475,  0.9883332 ,  1.        ,  0.9088737 ], dtype=float32),
 array([-0.0182744 ,  0.48745936, -0.00516158, -0.04712418,  0.2503119 ,
        0.5853234 ,  0.83967173,  0.9088737 ,  1.        ], dtype=float32)]

LDA Model:
Document-Topic:

[(0, 0.2965872083745731), (1, 0.7034127916254268)]
[(0, 0.2964665697022038), (1, 0.7035334302977961)]
[(0, 0.24696754833255918), (1, 0.7530324516674408)]
[(0, 0.3391022982510855), (1, 0.6608977017489145)]
[(0, 0.5928334226714413), (1, 0.40716657732855865)]
[(0, 0.24102655276664547), (1, 0.7589734472333545)]
[(0, 0.2841667707331107), (1, 0.7158332292668892)]
[(0, 0.34667048063848893), (1, 0.6533295193615112)]
[(0, 0.7336864063998245), (1, 0.2663135936001754)]
[(0, 0.5700078334633175), (1, 0.4299921665366824)]
Topic 0
[('minors', 0.041682411039787835),
 ('graph', 0.041520911134093876),
 ('system', 0.038329206254127626),
 ('survey', 0.03753585153191179),
 ('eps', 0.0351287056411886),
 ('human', 0.033430984047230196),
 ('trees', 0.031461369346122106),
 ('testing', 0.030907375780524896),
 ('well', 0.030744358008127703),
 ('engineering', 0.030595539801796162)]
Topic 1
[('system', 0.040118726033734085),
 ('survey', 0.03734100599657068),
 ('user', 0.03678637729732701),
 ('time', 0.03483279508494677),
 ('computer', 0.033932015303668),
 ('interface', 0.033793948617101006),
 ('response', 0.03378231469734522),
 ('trees', 0.033445559925836345),
 ('graph', 0.033062305667871876),
 ('paths', 0.03165595482730171)]
Similarity:
[array([1.        , 0.996622  , 0.9971814 , 0.8418874 , 0.9958073 ,
       0.99977267, 0.9960444 , 0.6795992 , 0.865116  ], dtype=float32),
 array([0.996622  , 1.        , 0.9876513 , 0.79472446, 0.99995595,
       0.9981465 , 0.98538244, 0.617058  , 0.821002  ], dtype=float32),
 array([0.9971814 , 0.9876513 , 1.        , 0.88000363, 0.9861372 ,
       0.995355  , 0.99990374, 0.7327231 , 0.90030956], dtype=float32),
 array([0.8418874 , 0.79472446, 0.88000363, 1.        , 0.78899217,
       0.8301899 , 0.88650906, 0.96802676, 0.99900514], dtype=float32),
 array([0.9958073 , 0.99995595, 0.9861372 , 0.78899217, 1.        ,
       0.9975313 , 0.98374   , 0.6096444 , 0.8156069 ], dtype=float32),
 array([0.99977267, 0.9981465 , 0.995355  , 0.8301899 , 0.9975313 ,
       1.        , 0.9939234 , 0.6638038 , 0.85422516], dtype=float32),
 array([0.9960444 , 0.98538244, 0.99990374, 0.88650906, 0.98374   ,
       0.9939234 , 1.        , 0.74209493, 0.90626204], dtype=float32),
 array([0.6795992 , 0.617058  , 0.7327231 , 0.96802676, 0.6096444 ,
       0.6638038 , 0.74209493, 0.99999994, 0.95587707], dtype=float32),
 array([0.865116  , 0.821002  , 0.90030956, 0.99900514, 0.8156069 ,
       0.85422516, 0.90626204, 0.95587707, 1.        ], dtype=float32)]
(0, '0.042*"minors" + 0.042*"graph" + 0.038*"system" + 0.038*"survey" + 0.035*"eps" + 0.033*"human" + 0.031*"trees" + 0.031*"testing" + 0.031*"well" + 0.031*"engineering"')
(1, '0.040*"system" + 0.037*"survey" + 0.037*"user" + 0.035*"time" + 0.034*"computer" + 0.034*"interface" + 0.034*"response" + 0.033*"trees" + 0.033*"graph" + 0.032*"paths"')


USE WITH CARE--
HDA Model:
[[(0, 0.5565702203077617),
  (1, 0.054574104632418734),
  (2, 0.2729517480403645),
  (3, 0.029618745787742838),
  (4, 0.0222580699344652),
  (5, 0.01655557651607404),
  (6, 0.012229490830839239)],
 [(0, 0.5205840667404564),
  (1, 0.3231107910741006),
  (2, 0.03961346309936006),
  (3, 0.029821838447607565),
  (4, 0.02240797412838224),
  (5, 0.016668006423738994),
  (6, 0.012312527505113606)],
 [(0, 0.517790262705016),
  (1, 0.06191946439382389),
  (2, 0.28805465028468186),
  (3, 0.03377804590557444),
  (4, 0.025396487881103134),
  (5, 0.01889150110691897),
  (6, 0.013955025456098604),
  (7, 0.010465705629518941)],
 [(0, 0.7643410301426959),
  (1, 0.061048475498996585),
  (2, 0.04417331958826799),
  (3, 0.03332724492765827),
  (4, 0.025048725716184123),
  (5, 0.018632957487399803),
  (6, 0.013764043119053765),
  (7, 0.010322476602212013)],
 [(0, 0.0757181237596145),
  (1, 0.7679391858106653),
  (2, 0.039586714359868345),
  (3, 0.029828166513061593),
  (4, 0.02242222612483417),
  (5, 0.016679291901008995),
  (6, 0.012320882327277141)],
 [(0, 0.08089166189934743),
  (1, 0.7435023552504705),
  (2, 0.044403927330132396),
  (3, 0.033510143050357315),
  (4, 0.025198860042726232),
  (5, 0.01874461730245409),
  (6, 0.013846529244831806),
  (7, 0.010384337848137939)],
 [(0, 0.7398447964237507),
  (1, 0.06698283248044715),
  (2, 0.04899313390216687),
  (3, 0.036862205074928295),
  (4, 0.02768180787661879),
  (5, 0.02059139176174965),
  (6, 0.01521071547172867),
  (7, 0.011407422425566253)],
 [(0, 0.7962703635234022),
  (1, 0.05398861469394166),
  (2, 0.03795941400773601),
  (3, 0.028556923952855452),
  (4, 0.021467485379882995),
  (5, 0.015968638663872797),
  (6, 0.011795927264606435)],
 [(0, 0.5331648916261047),
  (1, 0.2608024607901832),
  (2, 0.05217821699693843),
  (3, 0.03934504740893295),
  (4, 0.02953790052514554),
  (5, 0.02197118958402161),
  (6, 0.0162299684418014),
  (7, 0.012171820906447469)]]
HDA Topics:
[(0, '0.088*management + 0.087*abc + 0.085*ordering + 0.068*system + 0.053*human'), (1, '0.112*random + 0.105*error + 0.072*eps + 0.062*measurement + 0.058*abc')]

注:LDA_test.txt數據:

Human machine interface for lab abc computer applications
A survey of user opinion of computer system response time
The EPS user interface management system
System and human system engineering testing of EPS
Relation of user perceived response time to error measurement
The generation of random binary unordered trees
The intersection graph of paths in trees
Graph minors IV Widths of trees and well quasi ordering
Graph minors A survey

參考

  1. https://blog.csdn.net/sinat_34022298/article/details/75943272
  2. https://blog.csdn.net/weixin_40662229/article/details/80802325
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章