- import jieba, os
- import codecs
- from gensim import corpora, models, similarities
- from pprint import pprint
- from collections import defaultdict
- import sys
- import pickle
- reload(sys)
- sys.setdefaultencoding('utf-8')
- def print_dict(dict):
- for key in dict:
- print type(key),key,str(dict[key]),
- def test3():
- '''''''
- gensim學習之Dictionary
- '''
- a = [['一','一','二'],['一','二','三']]
- b = ['一','一','三','四','四']
- dictionary = corpora.Dictionary(a)
- print "########dictionary信息##########"
- print str(dictionary) #
- print "字典,{單詞id,在多少文檔中出現}"
- print dictionary.dfs #字典,{單詞id,在多少文檔中出現}
- print "文檔數目"
- print dictionary.num_docs #文檔數目
- print "dictionary.items()"
- print_dict(dict(dictionary.items())) #
- print "字典,{單詞id,對應的詞}"
- print_dict(dictionary.id2token) #字典,{單詞id,對應的詞}
- print "字典,{詞,對應的單詞id}"
- print_dict(dictionary.token2id) #字典,{詞,對應的單詞id}
- print "所有詞的個數"
- print dictionary.num_pos #所有詞的個數
- print "每個文件中不重複詞個數的和"
- print dictionary.num_nnz #每個文件中不重複詞個數的和
- print "########doc2bow##########"
- #dictionary.add_documents([b])
- #allow_update->更新當前字典;return_missing->返回字典中不存在的詞
- #result爲b文章轉換得到的詞袋,列表[(單詞id,詞頻)]
- result, missing = dictionary.doc2bow(b, allow_update=False, return_missing=True)
- print "詞袋b,列表[(單詞id,詞頻)]"
- print result
- print "不在字典中的詞及其詞頻,字典[(單詞,詞頻)]"
- print_dict(missing)
- print "########bow信息##########"
- for id, freq in result:
- print id, dictionary.id2token[id], freq
- print "########dictionary信息##########"
- #過濾文檔頻率大於no_below,小於no_above*num_docs的詞
- dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=10)
- return
- test3()
【gensim--dictionary】使用方法
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.