make
就會多出來一個build文件夾
sh demo.sh
其中,可以再demo.sh裏面,設置訓練語料路徑(默認是從網上下載一個語料,把這段刪了,改成自己的語料路徑就行了),還可以設置迭代次數,向量的維度等等,自己隨便折騰就行了
# Demo: Loads the newly created glove_model.txt into gensim API.
model=gensim.models.Word2Vec.load_word2vec_format(' vectors.txt',binary=False) #GloVe Model
def load(filename):
# Input: GloVe Model File
# More models can be downloaded from http://nlp.stanford.edu/projects/glove/
# glove_file="glove.840B.300d.txt"
glove_file = filename
dimensions = 300
num_lines = getFileLineNums(filename)
# num_lines = check_num_lines_in_glove(glove_file)
# dims = int(dimensions[:-1])
dims = 300
print num_lines
#
# # Output: Gensim Model text format.
gensim_file='glove_model.txt'
gensim_first_line = "{} {}".format(num_lines, dims)
#
# # Prepends the line.
if platform == "linux" or platform == "linux2":
prepend_line(glove_file, gensim_file, gensim_first_line)
else:
prepend_slow(glove_file, gensim_file, gensim_first_line)
# Demo: Loads the newly created glove_model.txt into gensim API.
model=gensim.models.Word2Vec.load_word2vec_format(gensim_file,binary=False) #GloVe Model
model_name = filename[5:-4]
model.save('model\\' + model_name)
return model
def getFileLineNums(filename):
f = open(filename,'r')
count = 0
for line in f:
count += 1
return count
def prepend_line(infile, outfile, line):
"""
Function use to prepend lines using bash utilities in Linux.
(source: http://stackoverflow.com/a/10850588/610569)
"""
with open(infile, 'r') as old:
with open(outfile, 'w') as new:
new.write(str(line) + "\n")
shutil.copyfileobj(old, new)
def prepend_slow(infile, outfile, line):
"""
Slower way to prepend the line by re-creating the inputfile.
"""
with open(infile, 'r') as fin:
with open(outfile, 'w') as fout:
fout.write(line + "\n")
for line in fin:
fout.write(line)
model = gensim.models.Word2Vec.load('model/'+model_name)
print len(model.vocab)
word_list = [u'發燒',u'流感']
for word in word_list:
print word,'--'
for i in model.most_similar(word, topn=10):
print i[0],i[1]
print ''
結果如下
發燒 --
瘟癘 0.561131298542
多無發 0.438511788845
感冒 0.423784643412
寒戰 0.41094905138
發冷 0.400202810764
肌肉痠痛 0.394035518169
畏寒 0.391746163368
頭痛 0.390283048153
惡寒 0.387357711792
石岐 0.385719358921
流感 --
芭比 0.693880617619
嗜血 0.660785496235
H1N1 0.543790698051
肺炎 0.520848989487
流行性感冒 0.517322063446
副流感 0.51515519619
甲型 0.495822429657
肺炎球菌 0.491611480713
H10N8 0.490446418524
H3N2 0.486712753773
for i in w2v_model.most_similar(positive=['肺炎', '肺'], negative=['胃炎']):
print i[0],i[1]
肺部 0.662135243416
通氣 0.548550665379
肺泡 0.529182732105
肺氣腫 0.525536477566
慢阻 0.512038588524
胸片 0.503533244133
萎陷 0.502206265926
肺透明膜病 0.498196214437
肺段 0.492621898651