# coding = utf8 import wordcut import create_dict import vectorize import classify import pickle import psutil import parameters import os from collections import deque import gensim import numpy as np import csv import processHandler def cos_sim(arrA, arrB): return arrA.dot(arrB)/(np.linalg.norm(arrA)*np.linalg.norm(arrB)) # def compare(word_cutter,vectorizer,sentenceA,sentenceB): # tokenA = word_cutter.word_cut(sentenceA) # word_cuttter returns a list # tokenB = word_cutter.word_cut(sentenceB) # vectorA = np.array(vectorizer.vectorize(tokenA)) # vectorB = np.array(vectorizer.vectorize(tokenB)) # return cos_sim(vectorA,vectorB) def main(): current_dir = os.path.abspath('.') stopword_set = set() parameter = parameters.Parameters(os.path.join(current_dir, 'config.ini'), stopword_set) # w2v_training = os.path.join(current_dir, 'training_set20170810.csv') # pickle_path = os.path.join(current_dir, 'corpus') sentence_list = deque() # answer_path = os.path.join(current_dir, 'answer.csv') question_path =os.path.join(current_dir, 'question.csv') # concept_des_path =os.path.join(current_dir, 'concept_description.csv') w2v_file = os.path.join(current_dir, 'w2v_file_2017012.bin') word_cutter = wordcut.WordCutter(overlapping=parameter.overlapping) # try using stop-words preprocessor = processHandler.Prerocessor(False,False,False) # file = open(pickle_path,'rb') # pklist = pickle.load(file) # file.close() trainingset = [] # i = 0 # for row in pklist: # temp = row[1].replace('\r','') # temp = temp.replace('\n', '') # trainingset.append(temp) # temp = row[2].replace('\r','') # temp = temp.replace('\n', '') # trainingset.append(temp) # i += 1 # if i > 100000: # break # del pklist # file = open(answer_path,encoding='gb18030') # cache = file.readlines() # file.close() # for item in cache: # temp = item.replace('\n','') # temp = temp.replace('\r', '') # trainingset.append(temp) file = open(question_path,encoding='gb18030') cache = file.readlines() file.close() for item in cache: temp = item.replace('\n','') temp = temp.replace('\r', '') trainingset.append(temp) # file = open(concept_des_path,encoding='gb18030') # cache = file.readlines() # file.close() # for item in cache: # temp = item.replace('\n','') # temp = temp.replace('\r', '') # trainingset.append(temp) # del cache while len(trainingset) > 0: contain_chinese = False last = trainingset.pop() for item in last: if word_cutter.is_chinese(item): contain_chinese = True break if contain_chinese: temp = last for symbol in (u'。', u'?', u'!', '!', '?'): temp = temp.replace(symbol, ' ') temp = temp.split() for sentence in temp: #print(sentence) sentence_list.append(sentence) del trainingset sentence_token = deque() total = len(sentence_list) i = 0 while len(sentence_list) > 0: i += 1 #print(item) #print(preprocessor.process_main(item)) temp = preprocessor.process_main(sentence_list.pop())[-1] if temp is not None: #print(temp) sentence_token.append(temp) if i >= 10000: print([len(sentence_list), total]) i = 0 dic = gensim.models.Word2Vec(sentence_token,size=parameter.n_neuron,workers=3,seed=1024,iter=20) #sg=1 represents using skip-gram #sentence_token,size=parameter.n_neuron,workers=4,seed=1024,iter=20,sg=0 #dic = gensim.models.Doc2Vec(sentence_token,size=parameter.n_neuron,workers=3,seed=1024,iter=20) sentence_token = deque() dic.save(w2v_file) # dic.save_word2vec_format(w2v_file, binary=True) if __name__ == '__main__': main()
【word2vec實例1】
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.