from gensim.models import Word2Vec,FastText
from keras.preprocessing.text import Tokenizer
import jieba
from keras.preprocessing.sequence import pad_sequences
import os
from collections import Counter
os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
from data.data_process import run
train_df,test_df = run()
data_word = [' ' .join(jieba.lcut(x)) for x in train_df.ocr]
def get_wordcount(data_word):
word_set = set()
for word in data_word:
word = (word.split(' '))
for i_word in word:
word_set.add(i_word)
return len(word_set)
# data_len_word = get_wordcount(data_word)
data_word = (list(data_word))
data_char = [' '.join(x) for x in train_df.ocr]
def set_tokenizer(docs,split_char= ' ', max_len = 100):
tokenizer = Tokenizer(lower=False,char_level=False,split=split_char)
tokenizer.fit_on_texts(docs)
X = tokenizer.texts_to_sequences(docs)
max_len = max_len
X = pad_sequences(X,maxlen=max_len,value=0)
word_index = tokenizer.word_index
return X,word_index
X,index = set_tokenizer(data_word,max_len= 330)
def train_word2vec(docs,embed_size = 300,save_name = 'w2v.txt',split_char = ' '):
input_docs = []
for i in docs:
input_docs.append(i.split(split_char))
w2v = Word2Vec(input_docs,size=embed_size,sg=1,window=8,seed=1024,workers=24,min_count=1,iter=10)
w2v.save(save_name)
return w2v
啊
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.