from gensim.models import Word2Vec,FastText
from keras.preprocessing.text import Tokenizer
import jieba
from keras.preprocessing.sequence import pad_sequences
import os
from collections import Counter
os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
from data.data_process import run
train_df,test_df = run()
data_word = [' ' .join(jieba.lcut(x)) for x in train_df.ocr]
def get_wordcount(data_word):
    word_set = set()
    for word in data_word:
        word = (word.split(' '))
        for i_word in word:
            word_set.add(i_word)
    return len(word_set)
# data_len_word = get_wordcount(data_word)
data_word = (list(data_word))
data_char = [' '.join(x) for x in train_df.ocr]
def set_tokenizer(docs,split_char= ' ', max_len = 100):
    tokenizer = Tokenizer(lower=False,char_level=False,split=split_char)
    tokenizer.fit_on_texts(docs)
    X = tokenizer.texts_to_sequences(docs)
    max_len = max_len
    X = pad_sequences(X,maxlen=max_len,value=0)
    word_index = tokenizer.word_index
    return X,word_index
X,index = set_tokenizer(data_word,max_len= 330)
def train_word2vec(docs,embed_size = 300,save_name = 'w2v.txt',split_char = ' '):
    input_docs = []
    for i in docs:
        input_docs.append(i.split(split_char))
    w2v = Word2Vec(input_docs,size=embed_size,sg=1,window=8,seed=1024,workers=24,min_count=1,iter=10)
    w2v.save(save_name)
    return w2v
 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章