elmo調試練習

import tensorflow_hub as hub
import tensorflow as tf
import re
import numpy as np
import pickle
import pandas as pd
from nltk import WordNeatLemmatizer,word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
stoplist = stopwords.words('english')
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
# x = ["Roasted ants are a popular snack in Columbia"]
# embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
from nltk import WordNetLemmatizer,word_tokenize
from keras import Model,Input,layers
from keras.utils.np_utils import to_categorical
pd.set_option('display.max_colwidth', -1)
data_train=pd.read_csv(r'/content/sample_data/train.tsv',sep='\t')
data_test=pd.read_csv(r'/content/sample_data/test.tsv',sep='\t')
sub=pd.read_csv(r'/content/sample_data/sampleSubmission.csv')
data_train_X = data_train.Phrase.values
X_test = list(data_test.Phrase.values)
data_train_Y = list(data_train.Sentiment.values)
lemmat = WordNetLemmatizer()
def clean(data):
    data = [re.sub('[^a-zA-Z]',' ',word) for word in  data] #去非英文字符
    data_x = []
    for i in (data):
        data_word = word_tokenize(i) #分詞
        # data_word = [word for word in data_word if word not in stoplist ] #去停用詞
        data_word_result = []
        for word in data_word:
            if word in stoplist:
                data_word.remove(word)
                data_word_result.append(word)
            elif data_word ==  []:
                for j in  data_word_result:
                    data_word.append(j)

        data_word1= [lemmat.lemmatize(word.lower()) for word in data_word] #去時態語態
        data_word1 = ' '.join(data_word1)
        data_x.append(data_word1)
    return data_x
def elmo_vector(x):
    embeddings = elmo(x,signature='default',as_dict=True)["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        return sess.run(tf.reduce_mean(embeddings,1))
data_train_X = clean(data_train_X)
data_train_Y = to_categorical(data_train_Y)
data_train_X_list = [data_train_X[i:i+250] for i in range(0,len(data_train_X),250)]
elmo_train = [elmo_vector(x) for x in data_train_X_list]
elmo_train_new = np.concatenate(elmo_train,axis=0)
pickle_out = open('elmo_train.pickle','wb')
pickle.dump(elmo_train_new,pickle_out)
pickle_out.close()


# X_train,X_val,Y_train,Y_val = train_test_split(data_train_X,data_train_Y,stratify=data_train_Y,test_size=0.2,random_state=123)#切分數據

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章