12_16上午

使用 CountVectorizer 對象爲每個詞創建二進制值

vectorizer =CountVectorizer(lowercase=True, analyzer='word', binary=True) ##TODO## : Use CountVectorizer to create a binary value for each word
representation = vectorizer.fit_transform(corpus)
representation_df = pd.DataFrame(data = representation.toarray(), columns=sorted(vectorizer.vocabulary_.keys()))
representation_df

使用 CountVectorizer 對象去掉英文停止詞

vectorizer = CountVectorizer(lowercase=True, analyzer='word', binary=True,stop_words={'the','on','in'})##TODO## : Use CountVectorizer to remove English stopwords
representation = vectorizer.fit_transform(corpus)
representation_df = pd.DataFrame(data = representation.toarray(), columns=sorted(vectorizer.vocabulary_.keys()))
representation_df

將測試語料庫中的句子轉換成 BoW 表徵

test_corpus = ['The keyboard sat on the mat', 'The bird sat on the mat']
X_test = vectorizer.transform(test_corpus)##TODO## : Generate the BoW representation for the test corpus
y_test = [0,1]
print("Expected Results for (keyboard, bird):  {}".format(y_test))
print("Actual   Results for (keyboard, bird):  {}".format(logistic.predict(X_test)))

構建並訓練模型

from keras.layers import Embedding, Input, Dense, Reshape
from keras.layers.merge import Dot
from keras.models import Model
from keras.optimizers import RMSprop



target_word = Input((1,))
context_word = Input((1,))

# An embedding layer is just a lookup table - a matrix of size vocabulary_size x EMBEDDING_SIZE
# We select input_length rows from this matrix

embedding_layer =Embedding(vocabulary_size+1,EMBEDDING_SIZE,input_length=1,name='embedding_layer') ##TODO## :  Add embedding layer nambed 'embedding_layer'.  Remember to add 1 to the vocabulary size!

# Expect an output of similarity score between 0 and 1
output_layer = Dense(1, activation='sigmoid')

# Select the row indexed by target_word, reshape it for convenience
target_embedding = embedding_layer(target_word)
target_embedding = Reshape((EMBEDDING_SIZE,))(target_embedding)

# Select the row indexed by context_word, reshape it for convenience
context_embedding = embedding_layer(context_word)
context_embedding = Reshape((EMBEDDING_SIZE,))(context_embedding)


# Perform the dot product on the two embeddings, and run through the output sigmoid 
output = Dot(axes=1)([target_embedding,context_embedding])##TODO## : Add dot product layer
output = output_layer(output)
    
# Setup a model for training
model = Model(inputs=[target_word, context_word], outputs=output)

optimizer = RMSprop(lr=0.0001, rho=0.99)
model.compile(loss='binary_crossentropy', optimizer=optimizer)

model.summary()

111

inputs = Input((window_size * 2,))

embedding_layer = Embedding(num_classes, EMBEDDING_SIZE, input_length=2*window_size, name='embedding_layer')
mean_layer = Lambda(lambda x: K.mean(x, axis=1))
output_layer = Dense(num_classes, activation='softmax')


output = embedding_layer(inputs)
output = mean_layer(output)
output = output_layer(output)

model = Model(inputs=[inputs], outputs=output)

optimizer = RMSprop(lr=0.1, rho=0.99)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

 

 

 

 

 

 

 

 

 

 

zzz

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章