使用 CountVectorizer 對象爲每個詞創建二進制值
vectorizer =CountVectorizer(lowercase=True, analyzer='word', binary=True) ##TODO## : Use CountVectorizer to create a binary value for each word
representation = vectorizer.fit_transform(corpus)
representation_df = pd.DataFrame(data = representation.toarray(), columns=sorted(vectorizer.vocabulary_.keys()))
representation_df
使用 CountVectorizer 對象去掉英文停止詞
vectorizer = CountVectorizer(lowercase=True, analyzer='word', binary=True,stop_words={'the','on','in'})##TODO## : Use CountVectorizer to remove English stopwords
representation = vectorizer.fit_transform(corpus)
representation_df = pd.DataFrame(data = representation.toarray(), columns=sorted(vectorizer.vocabulary_.keys()))
representation_df
將測試語料庫中的句子轉換成 BoW 表徵
test_corpus = ['The keyboard sat on the mat', 'The bird sat on the mat']
X_test = vectorizer.transform(test_corpus)##TODO## : Generate the BoW representation for the test corpus
y_test = [0,1]
print("Expected Results for (keyboard, bird): {}".format(y_test))
print("Actual Results for (keyboard, bird): {}".format(logistic.predict(X_test)))
構建並訓練模型
from keras.layers import Embedding, Input, Dense, Reshape
from keras.layers.merge import Dot
from keras.models import Model
from keras.optimizers import RMSprop
target_word = Input((1,))
context_word = Input((1,))
# An embedding layer is just a lookup table - a matrix of size vocabulary_size x EMBEDDING_SIZE
# We select input_length rows from this matrix
embedding_layer =Embedding(vocabulary_size+1,EMBEDDING_SIZE,input_length=1,name='embedding_layer') ##TODO## : Add embedding layer nambed 'embedding_layer'. Remember to add 1 to the vocabulary size!
# Expect an output of similarity score between 0 and 1
output_layer = Dense(1, activation='sigmoid')
# Select the row indexed by target_word, reshape it for convenience
target_embedding = embedding_layer(target_word)
target_embedding = Reshape((EMBEDDING_SIZE,))(target_embedding)
# Select the row indexed by context_word, reshape it for convenience
context_embedding = embedding_layer(context_word)
context_embedding = Reshape((EMBEDDING_SIZE,))(context_embedding)
# Perform the dot product on the two embeddings, and run through the output sigmoid
output = Dot(axes=1)([target_embedding,context_embedding])##TODO## : Add dot product layer
output = output_layer(output)
# Setup a model for training
model = Model(inputs=[target_word, context_word], outputs=output)
optimizer = RMSprop(lr=0.0001, rho=0.99)
model.compile(loss='binary_crossentropy', optimizer=optimizer)
model.summary()
111
inputs = Input((window_size * 2,))
embedding_layer = Embedding(num_classes, EMBEDDING_SIZE, input_length=2*window_size, name='embedding_layer')
mean_layer = Lambda(lambda x: K.mean(x, axis=1))
output_layer = Dense(num_classes, activation='softmax')
output = embedding_layer(inputs)
output = mean_layer(output)
output = output_layer(output)
model = Model(inputs=[inputs], outputs=output)
optimizer = RMSprop(lr=0.1, rho=0.99)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
zzz