-
word2vec介紹
Word2vec,是一羣用來產生詞向量的相關模型。這些模型爲淺而雙層的神經網絡,用來訓練以重新建構語言學之詞文本。網絡以詞表現,並且需猜測相鄰位置的輸入詞,在word2vec中詞袋模型假設下,詞的順序是不重要的。訓練完成之後,word2vec模型可用來映射每個詞到一個向量,可用來表示詞對詞之間的關係,該向量爲神經網絡之隱藏層。
-
CBOW算法:CBOW是已知當前詞的上下文,來預測當前詞。
-
skip-gram算法:已知當前詞的情況下,預測其上下文。
-
優缺點
-
skip-gram比Cbow準確率高,能更好的處理生僻字(即出現頻率低的字)。因爲在計算時,Cbow會將context word加起來,所以在遇到生僻詞時預測效果將會大大降低,而skip-gram則會預測生僻字的使用環境
-
Cbow比skip-gram訓練快
-
-
代碼執行
- 下載源碼,在TensorFlow-Examples\tensorflow_v1\examples\2_BasicModels\word2vec.py路徑下,會找到word2vec.py文件。
""" Word2Vec. Implement Word2Vec algorithm to compute vector representations of words. This example is using a small chunk of Wikipedia articles to train from. References: - Mikolov, Tomas et al. "Efficient Estimation of Word Representations in Vector Space.", 2013. Links: - [Word2Vec] https://arxiv.org/pdf/1301.3781.pdf Author: Aymeric Damien Project: https://github.com/aymericdamien/TensorFlow-Examples/ """ from __future__ import division, print_function, absolute_import import collections import os import random import urllib import zipfile import numpy as np import tensorflow as tf # Training Parameters learning_rate = 0.1 batch_size = 128 num_steps = 3000000 display_step = 10000 eval_step = 200000 # Evaluation Parameters#評價參數 eval_words = [b'five', b'of', b'going', b'hardware', b'american', b'britain'] # Word2Vec Parameters embedding_size = 200 # Dimension of the embedding vector max_vocabulary_size = 50000 # Total number of different words in the vocabulary min_occurrence = 10 # Remove all words that does not appears at least n times skip_window = 3 # How many words to consider left and right num_skips = 2 # How many times to reuse an input to generate a label num_sampled = 64 # Number of negative examples to sample # Download a small chunk of Wikipedia articles collection url = 'http://mattmahoney.net/dc/text8.zip' data_path = 'text8.zip' if not os.path.exists(data_path): print("Downloading the dataset... (It may take some time)") filename, _ = urllib.request.urlretrieve(url, data_path) print("Done!") # Unzip the dataset file. Text has already been processed with zipfile.ZipFile(data_path) as f: text_words = f.read(f.namelist()[0]).lower().split() # Build the dictionary and replace rare words with UNK token #構建字典,並用UNK令牌替換稀有單詞 count = [('UNK', -1)] # Retrieve the most common words#檢索最常用的單詞 count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1)) # Remove samples with less than 'min_occurrence' occurrences#刪除小於'min_occurrence'的樣本 #刪除出現次數小於'min_occurrence'的樣本 for i in range(len(count) - 1, -1, -1): if count[i][1] < min_occurrence: count.pop(i) else: # The collection is ordered, so stop when 'min_occurrence' is reached break # Compute the vocabulary size vocabulary_size = len(count) # Assign an id to each word word2id = dict() for i, (word, _)in enumerate(count): word2id[word] = i data = list() unk_count = 0 for word in text_words: # Retrieve a word id, or assign it index 0 ('UNK') if not in dictionary index = word2id.get(word, 0) if index == 0: unk_count += 1 data.append(index) count[0] = ('UNK', unk_count) id2word = dict(zip(word2id.values(), word2id.keys())) print("Words count:", len(text_words)) print("Unique words:", len(set(text_words))) print("Vocabulary size:", vocabulary_size) print("Most common words:", count[:10]) data_index = 0 # Generate training batch for the skip-gram model def next_batch(batch_size, num_skips, skip_window): global data_index assert batch_size % num_skips == 0 assert num_skips <= 2 * skip_window batch = np.ndarray(shape=(batch_size), dtype=np.int32) labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) # get window size (words left and right + current one) span = 2 * skip_window + 1 buffer = collections.deque(maxlen=span) if data_index + span > len(data): data_index = 0 buffer.extend(data[data_index:data_index + span]) data_index += span for i in range(batch_size // num_skips): context_words = [w for w in range(span) if w != skip_window] words_to_use = random.sample(context_words, num_skips) for j, context_word in enumerate(words_to_use): batch[i * num_skips + j] = buffer[skip_window] labels[i * num_skips + j, 0] = buffer[context_word] if data_index == len(data): buffer.extend(data[0:span]) data_index = span else: buffer.append(data[data_index]) data_index += 1 # Backtrack a little bit to avoid skipping words in the end of a batch data_index = (data_index + len(data) - span) % len(data) return batch, labels # Input data X = tf.placeholder(tf.int32, shape=[None]) # Input label Y = tf.placeholder(tf.int32, shape=[None, 1]) # Ensure the following ops & var are assigned on CPU # (some ops are not compatible on GPU) with tf.device('/cpu:0'): # Create the embedding variable (each row represent a word embedding vector) embedding = tf.Variable(tf.random_normal([vocabulary_size, embedding_size])) # Lookup the corresponding embedding vectors for each sample in X X_embed = tf.nn.embedding_lookup(embedding, X) # Construct the variables for the NCE loss nce_weights = tf.Variable(tf.random_normal([vocabulary_size, embedding_size])) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch loss_op = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=Y, inputs=X_embed, num_sampled=num_sampled, num_classes=vocabulary_size)) # Define the optimizer optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.minimize(loss_op) # Evaluation # Compute the cosine similarity between input data embedding and every embedding vectors X_embed_norm = X_embed / tf.sqrt(tf.reduce_sum(tf.square(X_embed))) embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True)) cosine_sim_op = tf.matmul(X_embed_norm, embedding_norm, transpose_b=True) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() with tf.Session() as sess: # Run the initializer sess.run(init) # Testing data x_test = np.array([word2id[w] for w in eval_words]) average_loss = 0 for step in range(1, num_steps + 1): # Get a new batch of data batch_x, batch_y = next_batch(batch_size, num_skips, skip_window) # Run training op _, loss = sess.run([train_op, loss_op], feed_dict={X: batch_x, Y: batch_y}) average_loss += loss if step % display_step == 0 or step == 1: if step > 1: average_loss /= display_step print("Step " + str(step) + ", Average Loss= " + \ "{:.4f}".format(average_loss)) average_loss = 0 # Evaluation if step % eval_step == 0 or step == 1: print("Evaluation...") sim = sess.run(cosine_sim_op, feed_dict={X: x_test}) for i in range(len(eval_words)): top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k + 1] log_str = '"%s" nearest neighbors:' % eval_words[i] for k in range(top_k): log_str = '%s %s,' % (log_str, id2word[nearest[k]]) print(log_str)
-
運行結果
word2Vec實戰解讀
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.