和絕大多數的詞向量不同,glove的目標是通過訓練詞向量和上下文向量,使得它們能夠重構共現矩陣。glove訓練部分的代碼風格和word2vec中訓練部分的代碼風格如出一轍。有了之前看word2vec的基礎,很容易就能看懂glove是怎麼做的了。glove在三元組上面進行訓練。三元組的數據結構依然和原來一樣。
typedef struct cooccur_rec {
int word1;
int word2;
real val;
} CREC;
我們首先看看glove中要訓練的參數,看initialize_parameters可以發現,跟word2vec中的參數基本一樣,都是一份詞向量參數和一份上下文向量參數。維度上glove多了一個bias,所以有一點區別,下面是代碼。
void initialize_parameters() {
long long a, b;
vector_size++; // Temporarily increment to allocate space for bias//詞向量多一維給bias
/* Allocate space for word vectors and context word vectors, and correspodning gradsq */
a = posix_memalign((void **)&W, 128, 2 * vocab_size * (vector_size + 1) * sizeof(real)); // Might perform better than malloc//詞向量和上下文向量參數,這個乘2把兩部分都包括了。這裏vector_size加1,寫的可能有問題,應該不需要加1
if (W == NULL) {
fprintf(stderr, "Error allocating memory for W\n");
exit(1);
}
a = posix_memalign((void **)&gradsq, 128, 2 * vocab_size * (vector_size + 1) * sizeof(real)); // Might perform better than malloc//glove用adagrad做梯度下降,還要爲每個參數存梯度累積值。
if (gradsq == NULL) {
fprintf(stderr, "Error allocating memory for gradsq\n");
exit(1);
}
for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size;//初始化參數
for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) gradsq[a * vector_size + b] = 1.0; // So initial value of eta is equal to initial learning rate
vector_size--;
}
下面是train_glove函數,開啓多線程調用glove_thread函數去訓練,跟word2vec套路一樣
/* Train model */
int train_glove() {
long long a, file_size;
int save_params_return_code;
int b;
FILE *fin;
real total_cost = 0;
fprintf(stderr, "TRAINING MODEL\n");
fin = fopen(input_file, "rb");//打開被打亂的三元組文件
if (fin == NULL) {fprintf(stderr,"Unable to open cooccurrence file %s.\n",input_file); return 1;}
fseeko(fin, 0, SEEK_END);
file_size = ftello(fin);
num_lines = file_size/(sizeof(CREC)); // Assuming the file isn't corrupt and consists only of CREC's//一共有多少個三元組
fclose(fin);
fprintf(stderr,"Read %lld lines.\n", num_lines);
if (verbose > 1) fprintf(stderr,"Initializing parameters...");
initialize_parameters();
if (verbose > 1) fprintf(stderr,"done.\n");
if (verbose > 0) fprintf(stderr,"vector size: %d\n", vector_size);
if (verbose > 0) fprintf(stderr,"vocab size: %lld\n", vocab_size);
if (verbose > 0) fprintf(stderr,"x_max: %lf\n", x_max);
if (verbose > 0) fprintf(stderr,"alpha: %lf\n", alpha);
pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));//多線程
lines_per_thread = (long long *) malloc(num_threads * sizeof(long long));//每個線程處理的三元組個數
time_t rawtime;
struct tm *info;
char time_buffer[80];
// Lock-free asynchronous SGD
for (b = 0; b < num_iter; b++) {//和word2vec不一樣,輪數的循環寫在了外面
total_cost = 0;
for (a = 0; a < num_threads - 1; a++) lines_per_thread[a] = num_lines / num_threads;//每個線程處理多少個三元組
lines_per_thread[a] = num_lines / num_threads + num_lines % num_threads;
long long *thread_ids = (long long*)malloc(sizeof(long long) * num_threads);//標識第幾個線程
for (a = 0; a < num_threads; a++) thread_ids[a] = a;
for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, glove_thread, (void *)&thread_ids[a]);//開啓多線程
for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);//等待線程的結束
for (a = 0; a < num_threads; a++) total_cost += cost[a];
free(thread_ids);
time(&rawtime);
info = localtime(&rawtime);
strftime(time_buffer,80,"%x - %I:%M.%S%p", info);
fprintf(stderr, "%s, iter: %03d, cost: %lf\n", time_buffer, b+1, total_cost/num_lines);
if (checkpoint_every > 0 && (b + 1) % checkpoint_every == 0) {//glove還支持在每一輪都輸出一個結果,這也是它把輪數循環寫在外面的原因
fprintf(stderr," saving itermediate parameters for iter %03d...", b+1);
save_params_return_code = save_params(b+1);
if (save_params_return_code != 0)
return save_params_return_code;
fprintf(stderr,"done.\n");
}
}
free(pt);
free(lines_per_thread);
return save_params(0);
}
下面詳細的說依稀glove_thread,這裏面是glove訓練的核心代碼。
/* Train the GloVe model */
void *glove_thread(void *vid) {
long long a, b ,l1, l2;
long long id = *(long long*)vid;
CREC cr;
real diff, fdiff, temp1, temp2;
FILE *fin;
fin = fopen(input_file, "rb");
fseeko(fin, (num_lines / num_threads * id) * (sizeof(CREC)), SEEK_SET); //Threads spaced roughly equally throughout file//首先要找到這個線程從哪裏開始訓練三元組
cost[id] = 0;
real* W_updates1 = (real*)malloc(vector_size * sizeof(real));
real* W_updates2 = (real*)malloc(vector_size * sizeof(real));
for (a = 0; a < lines_per_thread[id]; a++) {//過一遍這個線程需要訓練的所有三元組
fread(&cr, sizeof(CREC), 1, fin);//首先讀取一個三元組
if (feof(fin)) break;
if (cr.word1 < 1 || cr.word2 < 1) { continue; }
/* Get location of words in W & gradsq */
l1 = (cr.word1 - 1LL) * (vector_size + 1); // cr word indices start at 1//找到這個三元組中兩個單詞的id
l2 = ((cr.word2 - 1LL) + vocab_size) * (vector_size + 1); // shift by vocab_size to get separate vectors for context words
/* Calculate cost, save diff for gradients */
diff = 0;
for (b = 0; b < vector_size; b++) diff += W[b + l1] * W[b + l2]; // dot product of word and context word vector//詞向量和上下文向量的內積
diff += W[vector_size + l1] + W[vector_size + l2] - log(cr.val); // add separate bias for each word//再加上連個單詞對應的bias,應該和兩個單詞共現的次數val的log值儘可能接近
fdiff = (cr.val > x_max) ? diff : pow(cr.val / x_max, alpha) * diff; // multiply weighting function (f) with diff//同時每個三元組的重要程度不一樣,val高的三元組更重要一些
// Check for NaN and inf() in the diffs.
if (isnan(diff) || isnan(fdiff) || isinf(diff) || isinf(fdiff)) {
fprintf(stderr,"Caught NaN in diff for kdiff for thread. Skipping update");
continue;
}
cost[id] += 0.5 * fdiff * diff; // weighted squared error//平方誤差
/* Adaptive gradient updates */
fdiff *= eta; // for ease in calculating gradient
real W_updates1_sum = 0;
real W_updates2_sum = 0;
for (b = 0; b < vector_size; b++) {//更新詞向量和上下文向量的值
// learning rate times gradient for word vectors
temp1 = fdiff * W[b + l2];
temp2 = fdiff * W[b + l1];
// adaptive updates//adagrad梯度下降
W_updates1[b] = temp1 / sqrt(gradsq[b + l1]);//詞向量要調整的值,adagrad公式
W_updates2[b] = temp2 / sqrt(gradsq[b + l2]);//上下文向量要調整的值
W_updates1_sum += W_updates1[b];
W_updates2_sum += W_updates2[b];
gradsq[b + l1] += temp1 * temp1;//更新梯度累積值,越來越大,使得learning rate越來越小,和word2vec的機制類似
gradsq[b + l2] += temp2 * temp2;
}
if (!isnan(W_updates1_sum) && !isinf(W_updates1_sum) && !isnan(W_updates2_sum) && !isinf(W_updates2_sum)) {
for (b = 0; b < vector_size; b++) {
W[b + l1] -= W_updates1[b];//更新參數值
W[b + l2] -= W_updates2[b];
}
}
// updates for bias terms//更新bias的值
W[vector_size + l1] -= check_nan(fdiff / sqrt(gradsq[vector_size + l1]));
W[vector_size + l2] -= check_nan(fdiff / sqrt(gradsq[vector_size + l2]));
fdiff *= fdiff;
gradsq[vector_size + l1] += fdiff;
gradsq[vector_size + l2] += fdiff;
}
free(W_updates1);
free(W_updates2);
fclose(fin);
pthread_exit(NULL);
}
glove的訓練就完成了,和word2vec代碼其實是高度的相似的,word2vec是對於每個word pair更新一次詞向量和上下文向量,glove是根據每個三元組去更新一次詞向量和上下文向量。最後介紹一個glove1如何保存模型得到的參數。glove中不僅可以保存詞向量還可以保存上下文向量,以及可以保存每個單詞的bias。代碼比較長,不過邏輯很簡單。
int save_params(int nb_iter) {
/*
* nb_iter is the number of iteration (= a full pass through the cooccurrence matrix).
* nb_iter > 0 => checkpointing the intermediate parameters, so nb_iter is in the filename of output file.
* else => saving the final paramters, so nb_iter is ignored.
*/
//目前內存中還沒有詞典,詞典會把單詞和id對應上
long long a, b;
char format[20];
char output_file[MAX_STRING_LENGTH], output_file_gsq[MAX_STRING_LENGTH];
char *word = malloc(sizeof(char) * MAX_STRING_LENGTH + 1);
FILE *fid, *fout, *fgs;
if (use_binary > 0) { // Save parameters in binary file//二進制存儲
if (nb_iter <= 0)
sprintf(output_file,"%s.bin",save_W_file);
else
sprintf(output_file,"%s.%03d.bin",save_W_file,nb_iter);
fout = fopen(output_file,"wb");
if (fout == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_W_file); return 1;}
for (a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&W[a], sizeof(real), 1,fout);//二進制的情況下就存儲詞向量,不需要讀取單詞
fclose(fout);
if (save_gradsq > 0) {//還可以存梯度的累積值
if (nb_iter <= 0)
sprintf(output_file_gsq,"%s.bin",save_gradsq_file);
else
sprintf(output_file_gsq,"%s.%03d.bin",save_gradsq_file,nb_iter);
fgs = fopen(output_file_gsq,"wb");
if (fgs == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_gradsq_file); return 1;}
for (a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&gradsq[a], sizeof(real), 1,fgs);
fclose(fgs);
}
}
if (use_binary != 1) { // Save parameters in text file//非二進制的情況下,允許存儲不同的參數
if (nb_iter <= 0)
sprintf(output_file,"%s.txt",save_W_file);
else
sprintf(output_file,"%s.%03d.txt",save_W_file,nb_iter);
if (save_gradsq > 0) {
if (nb_iter <= 0)
sprintf(output_file_gsq,"%s.txt",save_gradsq_file);
else
sprintf(output_file_gsq,"%s.%03d.txt",save_gradsq_file,nb_iter);
fgs = fopen(output_file_gsq,"wb");
if (fgs == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_gradsq_file); return 1;}
}
fout = fopen(output_file,"wb");
if (fout == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_W_file); return 1;}
fid = fopen(vocab_file, "r");//首先要讀入字典
sprintf(format,"%%%ds",MAX_STRING_LENGTH);
if (fid == NULL) {fprintf(stderr, "Unable to open file %s.\n",vocab_file); return 1;}
if (write_header) fprintf(fout, "%ld %d\n", vocab_size, vector_size);//一般詞向量寫出的第一行都是詞典中單詞個數以及詞向量的維度
for (a = 0; a < vocab_size; a++) {//遍歷詞典,讀取一個單詞,寫一個單詞,後面跟着寫向量
if (fscanf(fid,format,word) == 0) return 1;
// input vocab cannot contain special <unk> keyword
if (strcmp(word, "<unk>") == 0) return 1;
fprintf(fout, "%s",word);//每行以單詞開頭
if (model == 0) { // Save all parameters (including bias)//存詞向量上下文向量以及包括bias,和上面的寫到文件第一行的vector_size對不上了。算是一個bug
for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]);
for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[(vocab_size + a) * (vector_size + 1) + b]);
}
if (model == 1) // Save only "word" vectors (without bias)//只存詞向量沒有bias
for (b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]);
if (model == 2) // Save "word + context word" vectors (without bias)//存詞向量和上下文向量的求和
for (b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b] + W[(vocab_size + a) * (vector_size + 1) + b]);
fprintf(fout,"\n");
if (save_gradsq > 0) { // Save gradsq
fprintf(fgs, "%s",word);
for (b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[a * (vector_size + 1) + b]);
for (b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[(vocab_size + a) * (vector_size + 1) + b]);
fprintf(fgs,"\n");
}
if (fscanf(fid,format,word) == 0) return 1; // Eat irrelevant frequency entry//文件詞典中還有單詞的頻數,這裏沒用,讀取以後不需要處理
}
if (use_unk_vec) {//後面是處理oov的邏輯,無足輕重。
real* unk_vec = (real*)calloc((vector_size + 1), sizeof(real));
real* unk_context = (real*)calloc((vector_size + 1), sizeof(real));
word = "<unk>";
int num_rare_words = vocab_size < 100 ? vocab_size : 100;
for (a = vocab_size - num_rare_words; a < vocab_size; a++) {
for (b = 0; b < (vector_size + 1); b++) {
unk_vec[b] += W[a * (vector_size + 1) + b] / num_rare_words;
unk_context[b] += W[(vocab_size + a) * (vector_size + 1) + b] / num_rare_words;
}
}
fprintf(fout, "%s",word);
if (model == 0) { // Save all parameters (including bias)
for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", unk_vec[b]);
for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", unk_context[b]);
}
if (model == 1) // Save only "word" vectors (without bias)
for (b = 0; b < vector_size; b++) fprintf(fout," %lf", unk_vec[b]);
if (model == 2) // Save "word + context word" vectors (without bias)
for (b = 0; b < vector_size; b++) fprintf(fout," %lf", unk_vec[b] + unk_context[b]);
fprintf(fout,"\n");
free(unk_vec);
free(unk_context);
}
fclose(fid);
fclose(fout);
if (save_gradsq > 0) fclose(fgs);
}
return 0;
}