詞向量源碼解析:(3.5)GloVe源碼解析之glove

和絕大多數的詞向量不同,glove的目標是通過訓練詞向量和上下文向量,使得它們能夠重構共現矩陣。glove訓練部分的代碼風格和word2vec中訓練部分的代碼風格如出一轍。有了之前看word2vec的基礎,很容易就能看懂glove是怎麼做的了。glove在三元組上面進行訓練。三元組的數據結構依然和原來一樣。

typedef struct cooccur_rec {
    int word1;
    int word2;
    real val;
} CREC;

我們首先看看glove中要訓練的參數,看initialize_parameters可以發現,跟word2vec中的參數基本一樣,都是一份詞向量參數和一份上下文向量參數。維度上glove多了一個bias,所以有一點區別,下面是代碼。

void initialize_parameters() {
    long long a, b;
    vector_size++; // Temporarily increment to allocate space for bias//詞向量多一維給bias


    /* Allocate space for word vectors and context word vectors, and correspodning gradsq */
    a = posix_memalign((void **)&W, 128, 2 * vocab_size * (vector_size + 1) * sizeof(real)); // Might perform better than malloc//詞向量和上下文向量參數,這個乘2把兩部分都包括了。這裏vector_size加1,寫的可能有問題,應該不需要加1
    if (W == NULL) {
        fprintf(stderr, "Error allocating memory for W\n");
        exit(1);
    }
    a = posix_memalign((void **)&gradsq, 128, 2 * vocab_size * (vector_size + 1) * sizeof(real)); // Might perform better than malloc//glove用adagrad做梯度下降,還要爲每個參數存梯度累積值。
if (gradsq == NULL) {
        fprintf(stderr, "Error allocating memory for gradsq\n");
        exit(1);
    }
    for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size;//初始化參數
    for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) gradsq[a * vector_size + b] = 1.0; // So initial value of eta is equal to initial learning rate
    vector_size--;
}

下面是train_glove函數,開啓多線程調用glove_thread函數去訓練,跟word2vec套路一樣

/* Train model */
int train_glove() {
    long long a, file_size;
    int save_params_return_code;
    int b;
    FILE *fin;
    real total_cost = 0;


    fprintf(stderr, "TRAINING MODEL\n");
    
    fin = fopen(input_file, "rb");//打開被打亂的三元組文件
    if (fin == NULL) {fprintf(stderr,"Unable to open cooccurrence file %s.\n",input_file); return 1;}
    fseeko(fin, 0, SEEK_END);
    file_size = ftello(fin);
    num_lines = file_size/(sizeof(CREC)); // Assuming the file isn't corrupt and consists only of CREC's//一共有多少個三元組
    fclose(fin);
    fprintf(stderr,"Read %lld lines.\n", num_lines);
    if (verbose > 1) fprintf(stderr,"Initializing parameters...");
    initialize_parameters();
    if (verbose > 1) fprintf(stderr,"done.\n");
    if (verbose > 0) fprintf(stderr,"vector size: %d\n", vector_size);
    if (verbose > 0) fprintf(stderr,"vocab size: %lld\n", vocab_size);
    if (verbose > 0) fprintf(stderr,"x_max: %lf\n", x_max);
    if (verbose > 0) fprintf(stderr,"alpha: %lf\n", alpha);
    pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));//多線程
    lines_per_thread = (long long *) malloc(num_threads * sizeof(long long));//每個線程處理的三元組個數
    
    time_t rawtime;
    struct tm *info;
    char time_buffer[80];
    // Lock-free asynchronous SGD
    for (b = 0; b < num_iter; b++) {//和word2vec不一樣,輪數的循環寫在了外面
        total_cost = 0;
        for (a = 0; a < num_threads - 1; a++) lines_per_thread[a] = num_lines / num_threads;//每個線程處理多少個三元組
        lines_per_thread[a] = num_lines / num_threads + num_lines % num_threads;
        long long *thread_ids = (long long*)malloc(sizeof(long long) * num_threads);//標識第幾個線程
        for (a = 0; a < num_threads; a++) thread_ids[a] = a;
        for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, glove_thread, (void *)&thread_ids[a]);//開啓多線程
        for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);//等待線程的結束
        for (a = 0; a < num_threads; a++) total_cost += cost[a];
        free(thread_ids);


        time(&rawtime);
        info = localtime(&rawtime);
        strftime(time_buffer,80,"%x - %I:%M.%S%p", info);
        fprintf(stderr, "%s, iter: %03d, cost: %lf\n", time_buffer,  b+1, total_cost/num_lines);


        if (checkpoint_every > 0 && (b + 1) % checkpoint_every == 0) {//glove還支持在每一輪都輸出一個結果,這也是它把輪數循環寫在外面的原因
            fprintf(stderr,"    saving itermediate parameters for iter %03d...", b+1);
            save_params_return_code = save_params(b+1);
            if (save_params_return_code != 0)
                return save_params_return_code;
            fprintf(stderr,"done.\n");
        }


    }
    free(pt);
    free(lines_per_thread);
    return save_params(0);
}

下面詳細的說依稀glove_thread,這裏面是glove訓練的核心代碼。

/* Train the GloVe model */
void *glove_thread(void *vid) {
    long long a, b ,l1, l2;
    long long id = *(long long*)vid;
    CREC cr;
    real diff, fdiff, temp1, temp2;
    FILE *fin;
    fin = fopen(input_file, "rb");
    fseeko(fin, (num_lines / num_threads * id) * (sizeof(CREC)), SEEK_SET); //Threads spaced roughly equally throughout file//首先要找到這個線程從哪裏開始訓練三元組
    cost[id] = 0;
    
    real* W_updates1 = (real*)malloc(vector_size * sizeof(real));
    real* W_updates2 = (real*)malloc(vector_size * sizeof(real));
    for (a = 0; a < lines_per_thread[id]; a++) {//過一遍這個線程需要訓練的所有三元組
        fread(&cr, sizeof(CREC), 1, fin);//首先讀取一個三元組
        if (feof(fin)) break;
        if (cr.word1 < 1 || cr.word2 < 1) { continue; }
        
        /* Get location of words in W & gradsq */
        l1 = (cr.word1 - 1LL) * (vector_size + 1); // cr word indices start at 1//找到這個三元組中兩個單詞的id
        l2 = ((cr.word2 - 1LL) + vocab_size) * (vector_size + 1); // shift by vocab_size to get separate vectors for context words
        
        /* Calculate cost, save diff for gradients */
        diff = 0;
        for (b = 0; b < vector_size; b++) diff += W[b + l1] * W[b + l2]; // dot product of word and context word vector//詞向量和上下文向量的內積
        diff += W[vector_size + l1] + W[vector_size + l2] - log(cr.val); // add separate bias for each word//再加上連個單詞對應的bias,應該和兩個單詞共現的次數val的log值儘可能接近
        fdiff = (cr.val > x_max) ? diff : pow(cr.val / x_max, alpha) * diff; // multiply weighting function (f) with diff//同時每個三元組的重要程度不一樣,val高的三元組更重要一些


        // Check for NaN and inf() in the diffs.
        if (isnan(diff) || isnan(fdiff) || isinf(diff) || isinf(fdiff)) {
            fprintf(stderr,"Caught NaN in diff for kdiff for thread. Skipping update");
            continue;
        }


        cost[id] += 0.5 * fdiff * diff; // weighted squared error//平方誤差
        
        /* Adaptive gradient updates */
        fdiff *= eta; // for ease in calculating gradient
        real W_updates1_sum = 0;
        real W_updates2_sum = 0;
        for (b = 0; b < vector_size; b++) {//更新詞向量和上下文向量的值
            // learning rate times gradient for word vectors
            temp1 = fdiff * W[b + l2];
            temp2 = fdiff * W[b + l1];
            // adaptive updates//adagrad梯度下降
            W_updates1[b] = temp1 / sqrt(gradsq[b + l1]);//詞向量要調整的值,adagrad公式
            W_updates2[b] = temp2 / sqrt(gradsq[b + l2]);//上下文向量要調整的值
            W_updates1_sum += W_updates1[b];
            W_updates2_sum += W_updates2[b];
            gradsq[b + l1] += temp1 * temp1;//更新梯度累積值,越來越大,使得learning rate越來越小,和word2vec的機制類似
            gradsq[b + l2] += temp2 * temp2;
        }
        if (!isnan(W_updates1_sum) && !isinf(W_updates1_sum) && !isnan(W_updates2_sum) && !isinf(W_updates2_sum)) {
            for (b = 0; b < vector_size; b++) {
                W[b + l1] -= W_updates1[b];//更新參數值
                W[b + l2] -= W_updates2[b];
            }
        }


        // updates for bias terms//更新bias的值
        W[vector_size + l1] -= check_nan(fdiff / sqrt(gradsq[vector_size + l1]));
        W[vector_size + l2] -= check_nan(fdiff / sqrt(gradsq[vector_size + l2]));
        fdiff *= fdiff;
        gradsq[vector_size + l1] += fdiff;
        gradsq[vector_size + l2] += fdiff;
        
    }
    free(W_updates1);
    free(W_updates2);
    
    fclose(fin);
    pthread_exit(NULL);
}

glove的訓練就完成了,和word2vec代碼其實是高度的相似的,word2vec是對於每個word pair更新一次詞向量和上下文向量,glove是根據每個三元組去更新一次詞向量和上下文向量。最後介紹一個glove1如何保存模型得到的參數。glove中不僅可以保存詞向量還可以保存上下文向量,以及可以保存每個單詞的bias。代碼比較長,不過邏輯很簡單。

int save_params(int nb_iter) {
    /*
     * nb_iter is the number of iteration (= a full pass through the cooccurrence matrix).
     *   nb_iter > 0 => checkpointing the intermediate parameters, so nb_iter is in the filename of output file.
     *   else        => saving the final paramters, so nb_iter is ignored.
     */

    //目前內存中還沒有詞典,詞典會把單詞和id對應上
    long long a, b;
    char format[20];
    char output_file[MAX_STRING_LENGTH], output_file_gsq[MAX_STRING_LENGTH];
    char *word = malloc(sizeof(char) * MAX_STRING_LENGTH + 1);
    FILE *fid, *fout, *fgs;
    
    if (use_binary > 0) { // Save parameters in binary file//二進制存儲
        if (nb_iter <= 0)
            sprintf(output_file,"%s.bin",save_W_file);
        else
            sprintf(output_file,"%s.%03d.bin",save_W_file,nb_iter);


        fout = fopen(output_file,"wb");
        if (fout == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_W_file); return 1;}
        for (a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&W[a], sizeof(real), 1,fout);//二進制的情況下就存儲詞向量,不需要讀取單詞
        fclose(fout);
        if (save_gradsq > 0) {//還可以存梯度的累積值
            if (nb_iter <= 0)
                sprintf(output_file_gsq,"%s.bin",save_gradsq_file);
            else
                sprintf(output_file_gsq,"%s.%03d.bin",save_gradsq_file,nb_iter);


            fgs = fopen(output_file_gsq,"wb");
            if (fgs == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_gradsq_file); return 1;}
            for (a = 0; a < 2 * (long long)vocab_size * (vector_size + 1); a++) fwrite(&gradsq[a], sizeof(real), 1,fgs);
            fclose(fgs);
        }
    }
    if (use_binary != 1) { // Save parameters in text file//非二進制的情況下,允許存儲不同的參數
        if (nb_iter <= 0)
            sprintf(output_file,"%s.txt",save_W_file);
        else
            sprintf(output_file,"%s.%03d.txt",save_W_file,nb_iter);
        if (save_gradsq > 0) {
            if (nb_iter <= 0)
                sprintf(output_file_gsq,"%s.txt",save_gradsq_file);
            else
                sprintf(output_file_gsq,"%s.%03d.txt",save_gradsq_file,nb_iter);


            fgs = fopen(output_file_gsq,"wb");
            if (fgs == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_gradsq_file); return 1;}
        }
        fout = fopen(output_file,"wb");
        if (fout == NULL) {fprintf(stderr, "Unable to open file %s.\n",save_W_file); return 1;}
        fid = fopen(vocab_file, "r");//首先要讀入字典
        sprintf(format,"%%%ds",MAX_STRING_LENGTH);
        if (fid == NULL) {fprintf(stderr, "Unable to open file %s.\n",vocab_file); return 1;}
if (write_header) fprintf(fout, "%ld %d\n", vocab_size, vector_size);//一般詞向量寫出的第一行都是詞典中單詞個數以及詞向量的維度
        for (a = 0; a < vocab_size; a++) {//遍歷詞典,讀取一個單詞,寫一個單詞,後面跟着寫向量
            if (fscanf(fid,format,word) == 0) return 1;
            // input vocab cannot contain special <unk> keyword
            if (strcmp(word, "<unk>") == 0) return 1;
            fprintf(fout, "%s",word);//每行以單詞開頭
            if (model == 0) { // Save all parameters (including bias)//存詞向量上下文向量以及包括bias,和上面的寫到文件第一行的vector_size對不上了。算是一個bug
                for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]);
                for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", W[(vocab_size + a) * (vector_size + 1) + b]);
            }
            if (model == 1) // Save only "word" vectors (without bias)//只存詞向量沒有bias
                for (b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b]);
            if (model == 2) // Save "word + context word" vectors (without bias)//存詞向量和上下文向量的求和
                for (b = 0; b < vector_size; b++) fprintf(fout," %lf", W[a * (vector_size + 1) + b] + W[(vocab_size + a) * (vector_size + 1) + b]);
            fprintf(fout,"\n");
            if (save_gradsq > 0) { // Save gradsq
                fprintf(fgs, "%s",word);
                for (b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[a * (vector_size + 1) + b]);
                for (b = 0; b < (vector_size + 1); b++) fprintf(fgs," %lf", gradsq[(vocab_size + a) * (vector_size + 1) + b]);
                fprintf(fgs,"\n");
            }
            if (fscanf(fid,format,word) == 0) return 1; // Eat irrelevant frequency entry//文件詞典中還有單詞的頻數,這裏沒用,讀取以後不需要處理
        }


        if (use_unk_vec) {//後面是處理oov的邏輯,無足輕重。
            real* unk_vec = (real*)calloc((vector_size + 1), sizeof(real));
            real* unk_context = (real*)calloc((vector_size + 1), sizeof(real));
            word = "<unk>";


            int num_rare_words = vocab_size < 100 ? vocab_size : 100;


            for (a = vocab_size - num_rare_words; a < vocab_size; a++) {
                for (b = 0; b < (vector_size + 1); b++) {
                    unk_vec[b] += W[a * (vector_size + 1) + b] / num_rare_words;
                    unk_context[b] += W[(vocab_size + a) * (vector_size + 1) + b] / num_rare_words;
                }
            }


            fprintf(fout, "%s",word);
            if (model == 0) { // Save all parameters (including bias)
                for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", unk_vec[b]);
                for (b = 0; b < (vector_size + 1); b++) fprintf(fout," %lf", unk_context[b]);
            }
            if (model == 1) // Save only "word" vectors (without bias)
                for (b = 0; b < vector_size; b++) fprintf(fout," %lf", unk_vec[b]);
            if (model == 2) // Save "word + context word" vectors (without bias)
                for (b = 0; b < vector_size; b++) fprintf(fout," %lf", unk_vec[b] + unk_context[b]);
            fprintf(fout,"\n");


            free(unk_vec);
            free(unk_context);
        }


        fclose(fid);
        fclose(fout);
        if (save_gradsq > 0) fclose(fgs);
    }
    return 0;
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章