#include<stdio.h>#include<stdlib.h>#include<string.h>#include<math.h>#include<pthread.h>#define MAX_STRING 100#define EXP_TABLE_SIZE 1000#define MAX_EXP 6#define MAX_SENTENCE_LENGTH 1000#define MAX_CODE_LENGTH 40#include<time.h>#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)constint vocab_hash_size =30;// Maximum 30 * 0.7 = 21M words in the vocabularytypedeffloat real;// Precision of float numbers//每個詞的基本數據結構struct vocab_word {longlong cn;//詞頻,從訓練集中計數得到或直接提供詞頻文件int*point;//Haffman樹中從根節點到該詞的路徑,存放的是路徑上每個節點的索引//word爲該詞的字面值//code爲該詞的haffman編碼//codelen爲該詞haffman編碼的長度char*word,*code, codelen;};char train_file[MAX_STRING], output_file[MAX_STRING];char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];//詞表,該數組的下標表示這個詞在此表中的位置,也稱之爲這個詞在詞表中的索引struct vocab_word *vocab;int binary =0, cbow =1, debug_mode =2, window =5, min_count =1, num_threads =1, min_reduce =1;//詞hash表,該數組的下標爲每個詞的hash值,由詞的字面值ASCII碼計算得到。vocab_hash[hash]中存儲的是該詞在詞表中的索引int*vocab_hash;//vocab_max_size是一個輔助變量,每次當詞表大小超出vocab_max_size時,一次性將詞表大小增加1000//vocab_size爲訓練集中不同單詞的個數,即詞表的大小//layer1_size爲詞向量的長度longlong vocab_max_size =1000, vocab_size =0, layer1_size =10;longlong train_words =0, word_count_actual =0, iter =5, file_size =0, classes =0;
real alpha =0.025, starting_alpha, sample =1e-3;//syn0存儲的是詞表中每個詞的詞向量//syn1存儲的是Haffman樹中每個非葉節點的向量//syn1neg是負採樣時每個詞的輔助向量//expTable是提前計算好的Sigmond函數表
real *syn0,*syn1,*syn1neg,*expTable;
clock_t start;int hs =0, negative =5;constint table_size =1e2;int*table;//計算每個函數的能量分佈表,在負採樣中用到voidInitUnigramTable(){int a, i;longlong train_words_pow =0;
real d1, power =0.75;//爲能量表table分配內存空間,共有table_size項,table_size爲一個既定的數1e8
table =(int*)malloc(table_size *sizeof(int));//遍歷詞表,根據詞頻計算能量總值for(a =0; a < vocab_size; a++)
train_words_pow +=pow(vocab[a].cn, power);
i =0;//d1:表示已遍歷詞的能量值佔總能量的比
d1 =pow(vocab[i].cn, power)/(real) train_words_pow;printf("\ntable_size:%d", table_size);printf("\ntrain_words_pow:%lld,d1:%f\n", train_words_pow, d1);//a:能量表table的索引//i:詞表的索引for(a =0; a < table_size; a++){//i號單詞佔據table中a位置
table[a]= i;//能量表反映的是一個單詞的能量分佈,如果該單詞的能量越大,所佔table的位置就越多//如果當前單詞的能量總和d1小於平均值,i遞增,同時更新d1;反之如果能量高的話,保持i不變,以佔據更多的位置if(a /(real) table_size > d1){
i++;
d1 +=pow(vocab[i].cn, power)/(real) train_words_pow;}//如果詞表遍歷完畢後能量表還沒填滿,將能量表中剩下的位置用詞表中最後一個詞填充if(i >= vocab_size)
i = vocab_size -1;}for(a =0; a < table_size; a++){printf("\t%d", table[a]);if((a +1)%10==0){printf("\n");}}}//從文件中讀入一個詞到word,以space' ',tab'\t',EOL'\n'爲詞的分界符//截去一個詞中長度超過MAX_STRING的部分//每一行的末尾輸出一個</s>voidReadWord(char*word, FILE *fin){int a =0, ch;while(!feof(fin)){
ch =fgetc(fin);if(ch ==13)continue;if((ch ==' ')||(ch =='\t')||(ch =='\n')){if(a >0){if(ch =='\n')ungetc(ch, fin);break;}if(ch =='\n'){strcpy(word,(char*)"</s>");return;}elsecontinue;}
word[a]= ch;
a++;if(a >= MAX_STRING -1)
a--;// Truncate too long words}
word[a]=0;}//返回一個詞的hash值,由詞的字面值計算得到,可能存在不同詞擁有相同hash值的衝突情況intGetWordHash(char*word){unsignedlonglong a, hash =0;for(a =0; a <strlen(word); a++)
hash = hash *257+ word[a];
hash = hash % vocab_hash_size;return hash;}//返回一個詞在詞表中的位置,若不存在則返回-1//先計算詞的hash值,然後在詞hash表中,以該值爲下標,查看對應的值//如果爲-1說明這個詞不存在索引,即不存在在詞表中,返回-1//如果該索引在詞表中對應的詞與正在查找的詞不符,說明發生了hash值衝突,按照開放地址法去尋找這個詞intSearchVocab(char*word){unsignedint hash =GetWordHash(word);while(1){if(vocab_hash[hash]==-1)return-1;if(!strcmp(word, vocab[vocab_hash[hash]].word))return vocab_hash[hash];
hash =(hash +1)% vocab_hash_size;}return-1;}//從文件中讀入一個詞,並返回這個詞在詞表中的位置,相當於將之前的兩個函數包裝了起來intReadWordIndex(FILE *fin){char word[MAX_STRING];ReadWord(word, fin);if(feof(fin))return-1;returnSearchVocab(word);}//爲一個詞構建一個vocab_word結構對象,並添加到詞表中//詞頻初始化爲0,hash值用之前的函數計算,//返回該詞在詞表中的位置intAddWordToVocab(char*word){unsignedint hash, length =strlen(word)+1;if(length > MAX_STRING)
length = MAX_STRING;
vocab[vocab_size].word =(char*)calloc(length,sizeof(char));strcpy(vocab[vocab_size].word, word);
vocab[vocab_size].cn =0;
vocab_size++;//每當詞表數目即將超過最大值時,一次性爲其申請添加一千個詞結構體的內存空間if(vocab_size +2>= vocab_max_size){
vocab_max_size +=1000;
vocab =(struct vocab_word *)realloc(vocab, vocab_max_size *sizeof(struct vocab_word));}
hash =GetWordHash(word);//如果該hash值與其他詞產生衝突,則使用開放地址法解決衝突(爲這個詞尋找一個hash值空位)while(vocab_hash[hash]!=-1)
hash =(hash +1)% vocab_hash_size;//將該詞在詞表中的位置賦給這個找到的hash值空位
vocab_hash[hash]= vocab_size -1;return vocab_size -1;}//按照詞頻從大到小排序intVocabCompare(constvoid*a,constvoid*b){return((struct vocab_word *) b)->cn -((struct vocab_word *) a)->cn;}//統計詞頻,按照詞頻對詞表中的項從大到小排序voidSortVocab(){int a, size;unsignedint hash;//對詞表進行排序,將</s>放在第一個位置qsort(&vocab[1], vocab_size -1,sizeof(struct vocab_word), VocabCompare);//充值hash表for(a =0; a < vocab_hash_size; a++)
vocab_hash[a]=-1;
size = vocab_size;
train_words =0;for(a =0; a < size; a++){//將出現次數小於min_count的詞從詞表中去除,出現次數大於min_count的重新計算hash值,更新hash詞表if((vocab[a].cn < min_count)&&(a !=0)){
vocab_size--;free(vocab[a].word);}else{//hash值計算
hash =GetWordHash(vocab[a].word);//hash值衝突解決while(vocab_hash[hash]!=-1)
hash =(hash +1)% vocab_hash_size;
vocab_hash[hash]= a;//計算總詞數
train_words += vocab[a].cn;}}//由於刪除了詞頻較低的詞,這裏調整詞表的內存空間
vocab =(struct vocab_word *)realloc(vocab,(vocab_size +1)*sizeof(struct vocab_word));// 爲Haffman樹的構建預先申請空間for(a =0; a < vocab_size; a++){
vocab[a].code =(char*)calloc(MAX_CODE_LENGTH,sizeof(char));
vocab[a].point =(int*)calloc(MAX_CODE_LENGTH,sizeof(int));}}//從詞表中刪除出現次數小於min_reduce的詞,沒執行一次該函數min_reduce自動加一voidReduceVocab(){int a, b =0;unsignedint hash;for(a =0; a < vocab_size; a++)if(vocab[a].cn > min_reduce){
vocab[b].cn = vocab[a].cn;
vocab[b].word = vocab[a].word;
b++;}elsefree(vocab[a].word);
vocab_size = b;//重置hash表for(a =0; a < vocab_hash_size; a++)
vocab_hash[a]=-1;//更新hash表for(a =0; a < vocab_size; a++){//hash值計算
hash =GetWordHash(vocab[a].word);//hash值衝突解決while(vocab_hash[hash]!=-1)
hash =(hash +1)% vocab_hash_size;
vocab_hash[hash]= a;}fflush(stdout);
min_reduce++;}//利用統計到的詞頻構建Haffman二叉樹//根據Haffman樹的特性,出現頻率越高的詞其二叉樹上的路徑越短,即二進制編碼越短voidCreateBinaryTree(){longlong a, b, i, min1i, min2i, pos1, pos2;//用來暫存一個詞到根節點的Haffman樹路徑longlong point[MAX_CODE_LENGTH];//用來暫存一個詞的Haffman編碼char code[MAX_CODE_LENGTH];//內存分配,Haffman二叉樹中,若有n個葉子節點,則一共會有2n-1個節點//count數組前vocab_size個元素爲Haffman樹的葉子節點,初始化爲詞表中所有詞的詞頻//count數組後vocab_size個元素爲Haffman書中即將生成的非葉子節點(合併節點)的詞頻,初始化爲一個大值1e15longlong*count =(longlong*)calloc(vocab_size *2+1,sizeof(longlong));//binary數組記錄各節點相對於其父節點的二進制編碼(0/1)longlong*binary =(longlong*)calloc(vocab_size *2+1,sizeof(longlong));//paarent數組記錄每個節點的父節點longlong*parent_node =(longlong*)calloc(vocab_size *2+1,sizeof(longlong));//count數組的初始化for(a =0; a < vocab_size; a++)
count[a]= vocab[a].cn;for(a = vocab_size; a < vocab_size *2; a++)
count[a]=1e15;//以下部分爲創建Haffman樹的算法,默認詞表已經按詞頻由高到低排序//pos1,pos2爲別爲詞表中詞頻次低和最低的兩個詞的下標(初始時就是詞表最末尾兩個)//</s>詞也包含在樹內
pos1 = vocab_size -1;
pos2 = vocab_size;//最多進行vocab_size-1次循環操作,每次添加一個節點,即可構成完整的樹for(a =0; a < vocab_size -1; a++){//比較當前的pos1和pos2,在min1i、min2i中記錄當前詞頻最小和次小節點的索引//min1i和min2i可能是葉子節點也可能是合併後的中間節點if(pos1 >=0){//如果count[pos1]比較小,則pos1左移,反之pos2右移if(count[pos1]< count[pos2]){
min1i = pos1;
pos1--;}else{
min1i = pos2;
pos2++;}}else{
min1i = pos2;
pos2++;}if(pos1 >=0){//如果count[pos1]比較小,則pos1左移,反之pos2右移if(count[pos1]< count[pos2]){
min2i = pos1;
pos1--;}else{
min2i = pos2;
pos2++;}}else{
min2i = pos2;
pos2++;}//在count數組的後半段存儲合併節點的詞頻(即最小count[min1i]和次小count[min2i]詞頻之和)
count[vocab_size + a]= count[min1i]+ count[min2i];//記錄min1i和min2i節點的父節點
parent_node[min1i]= vocab_size + a;
parent_node[min2i]= vocab_size + a;//這裏令每個節點的左右子節點中,詞頻較低的爲1(則詞頻較高的爲0)
binary[min2i]=1;}for(int ii =0; ii <2* vocab_size +1; ii++){printf("%d\t", ii);}printf("\n");for(int ii =0; ii <2* vocab_size +1; ii++){printf("%d\t", count[ii]);}printf("\n");for(int ii =0; ii <2* vocab_size +1; ii++){printf("%d\t", binary[ii]);}printf("\n");for(int ii =0; ii <2* vocab_size +1; ii++){printf("%d\t", parent_node[ii]);}printf("\n");printf("\n");//根據得到的Haffman二叉樹爲每個詞(樹中的葉子節點)分配Haffman編碼//由於要爲所有詞分配編碼,因此循環vocab_size次for(a =0; a < vocab_size; a++){
b = a;
i =0;while(1){//不斷向上尋找葉子結點的父節點,將binary數組中存儲的路徑的二進制編碼增加到code數組末尾
code[i]= binary[b];//在point數組中增加路徑節點的編號
point[i]= b;//Haffman編碼的當前長度,從葉子結點到當前節點的深度
i++;
b = parent_node[b];//由於Haffman樹一共有vocab_size*2-1個節點,所以vocab_size*2-2爲根節點if(b == vocab_size *2-2)break;}//在詞表中更新該詞的信息//Haffman編碼的長度,即葉子結點到根節點的深度
vocab[a].codelen = i;//Haffman路徑中存儲的中間節點編號要在現在得到的基礎上減去vocab_size,即不算葉子結點,單純在中間節點中的編號//所以現在根節點的編號爲(vocab_size*2-2) - vocab_size = vocab_size - 2
vocab[a].point[0]= vocab_size -2;//Haffman編碼和路徑都應該是從根節點到葉子結點的,因此需要對之前得到的code和point進行反向。for(b =0; b < i; b++){
vocab[a].code[i - b -1]= code[b];
vocab[a].point[i - b]= point[b]- vocab_size;}}printf("vocab_size:%d\n", vocab_size);for(b =0; b < vocab_size; b++){
vocab_word temp = vocab[b];printf("%s\t", temp.word);int codeLen = temp.codelen;printf("%d\t(\t", codeLen);for(int a =0; a < codeLen; a++){printf("%d\t", temp.code[a]);}printf(")\t\t\t\t\t\t");printf("point:(\t");for(int a =0; a < codeLen; a++){printf("%d\t", temp.point[a]);}printf(")\n");}free(count);free(binary);free(parent_node);}//從訓練文件中獲取所有詞彙並構建詞表和hash比voidLearnVocabFromTrainFile(){char word[MAX_STRING];
FILE *fin;longlong a, i;//初始化hash詞表for(a =0; a < vocab_hash_size; a++)
vocab_hash[a]=-1;//打開訓練文件
fin =fopen(train_file,"rb");if(fin ==NULL){printf("ERROR: training data file not found!\n");exit(1);}//初始化詞表大小
vocab_size =0;//將</s>添加到詞表的最前端AddWordToVocab((char*)"</s>");//開始處理訓練文件while(1){//從文件中讀入一個詞ReadWord(word, fin);if(feof(fin))break;//對總詞數加一,並輸出當前訓練信息
train_words++;if((debug_mode >1)&&(train_words %100000==0)){printf("%lldK%c", train_words /1000,13);fflush(stdout);}//搜索這個詞在詞表中的位置
i =SearchVocab(word);//如果詞表中不存在這個詞,則將該詞添加到詞表中,創建其在hash表中的值,初始化詞頻爲1;反之,詞頻加一if(i ==-1){
a =AddWordToVocab(word);
vocab[a].cn =1;}else
vocab[i].cn++;//如果詞表大小超過上限,則做一次詞表刪減操作,將當前詞頻最低的詞刪除if(vocab_size > vocab_hash_size *0.7)ReduceVocab();}//對詞表進行排序,剔除詞頻低於閾值min_count的值,輸出當前詞表大小和總詞數SortVocab();if(debug_mode >0){printf("Vocab size: %lld\n", vocab_size);printf("Words in train file: %lld\n", train_words);}//獲取訓練文件的大小,關閉文件句柄
file_size =ftell(fin);fclose(fin);}//將單詞和對應的詞頻輸出到文件中voidSaveVocab(){longlong i;
FILE *fo =fopen(save_vocab_file,"wb");for(i =0; i < vocab_size; i++)fprintf(fo,"%s %lld\n", vocab[i].word, vocab[i].cn);fclose(fo);}//從詞彙表文件中讀詞並構建詞表和hash表//由於詞彙表中的詞語不存在重複,因此與LearnVocabFromTrainFile相比沒有做重複詞彙的檢測voidReadVocab(){longlong a, i =0;char c;char word[MAX_STRING];//打開詞彙表文件
FILE *fin =fopen(read_vocab_file,"rb");if(fin ==NULL){printf("Vocabulary file not found\n");exit(1);}//初始化hash詞表for(a =0; a < vocab_hash_size; a++)
vocab_hash[a]=-1;
vocab_size =0;//開始處理詞彙表文件while(1){//從文件中讀入一個詞ReadWord(word, fin);if(feof(fin))break;//將該詞添加到詞表中,創建其在hash表中的值,並通過輸入的詞彙表文件中的值來更新這個詞的詞頻
a =AddWordToVocab(word);fscanf(fin,"%lld%c",&vocab[a].cn,&c);
i++;}//對詞表進行排序,剔除詞頻低於閾值min_count的值,輸出當前詞表大小和總詞數SortVocab();if(debug_mode >0){printf("Vocab size: %lld\n", vocab_size);printf("Words in train file: %lld\n", train_words);}//打開訓練文件,將文件指針移至文件末尾,獲取訓練文件的大小
fin =fopen(train_file,"rb");if(fin ==NULL){printf("ERROR: training data file not found!\n");exit(1);}fseek(fin,0,SEEK_END);
file_size =ftell(fin);//關閉文件句柄fclose(fin);}//初始化神經網絡結構voidInitNet(){longlong a, b;unsignedlonglong next_random =1;//syn0存儲的是詞表中每個詞的詞向量//這裏爲syn0分配內存空間//調用posiz_memalign來獲取一塊數量爲vocab_size * layer1_size,128byte頁對齊的內存//其中layer1_size是詞向量的長度
a =posix_memalign((void**)&syn0,128,(longlong)vocab_size * layer1_size *sizeof(real));if(syn0 ==NULL){printf("Memory allocation failed\n");exit(1);}//多層Softmax迴歸if(hs){//syn1存儲的是Haffman樹中每個非葉節點的向量//這裏爲syn1分配內存空間
a =posix_memalign((void**)&syn1,128,(longlong)vocab_size * layer1_size *sizeof(real));if(syn1 ==NULL){printf("Memory allocation failed\n");exit(1);}//初始化syn1爲0for(a =0; a < vocab_size; a++)for(b =0; b < layer1_size; b++)
syn1[a * layer1_size + b]=0;}//如果要使用負採樣,則需要爲syn1neg分配內存空間//syn1neg是負採樣時每個詞的輔助向量if(negative >0){
a =posix_memalign((void**)&syn1neg,128,(longlong)vocab_size * layer1_size *sizeof(real));if(syn1neg ==NULL){printf("Memory allocation failed\n");exit(1);}//初始化syn1neg爲0for(a =0; a < vocab_size; a++)for(b =0; b < layer1_size; b++)
syn1neg[a * layer1_size + b]=0;}for(a =0; a < vocab_size; a++)for(b =0; b < layer1_size; b++){
next_random = next_random *(unsignedlonglong)25214903917+11;//初始化詞向量syn0,每一維的值爲[-0.5, 0.5]/layer1_size範圍內的隨機數
syn0[a * layer1_size + b]=(((next_random &0xFFFF)/(real)65536)-0.5)/ layer1_size;}//創建Haffman二叉樹CreateBinaryTree();}//該函數爲線程函數,是訓練算法代碼實現的主要部分//默認在執行該線程函數前,已經完成詞表排序、Haffman樹的生成以及每個詞的Haffman編碼計算void*TrainModelThread(void*id){longlong a, b, d;//cw:窗口長度(中心詞除外)longlong cw;//word: 在提取句子時用來表示當前詞在詞表中的索引//last_word: 用於在窗口掃描輔助,記錄當前掃描到的上下文單詞//setence_length: 當前處理的句子長度//setence_position: 當前處理的單詞在當前句子中的位置longlong word, last_word, sentence_length =0, sentence_position =0;//word_count: 當前線程當前時刻已訓練的語料的長度//last_word_count: 當前線程上一次記錄時已訓練的語料長度longlong word_count =0, last_word_count =0;//sen:當前從文件中讀取的待處理句子,存放的是每個詞在詞表中的索引longlong sen[MAX_SENTENCE_LENGTH +1];//l1:在skip-gram模型中,在syn0中定位當前詞詞向量的起始位置//l2:在syn1或syn1neg中定位中間節點向量或負採樣向量的起始位置//target:在負採樣中存儲當前樣本//label:在負採樣中存儲當前樣本的標記longlong l1, l2, c, target, label, local_iter = iter;//next_random:用來輔助生成隨機數unsignedlonglong next_random =(longlong) id;
real f, g;
clock_t now;//neu1:輸入詞向量,在CBOW模型中是Context(x)中各個詞的向量和,在skip-gram模型中是中心詞的詞向量
real *neu1 =(real *)calloc(layer1_size,sizeof(real));//neuele:累計誤差項
real *neu1e =(real *)calloc(layer1_size,sizeof(real));
FILE *fi =fopen(train_file,"rb");//每個進程對應一段文本,根據當前線程的id找到該線程對應文本的初始位置//file_size就是之前LearnVocabFromTrainFile和ReadVocab函數中獲取的訓練文件的大小fseek(fi, file_size /(longlong) num_threads *(longlong) id,SEEK_SET);//開始主循環while(1){//每訓練約10000詞輸出一次訓練進度if(word_count - last_word_count >10){//word_count_actual是所有線程總共當前處理的詞數
word_count_actual += word_count - last_word_count;
last_word_count = word_count;if((debug_mode >1)){
now =clock();//輸出信息包括://當前的學習率alpha;//訓練總進度(當前訓練的總詞數/(迭代次數*訓練樣本總詞數)+1);//每個線程每秒處理的詞數printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",13, alpha, word_count_actual /(real)(iter * train_words +1)*100,
word_count_actual /((real)(now - start +1)/(real) CLOCKS_PER_SEC *1000));fflush(stdout);}//在初始學習率的基礎上,隨着實際訓練詞數的上升,逐步降低當前學習率(自適應調整學習率)
alpha = starting_alpha *(1- word_count_actual /(real)(iter * train_words +1));//調整的過程中保證學習率不低於starting_alpha * 0.0001if(alpha < starting_alpha *0.0001)
alpha = starting_alpha *0.0001;}//從訓練樣本中取出一個句子,句子間以回車分割if(sentence_length ==0){while(1){//從文件中讀入一個詞,將該詞在詞表中的索引賦給word
word =ReadWordIndex(fi);if(feof(fi))break;if(word ==-1)continue;
word_count++;//如果讀到的時回車,表示句子結束if(word ==0)break;//對高頻詞進行隨機下采樣,丟棄掉一些高頻詞,能夠使低頻詞向量更加準確,同時加快訓練速度//可以看作是一種平滑方法if(sample >0){
real ran =(sqrt(vocab[word].cn /(sample * train_words))+1)*(sample * train_words)/ vocab[word].cn;
next_random = next_random *(unsignedlonglong)25214903917+11;//以1-ran的概率捨棄高頻詞if(ran <(next_random &0xFFFF)/(real)65536)continue;}
sen[sentence_length]= word;
sentence_length++;//如果句子長度超出最大長度則截斷if(sentence_length >= MAX_SENTENCE_LENGTH)break;}//定位到句子頭
sentence_position =0;}//如果當前線程處理的詞數超過了它應該處理的最大值,那麼開始新一輪迭代//如果迭代數超過上限,則停止迭代if(feof(fi)||(word_count > train_words / num_threads)){
word_count_actual += word_count - last_word_count;
local_iter--;if(local_iter ==0)break;
word_count =0;
last_word_count =0;
sentence_length =0;fseek(fi, file_size /(longlong) num_threads *(longlong) id,SEEK_SET);continue;}//取出當前單詞
word = sen[sentence_position];if(word ==-1)continue;//初始化輸入詞向量for(c =0; c < layer1_size; c++)
neu1[c]=0;//初始化累計誤差項for(c =0; c < layer1_size; c++)
neu1e[c]=0;//生成一個[0, window-1]的隨機數,用來確定|context(w)|窗口的實際寬度(提高訓練速率?)
next_random = next_random *(unsignedlonglong)25214903917+11;
b = next_random % window;/********如果使用的是CBOW模型:輸入是某單詞周圍窗口單詞的詞向量,來預測該中心單詞本身*********/if(cbow){
cw =0;//一個詞的窗口爲[setence_position - window + b, sentence_position + window - b]//因此窗口總長度爲 2*window - 2*b + 1for(a = b; a < window *2+1- b; a++)if(a != window){//去除窗口的中心詞,這是我們要預測的內容,僅僅提取上下文
c = sentence_position - window + a;if(c <0)continue;if(c >= sentence_length)continue;//sen數組中存放的是句子中的每個詞在詞表中的索引
last_word = sen[c];if(last_word ==-1)continue;//計算窗口中詞向量的和for(c =0; c < layer1_size; c++)
neu1[c]+= syn0[c + last_word * layer1_size];//統計實際窗口中的有效詞數
cw++;}if(cw){//求平均向量和for(c =0; c < layer1_size; c++)
neu1[c]/= cw;//如果採用分層softmax優化//根據Haffman樹上從根節點到當前詞的葉節點的路徑,遍歷所有經過的中間節點if(hs)for(d =0; d < vocab[word].codelen; d++){
f =0;//l2爲當前遍歷到的中間節點的向量在syn1中的起始位置
l2 = vocab[word].point[d]* layer1_size;//f爲輸入向量neu1與中間結點向量的內積for(c =0; c < layer1_size; c++)
f += neu1[c]* syn1[c + l2];//檢測f有沒有超出Sigmoid函數表的範圍if(f <=-MAX_EXP)continue;elseif(f >= MAX_EXP)continue;//如果沒有超出範圍則對f進行Sigmoid變換else
f = expTable[(int)((f + MAX_EXP)*(EXP_TABLE_SIZE / MAX_EXP /2))];//g是梯度和學習率的乘積//學習率越大,則錯誤分類的懲罰也越大,對中間向量的修正量也越大//注意!word2vec中將Haffman編碼爲1的節點定義爲負類,而將編碼爲0的節點定義爲正類//即一個節點的label = 1 - d
g =(1- vocab[word].code[d]- f)* alpha;//根據計算得到的修正量g和中間節點的向量更新累計誤差for(c =0; c < layer1_size; c++)
neu1e[c]+= g * syn1[c + l2];//根據計算得到的修正量g和輸入向量更新中間節點的向量值//很好理解,假設vocab[word].code[d]編碼爲1,即負類,其節點label爲1-1=0//sigmoid函數得到的值爲(0,1)範圍內的數,大於label,很自然的,我們需要把這個中間節點的向量調小//而此時的g = (label - f)*alpha是一個負值,作用在中間節點向量上時,剛好起到調小效果//調小的幅度與sigmoid函數的計算值偏離label的幅度成正比for(c =0; c < layer1_size; c++)
syn1[c + l2]+= g * neu1[c];}//如果採用負採樣優化//遍歷所有正負樣本(1個正樣本+negative個負樣本)if(negative >0)for(d =0; d < negative +1; d++){if(d ==0){//第一次循環處理的是目標單詞,即正樣本
target = word;
label =1;}else{//從能量表中隨機抽取負樣本
next_random = next_random *(unsignedlonglong)25214903917+11;
target = table[(next_random >>16)% table_size];if(target ==0)
target = next_random %(vocab_size -1)+1;if(target == word)continue;
label =0;}//在負採樣優化中,每個詞在syn1neg數組中對應一個輔助向量//此時的l2爲syn1neg中目標單詞向量的起始位置
l2 = target * layer1_size;
f =0;//f爲輸入向量neu1與輔助向量的內積for(c =0; c < layer1_size; c++)
f += neu1[c]* syn1neg[c + l2];if(f > MAX_EXP)
g =(label -1)* alpha;elseif(f <-MAX_EXP)
g =(label -0)* alpha;//g = (label - f)*alphaelse
g =(label - expTable[(int)((f + MAX_EXP)*(EXP_TABLE_SIZE / MAX_EXP /2))])* alpha;//用輔助向量和g更新累計誤差for(c =0; c < layer1_size; c++)
neu1e[c]+= g * syn1neg[c + l2];//用輸入向量和g更新輔助向量for(c =0; c < layer1_size; c++)
syn1neg[c + l2]+= g * neu1[c];}//根據獲得的的累計誤差,更新context(w)中每個詞的詞向量for(a = b; a < window *2+1- b; a++)if(a != window){
c = sentence_position - window + a;if(c <0)continue;if(c >= sentence_length)continue;
last_word = sen[c];if(last_word ==-1)continue;for(c =0; c < layer1_size; c++)
syn0[c + last_word * layer1_size]+= neu1e[c];}}}/********如果使用的是skip-gram模型:輸入是中心單詞,來預測該單詞的上下文*********/else{//因爲需要預測Context(w)中的每個詞,因此需要循環2window - 2b + 1次遍歷整個窗口//遍歷時跳過中心單詞for(a = b; a < window *2+1- b; a++)if(a != window){
c = sentence_position - window + a;if(c <0)continue;if(c >= sentence_length)continue;//last_word爲當前待預測的上下文單詞
last_word = sen[c];if(last_word ==-1)continue;//l1爲當前單詞的詞向量在syn0中的起始位置
l1 = last_word * layer1_size;//初始化累計誤差for(c =0; c < layer1_size; c++)
neu1e[c]=0;//如果採用分層softmax優化//根據Haffman樹上從根節點到當前詞的葉節點的路徑,遍歷所有經過的中間節點if(hs)for(d =0; d < vocab[word].codelen; d++){
f =0;
l2 = vocab[word].point[d]* layer1_size;//注意!這裏用到了模型對稱:p(u|w) = p(w|u),其中w爲中心詞,u爲context(w)中每個詞//也就是skip-gram雖然是給中心詞預測上下文,真正訓練的時候還是用上下文預測中心詞//與CBOW不同的是這裏的u是單個詞的詞向量,而不是窗口向量之和//算法流程基本和CBOW的hs一樣,這裏不再贅述for(c =0; c < layer1_size; c++)
f += syn0[c + l1]* syn1[c + l2];if(f <=-MAX_EXP)continue;elseif(f >= MAX_EXP)continue;else
f = expTable[(int)((f + MAX_EXP)*(EXP_TABLE_SIZE / MAX_EXP /2))];
g =(1- vocab[word].code[d]- f)* alpha;for(c =0; c < layer1_size; c++)
neu1e[c]+= g * syn1[c + l2];for(c =0; c < layer1_size; c++)
syn1[c + l2]+= g * syn0[c + l1];}//如果採用負採樣優化//遍歷所有正負樣本(1個正樣本+negative個負樣本)//算法流程基本和CBOW的ns一樣,也採用的是模型對稱if(negative >0)for(d =0; d < negative +1; d++){if(d ==0){
target = word;
label =1;}else{
next_random = next_random *(unsignedlonglong)25214903917+11;
target = table[(next_random >>16)% table_size];if(target ==0)
target = next_random %(vocab_size -1)+1;if(target == word)continue;
label =0;}
l2 = target * layer1_size;
f =0;for(c =0; c < layer1_size; c++)
f += syn0[c + l1]* syn1neg[c + l2];if(f > MAX_EXP)
g =(label -1)* alpha;elseif(f <-MAX_EXP)
g =(label -0)* alpha;else
g =(label - expTable[(int)((f + MAX_EXP)*(EXP_TABLE_SIZE / MAX_EXP /2))])* alpha;for(c =0; c < layer1_size; c++)
neu1e[c]+= g * syn1neg[c + l2];for(c =0; c < layer1_size; c++)
syn1neg[c + l2]+= g * syn0[c + l1];}for(c =0; c < layer1_size; c++)
syn0[c + l1]+= neu1e[c];}}//完成了一個詞的訓練,句子中位置往後移一個詞
sentence_position++;//處理完一句句子後,將句子長度置爲零,進入循環,重新讀取句子並進行逐詞計算if(sentence_position >= sentence_length){
sentence_length =0;continue;}}fclose(fi);free(neu1);free(neu1e);pthread_exit(NULL);}//完整的模型訓練流程函數voidTrainModel(){long a, b, c, d;
FILE *fo;//創建多線程,線程數爲num_threads
pthread_t *pt =(pthread_t *)malloc(num_threads *sizeof(pthread_t));printf("Starting training using file %s\n", train_file);//設置初始學習率
starting_alpha = alpha;//如果有詞彙表文件,則從中加載生成詞表和hash表,否則從訓練文件中獲得printf("read_vocab_file:%d\t", read_vocab_file[0]);if(read_vocab_file[0]!=0)ReadVocab();elseLearnVocabFromTrainFile();//根據需要,可以將詞表中的詞和詞頻輸出到文件if(save_vocab_file[0]!=0)SaveVocab();if(output_file[0]==0)return;//初始化訓練網絡InitNet();//如果使用負採樣優化,則需要初始化能量表if(negative >0)InitUnigramTable();//開始計時
start =clock();//創建訓練線程for(a =0; a < num_threads; a++)pthread_create(&pt[a],NULL, TrainModelThread,(void*)(intptr_t) a);for(a =0; a < num_threads; a++)pthread_join(pt[a],NULL);
fo =fopen(output_file,"wb");//如果classes參數爲0,則輸出所有詞向量到文件中if(classes ==0){fprintf(fo,"%lld %lld\n", vocab_size, layer1_size);for(a =0; a < vocab_size; a++){fprintf(fo,"%s ", vocab[a].word);if(binary)for(b =0; b < layer1_size; b++)fwrite(&syn0[a * layer1_size + b],sizeof(real),1, fo);elsefor(b =0; b < layer1_size; b++)fprintf(fo,"%lf ", syn0[a * layer1_size + b]);fprintf(fo,"\n");}}//如果classes參數不爲0,則需要對詞向量進行K-means聚類,輸出詞類//classes爲最後要分成的類的個數else{//clcn:總類數//iter:總迭代次數//closeid:用來存儲計算過程中離某個詞最近的類編號int clcn = classes, iter =10, closeid;//centcn:屬於每個類的單詞數int*centcn =(int*)malloc(classes *sizeof(int));//cl:每個單詞所屬的類編號int*cl =(int*)calloc(vocab_size,sizeof(int));//x:用來存儲每次計算得到的詞向量和類中心的內積,值越大說明距離越近//closev:用來最大的內積,即距離最近
real closev, x;//cent:每個類的中心向量
real *cent =(real *)calloc(classes * layer1_size,sizeof(real));//先給所有單詞隨機指派類for(a =0; a < vocab_size; a++)
cl[a]= a % clcn;//一共迭代iter次for(a =0; a < iter; a++){//初始化類中心向量數組爲0for(b =0; b < clcn * layer1_size; b++)
cent[b]=0;//初始化每個類含有的單詞數爲1for(b =0; b < clcn; b++)
centcn[b]=1;//將剛纔隨意分配的所屬於同一個類的詞向量相加,並且計算屬於每個類的詞數for(c =0; c < vocab_size; c++){for(d =0; d < layer1_size; d++)
cent[layer1_size * cl[c]+ d]+= syn0[c * layer1_size + d];
centcn[cl[c]]++;}for(b =0; b < clcn; b++){
closev =0;for(c =0; c < layer1_size; c++){//計算每個類的平均中心向量
cent[layer1_size * b + c]/= centcn[b];//closev爲類平均中心向量的二範數的平方
closev += cent[layer1_size * b + c]* cent[layer1_size * b + c];}//對closev開方,此時的closev即爲類平均中心向量的二範數
closev =sqrt(closev);//用得到的範數對中心向量進行歸一化for(c =0; c < layer1_size; c++)
cent[layer1_size * b + c]/= closev;}//遍歷詞表中的每個詞,爲其重新分配距離最近的類for(c =0; c < vocab_size; c++){
closev =-10;
closeid =0;for(d =0; d < clcn; d++){
x =0;//對詞向量和歸一化的類中心向量做內積for(b =0; b < layer1_size; b++)
x += cent[layer1_size * d + b]* syn0[c * layer1_size + b];//內積越大說明兩點之間距離越近//取所有類中與這個詞的詞向量內積最大的一個類,將詞分到這個類中if(x > closev){
closev = x;
closeid = d;}}
cl[c]= closeid;}}//經過多次迭代後,逐漸會將詞向量向正確的類靠攏//輸出K-means聚類結果到文件中for(a =0; a < vocab_size; a++)fprintf(fo,"%s %d\n", vocab[a].word, cl[a]);free(centcn);free(cent);free(cl);}fclose(fo);}//當參數缺失時,輸出提示信息intArgPos(char*str,int argc,char**argv){int a;for(a =1; a < argc; a++)if(!strcmp(str, argv[a])){if(a == argc -1){printf("Argument missing for %s\n", str);exit(1);}return a;}return-1;}voidprepare(){int i;
vocab =(struct vocab_word *)calloc(vocab_max_size,sizeof(struct vocab_word));
vocab_hash =(int*)calloc(vocab_hash_size,sizeof(int));printf("%d", vocab_hash[0]);
expTable =(real *)malloc((EXP_TABLE_SIZE +1)*sizeof(real));for(i =0; i < EXP_TABLE_SIZE; i++){
expTable[i]=exp((i /(real) EXP_TABLE_SIZE *2-1)* MAX_EXP);// Precompute the exp() table
expTable[i]= expTable[i]/(expTable[i]+1);// Precompute f(x) = x / (x + 1)}}intmain(int argc,char**argv){int i;prepare();strcpy(train_file,"record/input.txt");strcpy(save_vocab_file,"record/vocab.txt");strcpy(output_file,"record/output.txt");/**
argc = 2;
if (argc == 1) {
printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
printf("Options:\n");
printf("Parameters for training:\n");
printf("\t-train <file>\n");
printf("\t\tUse text data from <file> to train the model\n");
printf("\t-output <file>\n");
printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
printf("\t-size <int>\n");
printf("\t\tSet size of word vectors; default is 100\n");
printf("\t-window <int>\n");
printf("\t\tSet max skip length between words; default is 5\n");
printf("\t-sample <float>\n");
printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
printf("\t-hs <int>\n");
printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
printf("\t-negative <int>\n");
printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
printf("\t-threads <int>\n");
printf("\t\tUse <int> threads (default 12)\n");
printf("\t-iter <int>\n");
printf("\t\tRun more training iterations (default 5)\n");
printf("\t-min-count <int>\n");
printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
printf("\t-alpha <float>\n");
printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
printf("\t-classes <int>\n");
printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
printf("\t-debug <int>\n");
printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
printf("\t-binary <int>\n");
printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
printf("\t-save-vocab <file>\n");
printf("\t\tThe vocabulary will be saved to <file>\n");
printf("\t-read-vocab <file>\n");
printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
printf("\t-cbow <int>\n");
printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
printf("\nExamples:\n");
printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
return 0;
}
output_file[0] = 0;
save_vocab_file[0] = 0;
read_vocab_file[0] = 0;
if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
if (cbow) alpha = 0.05;
if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
**/
vocab =(struct vocab_word *)calloc(vocab_max_size,sizeof(struct vocab_word));
vocab_hash =(int*)calloc(vocab_hash_size,sizeof(int));
expTable =(real *)malloc((EXP_TABLE_SIZE +1)*sizeof(real));for(i =0; i < EXP_TABLE_SIZE; i++){
expTable[i]=exp((i /(real) EXP_TABLE_SIZE *2-1)* MAX_EXP);// Precompute the exp() table
expTable[i]= expTable[i]/(expTable[i]+1);// Precompute f(x) = x / (x + 1)}TrainModel();return0;}
輸入數據
bb cc
bb
dd ee
bb
cc ac
bb cc ee
bb cc
ac bb
ee xx
bb
ac cc
ee bb
vocab.txt
</s>12
bb 8
cc 5
ee 4
ac 3
xx 1
dd 1
output.txt
710</s>0.0400270.044194-0.038303-0.0327800.0136660.0302110.0094090.002113-0.0360350.022185
bb -0.0435640.012495-0.007513-0.009572-0.033157-0.0188220.0257930.0302540.0296910.015974
cc 0.015448-0.038026-0.0409580.0496960.0380130.030901-0.0060390.040157-0.0049500.007347
ee -0.001492-0.0298320.013123-0.013374-0.0382540.0475420.043793-0.010951-0.0022610.005092
ac -0.036377-0.0400710.0455470.000630-0.025824-0.030421-0.0307650.0169690.0020140.013310
xx -0.042136-0.038078-0.0013000.0114360.025497-0.0317000.0407960.0092700.011197-0.006084
dd 0.029865-0.022878-0.0209750.021584-0.0075320.0103070.018045-0.040886-0.0198300.029137