Sentencepiece是google開源的文本Tokenzier工具,其主要原理是利用統計算法,在語料庫中生成一個類似分詞器的工具,外加可以將詞token化的功能;對比開源的分詞器,它會將頻繁出現的字符串作爲詞,然後形成詞庫進行切分,所以它會切分的粒度會更大些。當前各個大模型的分詞器基本都是基於該工具實現的。
由於原生LLaMa的訓練語料大部分都是英文,中文語料相較較少,使得模型對中文編解碼效率不高,擴充LLaMa中文詞表可有效提升LLaMa對中文的編解碼效率,此外擴充中文詞表還提高了模型的上下文窗口長度。
安裝sentencepiece
pip install sentencepiece
訓練詞表代碼:
import sentencepiece as spm spm.SentencePieceTrainer.train(input='./file_name.txt',input_format='text',model_prefix='bpe_test',model_type='bpe',vocab_size=10000,character_coverage=0.9995,num_threads=32,split_digits=True,byte_fallback=True, max_sentence_length=24000)
該代碼運行後會在當前目錄下生成三個文件,bpe_test.model,bpe_test.vocab,bpe_test.log
合併LLaMa詞表代碼:
1 import os 2 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python" 3 from transformers import LlamaTokenizer 4 from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model 5 import sentencepiece as spm 6 7 # 位置 8 llama_tokenizer_dir = "/mnt/models/Baichuan-7B" # 換成你自己模型的位置 9 chinese_sp_model_file ="./bpe_test.model" # 剛纔訓練的模型 10 11 # 加載 12 llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir) 13 chinese_sp_model = spm.SentencePieceProcessor() 14 chinese_sp_model.Load(chinese_sp_model_file) 15 llama_spm = sp_pb2_model.ModelProto() 16 llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto()) 17 chinese_spm = sp_pb2_model.ModelProto() 18 chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto()) 19 20 21 # 打印兩個詞表的大小和原llama的特殊token 22 print(len(llama_tokenizer),len(chinese_sp_model)) 23 print(llama_tokenizer.all_special_tokens) 24 print(llama_tokenizer.all_special_ids) 25 print(llama_tokenizer.special_tokens_map) 26 27 28 # 開始往llama詞表裏添加,這裏你也可以直接加入你想要加入詞表的詞,或者是領域內的特殊詞 29 llama_spm_tokens_set=set(p.piece for p in llama_spm.pieces) 30 print(len(llama_spm_tokens_set)) 31 print(f"Before:{len(llama_spm_tokens_set)}") 32 for p in chinese_spm.pieces: 33 piece = p.piece 34 if piece not in llama_spm_tokens_set: 35 new_p = sp_pb2_model.ModelProto().SentencePiece() 36 new_p.piece = piece 37 new_p.score = 0 38 llama_spm.pieces.append(new_p) 39 print(f"New model pieces: {len(llama_spm.pieces)}") 40 41 # 保存合併後的模型 42 output_sp_dir = 'merged_tokenizer_sp_test' 43 output_hf_dir = 'merged_tokenizer_hf_test' 44 os.makedirs(output_sp_dir,exist_ok=True) 45 with open(output_sp_dir+'/chinese_llama.model', 'wb') as f: 46 f.write(llama_spm.SerializeToString()) 47 tokenizer = LlamaTokenizer(vocab_file=output_sp_dir+'/chinese_llama.model') 48 49 tokenizer.save_pretrained(output_hf_dir) 50 print(f"Chinese-LLaMA tokenizer has been saved to {output_hf_dir}") 51 52 # 看一下效果 53 llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir) 54 chinese_llama_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir) 55 56 57 text = "The excellence of a translation can only be judged by noting" 58 print("Test text:\n",text) 59 print(f"Tokenized by LLaMA tokenizer:{llama_tokenizer.tokenize(text)}") 60 print(f"Tokenized length by LLaMA tokenizer:{len(llama_tokenizer.tokenize(text))}") 61 print(f"Tokenized by chinese_llama tokenizer:{chinese_llama_tokenizer.tokenize(text)}") 62 print(f"Tokenized length by LLaMA-extent-1 tokenizer:{len(chinese_llama_tokenizer.tokenize(text))}") 63 64 text = "麒麟,是中國古代神話中的一種瑞獸" 65 print("Test text:\n",text) 66 print(f"Tokenized by LLaMA tokenizer:{llama_tokenizer.tokenize(text)}") 67 print(f"Tokenized length by LLaMA tokenizer:{len(llama_tokenizer.tokenize(text))}") 68 print(f"Tokenized by chinese_llama tokenizer:{chinese_llama_tokenizer.tokenize(text)}") 69 print(f"Tokenized length by chinese_llama tokenizer:{len(chinese_llama_tokenizer.tokenize(text))}")