之前分別用numpy實現了mlp,cnn,lstm和bert模型,這周順帶搞一下GPT-2,純numpy實現,最重要的是可在樹莓派上或其他不能安裝pytorch的板子上運行,生成數據
gpt-2的mask-multi-headed-self-attention我現在才徹底的明白它是真的牛逼,比bert的multi-headed-self-attention牛的不是一點半點,提出mask的人智商也是相當高了
這次模型依然是從hungging face上找的一個,gpt-2 small版本,參數比bert還小,主要是gpt-2沒有token-type那個2*768的矩陣和一個pooler矩陣,別的都有
gpt-2的模型結構和bert類似,只不過multi-headed-self-attention換成了mask-multi-headed-self-attention,另外
import numpy as np import time def top_k_sampling(probs, k): # 使用argsort對概率分佈數組進行排序,得到索引數組 sorted_indices = np.argsort(probs)[::-1] # 選擇前K個概率最高的詞的索引 topk_indices = sorted_indices[:k] # 根據選擇的Top-K索引進行進一步處理,例如按概率重新歸一化或隨機採樣 return topk_indices def random_sampling(array, k): sample = np.random.choice(array, size=k, replace=False) return sample def word_embedding(input_ids, word_embeddings): return word_embeddings[input_ids] def position_embedding(position_ids, position_embeddings): return position_embeddings[position_ids] def token_type_embedding(token_type_ids, token_type_embeddings): return token_type_embeddings[token_type_ids] def softmax(x, axis=None): # e_x = np.exp(x).astype(np.float32) # e_x = np.exp(x - np.max(x, axis=axis, keepdims=True)) sum_ex = np.sum(e_x, axis=axis,keepdims=True).astype(np.float32) return e_x / sum_ex def scaled_dot_product_attention(Q, K, V, mask=None): d_k = Q.shape[-1] attention_scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k) #一致 if mask is not None: # min_value = scores.min() min_value = np.finfo(attention_scores.dtype).min #找的scores.dtype 這個類型數據的最小值 # scores = np.where(mask, scores, np.full_like(scores, -np.inf)) scores = np.where(mask, attention_scores, np.full_like(attention_scores, min_value)) # 用最小值替換0的部分 attention_weights = softmax(scores, axis=-1) # 這樣softmax的權重在本來是0的地方得到的數值是0 output = np.matmul(attention_weights, V) #一致了 return output, attention_weights def scaled_dot_product_attention2(Q, K, V): # 單部推理 降低計算量 global att_scores d_k = Q.shape[-1] attention_scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k) #一致 attention_weights = softmax(attention_scores, axis=-1) # 這樣softmax的權重在本來是0的地方得到的數值是0 output = np.matmul(attention_weights, V) #一致了 return output, attention_weights global_q = {} global_k = {} global_v = {} def mask_multihead_attention(i,input, num_heads,W_Q,B_Q,W_K,B_K,W_V,B_V,W_O,B_O): global global_q,global_k,global_v q = np.matmul(input, W_Q)+B_Q k = np.matmul(input, W_K)+B_K v = np.matmul(input, W_V)+B_V if q.shape[-2] == 1: k = global_k[i] = np.concatenate([global_k[i],k],axis=-2) v = global_v[i] = np.concatenate([global_v[i],v],axis=-2) else: global_q[i] = q global_k[i] = k global_v[i] = v _,n,_ = k.shape # 分割輸入爲多個頭 q = np.split(q, num_heads, axis=-1) k = np.split(k, num_heads, axis=-1) v = np.split(v, num_heads, axis=-1) #到這裏都是一致的 outputs = [] if q[0].shape[-2] != 1: mask = np.tril(np.ones((n, n))) #下三角矩陣 for q_,k_,v_ in zip(q,k,v): output, attention_weights = scaled_dot_product_attention(q_, k_, v_,mask) #一致 outputs.append(output) else: for q_,k_,v_ in zip(q,k,v): output, attention_weights = scaled_dot_product_attention2(q_, k_, v_) #一致 outputs.append(output) outputs = np.concatenate(outputs, axis=-1) outputs = np.matmul(outputs, W_O)+B_O return outputs #一致 def layer_normalization(x, weight, bias, eps=1e-12): mean = np.mean(x, axis=-1, keepdims=True) variance = np.var(x, axis=-1, keepdims=True) std = np.sqrt(variance + eps) normalized_x = (x - mean) / std output = weight * normalized_x + bias return output def feed_forward_layer(inputs, weight, bias=None, activation='relu'): if bias is not None: linear_output = np.matmul(inputs,weight) + bias else: linear_output = np.matmul(inputs,weight) if activation == 'relu': activated_output = np.maximum(0, linear_output) # ReLU激活函數 elif activation == 'gelu': activated_output = 0.5 * linear_output * (1 + np.tanh(np.sqrt(2 / np.pi) * (linear_output + 0.044715 * np.power(linear_output, 3)))) # GELU激活函數 elif activation == "tanh" : activated_output = np.tanh(linear_output) else: activated_output = linear_output # 無激活函數 return activated_output def residual_connection(inputs, residual): # 殘差連接 residual_output = inputs + residual return residual_output with open('vocab.txt', 'r', encoding='utf-8') as f: vocab = f.readlines() vocab = [i.strip() for i in vocab] # print(len(vocab)) def tokenize_sentence(sentence): tokenized_sentence = list(sentence) # 在句子開頭添加[cls] token_ids = [vocab.index(token) for token in tokenized_sentence] return token_ids # 加載保存的模型數據 model_data = np.load('gpt2_model_params.npz') # for i in model_data: # # print(i) # print(i,model_data[i].shape) def get_sentence_ids(sentence): token_ids = tokenize_sentence(sentence) input_ids = np.array(token_ids) # 輸入的詞彙id return input_ids word_embeddings = model_data["transformer.wte.weight"] position_embeddings = model_data["transformer.wpe.weight"] def model_input(input_ids,position_ids): word_embedded = word_embedding(input_ids, word_embeddings) position_ids = np.array(position_ids) # 位置id # 位置嵌入矩陣,形狀爲 (max_position, embedding_size) position_embedded = position_embedding(position_ids, position_embeddings) embedding_output = np.expand_dims(word_embedded + position_embedded, axis=0) return embedding_output def gpt2(input,num_heads): for i in range(12): LayerNorm1_weight = model_data['transformer.h.{}.ln_1.weight'.format(i)] LayerNorm1_bias = model_data['transformer.h.{}.ln_1.bias'.format(i)] # 調用多頭自注意力函數 W_QKV = model_data['transformer.h.{}.attn.c_attn.weight'.format(i)] B_QKV = model_data['transformer.h.{}.attn.c_attn.bias'.format(i)] W_O = model_data['transformer.h.{}.attn.c_proj.weight'.format(i)] B_O = model_data['transformer.h.{}.attn.c_proj.bias'.format(i)] LayerNorm2_weight = model_data['transformer.h.{}.ln_2.weight'.format(i)] LayerNorm2_bias = model_data['transformer.h.{}.ln_2.bias'.format(i)] intermediate_weight = model_data['transformer.h.{}.mlp.c_fc.weight'.format(i)] intermediate_bias = model_data['transformer.h.{}.mlp.c_fc.bias'.format(i)] dense_weight = model_data['transformer.h.{}.mlp.c_proj.weight'.format(i)] dense_bias = model_data['transformer.h.{}.mlp.c_proj.bias'.format(i)] input1 = layer_normalization(input,LayerNorm1_weight,LayerNorm1_bias) #這裏和模型輸出一致 W_Q,W_K,W_V = np.split(W_QKV, 3, axis=-1) B_Q,B_K,B_V = np.split(B_QKV, 3, axis=-1) output = mask_multihead_attention(i,input1, num_heads,W_Q,B_Q,W_K,B_K,W_V,B_V,W_O,B_O) #一致 output1 = residual_connection(input,output) #一致 output = layer_normalization(output1,LayerNorm2_weight,LayerNorm2_bias) #一致 output = feed_forward_layer(output, intermediate_weight, intermediate_bias, activation='gelu') output = feed_forward_layer(output, dense_weight, dense_bias, activation='') #一致 output2 = residual_connection(output1,output) input = output2 ln_f_weight = model_data['transformer.ln_f.weight'] ln_f_bias = model_data['transformer.ln_f.bias'] output = layer_normalization(output2,ln_f_weight,ln_f_bias) return output classifier_weight = model_data['lm_head.weight'] def predict(sentence="今天是個好日子",accelerater=True,gen_len=100): start = time.time() sentence_ids = get_sentence_ids(sentence) position_ids = range(len(sentence_ids)) print("prompt輸入:",sentence) for i in range(gen_len): embeddings = model_input(sentence_ids,position_ids) output = gpt2(embeddings,num_heads=12) # print(output) output = feed_forward_layer(output[:,-1], classifier_weight.T, activation='') samples = top_k_sampling(output[0],k=1) label_id = random_sampling(samples,k=1) print(vocab[label_id[0]],end="") if accelerater: #是否使用加速推理減少計算量 sentence_ids = label_id #每次使用上一步的q,和全量的key和v做計算,比使用所有歷史時間部的q計算量小很多,所以可以加速 position_ids = [position_ids[-1]+1] else: sentence_ids = np.concatenate([sentence_ids,label_id],axis=-1) #慢推理,每次都需要從頭計算,計算量會越來越大 position_ids = range(len(sentence_ids)) end = time.time() print("\nspend time:",end-start) if __name__ == "__main__": accelerater = False sentence = "今天是個好日子" #我們要做的就是把握住這個機會 predict(sentence,accelerater) # embeddings = model_input(sentence_ids) # output = gpt2(embeddings,num_heads=12) # # print(output) # output = feed_forward_layer(output[:,:], classifier_weight.T, activation='') # samples = np.argmax(output,axis=-1) # for i in samples[0]: # print(vocab[i],end="")
結果:同樣的結果,序列長度越長,時間上的差異越明顯,這才生成100tokens,就已經明顯看出速度的差異了
使用加速: prompt輸入: 今天是個好日子 ,我們要做的就是把握住這個機會,把握住這個機會,把握住了,我相信我們的投資將會創造奇蹟,而且我相信我們的投資團隊一定能夠創造奇蹟,我相信我們的投資團隊一定能夠創造奇蹟,我相信我們的投資團隊一定能夠創造 spend time: 22.19951105117798 不使用加速: prompt輸入: 今天是個好日子 ,我們要做的就是把握住這個機會,把握住這個機會,把握住了,我相信我們的投資將會創造奇蹟,而且我相信我們的投資團隊一定能夠創造奇蹟,我相信我們的投資團隊一定能夠創造奇蹟,我相信我們的投資團隊一定能夠創造 spend time: 42.4955039024353
模型參數:
transformer.wte.weight (21128, 768) transformer.wpe.weight (1024, 768) transformer.h.0.ln_1.weight (768,) transformer.h.0.ln_1.bias (768,) transformer.h.0.attn.c_attn.weight (768, 2304) transformer.h.0.attn.c_attn.bias (2304,) transformer.h.0.attn.c_proj.weight (768, 768) transformer.h.0.attn.c_proj.bias (768,) transformer.h.0.ln_2.weight (768,) transformer.h.0.ln_2.bias (768,) transformer.h.0.mlp.c_fc.weight (768, 3072) transformer.h.0.mlp.c_fc.bias (3072,) transformer.h.0.mlp.c_proj.weight (3072, 768) transformer.h.0.mlp.c_proj.bias (768,) transformer.h.1.ln_1.weight (768,) transformer.h.1.ln_1.bias (768,) transformer.h.1.attn.c_attn.weight (768, 2304) transformer.h.1.attn.c_attn.bias (2304,) transformer.h.1.attn.c_proj.weight (768, 768) transformer.h.1.attn.c_proj.bias (768,) transformer.h.1.ln_2.weight (768,) transformer.h.1.ln_2.bias (768,) transformer.h.1.mlp.c_fc.weight (768, 3072) transformer.h.1.mlp.c_fc.bias (3072,) transformer.h.1.mlp.c_proj.weight (3072, 768) transformer.h.1.mlp.c_proj.bias (768,) transformer.h.2.ln_1.weight (768,) transformer.h.2.ln_1.bias (768,) transformer.h.2.attn.c_attn.weight (768, 2304) transformer.h.2.attn.c_attn.bias (2304,) transformer.h.2.attn.c_proj.weight (768, 768) transformer.h.2.attn.c_proj.bias (768,) transformer.h.2.ln_2.weight (768,) transformer.h.2.ln_2.bias (768,) transformer.h.2.mlp.c_fc.weight (768, 3072) transformer.h.2.mlp.c_fc.bias (3072,) transformer.h.2.mlp.c_proj.weight (3072, 768) transformer.h.2.mlp.c_proj.bias (768,) transformer.h.3.ln_1.weight (768,) transformer.h.3.ln_1.bias (768,) transformer.h.3.attn.c_attn.weight (768, 2304) transformer.h.3.attn.c_attn.bias (2304,) transformer.h.3.attn.c_proj.weight (768, 768) transformer.h.3.attn.c_proj.bias (768,) transformer.h.3.ln_2.weight (768,) transformer.h.3.ln_2.bias (768,) transformer.h.3.mlp.c_fc.weight (768, 3072) transformer.h.3.mlp.c_fc.bias (3072,) transformer.h.3.mlp.c_proj.weight (3072, 768) transformer.h.3.mlp.c_proj.bias (768,) transformer.h.4.ln_1.weight (768,) transformer.h.4.ln_1.bias (768,) transformer.h.4.attn.c_attn.weight (768, 2304) transformer.h.4.attn.c_attn.bias (2304,) transformer.h.4.attn.c_proj.weight (768, 768) transformer.h.4.attn.c_proj.bias (768,) transformer.h.4.ln_2.weight (768,) transformer.h.4.ln_2.bias (768,) transformer.h.4.mlp.c_fc.weight (768, 3072) transformer.h.4.mlp.c_fc.bias (3072,) transformer.h.4.mlp.c_proj.weight (3072, 768) transformer.h.4.mlp.c_proj.bias (768,) transformer.h.5.ln_1.weight (768,) transformer.h.5.ln_1.bias (768,) transformer.h.5.attn.c_attn.weight (768, 2304) transformer.h.5.attn.c_attn.bias (2304,) transformer.h.5.attn.c_proj.weight (768, 768) transformer.h.5.attn.c_proj.bias (768,) transformer.h.5.ln_2.weight (768,) transformer.h.5.ln_2.bias (768,) transformer.h.5.mlp.c_fc.weight (768, 3072) transformer.h.5.mlp.c_fc.bias (3072,) transformer.h.5.mlp.c_proj.weight (3072, 768) transformer.h.5.mlp.c_proj.bias (768,) transformer.h.6.ln_1.weight (768,) transformer.h.6.ln_1.bias (768,) transformer.h.6.attn.c_attn.weight (768, 2304) transformer.h.6.attn.c_attn.bias (2304,) transformer.h.6.attn.c_proj.weight (768, 768) transformer.h.6.attn.c_proj.bias (768,) transformer.h.6.ln_2.weight (768,) transformer.h.6.ln_2.bias (768,) transformer.h.6.mlp.c_fc.weight (768, 3072) transformer.h.6.mlp.c_fc.bias (3072,) transformer.h.6.mlp.c_proj.weight (3072, 768) transformer.h.6.mlp.c_proj.bias (768,) transformer.h.7.ln_1.weight (768,) transformer.h.7.ln_1.bias (768,) transformer.h.7.attn.c_attn.weight (768, 2304) transformer.h.7.attn.c_attn.bias (2304,) transformer.h.7.attn.c_proj.weight (768, 768) transformer.h.7.attn.c_proj.bias (768,) transformer.h.7.ln_2.weight (768,) transformer.h.7.ln_2.bias (768,) transformer.h.7.mlp.c_fc.weight (768, 3072) transformer.h.7.mlp.c_fc.bias (3072,) transformer.h.7.mlp.c_proj.weight (3072, 768) transformer.h.7.mlp.c_proj.bias (768,) transformer.h.8.ln_1.weight (768,) transformer.h.8.ln_1.bias (768,) transformer.h.8.attn.c_attn.weight (768, 2304) transformer.h.8.attn.c_attn.bias (2304,) transformer.h.8.attn.c_proj.weight (768, 768) transformer.h.8.attn.c_proj.bias (768,) transformer.h.8.ln_2.weight (768,) transformer.h.8.ln_2.bias (768,) transformer.h.8.mlp.c_fc.weight (768, 3072) transformer.h.8.mlp.c_fc.bias (3072,) transformer.h.8.mlp.c_proj.weight (3072, 768) transformer.h.8.mlp.c_proj.bias (768,) transformer.h.9.ln_1.weight (768,) transformer.h.9.ln_1.bias (768,) transformer.h.9.attn.c_attn.weight (768, 2304) transformer.h.9.attn.c_attn.bias (2304,) transformer.h.9.attn.c_proj.weight (768, 768) transformer.h.9.attn.c_proj.bias (768,) transformer.h.9.ln_2.weight (768,) transformer.h.9.ln_2.bias (768,) transformer.h.9.mlp.c_fc.weight (768, 3072) transformer.h.9.mlp.c_fc.bias (3072,) transformer.h.9.mlp.c_proj.weight (3072, 768) transformer.h.9.mlp.c_proj.bias (768,) transformer.h.10.ln_1.weight (768,) transformer.h.10.ln_1.bias (768,) transformer.h.10.attn.c_attn.weight (768, 2304) transformer.h.10.attn.c_attn.bias (2304,) transformer.h.10.attn.c_proj.weight (768, 768) transformer.h.10.attn.c_proj.bias (768,) transformer.h.10.ln_2.weight (768,) transformer.h.10.ln_2.bias (768,) transformer.h.10.mlp.c_fc.weight (768, 3072) transformer.h.10.mlp.c_fc.bias (3072,) transformer.h.10.mlp.c_proj.weight (3072, 768) transformer.h.10.mlp.c_proj.bias (768,) transformer.h.11.ln_1.weight (768,) transformer.h.11.ln_1.bias (768,) transformer.h.11.attn.c_attn.weight (768, 2304) transformer.h.11.attn.c_attn.bias (2304,) transformer.h.11.attn.c_proj.weight (768, 768) transformer.h.11.attn.c_proj.bias (768,) transformer.h.11.ln_2.weight (768,) transformer.h.11.ln_2.bias (768,) transformer.h.11.mlp.c_fc.weight (768, 3072) transformer.h.11.mlp.c_fc.bias (3072,) transformer.h.11.mlp.c_proj.weight (3072, 768) transformer.h.11.mlp.c_proj.bias (768,) transformer.ln_f.weight (768,) transformer.ln_f.bias (768,) lm_head.weight (21128, 768)
原始的hunggingface模型,保存模型參數爲numpy,然後上面的numpy版的gpt-2就可以加載了
import numpy as np from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall") model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall") text_generator = TextGenerationPipeline(model, tokenizer) print(text_generator("今天是個好日子", max_length=20, do_sample=True)) print(model) # 打印BERT模型的權重維度 # for name, param in model.named_parameters(): # print(name, param.data.shape) # print(model.lm_head.weight) # print(model.lm_head.bias) # # # 保存模型參數爲NumPy格式 model_params = {name: param.data.cpu().numpy() for name, param in model.named_parameters()} model_params["lm_head.weight"] = model.lm_head.weight.data.cpu().numpy() np.savez('gpt2_model_params.npz', **model_params) # model_params