我用numpy實現了GPT-2，GPT-2源碼，GPT-2模型加速推理，並且可以在樹莓派上運行，讀了不少hungging face源碼，手動實現了numpy的GPT2模型

之前分別用numpy實現了mlp，cnn，lstm和bert模型，這周順帶搞一下GPT-2，純numpy實現，最重要的是可在樹莓派上或其他不能安裝pytorch的板子上運行，生成數據

gpt-2的mask-multi-headed-self-attention我現在才徹底的明白它是真的牛逼，比bert的multi-headed-self-attention牛的不是一點半點，提出mask的人智商也是相當高了

這次模型依然是從hungging face上找的一個，gpt-2 small版本，參數比bert還小，主要是gpt-2沒有token-type那個2*768的矩陣和一個pooler矩陣，別的都有

gpt-2的模型結構和bert類似，只不過multi-headed-self-attention換成了mask-multi-headed-self-attention，另外

layer_normalization層放到了attention層之前和feedforword層之前

最重點的是，我的代碼實現了gpt-2推理加速，以前的gpt-2的工程裏經常是生成一個新的token後和原來的token序列拼接起來，再給模型輸入，這會使模型大大增加計算量，同時mask失去了它的意義，每次都會把之前的token重新推理一邊，浪費時間，我的代碼中已經實現使用生成的token作爲模型輸入去推理後面的結果，計算量大大減少，速度大大提升，試想一下向量做矩陣乘法和矩陣做矩陣乘法的差異，同樣生成100個token，在cpu上的加速效果快了一倍，如果token長度更長，則加速效果會更明顯

上numpy代碼

import numpy as np
import time

def top_k_sampling(probs, k):
    # 使用argsort對概率分佈數組進行排序，得到索引數組
    sorted_indices = np.argsort(probs)[::-1]
    # 選擇前K個概率最高的詞的索引
    topk_indices = sorted_indices[:k]
    # 根據選擇的Top-K索引進行進一步處理，例如按概率重新歸一化或隨機採樣
    return topk_indices

def random_sampling(array, k):
    sample = np.random.choice(array, size=k, replace=False)
    return sample

def word_embedding(input_ids, word_embeddings):
    return word_embeddings[input_ids]

def position_embedding(position_ids, position_embeddings):
    return position_embeddings[position_ids]

def token_type_embedding(token_type_ids, token_type_embeddings):
    return token_type_embeddings[token_type_ids]

def softmax(x, axis=None):
    # e_x = np.exp(x).astype(np.float32) #  
    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    sum_ex = np.sum(e_x, axis=axis,keepdims=True).astype(np.float32)
    return e_x / sum_ex


def scaled_dot_product_attention(Q, K, V, mask=None):

    d_k = Q.shape[-1]
    attention_scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k)   #一致
    if mask is not None:
        # min_value = scores.min()
        min_value = np.finfo(attention_scores.dtype).min      #找的scores.dtype 這個類型數據的最小值
        # scores = np.where(mask, scores, np.full_like(scores, -np.inf))
        scores = np.where(mask, attention_scores, np.full_like(attention_scores, min_value))    # 用最小值替換0的部分

    attention_weights = softmax(scores, axis=-1)      # 這樣softmax的權重在本來是0的地方得到的數值是0

    output = np.matmul(attention_weights, V)        #一致了
    return output, attention_weights

def scaled_dot_product_attention2(Q, K, V):     # 單部推理 降低計算量

    global att_scores
    d_k = Q.shape[-1]
    attention_scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k)   #一致
    attention_weights = softmax(attention_scores, axis=-1)      # 這樣softmax的權重在本來是0的地方得到的數值是0
    output = np.matmul(attention_weights, V)        #一致了
    return output, attention_weights

global_q = {}
global_k = {}
global_v = {}
def mask_multihead_attention(i,input, num_heads,W_Q,B_Q,W_K,B_K,W_V,B_V,W_O,B_O):

    global global_q,global_k,global_v
    q = np.matmul(input, W_Q)+B_Q
    k = np.matmul(input, W_K)+B_K
    v = np.matmul(input, W_V)+B_V    

    if q.shape[-2] == 1:
        k = global_k[i] = np.concatenate([global_k[i],k],axis=-2)
        v = global_v[i] = np.concatenate([global_v[i],v],axis=-2)
    else:
        global_q[i] = q
        global_k[i] = k
        global_v[i] = v


    _,n,_ = k.shape
    # 分割輸入爲多個頭
    q = np.split(q, num_heads, axis=-1)
    k = np.split(k, num_heads, axis=-1)
    v = np.split(v, num_heads, axis=-1)   #到這裏都是一致的

    outputs = []
    if q[0].shape[-2] != 1:
        mask = np.tril(np.ones((n, n)))   #下三角矩陣
        for q_,k_,v_ in zip(q,k,v):
            output, attention_weights = scaled_dot_product_attention(q_, k_, v_,mask)    #一致
            outputs.append(output)
    else:
        for q_,k_,v_ in zip(q,k,v):
            output, attention_weights = scaled_dot_product_attention2(q_, k_, v_)    #一致
            outputs.append(output)

    outputs = np.concatenate(outputs, axis=-1)
    outputs = np.matmul(outputs, W_O)+B_O
    return outputs      #一致


def layer_normalization(x, weight, bias, eps=1e-12):
    mean = np.mean(x, axis=-1, keepdims=True)
    variance = np.var(x, axis=-1, keepdims=True)
    std = np.sqrt(variance + eps)
    normalized_x = (x - mean) / std
    output = weight * normalized_x + bias
    return output

def feed_forward_layer(inputs, weight, bias=None, activation='relu'):
    if bias is not None:
        linear_output = np.matmul(inputs,weight) + bias
    else:
        linear_output = np.matmul(inputs,weight)
    
    if activation == 'relu':
        activated_output = np.maximum(0, linear_output)  # ReLU激活函數
    elif activation == 'gelu':
        activated_output = 0.5 * linear_output * (1 + np.tanh(np.sqrt(2 / np.pi) * (linear_output + 0.044715 * np.power(linear_output, 3))))  # GELU激活函數
    
    elif activation == "tanh" :
        activated_output = np.tanh(linear_output)
    else:
        activated_output = linear_output  # 無激活函數
    
    return activated_output


def residual_connection(inputs, residual):
    # 殘差連接
    residual_output = inputs + residual
    return residual_output

with open('vocab.txt', 'r', encoding='utf-8') as f:
    vocab = f.readlines()
    vocab = [i.strip() for i in vocab]
    # print(len(vocab))

def tokenize_sentence(sentence):
    tokenized_sentence = list(sentence) # 在句子開頭添加[cls]
    token_ids = [vocab.index(token) for token in tokenized_sentence]

    return token_ids

# 加載保存的模型數據
model_data = np.load('gpt2_model_params.npz')
# for i in model_data:
#     # print(i)
#     print(i,model_data[i].shape)

def get_sentence_ids(sentence):
    token_ids = tokenize_sentence(sentence)
    input_ids = np.array(token_ids)  # 輸入的詞彙id
    return input_ids

word_embeddings = model_data["transformer.wte.weight"]
position_embeddings = model_data["transformer.wpe.weight"]
def model_input(input_ids,position_ids):

    word_embedded = word_embedding(input_ids, word_embeddings)

    position_ids = np.array(position_ids)  # 位置id
    # 位置嵌入矩陣，形狀爲 (max_position, embedding_size)
    position_embedded = position_embedding(position_ids, position_embeddings)

    embedding_output = np.expand_dims(word_embedded + position_embedded, axis=0)
    return embedding_output


def gpt2(input,num_heads):

    for i in range(12):

        LayerNorm1_weight = model_data['transformer.h.{}.ln_1.weight'.format(i)]
        LayerNorm1_bias = model_data['transformer.h.{}.ln_1.bias'.format(i)]
        # 調用多頭自注意力函數
        W_QKV = model_data['transformer.h.{}.attn.c_attn.weight'.format(i)]
        B_QKV = model_data['transformer.h.{}.attn.c_attn.bias'.format(i)]
        W_O = model_data['transformer.h.{}.attn.c_proj.weight'.format(i)]
        B_O = model_data['transformer.h.{}.attn.c_proj.bias'.format(i)]

        LayerNorm2_weight = model_data['transformer.h.{}.ln_2.weight'.format(i)]
        LayerNorm2_bias = model_data['transformer.h.{}.ln_2.bias'.format(i)]

        intermediate_weight = model_data['transformer.h.{}.mlp.c_fc.weight'.format(i)]
        intermediate_bias = model_data['transformer.h.{}.mlp.c_fc.bias'.format(i)]
        dense_weight = model_data['transformer.h.{}.mlp.c_proj.weight'.format(i)]
        dense_bias = model_data['transformer.h.{}.mlp.c_proj.bias'.format(i)]


        input1 = layer_normalization(input,LayerNorm1_weight,LayerNorm1_bias)     #這裏和模型輸出一致
        W_Q,W_K,W_V = np.split(W_QKV, 3, axis=-1) 
        B_Q,B_K,B_V = np.split(B_QKV, 3, axis=-1) 
        output = mask_multihead_attention(i,input1, num_heads,W_Q,B_Q,W_K,B_K,W_V,B_V,W_O,B_O)     #一致
        output1 = residual_connection(input,output)      #一致

        output = layer_normalization(output1,LayerNorm2_weight,LayerNorm2_bias)    #一致
        output = feed_forward_layer(output, intermediate_weight, intermediate_bias, activation='gelu')
        output = feed_forward_layer(output, dense_weight, dense_bias, activation='')       #一致
        output2 = residual_connection(output1,output)
        
        input = output2

    ln_f_weight = model_data['transformer.ln_f.weight']
    ln_f_bias = model_data['transformer.ln_f.bias']
    output = layer_normalization(output2,ln_f_weight,ln_f_bias)

    return output


classifier_weight = model_data['lm_head.weight']
def predict(sentence="今天是個好日子",accelerater=True,gen_len=100):

    start = time.time()
    sentence_ids = get_sentence_ids(sentence)
    position_ids = range(len(sentence_ids))
    print("prompt輸入:",sentence)
    for i in range(gen_len):
        embeddings = model_input(sentence_ids,position_ids)
        output = gpt2(embeddings,num_heads=12)
        # print(output)
        output = feed_forward_layer(output[:,-1], classifier_weight.T, activation='')

        samples = top_k_sampling(output[0],k=1)

        label_id = random_sampling(samples,k=1)
        
        print(vocab[label_id[0]],end="")

        if accelerater:      #是否使用加速推理減少計算量
            sentence_ids = label_id                                             #每次使用上一步的q，和全量的key和v做計算，比使用所有歷史時間部的q計算量小很多，所以可以加速
            position_ids = [position_ids[-1]+1]
        else:
            sentence_ids = np.concatenate([sentence_ids,label_id],axis=-1)      #慢推理，每次都需要從頭計算，計算量會越來越大
            position_ids = range(len(sentence_ids))

    end = time.time()

    print("\nspend time:",end-start)



if __name__ == "__main__":

    accelerater = False
    sentence = "今天是個好日子"  #我們要做的就是把握住這個機會

    predict(sentence,accelerater)


    # embeddings = model_input(sentence_ids)
    # output = gpt2(embeddings,num_heads=12)
    # # print(output)
    # output = feed_forward_layer(output[:,:], classifier_weight.T, activation='')

    # samples = np.argmax(output,axis=-1)

    # for i in samples[0]:
    #     print(vocab[i],end="")

結果：同樣的結果，序列長度越長，時間上的差異越明顯，這才生成100tokens，就已經明顯看出速度的差異了

使用加速：
prompt輸入: 今天是個好日子
，我們要做的就是把握住這個機會，把握住這個機會，把握住了，我相信我們的投資將會創造奇蹟，而且我相信我們的投資團隊一定能夠創造奇蹟，我相信我們的投資團隊一定能夠創造奇蹟，我相信我們的投資團隊一定能夠創造
spend time: 22.19951105117798

不使用加速：
prompt輸入: 今天是個好日子
，我們要做的就是把握住這個機會，把握住這個機會，把握住了，我相信我們的投資將會創造奇蹟，而且我相信我們的投資團隊一定能夠創造奇蹟，我相信我們的投資團隊一定能夠創造奇蹟，我相信我們的投資團隊一定能夠創造
spend time: 42.4955039024353

模型參數：

transformer.wte.weight (21128, 768)
transformer.wpe.weight (1024, 768)
transformer.h.0.ln_1.weight (768,)
transformer.h.0.ln_1.bias (768,)
transformer.h.0.attn.c_attn.weight (768, 2304)
transformer.h.0.attn.c_attn.bias (2304,)
transformer.h.0.attn.c_proj.weight (768, 768)
transformer.h.0.attn.c_proj.bias (768,)
transformer.h.0.ln_2.weight (768,)
transformer.h.0.ln_2.bias (768,)
transformer.h.0.mlp.c_fc.weight (768, 3072)
transformer.h.0.mlp.c_fc.bias (3072,)
transformer.h.0.mlp.c_proj.weight (3072, 768)
transformer.h.0.mlp.c_proj.bias (768,)
transformer.h.1.ln_1.weight (768,)
transformer.h.1.ln_1.bias (768,)
transformer.h.1.attn.c_attn.weight (768, 2304)
transformer.h.1.attn.c_attn.bias (2304,)
transformer.h.1.attn.c_proj.weight (768, 768)
transformer.h.1.attn.c_proj.bias (768,)
transformer.h.1.ln_2.weight (768,)
transformer.h.1.ln_2.bias (768,)
transformer.h.1.mlp.c_fc.weight (768, 3072)
transformer.h.1.mlp.c_fc.bias (3072,)
transformer.h.1.mlp.c_proj.weight (3072, 768)
transformer.h.1.mlp.c_proj.bias (768,)
transformer.h.2.ln_1.weight (768,)
transformer.h.2.ln_1.bias (768,)
transformer.h.2.attn.c_attn.weight (768, 2304)
transformer.h.2.attn.c_attn.bias (2304,)
transformer.h.2.attn.c_proj.weight (768, 768)
transformer.h.2.attn.c_proj.bias (768,)
transformer.h.2.ln_2.weight (768,)
transformer.h.2.ln_2.bias (768,)
transformer.h.2.mlp.c_fc.weight (768, 3072)
transformer.h.2.mlp.c_fc.bias (3072,)
transformer.h.2.mlp.c_proj.weight (3072, 768)
transformer.h.2.mlp.c_proj.bias (768,)
transformer.h.3.ln_1.weight (768,)
transformer.h.3.ln_1.bias (768,)
transformer.h.3.attn.c_attn.weight (768, 2304)
transformer.h.3.attn.c_attn.bias (2304,)
transformer.h.3.attn.c_proj.weight (768, 768)
transformer.h.3.attn.c_proj.bias (768,)
transformer.h.3.ln_2.weight (768,)
transformer.h.3.ln_2.bias (768,)
transformer.h.3.mlp.c_fc.weight (768, 3072)
transformer.h.3.mlp.c_fc.bias (3072,)
transformer.h.3.mlp.c_proj.weight (3072, 768)
transformer.h.3.mlp.c_proj.bias (768,)
transformer.h.4.ln_1.weight (768,)
transformer.h.4.ln_1.bias (768,)
transformer.h.4.attn.c_attn.weight (768, 2304)
transformer.h.4.attn.c_attn.bias (2304,)
transformer.h.4.attn.c_proj.weight (768, 768)
transformer.h.4.attn.c_proj.bias (768,)
transformer.h.4.ln_2.weight (768,)
transformer.h.4.ln_2.bias (768,)
transformer.h.4.mlp.c_fc.weight (768, 3072)
transformer.h.4.mlp.c_fc.bias (3072,)
transformer.h.4.mlp.c_proj.weight (3072, 768)
transformer.h.4.mlp.c_proj.bias (768,)
transformer.h.5.ln_1.weight (768,)
transformer.h.5.ln_1.bias (768,)
transformer.h.5.attn.c_attn.weight (768, 2304)
transformer.h.5.attn.c_attn.bias (2304,)
transformer.h.5.attn.c_proj.weight (768, 768)
transformer.h.5.attn.c_proj.bias (768,)
transformer.h.5.ln_2.weight (768,)
transformer.h.5.ln_2.bias (768,)
transformer.h.5.mlp.c_fc.weight (768, 3072)
transformer.h.5.mlp.c_fc.bias (3072,)
transformer.h.5.mlp.c_proj.weight (3072, 768)
transformer.h.5.mlp.c_proj.bias (768,)
transformer.h.6.ln_1.weight (768,)
transformer.h.6.ln_1.bias (768,)
transformer.h.6.attn.c_attn.weight (768, 2304)
transformer.h.6.attn.c_attn.bias (2304,)
transformer.h.6.attn.c_proj.weight (768, 768)
transformer.h.6.attn.c_proj.bias (768,)
transformer.h.6.ln_2.weight (768,)
transformer.h.6.ln_2.bias (768,)
transformer.h.6.mlp.c_fc.weight (768, 3072)
transformer.h.6.mlp.c_fc.bias (3072,)
transformer.h.6.mlp.c_proj.weight (3072, 768)
transformer.h.6.mlp.c_proj.bias (768,)
transformer.h.7.ln_1.weight (768,)
transformer.h.7.ln_1.bias (768,)
transformer.h.7.attn.c_attn.weight (768, 2304)
transformer.h.7.attn.c_attn.bias (2304,)
transformer.h.7.attn.c_proj.weight (768, 768)
transformer.h.7.attn.c_proj.bias (768,)
transformer.h.7.ln_2.weight (768,)
transformer.h.7.ln_2.bias (768,)
transformer.h.7.mlp.c_fc.weight (768, 3072)
transformer.h.7.mlp.c_fc.bias (3072,)
transformer.h.7.mlp.c_proj.weight (3072, 768)
transformer.h.7.mlp.c_proj.bias (768,)
transformer.h.8.ln_1.weight (768,)
transformer.h.8.ln_1.bias (768,)
transformer.h.8.attn.c_attn.weight (768, 2304)
transformer.h.8.attn.c_attn.bias (2304,)
transformer.h.8.attn.c_proj.weight (768, 768)
transformer.h.8.attn.c_proj.bias (768,)
transformer.h.8.ln_2.weight (768,)
transformer.h.8.ln_2.bias (768,)
transformer.h.8.mlp.c_fc.weight (768, 3072)
transformer.h.8.mlp.c_fc.bias (3072,)
transformer.h.8.mlp.c_proj.weight (3072, 768)
transformer.h.8.mlp.c_proj.bias (768,)
transformer.h.9.ln_1.weight (768,)
transformer.h.9.ln_1.bias (768,)
transformer.h.9.attn.c_attn.weight (768, 2304)
transformer.h.9.attn.c_attn.bias (2304,)
transformer.h.9.attn.c_proj.weight (768, 768)
transformer.h.9.attn.c_proj.bias (768,)
transformer.h.9.ln_2.weight (768,)
transformer.h.9.ln_2.bias (768,)
transformer.h.9.mlp.c_fc.weight (768, 3072)
transformer.h.9.mlp.c_fc.bias (3072,)
transformer.h.9.mlp.c_proj.weight (3072, 768)
transformer.h.9.mlp.c_proj.bias (768,)
transformer.h.10.ln_1.weight (768,)
transformer.h.10.ln_1.bias (768,)
transformer.h.10.attn.c_attn.weight (768, 2304)
transformer.h.10.attn.c_attn.bias (2304,)
transformer.h.10.attn.c_proj.weight (768, 768)
transformer.h.10.attn.c_proj.bias (768,)
transformer.h.10.ln_2.weight (768,)
transformer.h.10.ln_2.bias (768,)
transformer.h.10.mlp.c_fc.weight (768, 3072)
transformer.h.10.mlp.c_fc.bias (3072,)
transformer.h.10.mlp.c_proj.weight (3072, 768)
transformer.h.10.mlp.c_proj.bias (768,)
transformer.h.11.ln_1.weight (768,)
transformer.h.11.ln_1.bias (768,)
transformer.h.11.attn.c_attn.weight (768, 2304)
transformer.h.11.attn.c_attn.bias (2304,)
transformer.h.11.attn.c_proj.weight (768, 768)
transformer.h.11.attn.c_proj.bias (768,)
transformer.h.11.ln_2.weight (768,)
transformer.h.11.ln_2.bias (768,)
transformer.h.11.mlp.c_fc.weight (768, 3072)
transformer.h.11.mlp.c_fc.bias (3072,)
transformer.h.11.mlp.c_proj.weight (3072, 768)
transformer.h.11.mlp.c_proj.bias (768,)
transformer.ln_f.weight (768,)
transformer.ln_f.bias (768,)
lm_head.weight (21128, 768)

原始的hunggingface模型，保存模型參數爲numpy，然後上面的numpy版的gpt-2就可以加載了

import numpy as np

from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
text_generator = TextGenerationPipeline(model, tokenizer)   
print(text_generator("今天是個好日子", max_length=20, do_sample=True))



print(model)

# 打印BERT模型的權重維度
# for name, param in model.named_parameters():
#     print(name, param.data.shape)

# print(model.lm_head.weight)
# print(model.lm_head.bias)

# # # 保存模型參數爲NumPy格式
model_params = {name: param.data.cpu().numpy() for name, param in model.named_parameters()}
model_params["lm_head.weight"] = model.lm_head.weight.data.cpu().numpy()
np.savez('gpt2_model_params.npz', **model_params)
# model_params

我用numpy實現了GPT-2，GPT-2源碼，GPT-2模型加速推理，並且可以在樹莓派上運行，讀了不少hungging face源碼，手動實現了numpy的GPT2模型

【面試準備】又一次失敗的面試經歷，題目離譜～資深軟件測試工程師

dotnet 8 版本與銀河麒麟V10和UOS系統的 glibc 兼容性

deepspeed 訓練多機多卡報錯 ncclSystemError Last error

如何實現圖像搜索，文搜圖，圖搜圖，CLIP+faiss向量數據庫實現圖像高效搜索

使用單卡qlora混合精度訓練大模型chatGLM2-6b，解決qlora loss變成nan的問題！

我用numpy實現了VIT，手寫vision transformer, 可在樹莓派上運行，在hugging face上訓練模型保存參數成numpy格式，純numpy實現

我用numpy實現了GPT-2，GPT-2源碼，GPT-2模型加速推理，並且可以在樹莓派上運行，讀了不少hungging face源碼，手動實現了numpy的GPT2模型

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結