llama2c的量化和多线程（1）

小明 2025-05-06 06:54:43 5

为了方便调试，使得 model = Transformer(config)模型内存不溢出，将config中的"n_layers": 2，整体看一下Transformer的架构。
注：config就是设置Transformer中的参数。

class Transformer(nn.Module):
    last_loss: Optional[torch.Tensor]
    def __init__(self, params: ModelArgs):
        super().__init__()
        self.params = params
        self.vocab_size = params.vocab_size
        self.n_layers = params.n_layers
        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
        self.dropout = nn.Dropout(params.dropout)
        self.layers = torch.nn.ModuleList()
        # self.layers给定的数量加载TransformerBlock
        for layer_id in range(params.n_layers):
            self.layers.append(TransformerBlock(layer_id, params))
        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
        # share the unembedding parameters with the embedding parameters
        self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying
        # some useful precompute for the RoPE relative positional embeddings
        freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers))
        # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets 
        # tensor.
        self.last_loss = None
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> torch.Tensor:
        _bsz, seqlen = tokens.shape
        h = self.tok_embeddings(tokens)
        h = self.dropout(h)
        freqs_cos = self.freqs_cos[:seqlen]
        freqs_sin = self.freqs_sin[:seqlen]
        for layer in self.layers:
            h = layer(h, freqs_cos, freqs_sin)
        # RMSNorm
        h = self.norm(h)
        if targets is not None:
            # if we are given some desired targets also calculate the loss
            # Linear
            logits = self.output(h)
            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the output on the very last position
            logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim
            self.last_loss = None
        return logits
        
class TransformerBlock(nn.Module):
    def __init__(self, layer_id: int, args: ModelArgs):
        super().__init__()
        self.n_heads = args.n_heads
        self.dim = args.dim
        self.head_dim = args.dim // args.n_heads
        self.attention = Attention(args)
        self.feed_forward = FeedForward(
            dim=args.dim,
            hidden_dim=args.hidden_dim,
            multiple_of=args.multiple_of,
            dropout=args.dropout,
        )
        self.layer_id = layer_id
        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
    def forward(self, x, freqs_cos, freqs_sin):
        h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin)
        out = h + self.feed_forward.forward(self.ffn_norm(h))
        return out

Transformer(
  (tok_embeddings): Embedding(32000, 4096)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-1): 2 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=4096, out_features=4096, bias=False)
        (wk): Linear(in_features=4096, out_features=4096, bias=False)
        (wv): Linear(in_features=4096, out_features=4096, bias=False)
        (wo): Linear(in_features=4096, out_features=4096, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=4096, out_features=11008, bias=False)
        (w2): Linear(in_features=11008, out_features=4096, bias=False)
        (w3): Linear(in_features=4096, out_features=11008, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=4096, out_features=32000, bias=False)
)

model = load_meta_model(args.meta_llama)
打开params,存于json格式，目的是model = Transformer(config)时，这些模型参数构建一个新的Transformer
加载模型权重，并映射到state_dict字典格式
此时，有了新的Transformer模型model，有了权重，就可以给model赋予权重了。
如代码：
model.tok_embeddings.weight = nn.Parameter(state_dict['tok_embeddings.weight'])

��时返回一个具有params且具有权重的Transformer。

（）

1、export中的量化

def version2_export(model, filepath, group_size=64):
model.layers为
“ModuleList(
  (0-1): 2 x TransformerBlock(
    (attention): Attention(
      (wq): Linear(in_features=4096, out_features=4096, bias=False)
      (wk): Linear(in_features=4096, out_features=4096, bias=False)
      (wv): Linear(in_features=4096, out_features=4096, bias=False)
      (wo): Linear(in_features=4096, out_features=4096, bias=False)
      (attn_dropout): Dropout(p=0.0, inplace=False)
      (resid_dropout): Dropout(p=0.0, inplace=False)
    )
    (feed_forward): FeedForward(
      (w1): Linear(in_features=4096, out_features=11008, bias=False)
      (w2): Linear(in_features=11008, out_features=4096, bias=False)
      (w3): Linear(in_features=4096, out_features=11008, bias=False)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (attention_norm): RMSNorm()
    (ffn_norm): RMSNorm()
  )
)”
S1：对模型参数进行分组，分组目的是量化以分组大小为单位进行，减少异常值的影响
S2：收集模型中的权重参数，存储在weights的列表中。其中，模型参数是个tensor
如model.tok_embeddings.weight
*[layer.attention.wq.weight for layer in model.layers],
当TransformerBlock为2时，这儿weights为15，因为7*2+1=15（一个Bloclk有7个参数）
S3:判断weights中的每个权重w是否能被group_size整除，方便分组量化。
分组量化优势：
降低量化误差。在对连续的浮点数权重直接进行量化时，可能会因为单一阈值导致较大的量化误差，尤其是对于那些数值分布不均匀的权重。
分组量化通过将权重分为多个小组，并为每个小组独立计算量化参数（如量化步长或缩放因子），可以减少因个别极端值引起的全局量化误差
S4：将文件头的信息写入bin文件。包括magic、version、params（7个int参数）
头信息：文件头信息（File Header）是指存储在文件起始部分的一段特定数据，它包含了关于文件内容、格式、版本、结构以及如何解释文件中数据的必要信息。不同类型的文件有不同的文件头信息格式。
S5：关于norm层的全部fp32，通过numpy形式写入bin文件
S6：分组，找每组Wmax，s=Wmax/127.0, q=w/s,round(q)得到int8
    反量化回去得fp32valr，算与w的最大误差err，在O(~0.001)最好
for i, w in enumerate(weights): #i是从0开始的索引，w是指当前索引的weight
        # quantize this weight
        q, s, err = quantize_q80(w, group_size)
        # save the int8 weights to file
        serialize_int8(out_file, q) # save the tensor in int8
        serialize_fp32(out_file, s) # save scale factors
def quantize_q80(w, group_size):
    """
    takes a tensor and returns the Q8_0 quantized version
    i.e. symmetric quantization into int8, range [-127,127]
    """
    assert w.numel() % group_size == 0
    ori_shape = w.shape
    w = w.float() # convert to float32
    # 将w分成多个小组，每个小组有group_size个元素,这儿是torch.Size([2048000, 64])
    w = w.reshape(-1, group_size)
    # find the max in each group
    wmax = torch.abs(w).max(dim=1).values
    # calculate the scaling factor such that float = quant * scale
    scale = wmax / 127.0
    # scale into range [-127, 127]
    quant = w / scale[:,None]
    # round to nearest integer
    int8val = torch.round(quant).to(torch.int8)
    # dequantize by rescaling
    fp32val = (int8val.float() * scale[:,None]).view(-1)
    fp32valr = fp32val.reshape(-1, group_size)
    # calculate the max error in each group
    err = torch.abs(fp32valr - w).max(dim=1).values
    # find the max error across all groups
    maxerr = err.max().item()
    return int8val, scale, maxerr

整个bin文件排布是256kb的header + 权重

first write out the header. the header will be 256 bytes

now that the header is done, let’s write out the model

（）

The End

标签：

建站主机

1、export中的量化

first write out the header. the header will be 256 bytes

相关文章