llama2c的量化和多线程(1)
为了方便调试,使得 model = Transformer(config)模型内存不溢出,将config中的"n_layers": 2,整体看一下Transformer的架构。 注:config就是设置Transformer中的参数。
class Transformer(nn.Module): last_loss: Optional[torch.Tensor] def __init__(self, params: ModelArgs): super().__init__() self.params = params self.vocab_size = params.vocab_size self.n_layers = params.n_layers self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim) self.dropout = nn.Dropout(params.dropout) self.layers = torch.nn.ModuleList() # self.layers给定的数量加载TransformerBlock for layer_id in range(params.n_layers): self.layers.append(TransformerBlock(layer_id, params)) self.norm = RMSNorm(params.dim, eps=params.norm_eps) self.output = nn.Linear(params.dim, params.vocab_size, bias=False) # share the unembedding parameters with the embedding parameters self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying # some useful precompute for the RoPE relative positional embeddings freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len) self.register_buffer("freqs_cos", freqs_cos, persistent=False) self.register_buffer("freqs_sin", freqs_sin, persistent=False) # init all weights self.apply(self._init_weights) # apply special scaled init to the residual projections, per GPT-2 paper for pn, p in self.named_parameters(): if pn.endswith('w3.weight') or pn.endswith('wo.weight'): torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers)) # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets # tensor. self.last_loss = None def _init_weights(self, module): if isinstance(module, nn.Linear): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) if module.bias is not None: torch.nn.init.zeros_(module.bias) elif isinstance(module, nn.Embedding): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> torch.Tensor: _bsz, seqlen = tokens.shape h = self.tok_embeddings(tokens) h = self.dropout(h) freqs_cos = self.freqs_cos[:seqlen] freqs_sin = self.freqs_sin[:seqlen] for layer in self.layers: h = layer(h, freqs_cos, freqs_sin) # RMSNorm h = self.norm(h) if targets is not None: # if we are given some desired targets also calculate the loss # Linear logits = self.output(h) self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1) else: # inference-time mini-optimization: only forward the output on the very last position logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim self.last_loss = None return logits class TransformerBlock(nn.Module): def __init__(self, layer_id: int, args: ModelArgs): super().__init__() self.n_heads = args.n_heads self.dim = args.dim self.head_dim = args.dim // args.n_heads self.attention = Attention(args) self.feed_forward = FeedForward( dim=args.dim, hidden_dim=args.hidden_dim, multiple_of=args.multiple_of, dropout=args.dropout, ) self.layer_id = layer_id self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) def forward(self, x, freqs_cos, freqs_sin): h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin) out = h + self.feed_forward.forward(self.ffn_norm(h)) return out
Transformer( (tok_embeddings): Embedding(32000, 4096) (dropout): Dropout(p=0.0, inplace=False) (layers): ModuleList( (0-1): 2 x TransformerBlock( (attention): Attention( (wq): Linear(in_features=4096, out_features=4096, bias=False) (wk): Linear(in_features=4096, out_features=4096, bias=False) (wv): Linear(in_features=4096, out_features=4096, bias=False) (wo): Linear(in_features=4096, out_features=4096, bias=False) (attn_dropout): Dropout(p=0.0, inplace=False) (resid_dropout): Dropout(p=0.0, inplace=False) ) (feed_forward): FeedForward( (w1): Linear(in_features=4096, out_features=11008, bias=False) (w2): Linear(in_features=11008, out_features=4096, bias=False) (w3): Linear(in_features=4096, out_features=11008, bias=False) (dropout): Dropout(p=0.0, inplace=False) ) (attention_norm): RMSNorm() (ffn_norm): RMSNorm() ) ) (norm): RMSNorm() (output): Linear(in_features=4096, out_features=32000, bias=False) )
model = load_meta_model(args.meta_llama) 打开params,存于json格式,目的是model = Transformer(config)时,这些模型参数构建一个新的Transformer 加载模型权重,并映射到state_dict字典格式 此时,有了新的Transformer模型model,有了权重,就可以给model赋予权重了。 如代码: model.tok_embeddings.weight = nn.Parameter(state_dict['tok_embeddings.weight'])
���时返回一个具有params且具有权重的Transformer。
()1、export中的量化
def version2_export(model, filepath, group_size=64): model.layers为 “ModuleList( (0-1): 2 x TransformerBlock( (attention): Attention( (wq): Linear(in_features=4096, out_features=4096, bias=False) (wk): Linear(in_features=4096, out_features=4096, bias=False) (wv): Linear(in_features=4096, out_features=4096, bias=False) (wo): Linear(in_features=4096, out_features=4096, bias=False) (attn_dropout): Dropout(p=0.0, inplace=False) (resid_dropout): Dropout(p=0.0, inplace=False) ) (feed_forward): FeedForward( (w1): Linear(in_features=4096, out_features=11008, bias=False) (w2): Linear(in_features=11008, out_features=4096, bias=False) (w3): Linear(in_features=4096, out_features=11008, bias=False) (dropout): Dropout(p=0.0, inplace=False) ) (attention_norm): RMSNorm() (ffn_norm): RMSNorm() ) )” S1:对模型参数进行分组,分组目的是量化以分组大小为单位进行,减少异常值的影响 S2:收集模型中的权重参数,存储在weights的列表中。其中,模型参数是个tensor 如model.tok_embeddings.weight *[layer.attention.wq.weight for layer in model.layers], 当TransformerBlock为2时,这儿weights为15,因为7*2+1=15(一个Bloclk有7个参数) S3:判断weights中的每个权重w是否能被group_size整除,方便分组量化。 分组量化优势: 降低量化误差。在对连续的浮点数权重直接进行量化时,可能会因为单一阈值导致较大的量化误差,尤其是对于那些数值分布不均匀的权重。 分组量化通过将权重分为多个小组,并为每个小组独立计算量化参数(如量化步长或缩放因子),可以减少因个别极端值引起的全局量化误差 S4:将文件头的信息写入bin文件。包括magic、version、params(7个int参数) 头信息:文件头信息(File Header)是指存储在文件起始部分的一段特定数据,它包含了关于文件内容、格式、版本、结构以及如何解释文件中数据的必要信息。不同类型的文件有不同的文件头信息格式。 S5:关于norm层的全部fp32,通过numpy形式写入bin文件 S6:分组,找每组Wmax,s=Wmax/127.0, q=w/s,round(q)得到int8 反量化回去得fp32valr,算与w的最大误差err,在O(~0.001)最好 for i, w in enumerate(weights): #i是从0开始的索引,w是指当前索引的weight # quantize this weight q, s, err = quantize_q80(w, group_size) # save the int8 weights to file serialize_int8(out_file, q) # save the tensor in int8 serialize_fp32(out_file, s) # save scale factors def quantize_q80(w, group_size): """ takes a tensor and returns the Q8_0 quantized version i.e. symmetric quantization into int8, range [-127,127] """ assert w.numel() % group_size == 0 ori_shape = w.shape w = w.float() # convert to float32 # 将w分成多个小组,每个小组有group_size个元素,这儿是torch.Size([2048000, 64]) w = w.reshape(-1, group_size) # find the max in each group wmax = torch.abs(w).max(dim=1).values # calculate the scaling factor such that float = quant * scale scale = wmax / 127.0 # scale into range [-127, 127] quant = w / scale[:,None] # round to nearest integer int8val = torch.round(quant).to(torch.int8) # dequantize by rescaling fp32val = (int8val.float() * scale[:,None]).view(-1) fp32valr = fp32val.reshape(-1, group_size) # calculate the max error in each group err = torch.abs(fp32valr - w).max(dim=1).values # find the max error across all groups maxerr = err.max().item() return int8val, scale, maxerr
整个bin文件排布是256kb的header + 权重
first write out the header. the header will be 256 bytes
now that the header is done, let’s write out the model
()The End