a
    hcP                  
   @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZmZmZmZ e rzddlZddlmZ eeZg d	g d
g dddg d	g dg ddddg d	g d
g dddg d	g d
g ddddZdddddddddddddddddddddddddZdd Zd-edddZdd  Zd!d" Zd#d$ Zd%d& Z d'd( Z!d)d* Z"d+d, Z#dS ).z;AWQ (Activation aware Weight Quantization) integration file    N)version   )ACT2FN)PreTrainedModel)is_auto_awq_availableis_ipex_availableis_torch_availablelogging)AwqBackendPackingMethod	AwqConfigAWQLinearVersionExllamaVersion)q_projk_projv_projo_proj)	gate_projup_proj	down_proj)Zinput_layernormZpost_attention_layernormZnormF)	attentionmlp	layernorm	use_alibi)Zw1Zw3Zw2g    .A)r   r   r   r   
rope_theta)ZmistralZmixtralllamaZllavaactZc_fc)r   layer_before_actZdense_h_to_4hr   Zfc_inZ	gelu_impl)Z
starcoder2ZRefinedWebModelZfalconZmptZgptjZgpt_neoxZgpt_bigcodeZbloomc                 C   s   ddl m} |tvr| S |  D ]n\}}t| d }t| d }||krt| |rt| t| d }|j}t|}	|||	| j	|< t
||}
q | S )Nr   )ScaledActivationr   r   )Zawq.modules.actr   AWQ_SCALES_MAPPINGSnamed_childrenhasattrgetattrout_featurestorchZones_modulesreplace_quantization_scales)model
model_typer   namemoduleZact_nameZlayer_before_act_namer   sizeZ
scale_like_ r,   Y/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/integrations/awq.pyr%   M   s    
r%   )returnc              	      s  |du rg }|j }t s td|tjkr|jtjkrHddlm	} |}n|jtj
krfddlm} |}n|jtjkr|jd tjkrddlm}	 |	}q|jd tjkrddlm}
 |
}qtd	|jd  n.|jtjkrdd
lm} |}ntd|j nddlm} |}|  D ]\}} du r*g   | t|tjr||vrt  fdd|D s|j!}|j"}||j#|j$|||j%du|j&j'd| j(|< d}| j(| )d t*t+|, dkrt-|| ||d\}} .d q| |fS )a  
    Public method that recursively replaces the Linear layers of the given model with AWQ quantized layers.
    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
    conversion has been successful or not.

    During the module replacement, we also infer the backend to use through the `quantization_config` object.

    Args:
        model (`torch.nn.Module`):
            The model to convert, can be any `torch.nn.Module` instance.
        quantization_config (`AwqConfig`):
            The quantization config object that contains the quantization parameters.
        modules_to_not_convert (`list`, *optional*):
            A list of modules to not convert. If a module name is in the list (e.g. `lm_head`), it will not be
            converted.
        current_key_name (`list`, *optional*):
            A list that contains the current key name. This is used for recursion and should not be passed by the user.
        has_been_replaced (`bool`, *optional*):
            A boolean that indicates if the conversion has been successful or not. This is used for recursion and
            should not be passed by the user.
    NzAWQ (either `autoawq` or `llmawq`) is not available. Please install it with `pip install autoawq` or check out the installation guide in https://github.com/mit-han-lab/llm-awqr   )WQLinear_GEMM)WQLinear_GEMVr   )WQLinear_Exllama)WQLinear_ExllamaV2Unrecognized Exllama version: WQLinear_IPEXzUnrecognized AWQ version: )WQLinearc                 3   s   | ]}|d   v V  qdS ).N)join).0keycurrent_key_namer,   r-   	<genexpr>       z*replace_with_awq_linear.<locals>.<genexpr>)w_bit
group_sizein_featuresr"   biasdevTF)modules_to_not_convertr<   quantization_confighas_been_replaced)/backendr   
ValueErrorr
   AUTOAWQr   r   ZGEMMZawq.modules.linear.gemmr/   ZGEMVZawq.modules.linear.gemvr0   ZEXLLAMAexllama_configr   ONEawq.modules.linear.exllamar1   TWOawq.modules.linear.exllamav2r2   ZIPEXawq.modules.linear.gemm_ipexr5   Zawq.quantize.qmoduler6   r   append
isinstancennZLinearanyrA   r"   bitsr@   rB   weightdevicer$   Zrequires_grad_lenlistchildrenreplace_with_awq_linearpop)r&   rD   rE   r<   rF   rH   r/   
target_clsr0   r1   r2   r5   r6   r(   r)   rA   r"   r+   r,   r;   r-   r[   ^   sp    



r[   c                 C   s   t | tstd| jj |jdur8|j}|j|d< nj| jjt	v rt	| jj }| jj
dd}|j}|j}t|d|}||d< ||d< ||d< |j|d< ntd	|S )
af  
    Returns the fusing mapping given the quantization config and the model

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`~transformers.quantization_config.AWQConfig`):
            The quantization configuration to use.
    z:The model should be an instance of `PreTrainedModel`, got Nmax_seq_lenTdecodernum_key_value_headshidden_sizenum_attention_headsa  Fusing mapping not found either on the quantization config or the supported `AWQ_FUSED_MAPPINGS`. Please pass a `fused_mapping` argument in the `quantization_config` or raise an issue on transformers https://github.com/huggingface/transformers to add its support.)rR   r   	TypeError	__class____name__modules_to_fuseZfuse_max_seq_lenconfigr'   AWQ_FUSED_MAPPINGSget_text_configrb   rc   r!   rI   )r&   rE   Zcurrent_fused_mappingrh   rb   rc   ra   r,   r,   r-   get_modules_to_fuse   s&    


rk   c           
         sb  t |trt|}|j}t| |}t|dd}|tjkr`ddl	m
} ddlm} ddlm} ntdg  |  D ]\}|durtfdd	|D rqtt|d
 || |jdkrt| |d || n
td t| |||}	|	rt dd  qtt dkr^|  D ]F\}t fdd	 D rt|drt|jdrd|j_q| S )aJ  
    Optionally fuse some modules in the model to speedup inference.

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`Union[AwqConfig, dict]`):
            The quantization configuration to use.
    rD   Nr   )QuantAttentionFused)QuantFusedMLP)FasterTransformerRMSNormz0Fusing is only supported for the AutoAWQ backendc                 3   s   | ]}| v V  qd S Nr,   )r9   Zmodule_name_to_not_convert)r(   r,   r-   r=     r>   z#fuse_awq_modules.<locals>.<genexpr>r   Zipexr   z7The IPEX version AWQ does not support fuse mlp for now.r7   c                 3   s   | ]} v V  qd S ro   r,   )r9   Zfused_attention_parent_module)fused_attention_modulesmodule_namer,   r-   r=   (  s   rh   _attn_implementationZcustom)rR   dictr   	from_dictrH   rk   r!   r
   rJ   Zawq.modules.fused.attnrl   Zawq.modules.fused.mlprm   Zawq.modules.fused.normrn   rI   Znamed_modulesrT   _fuse_awq_layernormr   _fuse_awq_mlploggerinfo_fuse_awq_attention_layersrQ   splitrX   r    rh   rr   )
r&   rE   rH   rg   rD   rl   rm   rn   r)   Zattention_has_been_fusedr,   )rp   rq   r(   r-   fuse_awq_modules   s@    







r{   c                 C   sB   | D ]8}t ||rt||}||j|j|jj|j|< ~qdS )a  
    Fuse the LayerNorm layers into a target class using autoawq

    Args:
        fuse_module_names (`list[str]`):
            The list of module names to fuse
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.FasterTransformerRMSNorm`):
            The `FasterTransformerRMSNorm` class as it only supports that class
            for now.
    N)r    r!   rV   Zvariance_epsilontorW   r$   )fuse_module_namesr)   r]   rq   Z
old_moduler,   r,   r-   ru   0  s    


ru   c                 C   s   t |dkrdS t||d rt||d }t||d }t||d }|jj}| jjdd}	|	j}
t|
 }|||||}|	dd\}}| 
|}t|||| ~~~dS )a  
    Fuse the MLP layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        current_module_name (`str`):
            The current submodule name
        fuse_module_names (`list[str]`):
            The list of module names to fuse. For the MLP layers it has to be an array
            of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.QuantFusedMLP`):
            The `QuantFusedMLP` class as it only supports that class
            for now.
    r   N   r   Tr_   r7   )rX   r    r!   qweightrW   rh   rj   
hidden_actr   rsplitget_submodulesetattrr|   )r&   current_module_namer}   r)   r]   r   r   r   previous_devicerh   r   Zactivation_fnZ
new_moduleparent_name
child_nameparentr,   r,   r-   rv   G  s    
rv   c                 C   sL  ddl m}m} d}t|d dkr(|S t||d d rHt||d d }t||rb|}	d}
n^t||rv|}	d}
nJt rt	t
jdt	dkrddl m} t||r|}	d}
ntd	|jj}t||d d }t||d d
 }t||d d }|jdur$tj|j|j|jgddnd}|	|j|j|j|j|j |j |jdutt|  j}tj|j|j|jg|
d|_tj|j|j|jg|
d|_tj|j|j|jg|
d|_t||r|j|_||_||d |d |d ||||d |d |ddd	}d|_| dd\}}| !|}t"|||#| ~~~~d}|S )a  
    Fuse the Attention layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        modules_to_fuse (`list[str]`):
            The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
            in the correct order: q, k, v, o layer
        current_module_name (`str`):
            The current submodule name
        target_cls (`~autoawq.QuantAttentionFused`):
            The `QuantAttentionFused` class as it only supports that class
            for now.
    r   )r/   r0   Fr   r~   Zautoawqz0.2.6r4   z'Unsupported q_proj type: {type(q_proj)}r      N)dimrb   rc   ra   r^   r   r   g     @)r   r   Tr7   )$Zawq.modules.linearr/   r0   rX   r    r!   rR   r   r   parse	importlibmetadatar5   rI   r   rW   rB   r#   catr?   r@   rA   r"   nextiterZ
state_dictvaluesZqzerosscalesZsplit_k_itersgetZis_hf_transformersr   r   r   r|   )r&   r)   rg   r   r]   r/   r0   Zmodule_has_been_fusedr   Zlinear_target_clsZcat_dimr5   r   r   r   r   rB   Z	qkv_layerZfused_attention_layerr   r   r   r,   r,   r-   ry   p  sn    

"
*	

ry   c                 C   sl   |d t jkr$ddlm} || } nD|d t jkrVddlm} || |d |d d} ntd|d  | S )	z
    Runs post init for Exllama layers which performs:
        - Weights unpacking, reordering and repacking
        - Devices scratch space allocation
    r   r   )exllama_post_init)exllamav2_post_initmax_input_lenmax_batch_size)r   r   r3   )r   rL   rM   r   rN   rO   r   rI   )r&   rK   r   r   r,   r,   r-   post_init_awq_exllama_modules  s    
r   c                 C   s   ddl m} || } | S )zl
    Runs post init for IPEX layers which performs:
        - Weights packing, reordering and repacking
    r   )ipex_post_init)rP   r   )r&   r   r,   r,   r-   post_init_awq_ipex_modules  s    r   )NNNF)$__doc__r   	packagingr   Zactivationsr   Zmodeling_utilsr   utilsr   r   r   r	   Zutils.quantization_configr
   r   r   r   r#   Ztorch.nnrS   Z
get_loggerrf   rw   ri   r   r%   boolr[   rk   r{   ru   rv   ry   r   r   r,   r,   r,   r-   <module>   sp   
    i)@)_