a
    hF                     @   s   d dl mZmZmZ ddlmZ er0ddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ e rfd dlZeeZdZG d	d
 d
eZdS )    )TYPE_CHECKINGAnyOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameNc                	       s,  e Zd ZdZdZdZdgZ fddZdd Zd	d
 Z	dddddZ
ddeeeef dddZd1ddedeeef eee  dddZddddZdee ee dddZd2deee  dd d!Zee eee d"d#d$Zd%d& Zeed'd(d)Zd*d+ Zd3d,d-Zeed.d/d0Z  ZS )4Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    TFZ
acceleratec                    s$   t  j|fi | || _d | _d S N)super__init__quantization_configtriton_kernels_hub)selfr   kwargs	__class__ c/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   1   s    zMxfp4HfQuantizer.__init__c                 C   sF   | j du r@zddlm} |d| _ W n ty>   tdY n0 | j S )z3Lazy import and initialize kernels only when neededNr   )
get_kernelz kernels-community/triton_kernelsz2kernels package is required for MXFP4 quantization)r   Zkernelsr   ImportError)r   r   r   r   r   _lazy_import_kernels6   s    
z%Mxfp4HfQuantizer._lazy_import_kernelsc                 O   s>  t  std| jjrd S tj sH| jr@t	d d| j_d S t
dt sVtdtj }|dk}tdott }| jr|st	d d| j_d S |st	d	 d| j_d S n|std
n|std| js|   |d}|d u rt	d nB|d ur:| js:t|tr:d| v s2d| v r:tdd S )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`)      z3.4.0zMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200). We will default to dequantizing the model to bf16.ztMXFP4 quantization requires triton >= 3.4.0 and kernels installed, we will default to dequantizing the model to bf16zmMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200)zAMXFP4 quantization requires triton >= 3.4.0 and kernels installed
device_mapzYou have loaded an FP4 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpuZdiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r   r   
dequantizetorchcudais_availablepre_quantizedloggerwarning_onceRuntimeErrorr	   Zget_device_capabilityr   r
   
ValueErrorr   get
isinstancedictvalues)r   argsr   Zcompute_capabilityZgpu_is_supportedZkernels_availabler    r   r   r   validate_environmentA   sr    





z%Mxfp4HfQuantizer.validate_environmentztorch.dtype)dtypereturnc                 C   s   |d u rt j}td| |S )NzOverriding dtype=%s with `dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.)r#   Zbfloat16r'   info)r   r1   r   r   r   update_dtype   s    zMxfp4HfQuantizer.update_dtyper   ztorch.Tensor)modelparam_value
param_name
state_dictc           
      K   s   ddl m} ddlm} | jjrNd|v s0d|v rNt||d td  \}}	nt||\}}	t||sxt||r| jjr|	dv rdS d	S dS )
Nr   Mxfp4GptOssExpertsGptOssExpertsblocksscales_blocks)Zdown_proj_biasZgate_up_proj_biasFT)	integrationsr:   models.gpt_oss.modeling_gpt_ossr<   r   r"   r   lenr,   )
r   r5   r6   r7   r8   r   r:   r<   moduleZtensor_namer   r   r   check_quantized_param   s    
z&Mxfp4HfQuantizer.check_quantized_paramNztorch.device)r5   r6   r7   target_devicer8   unexpected_keysc              	   K   s  ddl m}m}	m}
m}m} ddlm} | js| 	 }t
||\}}t| t||r|||\}}|jj|jj|jj  }}}||||\}}d|v rdnd}t||| t|| d|||| dd t|| d	 t|| d
 W d    n1 s
0    Y  n|d}|d}|d}|d}|d}d|v s^d|v r| jjrt
||d td	  \}}nt
||\}}||||||d}t||st||r| jjr| jjr|d td	  }|	|||||fi | n|
||||| 	 fi | d S )Nr   )r:   r"   load_and_swizzle_mxfp4quantize_to_mxfp4swizzle_mxfp4r;   gate_up_proj	down_projZ_precision_config)Zrhs_data)weight_scaleZflex_ctxr?   _scalesempty_paramcasting_dtypeto_contiguousrankdevice_meshr=   r>   )rN   rO   rP   rQ   rR   r5   )r@   r:   r"   rG   rH   rI   rA   r<   r&   r   r   r#   Zdevicer,   Z
matmul_ogsPrecisionConfigFlexCtx
InFlexDatasetattrdelattrr+   r   rB   )r   r5   r6   r7   rE   r8   rF   r   r:   r"   rG   rH   rI   r<   r   rC   _Ztriton_weight_tensorrL   rS   rT   rU   ZprojrN   rO   rP   rQ   rR   Zshard_kwargsZdq_param_namer   r   r   create_quantized_param   sp    


4




	
z'Mxfp4HfQuantizer.create_quantized_param)r5   c                 K   s*   | j jr| | tj r&tj  d S r   )r   r"   Zremove_quantization_configr#   r$   r%   Zempty_cache)r   r5   r   r   r   r   #_process_model_after_weight_loading  s    

z4Mxfp4HfQuantizer._process_model_after_weight_loading)r5   expected_keyscheckpoint_keysc                 C   s  g }|D ]}| drF|d td  }||d  ||d  q| dr|d td  }||d  ||d  q| js| d	r|d td  }||d  nD| d
r|d td  }||d  n| drqn
|| q|| q|S )Nz.mlp.experts.gate_up_projrJ   gate_up_proj_blocksZgate_up_proj_scalesz.mlp.experts.down_projrK   down_proj_blocksZdown_proj_scalesz.mlp.experts.down_proj_blocksz .mlp.experts.gate_up_proj_blocksr>   )endswithrB   appendr&   )r   r5   r[   r\   Znew_expected_keyskeybaser   r   r   update_expected_keys  s,    




z%Mxfp4HfQuantizer.update_expected_keys)r5   keep_in_fp32_modulesc                 K   sj   ddl m} | || jj|| _|dd}|rBtd d| j_|j	}||| j| j|d}| j|j	_d S )Nr   )replace_with_mxfp4_linearuse_kernelsFzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   config)
r@   re   Zget_modules_to_not_convertr   rg   r+   r'   r(   r"   rh   )r   r5   rd   r   re   rf   rh   r   r   r   $_process_model_before_weight_loading)  s$    
z5Mxfp4HfQuantizer._process_model_before_weight_loading)missing_keysprefixr2   c                    s   ddl m} g  | D ]T\}}t||r|D ]<}||v sL|| d| v r.|ds.|ds. | q.q fdd|D S )Nr   r9   .z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0kZnot_missing_keysr   r   
<listcomp>U      z8Mxfp4HfQuantizer.update_missing_keys.<locals>.<listcomp>)r@   r:   named_modulesr,   r_   r`   )r   r5   rj   rk   r:   namerC   missingr   ro   r   update_missing_keysH  s    
z$Mxfp4HfQuantizer.update_missing_keysc                 C   s6   d|j jv r2t|dd d ur2|jddddd |S )NZGptOssConfigbase_model_tp_planZgrouped_gemm)z(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrrv   update)r   rh   r   r   r   update_tp_planW  s    zMxfp4HfQuantizer.update_tp_plan)r7   r2   c                 C   sh   | j jr2d|v r|ddS d|v rd|ddS n2| jsd|drN|ddS |drd|ddS |S )Nr?    rM   rJ   r]   rK   r^   )r   r"   replacer&   r_   )r   r7   r   r   r   update_param_named  s    

z"Mxfp4HfQuantizer.update_param_namec                 C   s  ddl m} | }| D ]\}}t||rt|drt|dr|jjj	|jjj
dddddd	|| d
< |jjjj	|jjjj
dd|| d< |jjj	|jjj
dddddd|| d< |jjjj	|jjjj
dd|| d< q|S )Nr   r9   rJ   rK       Z      z.gate_up_proj_blocksz.gate_up_proj_scalesi@  z.down_proj_blocksz.down_proj_scales)r@   r:   r8   rr   r,   hasattrrJ   ZstorageZlayoutZunswizzle_datadataZ	transposeZreshapeZgate_up_proj_precision_configrL   rK   Zdown_proj_precision_config)r   r5   r:   r8   rs   rC   r   r   r   get_state_dictq  sD    

zMxfp4HfQuantizer.get_state_dictc                 C   s   dS )NTr   )r   Zsafe_serializationr   r   r   is_serializable  s    z Mxfp4HfQuantizer.is_serializable)r2   c                 C   s   t d dS )NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r'   r(   )r   r   r   r   is_trainable  s    zMxfp4HfQuantizer.is_trainable)N)N)N)rw   
__module____qualname____doc__Z requires_parameters_quantizationZrequires_calibrationZrequired_packagesr   r   r0   r4   strr-   r   rD   r   listrY   rZ   rc   ri   ru   rz   r}   r   r   propertyboolr   __classcell__r   r   r   r   r   '   sH   I
 

V  
"
r   )typingr   r   r   rb   r   Zmodeling_utilsr   utilsr	   r
   r   r   r   Zquantizers_utilsr   r#   Z
get_loggerrw   r'   r   r   r   r   r   r   <module>   s   
