a
    h%                     @   s   d dl mZmZmZ ddlmZmZmZmZ ddl	m
Z
 ddlmZ e rRd dlZerbddlmZ eeZG d	d
 d
e
ZdS )    )TYPE_CHECKINGAnyOptional   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                	       s   e Zd ZdZdZdZdgZ fddZdd Zd	d	d
ddZ	d)dde
dee
ef eee
  dddZdde
ee
ef dddZd*deee
  dddZddddZee
 e
ee
 dddZd d! Zd+d"d#Zeed$d%d&Zd'd( Z  ZS ),FineGrainedFP8HfQuantizerz
    FP8 quantization implementation supporting both standard and MoE models.
    Supports both e4m3fn formats based on platform.
    TFZ
acceleratec                    s   t  j|fi | || _d S N)super__init__quantization_config)selfr   kwargs	__class__ m/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr      s    z"FineGrainedFP8HfQuantizer.__init__c                 O   s   t  stdt std|dds4|ddr<tdtj sTt sTt	dtj rtj
 }|\}}|dk s|dkr|d	k rtd
| d| d|d}|d u rtd n8|d ur| jst|trd| v sd| v rtdd S )NzxUsing fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)Zfrom_tfFZ	from_flaxzConverting into FP8 weights from tf/flax weights is currently not supported, please make sure the weights are in PyTorch format.zANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.`
device_mapzYou have loaded an FP8 model on CPU and have a CUDA or XPU device available, make sure to set your model on a GPU or XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or 'xpu'. cpuZdiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r   ImportErrorr   get
ValueErrortorchcudaZis_availabler   RuntimeErrorZget_device_capabilityloggerZwarning_oncepre_quantized
isinstancedictvalues)r   argsr   Zcompute_capabilitymajorminorr   r   r   r   validate_environment   sP    




z.FineGrainedFP8HfQuantizer.validate_environmentztorch.dtype)dtypereturnc                 C   s   |d u rt d tj}|S )NzKSetting dtype to torch.float32 as no dtype was specified in from_pretrained)r%   infor"   Zfloat32)r   r.   r   r   r   update_dtypeN   s    
z&FineGrainedFP8HfQuantizer.update_dtypeNr   ztorch.Tensorztorch.device)modelparam_value
param_nametarget_device
state_dictunexpected_keysc              
   C   s`  ddl m} ||}ttjj}ttjj}	| jj	\}
}|j
dd \}}||
 dksh|| dkrtd| d| d|
 d| d		|j
}|d
||
 |
|| |ddddd}tjt|dd}|	| }|j
}|d
d
}tj|| ||	dtj}|ddddd}||}||  }|||| |||ddd d | dS )zO
        Quantizes weights to FP8 format using Block-wise quantization
        r   )_load_parameter_into_modelNr   zMatrix dimensions (z, z$) must be divisible by block sizes ()r
         )r;   r9   )dim)minmaxr   z.weight_scale_inv)modeling_utilsr8   tor"   Zfinfofloat8_e4m3fnr?   r@   r   Zweight_block_sizeshaper!   ZreshapeZpermuteZamaxabsZ	unsqueezeclampZsqueezeZ
reciprocalrsplit)r   r2   r3   r4   r5   r6   r7   r8   Zfp8_minZfp8_maxZblock_size_mZblock_size_nrowscolsZparam_value_orig_shapeZmax_absscaleZscale_orig_shapeZquantized_paramr   r   r   create_quantized_paramT   s4    


z0FineGrainedFP8HfQuantizer.create_quantized_param)r2   r3   r4   r6   c           	      K   sj   ddl m} t||\}}t||rf| js2|dkrR|dkrN|jtjkrNtddS |dkrbtdd	S dS )
Nr   	FP8LinearZbiasweightz6Expect quantized weights but got an unquantized weightFZweight_scale_invz;Expect unquantized weights but got a quantized weight_scaleT)	integrations.finegrained_fp8rM   r   r'   r&   r.   r"   rC   r!   )	r   r2   r3   r4   r6   r   rM   moduleZtensor_namer   r   r   check_quantized_param   s    
z/FineGrainedFP8HfQuantizer.check_quantized_param)r2   keep_in_fp32_modulesc                 K   s@   ddl m} | || jj|| _||| j| jd}| j|j_d S )Nr   )replace_with_fp8_linear)modules_to_not_convertr   )rO   rS   Zget_modules_to_not_convertr   rT   config)r   r2   rR   r   rS   r   r   r   $_process_model_before_weight_loading   s    
z>FineGrainedFP8HfQuantizer._process_model_before_weight_loading)r2   c                 K   s   |S r   r   )r   r2   r   r   r   r   #_process_model_after_weight_loading   s    z=FineGrainedFP8HfQuantizer._process_model_after_weight_loading)missing_keysprefixr/   c                    s   ddl m} g  | D ]T\}}t||r|D ]<}||v sL|| d| v r.|ds.|ds. | q.q fdd|D S )Nr   rL   r   z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0kZnot_missing_keysr   r   
<listcomp>       zAFineGrainedFP8HfQuantizer.update_missing_keys.<locals>.<listcomp>)ZintegrationsrM   Znamed_modulesr'   endswithappend)r   r2   rX   rY   rM   namerP   missingr   r\   r   update_missing_keys   s    
z-FineGrainedFP8HfQuantizer.update_missing_keysc                 C   s<   d|j jv r8ddddddddddddddddd}||_|S )NZQwen3Zlocal_colwiseZlocal_rowwiseZgather)z layers.*.self_attn.q_proj.weightz*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightz*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.self_attnzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_invzlayers.*.mlp)r   __name__Zbase_model_tp_plan)r   rU   Z	text_planr   r   r   update_tp_plan   s(    z(FineGrainedFP8HfQuantizer.update_tp_planc                 C   s   dS )NTr   )r   Zsafe_serializationr   r   r   is_serializable   s    z)FineGrainedFP8HfQuantizer.is_serializable)r/   c                 C   s   dS )NFr   r   r   r   r   is_trainable   s    z&FineGrainedFP8HfQuantizer.is_trainablec                 C   s   dS )Nr   r   rg   r   r   r   get_accelerator_warm_up_factor   s    z8FineGrainedFP8HfQuantizer.get_accelerator_warm_up_factor)N)N)N)rd   
__module____qualname____doc__Z requires_parameters_quantizationZrequires_calibrationZrequired_packagesr   r-   r1   strr(   r   r   listrK   rQ   rV   rW   rc   re   rf   propertyboolrh   ri   __classcell__r   r   r   r   r      sB   / 

8
 

r   )typingr   r   r   utilsr   r   r   r	   baser   Zquantizers_utilsr   r"   rA   r   Z
get_loggerrd   r%   r   r   r   r   r   <module>   s   
