a
    h{5                     @   s   d dl mZmZmZ ddlmZ er0ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ e rbd dlZeeZG d	d
 d
eZdS )    )TYPE_CHECKINGAnyOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_nameNc                	       s   e Zd ZdZdZdZddgZ fddZdd	 Zd
d
dddZ	dde
ee
ef dddZd(dde
dee
ef eee
  dddZddddZd)deee
  dddZee
 e
ee
 ddd Zd!d" Zd*d#d$Zeed%d&d'Z  ZS )+FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    TFz
fbgemm-gpuZ
acceleratec                    s   t  j|fi | || _d S N)super__init__quantization_config)selfr   kwargs	__class__ h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   +   s    zFbgemmFp8HfQuantizer.__init__c                 O   s   t  stdt stdtds,tdtj s>tdtj }|\}}|dk r`t	d|
d}|d u r~td	 n8|d ur| jst|trd
| v sd| v rt	dd S )NzUsing fbgemm fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpuZdiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ImportErrorr
   r	   torchcudaZis_availableRuntimeErrorZget_device_capability
ValueErrorgetloggerZwarning_oncepre_quantized
isinstancedictvalues)r   argsr   Zcompute_capabilitymajorminorr   r   r   r   validate_environment/   sJ    




z)FbgemmFp8HfQuantizer.validate_environmentztorch.dtype)dtypereturnc                 C   s2   |d u rt j}td| n|t jkr.td|S )NzOverriding dtype=%s with `dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.zYYou cannot use FP8 with dtype=torch.float16.We recommend you passing dtype=torch.bfloat16)r   Zbfloat16r"   infoZfloat16r    )r   r+   r   r   r   update_dtype\   s    
z!FbgemmFp8HfQuantizer.update_dtyper   ztorch.Tensor)modelparam_value
param_name
state_dictc           
      K   s   ddl m}m} t||\}}	t||rj| js6|	dkrV|	dkrR|jtjkrRt	ddS |	dkrft	dd	S t||r| js|	dkrdS |	d
ks|	dkrt	dd	S dS )Nr   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsZbiasweightz6Expect quantized weights but got an unquantized weightFweight_scalez;Expect unquantized weights but got a quantized weight_scaleTZgate_up_proj_scaleZdown_proj_scale)
integrationsr4   r5   r   r$   r#   r+   r   Zfloat8_e4m3fnr    )
r   r/   r0   r1   r2   r   r4   r5   moduletensor_namer   r   r   check_quantized_paraml   s"    

z*FbgemmFp8HfQuantizer.check_quantized_paramNztorch.device)r/   r0   r1   target_devicer2   unexpected_keysc                 C   s  ddl m} t||\}}	t||r|	dkr|dd}
|
j}|
d|d }tjj	
|\}}||}|dd}||d d|d }nh|	dkr|dd}
|
j}|
d|d }tjj	
|\}}||}|dd}||d |d d}tj|||j|	 d< n>tjj	
|\}}tj||jd d||j|	 d< tj|||j|	< |d	ur||v r|| ~d	S )
z@
        Quantizes weights into weight and weight_scale
        r   )r5   Zgate_up_projr   r   Z	down_projZ_scaleN)r8   r5   r   r$   Z	transposeshapeZreshaper   opsZfbgemmZquantize_fp8_per_rownn	Parameterto_parametersviewremove)r   r/   r0   r1   r<   r2   r=   r5   r9   r:   Ztransposed_paramZoriginal_shapeZflattened_paramZnew_value_flatZweight_scale_flat	new_valuer7   r   r   r   create_quantized_param   s8    

 
z+FbgemmFp8HfQuantizer.create_quantized_param)r/   c                 K   s   |S r   r   )r   r/   r   r   r   r   #_process_model_after_weight_loading   s    z8FbgemmFp8HfQuantizer._process_model_after_weight_loading)r/   keep_in_fp32_modulesc                 K   sT   ddl m} |j}| || jj|| _|j}||| j| j| j||d}| j|j_d S )Nr   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r#   configtp_plan)r8   rK   Z_tp_planZget_modules_to_not_convertr   rL   rM   r#   )r   r/   rJ   r   rK   rN   rM   r   r   r   $_process_model_before_weight_loading   s    
	z9FbgemmFp8HfQuantizer._process_model_before_weight_loading)missing_keysprefixr,   c           	         s   ddl m}m} g  | D ]X\}}t|||fr|D ]<}||v sT|| d| v r6|ds6|ds6 | q6q fdd|D S )Nr   r3   .z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0kZnot_missing_keysr   r   
<listcomp>       z<FbgemmFp8HfQuantizer.update_missing_keys.<locals>.<listcomp>)r8   r4   r5   Znamed_modulesr$   endswithappend)	r   r/   rP   rQ   r4   r5   namer9   missingr   rU   r   update_missing_keys   s    z(FbgemmFp8HfQuantizer.update_missing_keysc                 C   sl   d|j jv rhddddddddddddddddddddddddddd}| d ur^|| _n||_|S |S )	NZLlama4Zlocal_colwiseZlocal_rowwiseZgatherZsequence_parallellocalZlocal_packed_rowwise)z layers.*.self_attn.q_proj.weightz&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightzlayers.*.self_attnzlayers.*.input_layernorm.weightz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightzlayers.*.feed_forward.expertszlayers.*.feed_forwardz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalez.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__Zget_text_configZbase_model_tp_plan)r   rM   Z	text_planr   r   r   update_tp_plan   sB    %z#FbgemmFp8HfQuantizer.update_tp_planc                 C   s   dS )NTr   )r   Zsafe_serializationr   r   r   is_serializable#  s    z$FbgemmFp8HfQuantizer.is_serializable)r,   c                 C   s   dS )NFr   )r   r   r   r   is_trainable&  s    z!FbgemmFp8HfQuantizer.is_trainable)N)N)N)r^   
__module____qualname____doc__Z requires_parameters_quantizationZrequires_calibrationZrequired_packagesr   r*   r.   strr%   r   r;   r   listrH   rI   rO   r\   r_   r`   propertyboolra   __classcell__r   r   r   r   r   !   s@   -
% 

? 
/
r   )typingr   r   r   baser   Zmodeling_utilsr   utilsr	   r
   r   r   Zquantizers_utilsr   r   Z
get_loggerr^   r"   r   r   r   r   r   <module>   s   
