a
    h                     @   s|   d dl mZmZ ddlmZ er,ddlmZ ddlmZm	Z	m
Z
mZ ddlmZ e	 r^d dlZeeZG d	d
 d
eZdS )    )TYPE_CHECKINGOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_torch_availableis_vptq_availablelogging)QuantizationConfigMixinNc                       s   e Zd ZdZdZdgZed fddZdd Zd	d	d
ddZ	dde
ee  dddZddddZeedddZdddZ  ZS )VptqHfQuantizerzS
    Quantizer of the VPTQ method. Enables the loading of prequantized models.
    Tvptq)quantization_configc                    s   t  j|fi | || _d S N)super__init__r   )selfr   kwargs	__class__ b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/quantizers/quantizer_vptq.pyr   (   s    zVptqHfQuantizer.__init__c                 O   s    t  stdt stdd S )NzGUsing `vptq` quantization requires Accelerate: `pip install accelerate`zEUsing `vptq` quantization requires VPTQ>=0.0.4: `pip install -U vptq`)r   ImportErrorr
   )r   argsr   r   r   r   validate_environment,   s    z$VptqHfQuantizer.validate_environmentztorch.dtype)dtypereturnc                 C   sd   |d u r`t j r$t j}td n<dd l}t|ddd }|ddu rPtdt j	}td	 |S )
NzCUDA available. Assuming VPTQ inference on GPU and loading the model in `torch.float16`. To overwrite it, set `dtype` manually.r   device_availabilityc                 S   s   dS NFr   )Zdevicer   r   r   <lambda>=       z.VptqHfQuantizer.update_dtype.<locals>.<lambda>cpuTzKNo GPU found. Please wait for the next release of VPTQ to use CPU inferencezVNo GPU found. Assuming VPTQ inference on CPU and loading the model in `torch.float32`.)
torchcudaZis_availableZfloat16loggerinfor   getattrRuntimeErrorZfloat32)r   r   r   r   r   r   r   update_dtype3   s    

zVptqHfQuantizer.update_dtypeNr   )modelkeep_in_fp32_modulesc                 K   s@   ddl m} | || jj|| _||| j| jd | j|j_dS )z
        we don't have param like modules_to_not_convert to indicate which layers should not be quantized
        because `quantization_config` include the layers that should be quantized
        r   )replace_with_vptq_linear)r   modules_to_not_convertN)Zintegrationsr,   Zget_modules_to_not_convertr   r-   config)r   r*   r+   r   r,   r   r   r   $_process_model_before_weight_loadingD   s    

z4VptqHfQuantizer._process_model_before_weight_loading)r*   c                 K   s   |S r   r   )r   r*   r   r   r   r   #_process_model_after_weight_loading[   s    z3VptqHfQuantizer._process_model_after_weight_loading)r   c                 C   s   dS r   r   )r   r   r   r   is_trainable^   s    zVptqHfQuantizer.is_trainablec                 C   s   dS )NTr   )r   Zsafe_serializationr   r   r   is_serializableb   s    zVptqHfQuantizer.is_serializable)N)N)__name__
__module____qualname____doc__Zrequires_calibrationZrequired_packagesr   r   r   r)   r   liststrr/   r0   propertyboolr1   r2   __classcell__r   r   r   r   r       s    
r   )typingr   r   baser   Zmodeling_utilsr   utilsr   r	   r
   r   Zutils.quantization_configr   r#   Z
get_loggerr3   r%   r   r   r   r   r   <module>   s   
