a
    h?                     @   s   d dl Z d dlZd dlZd dlmZmZmZ d dlmZ ddl	m
Z
 ddlmZ er`ddlmZ d d	lmZ dd
lmZmZmZ ddlmZ e rd dlZd dlmZ eeZeee dddZdd Zdd Zdd Z G dd de
Z!dS )    N)TYPE_CHECKINGOptionalUnion)version   )HfQuantizer)get_module_from_name   )PreTrainedModel)Any)is_torch_availableis_torchao_availablelogging)TorchAoConfig)config_namereturnc                 C   s&   |   } td| }|r"|dS dS )z
    Extract the size digit from strings like "4weight", "8weight".
    Returns the digit as an integer if found, otherwise None.
    z
(\d)weightr   N)lowerresearchgroup)r   Z	str_match r   e/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/quantizers/quantizer_torchao.pyfuzzy_match_size)   s
    
r   c                 C   s.   | dd d }| }|D ]}|j| }q|S )N.)splitZ_modules)modelnameZmodule_treeparentmr   r   r   find_parent9   s
    r    c                 C   sj   ddl m} ddlm} t| |r:| jj d|   dS t| |rf| jj d| j dt| j	 dS d S )Nr   )AffineQuantizedTensor)LinearActivationQuantizedTensor()z(activation=	, weight=)
Ztorchao.dtypesr!   Z7torchao.quantization.linear_activation_quantized_tensorr"   
isinstance	__class____name___quantization_typeZinput_quant_funcZoriginal_weight_tensor)weightr!   r"   r   r   r   r)   A   s    

r)   c                 C   s^   t | j}|d u r4d| jjd  d| jjd  dS d| jjd  d| jjd  d| S d S )Nzin_features=r   z, out_features=r   z, weight=Noner%   )r)   r*   shape)selfr*   r   r   r   _linear_extra_reprL   s    
"r-   c                       s  e Zd ZdZdZdZdgZ fddZdd Zd	d
 Z	dddddZ
eeeeef f eeeeef f dddZd*deee  dddZddeeeef edddZddedeeef ee dddZdd  Zd+ed!d"d#Zd$d% Zeed!d&d'Zeed!d(d)Z  ZS ),TorchAoHfQuantizerz?
    Quantizer for torchao: https://github.com/pytorch/ao/
    TFtorchaoc                    s   t  j|fi | d S )N)super__init__)r,   quantization_configkwargsr'   r   r   r1   ]   s    zTorchAoHfQuantizer.__init__c                 O   s   t  stdd| _|d}t|trld| v s@d| v rlt|dkrld| _| jrld| v rlt	d| jr|d	}|rt
tj
d
}|t
dk rtd| dd S )NzSLoading an torchao quantized model requires torchao library (`pip install torchao`)F
device_mapZdiskcpur   TzYou are attempting to perform disk offload with a pre-quantized torchao model This is not supported yet . Please remove the disk device from the device_map.weights_onlytorchz2.5.0zlIn order to use torchao pre-quantized model, you need to have torch>=2.5.0. However, the current version is zc. You can also set with `weights_only=False` in `from_pretrained` if you don't want to update torch)r   ImportErroroffloadgetr&   dictvalueslenpre_quantized
ValueErrorr   parse	importlibmetadataRuntimeError)r,   argsr3   r5   r7   Ztorch_versionr   r   r   validate_environment`   s&    

$

z'TorchAoHfQuantizer.validate_environmentc                 C   sp   | j jdkrH|d ur0|tjkr0td| d |d u rHtd tj}| j jdkrl|d u rltd tj}|S )Nint4_weight_onlyzSetting dtype to zo for int4_weight_only quantization, but only bfloat16 is supported right now. Please set the dtype to bfloat16.zSetting dtype to torch.bfloat16 for int4_weight_only quantization since only bfloat16 is supported right now. Please set dtype=torch.bfloat16 to remove this warning.#int8_dynamic_activation_int8_weightzSetting dtype to torch.float32 for int8_dynamic_activation_int8_weight quantization as no dtype was specified in from_pretrained)r2   
quant_typer8   Zbfloat16loggerZwarning_onceinfoZfloat32)r,   dtyper   r   r   update_dtypex   s"    
zTorchAoHfQuantizer.update_dtypeztorch.dtype)rL   r   c                 C   s   t tj dt dkrddlm} | j t dkr~ddl	m
} | jj}t||r~|jj}t|}|dkrx|jS tjS |jtjtjd d}|| jj S td	d S )
NZ
acceleratez0.19.0r   )CustomDtype0.9.0AOBaseConfig4rG   int8_weight_onlyrH   	autoquantzYou are using `device_map='auto'` on a torchao quantized model. To automatically compute the appropriate device map, you should upgrade your `accelerate` library with `pip install --upgrade accelerate`)r   rA   rB   rC   Zaccelerate.utilsrN   r2   _get_ao_versionVersiontorchao.core.configrQ   rI   r&   r'   r(   r   ZINT4r8   Zint8r@   )r,   rL   rN   rQ   rI   r   
size_digitmap_to_target_dtyper   r   r   adjust_target_dtype   s(    
z&TorchAoHfQuantizer.adjust_target_dtype)
max_memoryr   c                 C   s   dd |  D }|S )Nc                 S   s   i | ]\}}||d  qS )g?r   ).0keyvalr   r   r   
<dictcomp>       z8TorchAoHfQuantizer.adjust_max_memory.<locals>.<dictcomp>)items)r,   r\   r   r   r   adjust_max_memory   s    z$TorchAoHfQuantizer.adjust_max_memoryNr
   )r   keep_in_fp32_modulesc                    st   |  || jj|| _| jjrp|   fdd| D | fdd| D fdd| jD | _d S )Nc                    s$   g | ]\}}t |t  kr|qS r   idr]   r   module)	input_embr   r   
<listcomp>   ra   zKTorchAoHfQuantizer._process_model_before_weight_loading.<locals>.<listcomp>c                    s$   g | ]\}}t |t  kr|qS r   re   rg   )
output_embr   r   rj      ra   c                    s   g | ]}|  vr|qS r   r   )r]   x)input_emb_namesoutput_emb_namesr   r   rj      s   )Zget_modules_to_not_convertr2   modules_to_not_convertinclude_input_output_embeddingsget_input_embeddingsZnamed_modulesZget_output_embeddings)r,   r   rd   r3   r   )ri   rm   rk   rn   r   $_process_model_before_weight_loading   s    
z7TorchAoHfQuantizer._process_model_before_weight_loadingztorch.Tensor)r   param_value
param_name
state_dictr   c           
         s   | j jdkrdS |dd }t fdd| jD r8dS |dkrJ| jrJdS t| \}}tjj	g}	| j j
rx|	tjj t|t|	o|dkS d S )NrU   Fparam_devicec                 3   s"   | ]}|d   v p| kV  qdS )r   Nr   )r]   r^   rt   r   r   	<genexpr>   ra   z;TorchAoHfQuantizer.check_quantized_param.<locals>.<genexpr>r6   r*   )r2   rI   popanyro   r:   r   r8   nnLinearrp   appendZ	Embeddingr&   tuple)
r,   r   rs   rt   ru   r3   rv   rh   tensor_nameZ_QUANTIZABLEr   rw   r   check_quantized_param   s    
z(TorchAoHfQuantizer.check_quantized_paramztorch.device)r   rs   rt   target_deviceru   unexpected_keysc                 C   s  | j jdkrdS ddlm} t||\}}	| jrntjj|j	|d|j
d|j|	< t|tjrjtt||_nt| j ts~J tjj||j
dj	|d|j|	< | }
| j jrt|t|
kr|  t|jjddd	d
 | j  tdkrnddlm} | j  }t||rn|dd\}}d}||j v r@|j | }n|j !dd}|durj|||dd d dS ||| j   dS )z
        Each nn.Linear layer that needs to be quantized is processed here.
        First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
        rU   Nr   )	quantize_)Zdevice)requires_gradT)decoderZtie_word_embeddingsFz0.12.0)ModuleFqnToConfigr   r   _defaultc                 S   s   dS NTr   )rl   Zfqnr   r   r   <lambda>  ra   z;TorchAoHfQuantizer.create_quantized_param.<locals>.<lambda>)Z	filter_fn)"r2   rI   torchao.quantizationr   r   r?   r8   r{   	Parametertor   _parametersr&   r|   types
MethodTyper-   Z
extra_reprr   rq   Zuntie_embedding_weightsrf   Ztie_weightssetattrconfigZget_text_configrV   r   rW   r   Zget_apply_tensor_subclassrsplitZmodule_fqn_to_configr;   )r,   r   rs   rt   r   ru   r   r   rh   r   Zinput_embedr   r   Z
module_fqn_cr   r   r   create_quantized_param   sB    

z)TorchAoHfQuantizer.create_quantized_paramc                 K   sT   | j jdkrPddlm} ddlm} tj|dd}||f|dd| j j}|S d	S )
z/No process required for torchao quantized modelrU   r   )rU   )ALL_AUTOQUANT_CLASS_LISTzmax-autotune)modeF)Zqtensor_class_listZset_inductor_configN)	r2   rI   r/   rU   r   r   r8   compileZquant_type_kwargs)r,   r   r3   rU   r   r   r   r   #_process_model_after_weight_loading  s    z6TorchAoHfQuantizer._process_model_after_weight_loading)r   c                 C   s`   |rt d dS ttjdtdk}|s<t d | jr\| jjd u r\t d dS |S )Nzetorchao quantized model does not support safe serialization, please set `safe_serialization` to FalseFZhuggingface_hubz0.25.0zMtorchao quantized model is only serializable after huggingface_hub >= 0.25.0 a  The model contains offloaded modules and these modules are not quantized. We don't recommend saving the model as we won't be able to reload them.If you want to specify modules to not quantize, please specify modules_to_not_convert in the quantization_config.)	rJ   warningr   rA   rB   rC   r:   r2   ro   )r,   Zsafe_serializationZ_is_torchao_serializabler   r   r   is_serializable*  s     
z"TorchAoHfQuantizer.is_serializablec                 C   sl   | j  tdkrRddlm} | j j}t||rR|jj	}t
|}|dkrNdS dS ddddd}|| j j S )a9  
        This factor is used in caching_allocator_warmup to determine how many bytes to pre-allocate for accelerator warmup.
        - A factor of 2 means we pre-allocate the full memory footprint of the model.
        - A factor of 4 means we pre-allocate half of that, and so on

        However, when using TorchAO, calculating memory usage with param.numel() * param.element_size() doesn't give the correct size for quantized weights (like int4 or int8)
        That's because TorchAO internally represents quantized tensors using subtensors and metadata, and the reported element_size() still corresponds to the dtype
        not the actual bit-width of the quantized data.

        To correct for this:
        - Use a division factor of 8 for int4 weights
        - Use a division factor of 4 for int8 weights
        rO   r   rP   rR         rS   )r2   rV   r   rW   rX   rQ   rI   r&   r'   r(   r   )r,   rQ   rI   r   rY   rZ   r   r   r   get_accelerator_warm_up_factor=  s    
z1TorchAoHfQuantizer.get_accelerator_warm_up_factorc                 C   s   ddg}| j j|v S )NrT   rH   )r2   rI   )r,   Z"supported_quant_types_for_trainingr   r   r   is_trainabled  s    zTorchAoHfQuantizer.is_trainablec                 C   s   dS r   r   )r,   r   r   r   is_compileablel  s    z!TorchAoHfQuantizer.is_compileable)N)N)r(   
__module____qualname____doc__Z requires_parameters_quantizationZrequires_calibrationZrequired_packagesr1   rF   rM   r[   r<   strr   intrc   r   listrr   r   boolr   r   r   r   r   propertyr   r   __classcell__r   r   r4   r   r.   T   sB   $0 

;'r.   )"rB   r   r   typingr   r   r   	packagingr   baser   Zquantizers_utilsr   Zmodeling_utilsr
   r   utilsr   r   r   Zutils.quantization_configr   r8   Ztorch.nnr{   Z
get_loggerr(   rJ   r   r   r    r)   r-   r.   r   r   r   r   <module>   s(   
