a
    h|                     @   s"  d dl Z d dlmZ d dlmZ ddlmZ ddlmZm	Z	 e	
eZe rTd dlZdd Zd+ee ed	 ee ed
ef dddZd,ee ed	 ee ed
ef dddZd-ee ed	 ee ed
ef dddZd.ed	ee ed
ef dddZd/ed	ee ed
ef dddZd0ed	ee ed
ef dddZeeeeeedZd1eeeee ee dddZd2eee dddZd3eee ddd Zd4eee dd!d"Zd5eee dd#d$Zd6eee dd%d&Z d7eee dd'd(Z!eeeee e!dZ"d8eee dd)d*Z#dS )9    Nwraps)Optional   )PretrainedConfig)is_torch_availableloggingc                    s,   dd dd  t  fdd}|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    c                 S   s   t |d }t| jdr$| jj}n| jj}||krnt| dsZ| j| j||d d\| _}| jd| jdd n | j	
|| _	| jd| j	dd dS )	zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r    original_max_position_embeddingslong_inv_freqseq_leninv_freqF
persistentN)torchmaxhasattrconfigr	   max_position_embeddingsrope_init_fnr
   register_bufferoriginal_inv_freqto)selfposition_idsdevicer   r	   _ r   \/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_update+   s    

z6dynamic_rope_update.<locals>.longrope_frequency_updatec                 S   s   t |d }|| jkrF| j| j||d\}| _| jd|dd || _|| jk r| j| jkr| j	|| _| jd| jdd | j| _dS )a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   r   r   Fr   N)
r   r   Zmax_seq_len_cachedr   r   Zattention_scalingr   Zoriginal_max_seq_lenr   r   )r   r   r   r   r   r   r   r   dynamic_frequency_update>   s    
z5dynamic_rope_update.<locals>.dynamic_frequency_updatec                    sB   d| j v r | ||jd n| j dkr6| ||jd | ||S )Ndynamic)r   longrope)	rope_typer   )r   xr   r    r   rope_forwardr   r   wrapperQ   s
    

z$dynamic_rope_update.<locals>.wrapperr   )r&   r'   r   r%   r   dynamic_rope_update   s
    r(   ztorch.deviceztorch.Tensor)r   r   r   returnc           	      C   sv   | j }t| dr| jnd}t| ddp0| j| j }t|| }d}d|tjd|dtj	dj
|tjd|   }||fS )	ax  
    Computes the inverse frequencies according to the original RoPE implementation
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    partial_rotary_factor      ?head_dimNr      dtyper   r/   )
rope_thetar   r*   getattrhidden_sizenum_attention_headsintr   arangeint64r   float)	r   r   r   baser*   r,   dimattention_factorr   r   r   r    _compute_default_rope_parameters\   s    ,r<   c                 C   s*   | j d }t| ||\}}|| }||fS )a  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    factor)rope_scalingr<   )r   r   r   r=   r   r;   r   r   r   '_compute_linear_scaling_rope_parametersz   s    
r?   c                 C   s   | j }t| dr| jnd}t| d| j| j }t|| }| j}| jd }d}	|du r\|}n4t	|t
jrt
|t
j||j|jd}n
t||}||| | |d  ||d    }d|t
jd	|dt
jd
j|t
jd|   }
|
|	fS )a  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    r*   r+   r,   r=   Nr/   r   r   r-   r   r.   r0   )r1   r   r*   r2   r3   r4   r5   r   r>   
isinstancer   ZTensormaximumtensorr/   r   r   r6   r7   r   r8   )r   r   r   r9   r*   r,   r:   r   r=   r;   r   r   r   r   _compute_dynamic_ntk_parameters   s$    

$,rD   c                    s  | j }t| dr| jnd}t| d| j| j }t|| }| jd }| jd}| jd}	| jd}
| jdpx| j	}dd
d}|du r|	r|
rt
|||	|||
 }n||}| jdpd}| jdpd	}dd   fdd}dd }|td|dj|tj
d|  }d| }d||  }| jdd}|||||||\}}d	||||d j|tj
d }|d	|  ||  }||fS )a  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r*   r+   r,   r=   r;   mscalemscale_all_dimr	   r   c                 S   s"   | dkrdS d| t |  d S )Nr   r+   g?)mathlog)scalerE   r   r   r   
get_mscale   s    z,_compute_yarn_parameters.<locals>.get_mscaleN	beta_fast    	beta_slowc                 S   s*   |t || d t j   dt |  S )zPInverse dimension formula to find the dimension based on the number of rotationsr-   )rG   rH   pi)Znum_rotationsr:   r9   r   r   r   r   find_correction_dim   s    z5_compute_yarn_parameters.<locals>.find_correction_dimc                    sL    | |||} ||||}|r4t |}t |}t|dt||d fS )z.Find dimension range bounds based on rotationsr   r   )rG   floorceilr   min)Zlow_rotZhigh_rotr:   r9   r   truncatelowhighrO   r   r   find_correction_range   s    

z7_compute_yarn_parameters.<locals>.find_correction_rangec                 S   s>   | |kr|d7 }t j|t jd|  ||   }t |dd}|S )NgMbP?r.   r   r   )r   r6   float32clamp)rR   r   r:   Zlinear_funcZ	ramp_funcr   r   r   linear_ramp_factor  s
    z4_compute_yarn_parameters.<locals>.linear_ramp_factorr   r-   r0   rS   T)r   )r1   r   r*   r2   r3   r4   r5   r>   getr   r8   r   r6   r   )r   r   r   r9   r*   r,   r:   r=   r;   rE   rF   r	   rJ   rK   rM   rW   rZ   Z	pos_freqsZinv_freq_extrapolationZinv_freq_interpolationrS   rT   rU   Zinv_freq_extrapolation_factorr   r   rV   r   _compute_yarn_parameters   s>    

	
" 
r\   c                 C   s&  | j }t| dr| jnd}t| d| j| j }t|| }| jd }| jd }| jd}	| jd}
t| dr| j	}| j
| j	 }	n| j
}|
d	u r|	dkrd}
ntd
t|	t|  }
|r||krtj|tj|d}ntj|tj|d}tjd|dtj|d | }d|||   }||
fS )a~  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r*   r+   r,   long_factorshort_factorr=   r;   r	   Nr   r@   r   r-   )r1   r   r*   r2   r3   r4   r5   r>   r[   r	   r   rG   sqrtrH   r   rC   rX   r6   r7   r8   )r   r   r   r9   r*   r,   r:   r]   r^   r=   r;   r	   Zext_factorsZinv_freq_shaper   r   r   r   _compute_longrope_parameters  s,    


r`   c                 C   s   t | ||\}}| jd }| jd }| jd }| jd }|| }	|| }
dtj | }t||	k|| |}|| | ||  }d| | | ||  }||
k  ||	k  }t|||}||fS )a<  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r=   low_freq_factorhigh_freq_factorr	   r-   r   )r<   r>   rG   rN   r   where)r   r   r   r   r;   r=   ra   rb   Zold_context_lenZlow_freq_wavelenZhigh_freq_wavelenZwavelenZinv_freq_llamaZsmooth_factorZsmoothed_inv_freqZis_medium_freqr   r   r   _compute_llama3_parametersR  s    



rd   )defaultZlinearr!   yarnr"   Zllama3)r#   received_keysrequired_keysoptional_keysignore_keysc                 C   s   d|v r|dh8 }| d |dur,||8 }|| }|rLtd|  d| |durb|| | }n|| }|rtd|  d|  dS )zYCompare the received keys in `config.rope_scaling` against the expected and optional keystyper#   Nz9Missing required keys in `rope_scaling` for 'rope_type'='z': z5Unrecognized keys in `rope_scaling` for 'rope_type'=')addKeyErrorloggerwarning)r#   rg   rh   ri   rj   Zmissing_keysZunused_keysr   r   r   _check_received_keys  s    	

rp   )r   rj   c                 C   s@   | j }|d|dd }dh}t| }t||||d d S )Nr#   rk   rj   )r>   r[   setkeysrp   )r   rj   r>   r#   rh   rg   r   r   r   !_validate_default_rope_parameters  s
    rt   c                 C   st   | j }|d|dd }ddh}t| }t||||d |d }|d u s`t|tr`|dk rptd|  d S )Nr#   rk   r=   rq   r+   8`rope_scaling`'s factor field must be a float >= 1, got 	r>   r[   rr   rs   rp   rA   r8   rn   ro   )r   rj   r>   r#   rh   rg   r=   r   r   r   (_validate_linear_scaling_rope_parameters  s    rw   c                 C   s|   | j }|d|dd }ddh}dh}t| }t|||||d |d }|d u sht|trh|dk rxtd|  d S )Nr#   rk   r=   r	   rq   r+   ru   rv   )r   rj   r>   r#   rh   ri   rg   r=   r   r   r   )_validate_dynamic_scaling_rope_parameters  s    rx   c              	   C   s  | j }|d|dd }ddh}h d}t| }t|||||d |d }|d u sjt|trj|dk rztd|  |d}|d urt|tr|d	k rtd
|  |d}	|	d urt|	tstd|	  |d}
|
d ur
t|
ts
td|
  |	pd|
pdk r8td|	 d|
 d | j d}|d ur| j	| }||krt
d| d| d| d n
t
d d S )Nr#   rk   r=   >   rM   r;   rF   rK   rS   rE   r	   rq   r+   ru   r;   r   L`rope_scaling`'s attention_factor field must be a float greater than 0, got rK   z6`rope_scaling`'s beta_fast field must be a float, got rM   z6`rope_scaling`'s beta_slow field must be a float, got rL   r   zO`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r	   zHThe explicitly set RoPE scaling factor (config.rope_scaling['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_scaling['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config.a~  config.rope_scaling['original_max_position_embeddings'], the pre-yarn context length, is unset. We will **assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * factor) -- we recommend updating both fields for optimal downstream model usage.)r>   r[   rr   rs   rp   rA   r8   rn   ro   r   warning_once)r   rj   r>   r#   rh   ri   rg   r=   r;   rK   rM   r	   Zimplicit_factorr   r   r   _validate_yarn_parameters  sP    	





r{   c                 C   s  | j }|d|dd }h d}h d}t| }t|||||d t| drX| jnd}t| d| j| j	 }t
|| }	|d	}
t|
tstd
d |
D rtd|
  t|
|	d krtd|	d  dt|
  |d}t|tstdd |D rtd|  t||	d krJtd|	d  dt|  t| drbtd n|d}|d u rtd n&t|tr|dk rtd|  |d}|d urt|tr|dk rtd|  d S )Nr#   rk   >   r]   r#   r^   >   r=   r;   r	   rq   r*   r+   r,   r^   c                 s   s   | ]}t |ttfV  qd S NrA   r5   r8   .0r$   r   r   r   	<genexpr>      z0_validate_longrope_parameters.<locals>.<genexpr>zC`rope_scaling`'s short_factor field must be a list of numbers, got r-   z5`rope_scaling`'s short_factor field must have length z, got r]   c                 s   s   | ]}t |ttfV  qd S r|   r}   r~   r   r   r   r     r   zB`rope_scaling`'s long_factor field must be a list of numbers, got z4`rope_scaling`'s long_factor field must have length r	   aY  This model has set a `original_max_position_embeddings` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.r=   z1Missing required keys in `rope_scaling`: 'factor'ru   r;   g        ry   )r>   r[   rr   rs   rp   r   r*   r2   r3   r4   r5   rA   listallrn   ro   lenrz   r8   )r   rj   r>   r#   rh   ri   rg   r*   r,   r:   r^   r]   r=   r;   r   r   r   _validate_longrope_parameters	  sD    

 



r   c           
      C   s6  | j }|d|dd }h d}t| }t||||d |d }|d u s`t|tr`|dk rptd|  |d }|d	 }|d u st|tstd
|  |d u st|tstd|  ||krtd| d|  |d }	|	d u st|	t	std|	  |	| j
kr2td|	 d| j
  d S )Nr#   rk   >   r=   r#   ra   r	   rb   rq   r=   r+   ru   ra   rb   z<`rope_scaling`'s low_freq_factor field must be a float, got z=`rope_scaling`'s high_freq_factor field must be a float, got zc`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r	   zP`rope_scaling`'s original_max_position_embeddings field must be an integer, got zg`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)r>   r[   rr   rs   rp   rA   r8   rn   ro   r5   r   )
r   rj   r>   r#   rh   rg   r=   ra   rb   r	   r   r   r   _validate_llama3_parameters;  sJ    r   c                 C   sb   t | dd}|du rdS |d|dd}t|}|durL|| |d ntd| d dS )	zO
    Validate the RoPE config arguments, given a `PretrainedConfig` object
    r>   Nr#   rk   re   rq   zTMissing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='')r2   r[   ROPE_VALIDATION_FUNCTIONSrn   ro   )r   rj   r>   r#   Zvalidation_fnr   r   r   rope_config_validationj  s    

r   )NNN)NNN)NNN)N)N)N)NN)N)N)N)N)N)N)N)$rG   	functoolsr   typingr   Zconfiguration_utilsr   utilsr   r   Z
get_logger__name__rn   r   r(   r5   tupler8   r<   r?   rD   r\   r`   rd   ZROPE_INIT_FUNCTIONSstrrr   rp   rt   rw   rx   r{   r   r   r   r   r   r   r   r   <module>   s   
?   
   
   
. 

X 

7 

-  B2&
