a
    h-                     @   s  d dl Z d dlmZ d dlZd dlmZmZ ddlmZ ddlm	Z	 e
eZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZG dd  d ejZeed!d"d#feeed$d%ifeeeejeeeejeejejejejej ej!ed&Z"ee"Z#d'd( Z$e$d)Z%e$d*Z&e$d+Z'e$d,Z(e$d-Z)e$d.Z*e$d/Z+e$d0Z,dS )1    N)OrderedDict)Tensornn   )logging)is_torchdynamo_compilingc                   @   s    e Zd ZdZeedddZdS )PytorchGELUTanha&  
    A fast C implementation of the tanh approximation of the GeLU activation function. See
    https://huggingface.co/papers/1606.08415.

    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
    match due to rounding errors.
    inputreturnc                 C   s   t jj|ddS )Ntanh)Zapproximate)r   
functionalgeluselfr
    r   T/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/activations.pyforward%   s    zPytorchGELUTanh.forwardN__name__
__module____qualname____doc__r   r   r   r   r   r   r      s   r   c                   @   s    e Zd ZdZeedddZdS )NewGELUActivationz
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    r	   c                 C   s6   d| dt tdtj |dt |d     S )N      ?      ?       @Hm?g      @)torchr   mathsqrtpipowr   r   r   r   r   /   s    zNewGELUActivation.forwardNr   r   r   r   r   r   )   s   r   c                       sH   e Zd ZdZded fddZeedddZeedd	d
Z  Z	S )GELUActivationa  
    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
    Also see the Gaussian Error Linear Units paper: https://huggingface.co/papers/1606.08415
    F)use_gelu_pythonc                    s&   t    |r| j| _n
tjj| _d S N)super__init___gelu_pythonactr   r   r   )r   r$   	__class__r   r   r'   ;   s    

zGELUActivation.__init__r	   c                 C   s    |d dt |td   S )Nr   r   r   )r   erfr   r    r   r   r   r   r(   B   s    zGELUActivation._gelu_pythonc                 C   s
   |  |S r%   r)   r   r   r   r   r   E   s    zGELUActivation.forward)F)
r   r   r   r   boolr'   r   r(   r   __classcell__r   r   r*   r   r#   3   s   r#   c                   @   s    e Zd ZdZeedddZdS )FastGELUActivationz}
    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
    r	   c                 C   s*   d| dt |d dd| |     S )Nr   r   g3E?r   )r   r   r   r   r   r   r   N   s    zFastGELUActivation.forwardNr   r   r   r   r   r0   I   s   r0   c                   @   s    e Zd ZdZeedddZdS )QuickGELUActivationzr
    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
    r	   c                 C   s   |t d|  S )NgZd;?)r   sigmoidr   r   r   r   r   W   s    zQuickGELUActivation.forwardNr   r   r   r   r   r1   R   s   r1   c                       s8   e Zd ZdZeed fddZeedddZ  ZS )ClippedGELUActivationa  
    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
    https://huggingface.co/papers/2004.09602.

    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
    initially created.

    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://huggingface.co/papers/1606.08415
    minmaxc                    s8   ||krt d| d| dt   || _|| _d S )Nzmin should be < max (got min: z, max: ))
ValueErrorr&   r'   r5   r6   )r   r5   r6   r*   r   r   r'   h   s
    
zClippedGELUActivation.__init__xr   c                 C   s   t t|| j| jS r%   )r   Zclipr   r5   r6   )r   r:   r   r   r   r   p   s    zClippedGELUActivation.forward)	r   r   r   r   floatr'   r   r   r/   r   r   r*   r   r3   [   s   r3   c                       s0   e Zd ZdZ fddZeedddZ  ZS )AccurateGELUActivationz
    Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
    https://github.com/hendrycks/GELUs

    Implemented along with MEGA (Moving Average Equipped Gated Attention)
    c                    s    t    tdtj | _d S )N   )r&   r'   r   r    r!   precomputed_constantr   r*   r   r   r'   |   s    
zAccurateGELUActivation.__init__r	   c                 C   s,   d| dt | j|dt |d     S )Nr   r   r      )r   r   r>   r"   r   r   r   r   r      s    zAccurateGELUActivation.forward)r   r   r   r   r'   r   r   r/   r   r   r*   r   r<   t   s   r<   c                       s@   e Zd ZdZ fddZeedddZeedddZ  ZS )	MishActivationz
    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://huggingface.co/papers/1908.08681). Also
    visit the official repository for the paper: https://github.com/digantamisra98/Mish
    c                    s   t    tjj| _d S r%   )r&   r'   r   r   mishr)   r?   r*   r   r   r'      s    
zMishActivation.__init__r	   c                 C   s   |t tj| S r%   )r   r   r   r   softplusr   r   r   r   _mish_python   s    zMishActivation._mish_pythonc                 C   s
   |  |S r%   r-   r   r   r   r   r      s    zMishActivation.forward)	r   r   r   r   r'   r   rD   r   r/   r   r   r*   r   rA      s   rA   c                   @   s    e Zd ZdZeedddZdS )LinearActivationz[
    Applies the linear activation function, i.e. forwarding input directly to output.
    r	   c                 C   s   |S r%   r   r   r   r   r   r      s    zLinearActivation.forwardNr   r   r   r   r   rE      s   rE   c                   @   s   e Zd ZdZdddZdS )LaplaceActivationz
    Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
    https://huggingface.co/papers/2209.10655

    Inspired by squared relu, but with bounded range and gradient for better stability
    绹۞? ^/?c                 C   s*   ||  |td }ddt|  S )Nr   r   r   )divr   r    r   r,   )r   r
   musigmar   r   r   r      s    zLaplaceActivation.forwardN)rG   rH   r   r   r   r   r   r   r   r   r   rF      s   rF   c                   @   s   e Zd ZdZdd ZdS )ReLUSquaredActivationz`
    Applies the relu^2 activation introduced in https://huggingface.co/papers/2109.08668v2
    c                 C   s   t j|}t|}|S r%   )r   r   relur   Zsquare)r   r
   Zrelu_appliedZsquaredr   r   r   r      s    
zReLUSquaredActivation.forwardNrL   r   r   r   r   rM      s   rM   c                       s   e Zd Z fddZ  ZS )ClassInstantierc                    s4   t  |}t|tr|n|i f\}}|f i |S r%   )r&   __getitem__
isinstancetuple)r   keycontentclskwargsr*   r   r   rP      s    zClassInstantier.__getitem__)r   r   r   rP   r/   r   r   r*   r   rO      s   rO   c                       s`   e Zd ZdZddddejdf fdd	Zeedd	d
ZeedddZ	eedddZ
  ZS )XIELUActivationz
    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010

    If the user has installed the nickjbrowning/XIELU wheel, we import xIELU CUDA
    Otherwise, we emit a single warning and use xIELU Python
    g?r   gưFc              
      s  t    ttttj||dd d| _	ttttj|| |dd d| _
| dtj||d | dtj||d || _t| j    | _t| j    | _d | _zdd l}tjj | _d}z$ddlm}	 |	| j| _|d7 }W n> tyT }
 z$|d	|
 d
7 }| j| _W Y d }
~
n
d }
~
0 0 t | W n6 ty }
 zt dt!|
 W Y d }
~
n
d }
~
0 0 d S )N)dtyper   r   betaepszUsing experimental xIELU CUDA.)allow_in_graphz& Enabled torch._dynamo for xIELU CUDA.z+ Could not enable torch._dynamo for xIELU (z*) - this may result in slower performance.u   CUDA-fused xIELU not available (%s) – falling back to a Python version.
For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`)"r&   r'   r   	Parameterr   logexpZtensor	unsqueezealpha_palpha_nZregister_bufferwith_vector_loadsr;   rY   detachcpuitem_beta_scalarrZ   _eps_scalar_xielu_cuda_objZ	xielu.opsclassesxieluZXIELUZtorch._dynamor[   _xielu_cuda_xielu_cuda_fn	Exceptionloggerwarning_oncestr)r   Zalpha_p_initZalpha_n_initrY   rZ   rX   rb   rj   msgr[   errr*   r   r   r'      s8    	
,&zXIELUActivation.__init__r9   c              
   C   sh   t j| j}| jt j| j }t|dk|| | | j|  tt	|| j
| | | j|  S )Nr   )r   r   rC   r`   rY   ra   r   whereexpm1r5   rZ   )r   r:   r`   ra   r   r   r   _xielu_python   s    $zXIELUActivation._xielu_pythonc                 C   s   |j }| dk r|d}q| dkr>|dd|d}||j krXtd||j  | j|| j	| j
| j| j| j}||S )zDFirewall function to prevent torch.compile from seeing .item() callsr@   r   r   z_Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).)shapedimr_   viewsizern   ro   rh   r   r`   ra   rf   rg   rb   )r   r:   Zoriginal_shaperesultr   r   r   rk      s(    
	zXIELUActivation._xielu_cudar	   c                 C   s4   | j d ur*|jr*t s | |S td | |S )Nz:torch._dynamo is compiling, using Python version of xIELU.)rh   Zis_cudar   rl   rn   ro   ru   r   r   r   r   r     s
    

zXIELUActivation.forward)r   r   r   r   r   Zbfloat16r'   r   ru   rk   r   r/   r   r   r*   r   rW      s   	+	rW   i
   r4   r$   T)r   Zgelu_10	gelu_fastgelu_newgelu_pythonZgelu_pytorch_tanhZgelu_accurateZlaplaceZ
leaky_relulinearrB   
quick_gelurN   Zrelu2Zrelu6r2   siluZswishr   Zprelurj   c                 C   s0   | t v rt |  S td|  dtt   d S )Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)Zactivation_stringr   r   r   get_activation5  s    r   r   r~   r   r}   r   r   rB   r   )-r   collectionsr   r   r   r   utilsr   Zutils.import_utilsr   Z
get_loggerr   rn   Moduler   r   r#   r0   r1   r3   r<   rA   rE   rF   rM   rO   rW   Z	LeakyReLUZReLUZReLU6ZSigmoidZSiLUZTanhZPReLUZACT2CLSr   r   r   r~   r   r}   r   r   rB   Z
linear_actr   r   r   r   <module>   sf   

			_
