a
    h}                     @   sP  d dl Z d dlmZ d dlZd dlmZ d dlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZmZmZmZ d	d
lmZ eeZG dd dejZG dd deZG dd deZdd Zd$ddZG dd deZ G dd deZ!G dd deZ"G dd de"eZ#G dd de
Z$G dd  d eZ%G d!d" d"eZ&g d#Z'dS )%    N)Optional   )logging   )GemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassification)GraniteAttention)LlamaDecoderLayerLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbedding   )HeliumConfigc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	HeliumRMSNormư>c                    s&   t    tt|| _|| _d S )N)super__init__nn	ParametertorchZonesweightvariance_epsilon)selfhidden_sizeeps	__class__ e/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/helium/modular_helium.pyr   "   s    
zHeliumRMSNorm.__init__c                 C   sR   |j }|tj}|djddd}|t|| j  }| jtj| |S )Nr   T)Zkeepdim)	Zdtypetor   Zfloat32powmeanZrsqrtr   r   )r   Zhidden_statesZinput_dtypeZvariancer   r   r    forward'   s
    zHeliumRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler   shaper   )r   r   r   r    
extra_repr.   s    zHeliumRMSNorm.extra_repr)r   )__name__
__module____qualname__r   r%   r(   __classcell__r   r   r   r    r   !   s   r   c                   @   s   e Zd ZdS )HeliumRotaryEmbeddingNr)   r*   r+   r   r   r   r    r-   2   s   r-   c                   @   s   e Zd ZdS )	HeliumMLPNr.   r   r   r   r    r/   6   s   r/   c                 C   s>   | ddddf }| ddddf }t j| |fdddS )	z*Rotates half the hidden dims of the input..r   Nr   r   r!   dim)r   stackflatten)xx1Zx2r   r   r    rotate_half:   s    r7   c                 C   s   | |}| |}|dd|jd d f jddd}|dd|jd d f jddd}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr!   r   r0   )Z	unsqueezer'   Zrepeat_interleaver7   )qkcossinZposition_idsZunsqueeze_dimZq_embedZk_embedr   r   r    apply_rotary_pos_embA   s    

$$r<   c                       s*   e Zd Zdeee d fddZ  ZS )HeliumAttentionNconfig	layer_idxc                    s:   t  || tj|j|jdd| _dt| j | _	d S )NF)Zbiasr   )
r   r   r   ZLinearr   Zo_projmathsqrtZhead_dimZscalingr   r?   r@   r   r   r    r   c   s    zHeliumAttention.__init__)Nr)   r*   r+   r   r   intr   r,   r   r   r   r    r=   b   s   r=   c                       s*   e Zd Zdeee d fddZ  ZS )HeliumDecoderLayerNr>   c                    s@   t  || t|| _t|j|jd| _t|j|jd| _d S )Nr   )	r   r   r/   Zmlpr   r   rms_norm_epsZinput_layernormZpost_attention_layernormrC   r   r   r    r   j   s    
zHeliumDecoderLayer.__init__)NrD   r   r   r   r    rF   i   s   rF   c                   @   s   e Zd ZdS )HeliumPreTrainedModelNr.   r   r   r   r    rI   r   s   rI   c                       s"   e Zd Zed fddZ  ZS )HeliumModelr?   c                    sZ   t    t fddt jD | _t j j	d| _
t | _d| _|   d S )Nc                    s   g | ]}t  |qS r   )rF   ).0r@   rK   r   r    
<listcomp>z       z(HeliumModel.__init__.<locals>.<listcomp>rG   F)r   r   r   Z
ModuleListrangeZnum_hidden_layersZlayersr   r   rH   Znormr-   Z
rotary_embZgradient_checkpointingZ	post_init)r   r?   r   rK   r    r   w   s    
zHeliumModel.__init__)r)   r*   r+   r   r   r,   r   r   r   r    rJ   v   s   rJ   c                   @   s   e Zd ZdS )HeliumForCausalLMNr.   r   r   r   r    rP      s   rP   c                   @   s   e Zd ZdS )HeliumForSequenceClassificationNr.   r   r   r   r    rQ      s   rQ   c                   @   s   e Zd ZdS )HeliumForTokenClassificationNr.   r   r   r   r    rR      s   rR   )rI   rJ   rP   rQ   rR   )Nr   )(rA   typingr   r   Ztorch.nnr   Ztorch.utils.checkpointutilsr   Zgemma.modeling_gemmar   r   r   Zgranite.modeling_graniter	   Zllama.modeling_llamar
   r   r   r   r   Zconfiguration_heliumr   Z
get_loggerr)   loggerModuler   r-   r/   r7   r<   r=   rF   rI   rJ   rP   rQ   rR   __all__r   r   r   r    <module>   s.   

!	