a
    h                  	   @   s  d dl mZmZmZmZ d dlZd dlm  mZ	 d dlmZ ddl
mZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( e# r
d dl)m*Z* ddl+m,Z, e$-e.Z/G dd deddZ0G dd dej1Z2G dd dej1Z3G dd dej1Z4G dd dej1Z5G d d! d!ej1Z6d"d# Z7d>d$d%Z8ej9e:ej9d&d'd(Z;d?ej1ej9ej9ej9eej9 e<e<d*d+d,Z=G d-d. d.ej1Z>G d/d0 d0eZ?e"G d1d2 d2eZ@G d3d4 d4ej1ZAe"G d5d6 d6e@ZBd@eej9eCej9 df ee: eej9 eej9e:f d8d9d:ZDG d;d< d<e@eZEg d=ZFdS )A    )CallableOptional	TypedDictUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastMoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torch_flex_attn_availablelogging)deprecate_kwarg   )GraniteMoeSharedConfig)	BlockMask)make_flex_block_causal_maskc                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )GraniteFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    Zcu_seq_lens_qZcu_seq_lens_kZmax_length_qZmax_length_kZseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__intZ	IntTensor r'   r'   z/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/granitemoeshared/modeling_granitemoeshared.pyr   3   s   


r   F)totalc                       s:   e Zd ZdZed fddZejejdddZ  Z	S )GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    sZ   t    |j| _|j| _t|j | _tj	| j| jd dd| _
tj	| j| jdd| _d S )N   FZbias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr,   	__class__r'   r(   r0   U   s    
zGraniteMoeSharedMLP.__init__)hidden_statesreturnc                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr-   dimr   r   )r7   chunkr5   r8   )r:   r=   chunked_hidden_statesr'   r'   r(   forward^   s
    

zGraniteMoeSharedMLP.forward)
r   r    r!   r"   r   r0   r#   TensorrD   __classcell__r'   r'   r;   r(   r*   L   s   	r*   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	GraniteMoeSharedRMSNormư>c                    s&   t    tt|| _|| _dS )zF
        GraniteMoeSharedRMSNorm is equivalent to T5LayerNorm
        N)r/   r0   r   	Parameterr#   Zonesweightvariance_epsilon)r:   r1   epsr;   r'   r(   r0   g   s    
z GraniteMoeSharedRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr-   r?   T)Zkeepdim)	dtypetor#   float32powmeanZrsqrtrK   rJ   )r:   r=   Zinput_dtypeZvariancer'   r'   r(   rD   o   s
    zGraniteMoeSharedRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerJ   shaperK   )r:   r'   r'   r(   
extra_reprv   s    z"GraniteMoeSharedRMSNorm.extra_repr)rH   )r   r    r!   r0   rD   rT   rF   r'   r'   r;   r(   rG   f   s   rG   c                       s0   e Zd Zeeedd fddZdd Z  ZS )GraniteMoeSharedParallelExpertsN)num_expertsr2   output_sizer>   c                    s6   t    tt|||| _|| _|| _|| _	dS )a  
        Initialize the GraniteMoeSharedParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
r/   r0   r   rI   r#   emptyrJ   rV   r2   rW   )r:   rV   r2   rW   r;   r'   r(   r0   {   s
    
z(GraniteMoeSharedParallelExperts.__init__c                 C   sP   |j |dd}g }t| jD ] }|t|| | j|  qtj|dd}|S )a  
        Forward pass of the GraniteMoeSharedParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   r@   )	splitrangerV   appendFZlinearrJ   r#   cat)r:   inputsexpert_sizeZ
input_listZoutput_listiresultsr'   r'   r(   rD      s    z'GraniteMoeSharedParallelExperts.forwardr   r    r!   r&   r0   rD   rF   r'   r'   r;   r(   rU   z   s   rU   c                       s.   e Zd Zeeed fddZdd Z  ZS )GraniteMoeSharedTopKGatingr2   rV   top_kc                    s2   t    || _|| _|| _tj||dd| _dS )a  
        Initialize the top-k gating mechanism.
        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        Fr.   N)r/   r0   rV   r2   re   r   r6   layer)r:   r2   rV   re   r;   r'   r(   r0      s
    
z#GraniteMoeSharedTopKGating.__init__c                 C   s   |  | }|j| jdd\}}tj|dd|}tj|d| j	g|j
|jd}|d|d}| d}| }| }	|	d\}
}|j| jdd}| }|| }|||||fS )Nr   r@   r   rM   devicetrunc)Zrounding_mode)rf   floattopkre   r#   softmaxZtype_aszerossizerV   rM   rh   Zscatterlongsumtolistflattensortdiv)r:   r=   logitsZtop_k_logitsZtop_k_indicesZtop_k_gatesrm   Zgatesr_   Ztop_k_experts_Zindex_sorted_expertsbatch_indexbatch_gatesr'   r'   r(   rD      s    z"GraniteMoeSharedTopKGating.forwardrb   r'   r'   r;   r(   rc      s   rc   c                       s.   e Zd ZdZed fddZdd Z  ZS )GraniteMoeSharedMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    r+   c                    sl   t    |j| _|j| _t|j | _t|j	| j| jd | _
t|j	| j| j| _t| j|j	|jd| _d S )Nr-   rd   )r/   r0   r1   r2   Zintermediate_sizer   r4   r5   rU   num_local_expertsr7   r8   rc   num_experts_per_tokrouterr9   r;   r'   r(   r0      s    
zGraniteMoeSharedMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}	|| }
| |
|}|jddd}| |d |d  }| ||}||dddf  }tj|| | j	f|j
|jd}|d||}|||| j	}||	fS )a  
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
            Tensor:
                Router logits.
        r?   r-   r@   r   r   Nrg   )rn   reshaper|   r7   rB   r5   r8   r#   rm   r2   rM   rh   Z	index_addview)r:   Zlayer_inputbszlengthZemb_sizerv   rw   rx   r_   router_logitsZexpert_inputsr=   rC   Zexpert_outputsrm   Zlayer_outputr'   r'   r(   rD      s    zGraniteMoeSharedMoE.forward)r   r    r!   r"   r   r0   rD   rF   r'   r'   r;   r(   ry      s   ry   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr?   r-   r@   )rS   r#   r]   )xx1Zx2r'   r'   r(   rotate_half  s    r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkcossinposition_idsZunsqueeze_dimZq_embedZk_embedr'   r'   r(   apply_rotary_pos_emb  s
    

r   )r=   n_repr>   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rS   expandr}   )r=   r   batchnum_key_value_headsslenhead_dimr'   r'   r(   	repeat_kv5  s
    0r           )modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr-   r   r?   )rA   rM   )ptrainingr   )r   num_key_value_groupsr#   matmul	transposerS   r   
functionalrl   rO   rN   rM   r   r   
contiguous)r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr'   r'   r(   eager_attention_forwardA  s    
&r   c                       s   e Zd ZdZdeee d fddZedddd	de	j
ee	j
 ee	j ee eee	j eee	j
e	j
f  ee	j
ee	j
 eee	j
  f dddZ  ZS )GraniteMoeSharedAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr,   	layer_idxc                    s   t    || _|| _|d u r4td| jj d |j| _|j	| _	|j
| _| j	| j | _|j| _| j| j | _d| _|j| _| j| j | j	krtd| j	 d| j dtj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j	|jd| _d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r.   )r/   r0   r,   r   loggerwarning_oncer<   r   attention_dropoutr1   num_attention_heads	num_headsr   r   r   Z	is_causalZattention_multiplierr   
ValueErrorr   r6   Zattention_biasq_projk_projv_projo_projr:   r,   r   r;   r'   r(   r0   a  s2    

z"GraniteMoeSharedAttention.__init__past_key_valuepast_key_values4.58new_nameversionF)r=   r   r   r   	use_cachecache_positionposition_embeddingsr>   c                 K   sF  |  \}	}
}| |}| |}| |}||	|
| j| jdd}||	|
| j| jdd}||	|
| j| jdd}|d ur|nd\}}|d urt	||||\}}|d ur|||d}|
||| j|\}}t}| jjdkrt| jj }|| ||||f| jsdn| j| jd|\}}||	|
d}| |}||fS )	Nr   r-   )NN)r   r   r   eagerr   )r   r   r?   )rn   r   r   r   r~   r   r   r   r   r   updater   r   r,   _attn_implementationr   r   r   r   r   )r:   r=   r   r   r   r   r   r   r   r   Zq_lenrv   Zquery_statesr   r   r   r   Zcache_kwargsZattention_interfacer   r   r'   r'   r(   rD     s>    




z!GraniteMoeSharedAttention.forward)N)NNNFNN)r   r    r!   r"   r   r   r&   r0   r   r#   rE   r$   r	   boolrR   rD   rF   r'   r'   r;   r(   r   ^  s&          r   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	e e	e e	e e	ej
 e	e e	eejejf  ee eeje	eejejf  f d
ddZ  ZS )GraniteMoeSharedDecoderLayerr   c                    s|   t    |j| _t||d| _|jdkr4t|| _t|j|j	d| _
t|j|j	d| _|j| _|jdkrnd nt|| _d S )Nr   r   rL   )r/   r0   r1   r   	self_attnrz   ry   block_sparse_moerG   rms_norm_epsinput_layernormpost_attention_layernormresidual_multiplierr3   r*   
shared_mlpr   r;   r'   r(   r0     s    


z%GraniteMoeSharedDecoderLayer.__init__r   r   r   r   NF)r=   r   r   r   output_attentionsr   r   output_router_logitsr   r   r>   c
                 K   s   |}|  |}| jf ||||||||	d|
\}}||| j  }|}| |}| |\}}| jdu rn|}n|| | }~||| j  }|f}|r||f7 }|r||f7 }|S )aD  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        )r=   r   r   r   r   r   r   r   N)r   r   r   r   r   r   )r:   r=   r   r   r   r   r   r   r   r   r   ZresidualZself_attn_weightsZmoe_hidden_statesr   outputsr'   r'   r(   rD     s:    '
	




z$GraniteMoeSharedDecoderLayer.forward)NNNFFNFN)r   r    r!   r   r&   r0   r   r#   rE   r   r$   r	   r   rR   r   r   FloatTensorrD   rF   r'   r'   r;   r(   r     s.           r   c                       sF   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZ fddZ  ZS )	GraniteMoeSharedPreTrainedModelr,   modelTr   r   Fc                    s0   t  | t|tr,|jjjd| jjd d S )Nr   )rQ   Zstd)	r/   _init_weights
isinstancerU   rJ   dataZnormal_r,   Zinitializer_range)r:   r   r;   r'   r(   r   "  s    
z-GraniteMoeSharedPreTrainedModel._init_weights)r   r    r!   r   r%   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_can_compile_fullgraphr   rF   r'   r'   r;   r(   r     s   
r   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	GraniteMoeSharedRotaryEmbeddinginv_freqNr+   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)r/   r0   hasattrr   r   dictgetr   max_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr,   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)r:   r,   rh   r   r;   r'   r(   r0   +  s    
z(GraniteMoeSharedRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r?   r   ZmpscpuF)device_typeZenabledr-   r@   )rM   )r   rj   r   rS   rN   rh   r   r   strr#   Zautocastr   r]   r   r   r   rM   )
r:   r   r   Zinv_freq_expandedZposition_ids_expandedr   ZfreqsZembr   r   r'   r'   r(   rD   <  s    0&,z'GraniteMoeSharedRotaryEmbedding.forward)N)r   r    r!   r#   rE   r%   r   r0   Zno_gradr   rD   rF   r'   r'   r;   r(   r   (  s
   

r   c                       s   e Zd Zed fddZedeej eej	 eej ee
eeej f  eej ee ee ee ee ee eej e
eef dddZde
ej	d	f ej	ej	eed
ddZeej	eeejej	edddZ  ZS )GraniteMoeSharedModelr+   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _d| _ j| _ j| _ j| _| j| j | _ j| _ j| _ j| _| jdkrt nd | _|   d S )Nc                    s   g | ]}t  |qS r'   )r   ).0r   r+   r'   r(   
<listcomp>U      z2GraniteMoeSharedModel.__init__.<locals>.<listcomp>r   FZrope)r/   r0   Zpad_token_idZpadding_idx
vocab_sizer   Z	Embeddingr1   embed_tokensZ
ModuleListrZ   num_hidden_layerslayersrG   r   normgradient_checkpointingembedding_multiplierr   r   r   r   Z
rope_thetaZposition_embedding_typer   
rotary_emb	post_initr9   r;   r+   r(   r0   N  s$    zGraniteMoeSharedModel.__init__N)	input_idsr   r   r   inputs_embedsr   r   output_hidden_statesr   return_dictr   r>   c                 K   s2  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|
d urH|
n| j j}
|d u |d uA rhtd| jr| jr|rt	d d}|d u r| 
|}|| j }t|td tfstd|r|d u rt| j d}|d u r|d ur| nd}tj|||jd  |jd}|d u r"|d}| |||||}|}d }| jd urT| ||}|r^d	nd }|rld	nd }|	rzd	nd }| jD ]`}|r||f7 }|||||||||	|d
	}|d }|r||d f7 }|	r||d f7 }q| |}|r ||f7 }|
s tdd ||||fD S t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBThe `past_key_values` should be either a `Cache` object or `None`.r+   r   r   rh   r'   )r   r   r   r   r   r   r   r   r?   c                 s   s   | ]}|d ur|V  qd S )Nr'   )r   vr'   r'   r(   	<genexpr>  s   z0GraniteMoeSharedModel.forward.<locals>.<genexpr>)Zlast_hidden_stater   r=   
attentionsr   )r,   r   r   r   use_return_dictr   r   r   r   r   r   r   r   r   r	   r
   get_seq_lengthr#   arangerS   rh   r   _update_causal_maskr   r   r   rR   r   )r:   r   r   r   r   r   r   r   r   r   r   r   r   past_seen_tokensr   r=   r   Zall_hidden_statesZall_self_attnsZall_router_logitsZdecoder_layerZlayer_outputsr'   r'   r(   rD   g  s    










zGraniteMoeSharedModel.forwardFr   )r   input_tensorr   r   r   c                 C   sB  | j jdkr(|d ur$|dk r$|S d S | j jdkrLt|tjrHt|}|S |d ur\| nd}|d urn|jnd}| j jdkr|s|st	j
|||| jdrd S |j}|jd }	|r| }
n"t|tjr|jd	 n
||	 d }
| j||	|
|||jd d
}| j jdkr>|d ur>|jjdv r>|s>t|j}t	||}|S )NZflash_attention_2r   Zflex_attentionr   FZsdpa)r   Zpast_key_values_lengthZis_trainingr   r?   )sequence_lengthtarget_lengthrM   r   
batch_size)cudaZxpuZnpu)r,   r   anyr   r#   rE   r   r   Zis_compileabler   Z_ignore_causal_mask_sdpar   rM   rS   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrh   r   finfominZ_unmask_unattended)r:   r   r   r   r   r   r   Zusing_compilable_cacherM   r  r  r   	min_dtyper'   r'   r(   r     sZ    






	z)GraniteMoeSharedModel._update_causal_mask)r   r  r  rM   r   r  c                 K   sF  | dur|   dkr| }n&t|j}tj||f|||jd}|dkrVtj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| durB|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuerM   rh   r   )Zdiagonalr   r?   r   )rA   r#   r  r  fullrh   Ztriur   r}   r   clonerS   rN   Zmasked_fill)r   r  r  rM   r   r  r   r   r	  Zmask_lengthZpadding_maskr'   r'   r(   r    s*     $

6  zKGraniteMoeSharedModel._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNN)F)r   r    r!   r   r0   r   r   r#   r$   rE   r   r	   listr   r   rR   r   rD   r   staticmethodr&   rM   r  rF   r'   r'   r;   r(   r   L  sV              
t Dr   r-   )gate_logitsrV   r   r>   c                    s  | du st | tsdS t | trF| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rtj
| dd}	tj
|dd}
n|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||jd fd|jd  }tj|| ddtj|dd }
|jjdur|jjnd}|jd t| }t|	dd|||jd  f |
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS r'   )rN   )r   Z
layer_gateZcompute_devicer'   r(   r   t  r   z,load_balancing_loss_func.<locals>.<listcomp>r@   r?   r   )r   rR   rh   r#   r]   r   r   rl   rk   Zone_hotrQ   rj   rS   r   r}   rN   rp   indexr&   r   )r  rV   re   r   Zconcatenated_gate_logitsZrouting_weightsrv   Zselected_expertsZexpert_maskZtokens_per_expertZrouter_prob_per_expertr  r  r   Zexpert_attention_maskZ router_per_expert_attention_maskZdevice_indexZrankZoverall_lossr'   r  r(   load_balancing_loss_funcR  sR    



&r  c                       s   e Zd ZdgZed fddZed
eej	 eej
 eej	 eeeeej f  eej eej	 ee ee ee ee ee eej	 eeej
f eeef ddd	Z  ZS )GraniteMoeSharedForCausalLMzlm_head.weightr+   c                    sX   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _|j| _|   d S )NFr.   )r/   r0   r   r   r   r   r6   r1   lm_headrouter_aux_loss_coefrz   rV   r{   r   r9   r;   r'   r(   r0     s    
z$GraniteMoeSharedForCausalLM.__init__Nr   )r   r   r   r   r   labelsr   r   r   r   r   r   logits_to_keepr>   c                 K   s  |dur|n| j j}|
dur |
n| j j}
|	dur4|	n| j j}	|durH|n| j j}| jf ||||||||	|
||d|}|d }t|trt| dn|}| 	|dd|ddf }|| j j
 }d}|dur| }| j||fd| j ji|}d}|
r>t|r
|jn|d | j| j|}|dur>|| j||j 7 }|s~|f|dd  }|
rf|f| }|durz|f| S |S t||||j|j|j|jdS )ax  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeSharedForCausalLM

        >>> model = GraniteMoeSharedForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r   r   r   r   r   r   r   r   r   r   r   r   r   r?   r   )lossaux_lossru   r   r=   r   r   )r,   r   r   r   r   r   r   r&   slicer  Zlogits_scalingrj   Zloss_functionr   r  r   rV   r{   r  rN   rh   r   r   r=   r   )r:   r   r   r   r   r   r  r   r   r   r   r   r   r  r   r   r=   Zslice_indicesru   r  r  outputr'   r'   r(   rD     sx    (

z#GraniteMoeSharedForCausalLM.forward)NNNNNNNNNNNNr   )r   r    r!   Z_tied_weights_keysr   r0   r   r   r#   r$   rE   r   r	   r  r   r   r&   rR   r   rD   rF   r'   r'   r;   r(   r    s@                
r  )r  r   r   )Nr   )r   )Nr-   N)Gtypingr   r   r   r   r#   Ztorch.nn.functionalr   r   r\   Zactivationsr   Zcache_utilsr	   r
   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   Zutils.deprecationr   Zconfiguration_granitemoesharedr   Z!torch.nn.attention.flex_attentionr   Zintegrations.flex_attentionr   Z
get_loggerr   r   r   Moduler*   rG   rU   rc   ry   r   r   rE   r&   r   rj   r   r   r   r   r   r   rR   r  r  __all__r'   r'   r'   r(   <module>   st   
-0<
 Wa$  	   V