a
    h                     @   s   d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ d
dlmZmZmZmZ ddlmZ eeZG dd deddZG dd dejZG dd deZG dd deZG dd deZG dd deZg dZ dS )    )Optional	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging)deprecate_kwarg   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )GraniteFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    Zcu_seq_lens_qZcu_seq_lens_kZmax_length_qZmax_length_kZseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__intZ	IntTensor r   r   y/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   &   s   


r   F)totalc                       s:   e Zd ZdZed fddZejejdddZ  Z	S )GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    sZ   t    |j| _|j| _t|j | _tj	| j| jd dd| _
tj	| j| jdd| _d S )Nr   F)Zbias)super__init__Zhidden_sizeZ
input_sizeshared_intermediate_sizer   Z
hidden_act
activationr   ZLinearinput_linearoutput_linearselfr    	__class__r   r   r"   H   s    
zGraniteMoeSharedMLP.__init__)hidden_statesreturnc                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr   )dimr   r   )r%   chunkr$   r&   )r(   r+   Zchunked_hidden_statesr   r   r   forwardQ   s
    

zGraniteMoeSharedMLP.forward)
r   r   r   r   r   r"   r   Tensorr0   __classcell__r   r   r)   r   r   ?   s   	r   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	e e	e e	e e	ej
 e	e e	eejejf  ee eeje	eejejf  f d
ddZ  ZS )GraniteMoeSharedDecoderLayer)r    	layer_idxc                    s*   t  || |jdkrd nt|| _d S )Nr   )r!   r"   r#   r   
shared_mlp)r(   r    r4   r)   r   r   r"   Z   s    z%GraniteMoeSharedDecoderLayer.__init__Zpast_key_valuepast_key_valuesz4.58)new_nameversionNF)r+   attention_maskposition_idsr6   output_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsr,   c
                 K   s   |}|  |}| jf ||||||||	d|
\}}||| j  }|}| |}| |\}}| jdu rn|}n|| | }~||| j  }|f}|r||f7 }|r||f7 }|S )aD  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        )r+   r9   r:   r6   r;   r<   r=   r?   N)Zinput_layernormZ	self_attnZresidual_multiplierZpost_attention_layernormZblock_sparse_moer5   )r(   r+   r9   r:   r6   r;   r<   r=   r>   r?   r@   ZresidualZself_attn_weightsZmoe_hidden_statesZrouter_logitsoutputsr   r   r   r0   ^   s:    '
	




z$GraniteMoeSharedDecoderLayer.forward)NNNFFNFN)r   r   r   r   r   r"   r
   r   r1   r   r   r   booltupler   r   ZFloatTensorr0   r2   r   r   r)   r   r3   Y   s.           r3   c                   @   s   e Zd ZU eed< dgZdS )GraniteMoeSharedPreTrainedModelr    r3   N)r   r   r   r   r   Z_no_split_modulesr   r   r   r   rD      s   
rD   c                       s"   e Zd Zed fddZ  ZS )GraniteMoeSharedModelr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r   )r3   ).0r4   r   r   r   
<listcomp>       z2GraniteMoeSharedModel.__init__.<locals>.<listcomp>)r!   r"   r   Z
ModuleListrangeZnum_hidden_layersZlayersr'   r)   r   r   r"      s    zGraniteMoeSharedModel.__init__)r   r   r   r   r"   r2   r   r   r)   r   rE      s   rE   c                       s(   e Zd ZdgZed fddZ  ZS )GraniteMoeSharedForCausalLMzlm_head.weightr   c                    s"   t  | t|| _|   d S )N)r!   r"   rE   modelZ	post_initr'   r)   r   r   r"      s    
z$GraniteMoeSharedForCausalLM.__init__)r   r   r   Z_tied_weights_keysr   r"   r2   r   r   r)   r   rJ      s   rJ   )rJ   rE   rD   )!typingr   r   r   r   Zactivationsr   Zcache_utilsr   Zprocessing_utilsr   utilsr	   Zutils.deprecationr
   Zgranitemoe.modeling_granitemoer   r   r   r   Zconfiguration_granitemoesharedr   Z
get_loggerr   loggerr   Moduler   r3   rD   rE   rJ   __all__r   r   r   r   <module>   s"   
X
