a
    hpA                     @   sR  d dl mZmZ d dlZd dlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ d
dlmZ d
dlmZmZmZ d
dlmZmZmZmZmZmZmZ ddl m!Z! e"e#Z$G dd deZ%G dd deZ&G dd deZ'G dd deZ(G dd deZ)G dd deZ*G dd deZ+G dd deZ,g d Z-dS )!    )OptionalUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)Unpack)auto_docstringcan_return_tuplelogging)deprecate_kwarg   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                       s$   e Zd Zeed fddZ  ZS )GraniteMoeHybridAttentionconfig	layer_idxc                    s   t  || d S Nsuper__init__selfr   r   	__class__ y/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr#   ,   s    z"GraniteMoeHybridAttention.__init____name__
__module____qualname__r   intr#   __classcell__r(   r(   r&   r)   r   +   s   r   c                       s$   e Zd Zeed fddZ  ZS )GraniteMoeHybridMambaLayerr   c                    s   t  t|| d S r    )r"   r#   r   r$   r&   r(   r)   r#   1   s    z#GraniteMoeHybridMambaLayer.__init__r*   r(   r(   r&   r)   r0   0   s   r0   c                       s   e Zd Zd fdd	Z  ZS )GraniteMoeHybridRMSNormGatedư>c                    s   t  || d S r    r!   )r%   Zhidden_sizeepsr&   r(   r)   r#   6   s    z%GraniteMoeHybridRMSNormGated.__init__)r2   )r+   r,   r-   r#   r/   r(   r(   r&   r)   r1   5   s   r1   c                       s"   e Zd Zed fddZ  ZS )GraniteMoeHybridMLPr   c                    s   t  | d S r    r!   r%   r   r&   r(   r)   r#   ;   s    zGraniteMoeHybridMLP.__init__)r+   r,   r-   r   r#   r/   r(   r(   r&   r)   r4   :   s   r4   c                       s   e Zd Zeed fddZedddddeje	ej e	e
 e	e e	e e	ej e	e e	eejejf  ee eeje	eejejf  f d

ddZ  ZS )GraniteMoeHybridDecoderLayerr   c                    sn   t  || t|| _d | _d | _|j| dkr@t||| _nt||| _|j| | _	t
|dddk| _d S )NmambaZnum_local_expertsr   )r"   r#   r4   
shared_mlp	self_attnr8   Zlayers_block_typer0   r   
layer_typegetattrhas_expertsr$   r&   r(   r)   r#   @   s    
z%GraniteMoeHybridDecoderLayer.__init__Zpast_key_valuepast_key_valuesz4.58)new_nameversionNF)
hidden_statesattention_maskr>   output_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsreturnc	              
   K   s   |}
|  |}| jdur8| jf ||||d|	}d}n$| jf |||||||d|	\}}|
|| j  }|}
| |}| jr| |\}}|| | }n| |}d}|
|| j  }|f}|r||f7 }|r||f7 }|S )aC  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        N)rA   rE   Zcache_paramsrB   )rA   rB   r>   rC   rD   rE   rG   )Zinput_layernormr8   r:   Zresidual_multiplierZpost_attention_layernormr=   Zblock_sparse_moer9   )r%   rA   rB   r>   rC   rD   rE   rF   rG   rH   ZresidualZself_attn_weightsZmoe_hidden_statesrouter_logitsoutputsr(   r(   r)   forwardP   sL    &






z$GraniteMoeHybridDecoderLayer.forward)NNFFNFN)r+   r,   r-   r   r.   r#   r   torchTensorr   r   bool
LongTensortupler	   r   FloatTensorrL   r/   r(   r(   r&   r)   r7   ?   s*          r7   c                       s0   e Zd ZU eed< dgZdZ fddZ  ZS )GraniteMoeHybridPreTrainedModelr   r7   Tc                    sl   t  | t|trP|jjd tt	d|j
d |j_|jjd nt|trh|jjd d S )Ng      ?r   )r"   _init_weights
isinstancer0   Zdt_biasdataZfill_rM   logarangeZ	num_headsZA_logDr1   weight)r%   moduler&   r(   r)   rT      s    

z-GraniteMoeHybridPreTrainedModel._init_weights)	r+   r,   r-   r   __annotations__Z_no_split_modulesZ_is_statefulrT   r/   r(   r(   r&   r)   rS      s   
rS   c                       s   e Zd Zed fddZeed
eje	ej
 e	ej e	eeeej f  e	ej e	e e	e e	e e	e e	e e	ej ee eeef dddZdd	 Z  ZS )GraniteMoeHybridModelr5   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r(   )r7   ).0r   r5   r(   r)   
<listcomp>       z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>)r"   r#   r   Z
ModuleListrangeZnum_hidden_layerslayersr6   r&   r5   r)   r#      s    zGraniteMoeHybridModel.__init__N)	input_idsrB   position_idsr>   inputs_embedsrD   rC   output_hidden_statesrF   return_dictrE   rH   rI   c                 K   sJ  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|
d urH|
n| j j}
|d u |d uA rhtd| jr| jr|rt	d d}|d u r| 
|}|| j }|r|d u rt	d |d u r|d ur| nd}tj|||jd  |jd}|d u r|d}| |||||}| ||}|}d }| jd urB| ||}|rLdnd }|rZdnd }|	rhdnd }| jD ]}|jd	kr|n|}|r||f7 }||f||||||	|d
|}|d }|r|d d ur||d f7 }|	rr|d d urr||d f7 }qr| |}|r$||f7 }|r8|js8d|_t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicer(   r8   )rB   r>   rC   rD   rE   rF   rG   T)Zlast_hidden_stater>   rA   Z
attentionsrJ   )r   rC   rf   rD   Zuse_return_dict
ValueErrorZgradient_checkpointingZtrainingloggerZwarning_onceZembed_tokensZembedding_multiplierZget_seq_lengthrM   rX   shaperi   Z	unsqueezeZ_update_causal_mask_update_mamba_maskZ
rotary_embrb   r;   ZnormZhas_previous_stater   )r%   rc   rB   rd   r>   re   rD   rC   rf   rF   rg   rE   rH   Zpast_seen_tokensZcausal_mask
mamba_maskrA   rG   Zall_hidden_statesZall_self_attnsZall_router_logitsZdecoder_layerZ
layer_maskZlayer_outputsr(   r(   r)   rL      s    






	

zGraniteMoeHybridModel.forwardc                 C   s.   |}|d dks&|dur*t |dkr*d}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )rM   all)r%   rB   rE   ro   r(   r(   r)   rn   6  s    "z(GraniteMoeHybridModel._update_mamba_mask)NNNNNNNNNNN)r+   r,   r-   r   r#   r   r
   rM   rP   r   rN   r   r   listrR   rO   r	   r   rQ   r   rL   rn   r/   r(   r(   r&   r)   r]      s<              
ur]   c                       s2   e Zd ZdgZed fddZd	ddZ  ZS )
GraniteMoeHybridForCausalLMzlm_head.weightr5   c                    s"   t  | t|| _|   d S r    )r"   r#   r]   modelZ	post_initr6   r&   r(   r)   r#   E  s    
z$GraniteMoeHybridForCausalLM.__init__NTc                 K   s  |d u }	|	sj|d us&|d |j d krD|d d |j d  d f }q|j d |j d kr|d d |f }n |rt| j|j d | j| jd}|d ur|d u r| dd }||dkd |	s|d d |j d  d f }|d ur|	rd|i}
nd| i}
|
	|||||d |
S )Nrj   r   r   rh   re   rc   )rd   r>   rD   rB   rE   )
rm   r   r   Zdtyperi   longZcumsumZmasked_fill_
contiguousupdate)r%   rc   r>   rB   re   rE   rd   rD   rH   Zempty_past_kvZmodel_inputsr(   r(   r)   prepare_inputs_for_generationK  s<    
	z9GraniteMoeHybridForCausalLM.prepare_inputs_for_generation)NNNNNT)r+   r,   r-   Z_tied_weights_keysr   r#   rw   r/   r(   r(   r&   r)   rr   B  s   	      rr   )rr   r]   rS   ).typingr   r   rM   r   Zcache_utilsr   Zmodeling_outputsr   r   Zprocessing_utilsr	   utilsr
   r   r   Zutils.deprecationr   Zbamba.configuration_bambar   Zbamba.modeling_bambar   r   r   Z*granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   Zconfiguration_granitemoehybridr   Z
get_loggerr+   rl   r   r0   r1   r4   r7   rS   r]   rr   __all__r(   r(   r(   r)   <module>   s,   $	
j C