a
    h"l                     @   s  d Z ddlmZ ddlZddlm  mZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- e.e/Z0G dd de"Z1G dd de,Z2G dd deZ3G dd dej4Z5G dd de$Z6G dd de-Z7G d d! d!e%eZ8G d"d# d#e+Z9G d$d% d%e*Z:G d&d' d'e&Z;G d(d) d)e(Z<G d*d+ d+e)Z=G d,d- d-e'Z>g d.Z?dS )/zPyTorch MiniMax model.    )OptionalN)nn   )ACT2FN)CacheDynamicCache)layer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)MoeModelOutputWithPast)Unpack)TransformersKwargslogging)deprecate_kwarg)OutputRecorder   )MixtralConfig)
MixtralAttentionMixtralDecoderLayerMixtralForCausalLMMixtralForQuestionAnswering MixtralForSequenceClassificationMixtralForTokenClassificationMixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralSparseMoeBlockc                       s"   e Zd ZdZd fdd	Z  ZS )MiniMaxConfiga  
    This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an
    MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the MiniMax.

    [MiniMaxAI/MiniMax-Text-01-hf](https://huggingface.co/MiniMaxAI/MiniMax-Text-01-hf)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the MiniMax model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MiniMaxModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 14336):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
            The maximum sequence length that this model might ever be used with. MiniMax's sliding window attention
            allows sequence of up to 4096*32 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `4096`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_experts_per_tok (`int`, *optional*, defaults to 2):
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter
        num_local_experts (`int`, *optional*, defaults to 8):
            Number of experts per Sparse MLP layer.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabeling this will also
            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.0):
            Amount of noise to add to the router.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        block_size (`int`, *optional*, defaults to 256):
            The length of each attention block, determining how queries, keys, and values
            are grouped and processed for intra- and inter-block attention.
        full_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after normal attention.
        full_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after normal attention.
        linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after lightning attention.
        linear_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after lightning attention.
        mlp_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after MLP.
        mlp_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after MLP.

    ```python
    >>> from transformers import MiniMaxModel, MiniMaxConfig

    >>> # Initializing a MiniMax style configuration
    >>> configuration = MiniMaxConfig()

    >>> # Initializing a model from the MiniMax style configuration
    >>> model = MiniMaxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```N      c	           
         sp   t  jf i |	 || _|| _|| _|| _|| _|| _|| _|| _	| jd u rbdd t
| jD | _t| j d S )Nc                 S   s$   g | ]}t |d  d rdndqS )r!   r   full_attentionlinear_attention)bool).0i r'   g/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/minimax/modular_minimax.py
<listcomp>   s   z*MiniMaxConfig.__init__.<locals>.<listcomp>)super__init__layer_types
block_sizefull_attn_alpha_factorfull_attn_beta_factorlinear_attn_alpha_factorlinear_attn_beta_factormlp_alpha_factormlp_beta_factorrangenum_hidden_layersr   )
selfr,   r-   r.   r/   r0   r1   r2   r3   super_kwargs	__class__r'   r(   r+      s    
zMiniMaxConfig.__init__)Nr    r!   r!   r!   r!   r!   r!   )__name__
__module____qualname____doc__r+   __classcell__r'   r'   r8   r(   r   5   s   g        r   c                   @   s   e Zd ZdS )MiniMaxRMSNormNr:   r;   r<   r'   r'   r'   r(   r?      s   r?   c                       s   e Zd Z fddZdd ZedddZ fdd	Zed fd
dZdd Z	edddZ
ejdddZedddZ  ZS )MiniMaxCachec                    s   t    g | _d S N)r*   r+   linear_cacher6   r8   r'   r(   r+      s    
zMiniMaxCache.__init__c                 C   s4   t t| j|d D ]}| jg  q|| j|< d S )Nr!   )r4   lenrC   append)r6   	layer_idxrC   _r'   r'   r(   set_linear_cache   s    zMiniMaxCache.set_linear_cache)rG   c                 C   s   |t | k r| j| S d S rB   )rE   rC   r6   rG   r'   r'   r(   get_linear_cache   s    
zMiniMaxCache.get_linear_cachec                    s   t t  t| jS rB   )maxr*   __len__rE   rC   rD   r8   r'   r(   rM      s    zMiniMaxCache.__len__c                    s4   |t | jk r(| j| g kr(| j| fS t |S rB   )rE   rC   r*   __getitem__rJ   r8   r'   r(   rN      s    zMiniMaxCache.__getitem__c                 c   s    t t| D ]}| | V  qd S rB   )r4   rE   rJ   r'   r'   r(   __iter__   s    zMiniMaxCache.__iter__)repeatsc                 C   sP   t t| D ]>}| j| g kr:| j| j|dd| j|< q| j| | qd S )Nr   dim)r4   rE   rC   Zrepeat_interleavelayersbatch_repeat_interleave)r6   rP   rG   r'   r'   r(   rT      s    z$MiniMaxCache.batch_repeat_interleave)indicesc                 C   sN   t t| D ]<}| j| g kr8| j| |df | j|< q| j| | qd S )N.)r4   rE   rC   rS   batch_select_indices)r6   rU   rG   r'   r'   r(   rV      s    z!MiniMaxCache.batch_select_indices)
max_lengthc                 C   s   t dd S )Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)r6   rW   r'   r'   r(   crop   s    zMiniMaxCache.crop)r:   r;   r<   r+   rI   intrK   rM   rN   rO   rT   torchTensorrV   rY   r>   r'   r'   r8   r(   rA      s   rA   c                       s   e Zd Zeed fddZdd Zdd Zedd	d
dde	j
ee	j
e	j
f ee	j
 ee ee	j ee ee	j
ee	j
 eee	j
  f dddZ  ZS )MiniMaxLightningAttentionconfigrG   c                    s  t    || _t|dd p&|j|j | _|j| _|j| _|j| _t	|j
 | _t| j| j | _tj|j| j| j d dd| _tj| j| j |jdd| _tj|j| j| j dd| _|  }| |\}}}| d| | d| | d| | d| d S )	Nhead_dimr   F)Zbias
slope_ratequery_decay	key_decaydiagonal_decay)r*   r+   rG   getattrhidden_sizenum_attention_headsr`   r5   r-   r   Z
hidden_actact_fnr?   normr   ZLinearqkv_projout_projoutput_gateget_slope_ratedecay_factorsZregister_buffer)r6   r_   rG   ra   rb   rc   rd   r8   r'   r(   r+      s"    
 z"MiniMaxLightningAttention.__init__c                 C   sd   ddd| j    }t| j d }d| j| jd d   d }|| }|| }|d d d d f }|S )Nr!   r      gh㈵>)rg   r[   arangerG   r5   )r6   baseexponentfactorZrater'   r'   r(   rm     s    z(MiniMaxLightningAttention.get_slope_ratec                 C   s   t | jd }t | |d d d f  }t | | j|d d d f   }|d d d f |d d d f  }|d d d d d d f }|| }t |dk| td}t |}|||fS )Nr!   r   z-inf)r[   rp   r-   expwherefloat)r6   ra   Zblock_size_rangerb   rc   rd   r'   r'   r(   rn     s    " 
z'MiniMaxLightningAttention.decay_factorspast_key_valuepast_key_values4.58new_nameversionN)hidden_statesposition_embeddingsattention_maskrx   cache_positionkwargsreturnc           #      K   sn  |j \}}}	|| j d | j }
| | |}|||| jd| j }tj|| jdd\}}}|	dd}|	dd}|	dd}d }|d ur|
| j}|d u rDt|| j| j| j|}|d ur|jtjd}||dd d}g }t|
D ]@}|| j }t|| j |}|| }|d d d d ||f }|d d d d ||f }|d d d d ||f }| jd d d |f }| jd d | d f }| jd d d d d |d |f }t| j | }t||	dd}t|| |}t|| |}|| }|| t|| 	dd|} || |  }qnt| j }!g }t|D ]}|d d d d ||d f }|d d d d ||d f }|d d d d ||d f }t|	dd|}"|!| |" }t||}|| q^tj|dd}|	dd}|||| j| j }| |}t| || }| |}|d urf| | j| ||fS )	Nr!   r   rQ   r   )Zdtyper   )!shaper-   rh   rj   Zreshaperg   r`   r[   splitZ	transposerK   rG   Zzerostor$   Zmasked_fill	unsqueezer4   minrb   rc   rd   rt   ra   matmulrF   catri   FZsigmoidrl   rk   rI   )#r6   r}   r~   r   rx   r   r   Z
batch_sizeZseq_lenrf   Z
num_blocksZ
qkv_statesZquery_statesZ
key_statesZvalue_statesZattn_weights_interZattn_outputr&   Z	start_idxZend_idxZcurrent_block_sizeZcurrent_query_statesZcurrent_key_statesZcurrent_value_statesZcurrent_query_decayZcurrent_key_decayZcurrent_diagonal_decayZblock_decayZattn_weights_intraZattn_output_intraZattn_output_interZcurrent_attn_outputZnext_attn_weights_interratioZcurrent_attn_weights_interr'   r'   r(   forward  st    


"



z!MiniMaxLightningAttention.forward)NN)r:   r;   r<   r   rZ   r+   rm   rn   r   r[   r\   tupler   r   
LongTensorr   r   r   r>   r'   r'   r8   r(   r]      s     r]   c                   @   s   e Zd ZdS )MiniMaxAttentionNr@   r'   r'   r'   r(   r   ~  s   r   c                   @   s   e Zd ZdS )MiniMaxSparseMoeBlockNr@   r'   r'   r'   r(   r     s   r   c                       s   e Zd Zeed fddZedddddeje	ejejf e
ej e
ej e
e	ej  e
e e
e e
e e
ej ee e	eje
e	ejejf  f d
ddZ  ZS )MiniMaxDecoderLayerr^   c                    sx   t  || || _|j| | _|j| _|j| _| jdkrXt||| _|j	| _
|j| _nt||| _|j| _
|j| _d S )Nr#   )r*   r+   rG   r,   
layer_typer2   r3   r]   	self_attnr0   attn_alpha_factorr1   attn_beta_factorr   r.   r/   )r6   r_   rG   r8   r'   r(   r+     s    

zMiniMaxDecoderLayer.__init__rw   rx   ry   rz   NF)r}   r~   r   position_idsrx   output_attentionsoutput_router_logits	use_cacher   r   r   c
                 K   s|   |  |}|}| jf ||||||||	d|
\}}|| j || j  }| |}|}| |\}}|| j || j  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            attention_mask (`torch.Tensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r}   r~   r   r   rx   r   r   r   )Zinput_layernormr   r   r   Zpost_attention_layernormZblock_sparse_moer2   r3   )r6   r}   r~   r   r   rx   r   r   r   r   r   ZresidualrH   r'   r'   r(   r     s(    '
	

zMiniMaxDecoderLayer.forward)NNNFFFN)r:   r;   r<   r   rZ   r+   r   r[   r\   r   r   r   r$   r   r   FloatTensorr   r>   r'   r'   r8   r(   r     s,          r   c                   @   s(   e Zd ZdZeeddeeegdZ	dS )MiniMaxPreTrainedModelFr!   )index)Zrouter_logitsr}   Z
attentionsN)
r:   r;   r<   Z_can_compile_fullgraphr   r   r   r   r]   Z_can_record_outputsr'   r'   r'   r(   r     s
   
r   c                   @   sX   e Zd Zdejeej eej ee eej ee	 ee	 eej e
e ed
ddZdS )MiniMaxModelN)
	input_idsr   r   rx   inputs_embedsr   r   r   r   r   c	              
   K   s8  |d u |d uA rt d|r,|d u r,t }n"|rNt|tsNt dt| d|d u r`| |}|d u r|d urx| nd}
tj|
|
|jd  |j	d}|d u r|
d}| jjd u rtnt}|| j|||||d}|}| ||}| jD ]6}|jdkr|}n|}||f||||||d	|	}q| |}t||d
S )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   r!   )device)r_   Zinput_embedsr   r   rx   r   r"   )r~   r   r   rx   r   r   )Zlast_hidden_staterx   )
ValueErrorrA   
isinstancetypeZembed_tokensZget_seq_lengthr[   rp   r   r   r   r_   Zsliding_windowr	   r
   Z
rotary_embrS   r   ri   r   )r6   r   r   r   rx   r   r   r   r   r   Zpast_seen_tokensZmask_functionZcausal_maskr}   r~   Zdecoder_layerZinput_attention_maskr'   r'   r(   r     sb    

	


zMiniMaxModel.forward)NNNNNNNN)r:   r;   r<   r[   r   r   r\   rA   r   r$   r   r   r   r   r'   r'   r'   r(   r     s(           r   c                       s   e Zd Z fddZ  ZS )MiniMaxForCausalLMc                    s   t  jf i |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r*   r   )r6   r7   r8   r'   r(   r   .  s    zMiniMaxForCausalLM.forward)r:   r;   r<   r   r>   r'   r'   r8   r(   r   -  s   r   c                   @   s   e Zd ZdS ) MiniMaxForSequenceClassificationNr@   r'   r'   r'   r(   r   H  s   r   c                   @   s   e Zd ZdS )MiniMaxForTokenClassificationNr@   r'   r'   r'   r(   r   L  s   r   c                   @   s   e Zd ZdS )MiniMaxForQuestionAnsweringNr@   r'   r'   r'   r(   r   P  s   r   )r   r   r   r   r   r   r   )@r=   typingr   r[   Ztorch.nn.functionalr   Z
functionalr   Zactivationsr   Zcache_utilsr   r   Zconfiguration_utilsr   Zmasking_utilsr	   r
   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   Zprocessing_utilsr   utilsr   r   Zutils.deprecationr   Zutils.genericr   Zmixtral.configuration_mixtralr   Zmixtral.modeling_mixtralr   r   r   r   r   r   r   r   r   r   Z
get_loggerr:   loggerr   r?   rA   Moduler]   r   r   r   r   r   r   r   r   r   __all__r'   r'   r'   r(   <module>   sD   0
 . S	K