a
    h                     @   s  d Z ddlZddlmZmZmZ ddlZddlm  m	Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1 e! r.ddl2m3Z3 G dd deZ4G dd de*Z5G dd de+Z6d5ej7ej8ej8ej8eej8df ee9 ee9 eej8 e:ej8ej8f d	ddZ;e Z<e;e<d < G d!d" d"ej7Z=G d#d$ d$e(Z>G d%d& d&ej7Z?G d'd( d(eZ@G d)d* d*e)ZAG d+d, d,e1ZBd6eej8e:ej8 df eeC eeC eCeej8 eej8eCf d-d.d/ZDG d0d1 d1e0ZEG d2d3 d3e'ZFg d4ZGdS )7zPyTorch Doge model.    N)CallableOptionalUnion)nn   )ACT2FN)Cache)PretrainedConfig)compile_friendly_flex_attention)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)rope_config_validation)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsis_torch_flex_attn_available)deprecate_kwarg)OutputRecorder   )LlamaForSequenceClassificationLlamaMLPLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward	repeat_kv)MixtralForCausalLMMixtralModel)	BlockMaskc                       sx   e Zd ZdZdZdgZddddddddddddddddd	Zd
gdgfddgdgfdgdgfdZd! fdd 	Z  Z	S )"
DogeConfiga   
    This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
    model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for each sequence transformation and state transformation module.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings.
            NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
            Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
                    In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'.
                    The original max position embeddings used during pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation.
                    If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention.
            If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
            When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
            For more details checkout [this paper](https://huggingface.co/papers/2305.13245).
            If it is not specified, will default to `num_attention_heads`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `None`.
        keep_window_size (`int`, *optional*, defaults to 2048):
            The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
        is_moe (`bool`, *optional*, defaults to `False`):
            Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.
        num_experts (`int`, *optional*, defaults to 16384):
            Number of routed experts in the model. This is only used when `is_moe=True`.
        num_experts_per_tok (`int`, *optional*, defaults to 64):
            Number of selected experts to route per-token.
        norm_topk_prob (`bool`, *optional*, defaults to `False`):
            Whether to normalize the topk probabilities.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.

    ```python
    >>> from transformers import DogeConfig, DogeModel

    >>> # Initializing a Doge-320M style configuration
    >>> configuration = DogeConfig()

    >>> # Initializing a model from the Doge-320M style configuration
    >>> model = DogeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Zdogepast_key_valuesZcolwiseZrowwiseZsequence_parallelZcolwise_repZrowwise_rep)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.dt_projzlayers.*.self_attn.o_projzlayers.*.input_layernorm.weightzlayers.*.input_residual.weightz(layers.*.post_attention_layernorm.weightz'layers.*.post_attention_residual.weightznorm.weightzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projzlayers.*.mlp.router_gatezlayers.*.mlp.down_embedzlayers.*.mlp.up_embed	input_idsinputs_embedshidden_statesattention_mask)Zembed_tokensZlayersZnorm                     silu{Gz?ư>TF     @N    @  @   MbP?c                    s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| jd urd| jv r| jd | jd< t|  |d u r|| _t jf d|
i| d S )NtypeZ	rope_typetie_word_embeddings)
vocab_sizehidden_sizeintermediate_sizenum_hidden_layershidden_dropout
hidden_actinitializer_rangerms_norm_eps	use_cachemax_position_embeddings
rope_thetarope_scalingnum_attention_headsnum_key_value_headsattention_biasattention_dropoutmlp_biassliding_windowkeep_window_sizeis_moenum_expertsnum_experts_per_toknorm_topk_proboutput_router_logitsrouter_aux_loss_coefr   super__init__)selfr7   r8   r9   r:   r;   r<   r=   r>   r?   r6   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   kwargs	__class__ a/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/doge/modular_doge.pyrQ      sF    
zDogeConfig.__init__)r(   r)   r*   r+   r,   r-   r.   r/   TFr*   r0   Nr1   NFr,   FNr*   Fr2   r3   FFr4   )
__name__
__module____qualname____doc__Z
model_typeZkeys_to_ignore_at_inferenceZbase_model_tp_planZbase_model_pp_planrQ   __classcell__rV   rV   rT   rW   r"   7   sf   p

                          r"   c                   @   s   e Zd ZdS )DogeRMSNormNrX   rY   rZ   rV   rV   rV   rW   r]     s   r]   c                   @   s   e Zd ZdS )DogeRotaryEmbeddingNr^   rV   rV   rV   rW   r_     s   r_   r!   )	modulequerykeyvaluer'   scalingsoftcap	head_maskreturnc              
      s   d }	d  t |tr|}	n|  d urJ d d d d d d d |jd f   fdd}
t||||
|	d|dd\}}||j}|dd }||fS )Nc                    s^   d urt |   }  d ur:|  | | | |  } d urZ| | | d d  } | S )Nr   )torchtanh)ZscoreZ	batch_idxZhead_idxZq_idxZkv_idxZcausal_maskrf   re   rV   rW   	score_mod*  s    z)flex_attention_forward.<locals>.score_modT)rl   
block_maskZ
enable_gqascaleZ
return_lse   r   )
isinstancer!   shaper
   todtype	transpose
contiguous)r`   ra   rb   rc   r'   rd   re   rf   rS   rm   rl   attn_outputZattention_weightsrV   rk   rW   flex_attention_forward  s*    
&	
rw   Zdoge_flex_attentionc                       s   e Zd Zdeee d fddZedddddej	e
ej	ej	f eej	 ee eej e
ej	eej	 ee
ej	  f d	d
dZdej	ej	eeej	 dddZ  ZS )DogeAttentionNconfig	layer_idxc                    s(  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _|j| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tt|j| _tj|j| j |j|jd| _tj|j| j |j|jd| _t| j|jd| _t| j|jd| _d S )Nhead_dimg      ࿩Zbiaseps)rP   rQ   rz   r{   getattrr8   rC   r|   rD   num_key_value_groupsrd   rF   rI   r   LinearrE   q_projk_projv_proj	Parameterri   zerosAdt_projo_projr]   r>   q_normk_normrR   rz   r{   rT   rV   rW   rQ   K  s4    
zDogeAttention.__init__past_key_valuer#   4.58new_nameversion)r&   position_embeddingsr'   r#   cache_positionrg   c                 K   s  |j d d }g |d| jR }| | ||dd}	| | ||dd}
| ||dd}|\}}t	|	|
||\}	}
|d ur|||d}|
|
|| j|\}
}| |dd|j d |j d d}t| jt| dd}| j||| j|d}t|| j}t}| jjdkr>t| jj }|| |	|
|f|| jsXd	n| j| jd
|\}}|jg |dR   }| |}||fS )Nro   r   )sincosr   r   rh   r&   	dt_statesrI   r'   eagerr,   )r'   dropoutrd   ) rq   r|   r   r   viewrt   r   r   r   r   updater{   r   reshaperi   expr   FZsoftplusprepare_dynamic_maskrI   r   r   r   rz   Z_attn_implementationALL_ATTENTION_FUNCTIONStrainingrF   rd   ru   r   )rR   r&   r   r'   r#   r   rS   Zinput_shapeZhidden_shapeZquery_statesZ
key_statesZvalue_statesr   r   Zcache_kwargsr   	attn_maskZattention_interfacerv   Zattn_weightsrV   rV   rW   forwardi  sN    
 

zDogeAttention.forwardr*   r   c           
   	   C   s  t |jj}|j}|dddddddf dd|jd d}|durt|ts|jt jkr|j}t 	|t j
d|j|d|}||ddddddd|jd f dk|}|jd |kr
t j|||jd}t j||ddd	d
j}	|d|	d}||dk|}|S )a8  
        The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

        Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

        Args:
            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
            dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
            keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
        Nr   ro   r,   )devicers   r   rs   r   TF)dimZlargestsorted      ?)ri   Zfinfors   minexpandrq   rp   r!   boolwhereZtensorr   Zmasked_fillZ
zeros_liketopkindicesZscatter)
rR   r&   r   rI   r'   Z	min_dtypers   r   Zactive_maskZtopk_indicesrV   rV   rW   r     s$    2z"DogeAttention.prepare_dynamic_mask)N)NNN)r*   N)rX   rY   rZ   r"   r   intrQ   r   ri   Tensortupler   
LongTensorr   r   r\   rV   rV   rT   rW   rx   J  s*      <  rx   c                   @   s   e Zd ZdS )DogeMLPNr^   rV   rV   rV   rW   r     s   r   c                       s6   e Zd Zed fddZejejdddZ  ZS )	DogeCDMoE)rz   c                    s   t    |j| _|j| _t|j | _|j| _t	t
| j| _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| jd dd| _t| j| j| _t| j| j| _d S )Nr}   r   F)rP   rQ   r8   r9   r   r<   act_fnrK   mathfloorsqrtnum_keysrL   top_krM   r   r   rG   	gate_projup_proj	down_projrouter_gateZ	Embedding
down_embedup_embedrR   rz   rT   rV   rW   rQ     s    
zDogeCDMoE.__init__)r&   rg   c                 K   s  |j \}}}| |d|| d}|j| jdd\\}}\}	}
|d|d }|	d| j |
d }|jg |j d d dR  }|jg |j d d dR  }|j| jdd\}}|d|}tj	|dd}| j
r||jddd }| |}| |}t|||| dd|| d}| || }t||| dd|||d}| | | || | }|| }||fS )Nr   r   r   rh   T)r   Zkeepdimro   )rq   r   r   r   r   	unsqueezer   gatherr   softmaxrM   sumr   r   ri   matmulr   r   r   r   )rR   r&   rS   ZbszZseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesZscoresposition_indicesr   routing_weightsr   r   Zexperts_weightsZexperts_statesrV   rV   rW   r     s(    

&$ zDogeCDMoE.forward)	rX   rY   rZ   r"   rQ   ri   r   r   r\   rV   rV   rT   rW   r     s   r   c                       s   e Zd Zdeee d fddZedddddej	e
ej	ej	f eej	 eej ee
ej	  ee eej ee e
ejee
ejejf  f d
	ddZ  ZS )DogeDecoderLayerNry   c                    s   t    |j| _t|j|jd| _t||d| _t	
t|j| _t|j|jd| _|jsft|nt|| _t	
t|j| _d S )Nr~   ry   )rP   rQ   r;   r]   r8   r>   input_layernormrx   	self_attnr   r   ri   Zonesinput_residualpost_attention_layernormrJ   r   r   mlppost_attention_residualr   rT   rV   rW   rQ     s    
zDogeDecoderLayer.__init__r   r#   r   r   F)	r&   r   r'   position_idsr#   r?   r   rS   rg   c              
   K   s   |}	|  |}| jf |||||||d|\}}
tj|| j| jd}| j|	 | }|}	| |}| |}tj|| j| jd}| j	|	 | }|S )N)r&   r   r'   r   r#   r?   r   )pr   )
r   r   r   r   r;   r   r   r   r   r   )rR   r&   r   r'   r   r#   r?   r   rS   ZresidualZself_attn_weightsrV   rV   rW   r     s*    




zDogeDecoderLayer.forward)N)NNNFN)rX   rY   rZ   r"   r   r   rQ   r   ri   r   r   r   r   r   r   FloatTensorr   r\   rV   rV   rT   rW   r     s$        r   c                   @   s0   e Zd ZdZdZeeddeedZ	dd Z
dS )DogePreTrainedModelFro   )index)r   r&   
attentionsc                 C   sl   t | | t|tr.t|drh|jj  n:t|trht|drP|j	j
d t|drh|jj
d dS )zInitialize the weightsr   r   r   r   N)r   _init_weightsrp   rx   hasattrr   dataZzero_r   r   Zfill_r   )rR   r`   rV   rV   rW   r   A  s    




z!DogePreTrainedModel._init_weightsN)rX   rY   rZ   Z_supports_flash_attnZ_can_compile_fullgraphr   r   r   rx   Z_can_record_outputsr   rV   rV   rV   rW   r   8  s   
r   c                   @   s   e Zd ZdS )	DogeModelNr^   rV   rV   rV   rW   r   N  s   r   )gate_logitsrK   r   r   r'   rg   c                 C   sx  | du st | tsdS | d j}| d j}g }g }| D ]}	|	|}	|	j|dd\\}
}\}}|
d|d }|d| |d }|jg |jdd dR  }|jg |jdd dR  }|j|dd\}}|	d|}t
j|dd}|| || q6tj|dd}tj|dd}|du r|d}tj|||d}tj|||d}|d|||jd  }tj|dd}n|j\}}t| }|ddddddf ||||fd|}|d|  }tj|||d}tj|||d}|d||t| }|ddddddf ||||fd||}tj|| ddtj|dd }t|| }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [2, batch_size * sequence_length, num_keys].
        num_experts:
            Number of experts
        num_keys:
            Number of keys
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   r   r   rh   r   )rp   r   rs   r   rr   r   r   r   rq   r   r   r   appendri   catr   Z	ones_likeZscatter_add_meanlenr   r   r   r   )r   rK   r   r   r'   Zcompute_dtypeZcompute_deviceZall_expert_indicesZall_routing_weightsZlayer_gate_logitsr   r   r   r   r   r   r   r   Zexpert_indicesr   Ztokens_per_expertpadZrouter_prob_per_expertZ
batch_sizeZsequence_lengthr:   Zexpert_attention_maskZ router_per_expert_attention_maskZoverall_lossrV   rV   rW   load_balancing_loss_funcR  sn     








r   c                       s   e Zd Z fddZdeej eej eej eeej	  eej	 eej ee
 eej eeejf ee
 ee edddZ  ZS )	DogeForCausalLMc                    s"   t  | t|| _|j| _d S )N)rP   rQ   r   modelrK   r   rT   rV   rW   rQ     s    
zDogeForCausalLM.__init__Nr   )r$   r'   r   r#   r%   labelsr?   r   logits_to_keeprN   rS   rg   c              
   K   s   |
dur|
n| j j}
| jf |||||||d|}|j}t|	trPt|	 dn|	}| |dd|ddf }d}|dur| j||| j	fi |}d}|
rt
|j| jtt| j| j|}|dur|| j||j 7 }t||||j|j|j|jdS )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DogeForCausalLM

        >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
        >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r$   r'   r   r#   r%   r?   r   )lossaux_losslogitsr#   r&   r   r   )rz   rN   r   Zlast_hidden_staterp   r   sliceZlm_headZloss_functionr7   r   r   rK   r   r   r   rL   rO   rr   r   r   r#   r&   r   )rR   r$   r'   r   r#   r%   r   r?   r   r   rN   rS   outputsr&   Zslice_indicesr   r   r   rV   rV   rW   r     sN    %zDogeForCausalLM.forward)
NNNNNNNNr   N)rX   rY   rZ   rQ   r   ri   r   r   listr   r   r   r   r   r   r   r   r\   rV   rV   rT   rW   r     s2             r   c                   @   s   e Zd ZdS )DogeForSequenceClassificationNr^   rV   rV   rV   rW   r     s   r   )r"   r   r   r   r   )NNN)NNr   N)Hr[   r   typingr   r   r   ri   Ztorch.nn.functionalr   Z
functionalr   Zactivationsr   Zcache_utilsr   Zconfiguration_utilsr	   Zintegrations.flex_attentionr
   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   Zutils.deprecationr   Zutils.genericr   Zllama.modeling_llamar   r   r   r   r   r   r   r   Zmixtral.modeling_mixtralr   r    Z!torch.nn.attention.flex_attentionr!   r"   r]   r_   Moduler   floatr   rw   r   rx   r   r   r   r   r   r   r   r   r   __all__rV   rV   rV   rW   <module>   s|   (
 W
   1~93    jZ