a
    h                    @   s  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	m
  mZ d dlZd dlm
Z
 d dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ e% r,d dl,m-Z- d dl.m/Z/ d dl0m1Z1 ne2Z/e&3e4Z5G dd deZ6dNej7ej7eej7 eej7 e8ej7ej7ej7e9eej7 eej7 f dddZ:ej7ej7e9e9ej7dddZ;G dd dej<j=Z>dOeej7 ee9 ddd Z?G d!d" d"e/Z@G d#d$ d$e
jAZBG d%d& d&e
jAZCG d'd( d(e*ZDdPd*ej7ej7ej7eejE e8e9e9f e9e9eeF ee8ej7ej7f e8ej7 f d+
d,d-ZGejHfd*ej7e@ej7e9e8e9e9f e9e9ejIe8ej7 d.
d/d0ZJd*ej7ej7ej7eejE e8e9e9f e9e9e8ej7 d1	d2d3ZKeJeGeKd4ZLG d5d* d*e
jAZMG d6d7 d7eZNe$G d8d9 d9e"ZOe$G d:d; d;eOZPG d<d= d=e
jAZQe$d>d?G d@dA dAeOZRe$dBd?G dCdD dDeOZSe$dEd?G dFdG dGeOZTe$G dHdI dIeOZUe$dJd?G dKdL dLeOZVg dMZWdS )Q    N)nullcontext)LiteralOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)PretrainedConfig)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringis_flash_attn_2_availablelogging)is_triton_available   )GemmaRotaryEmbeddingapply_rotary_pos_emb) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryc                #       sJ   e Zd ZdZdZddiZdgZd#ed d fdd Z fd!d"Z	  Z
S )$ModernBertConfiga  
    This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the ModernBERT-base.
    e.g. [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 50368):
            Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`ModernBertModel`]
        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 22):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer decoder.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu"`
            if not specified.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
            The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        norm_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the normalization layers.
        pad_token_id (`int`, *optional*, defaults to 50283):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 50282):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 50281):
            Beginning of stream token id.
        cls_token_id (`int`, *optional*, defaults to 50281):
            Classification token id.
        sep_token_id (`int`, *optional*, defaults to 50282):
            Separation token id.
        global_rope_theta (`float`, *optional*, defaults to 160000.0):
            The base period of the global RoPE embeddings.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        global_attn_every_n_layers (`int`, *optional*, defaults to 3):
            The number of layers between global attention layers.
        local_attention (`int`, *optional*, defaults to 128):
            The window size for local attention.
        local_rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the local RoPE embeddings.
        embedding_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the embeddings.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the MLP layers.
        mlp_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the MLP layers.
        decoder_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the decoder layers.
        classifier_pooling (`str`, *optional*, defaults to `"cls"`):
            The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
            CLS token doesn't attend to all tokens on long sequences.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the classifier.
        classifier_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the classifier.
        classifier_activation (`str`, *optional*, defaults to `"gelu"`):
            The activation function for the classifier.
        deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
            Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
        sparse_prediction (`bool`, *optional*, defaults to `False`):
            Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
        sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
            The index to ignore for the sparse prediction.
        reference_compile (`bool`, *optional*):
            Whether to compile the layers of the model which were compiled during pretraining. If `None`, then parts of
            the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
            shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
            be faster in some scenarios.
        repad_logits_with_grad (`bool`, *optional*, defaults to `False`):
            When True, ModernBertForMaskedLM keeps track of the logits' gradient when repadding for output. This only
            applies when using Flash Attention 2 with passed labels. Otherwise output logits always have a gradient.

    Examples:

    ```python
    >>> from transformers import ModernBertModel, ModernBertConfig

    >>> # Initializing a ModernBert style configuration
    >>> configuration = ModernBertConfig()

    >>> # Initializing a model from the modernbert-base style configuration
    >>> model = ModernBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Z
modernbert
rope_thetaglobal_rope_thetaZpast_key_values             gelu    {Gz?       @h㈵>Fk  j  i       A        r
           @TclsNr4   mean)classifier_poolingc$           %         s   t  jf |||||d|$ || _|| _|| _|| _|| _|| _|| _|	| _	|
| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| | _|!| _|"| _|#| _| jdvrt d| j dd S )N)pad_token_idbos_token_ideos_token_idcls_token_idsep_token_idr6   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is .)!super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsinitializer_rangeinitializer_cutoff_factornorm_eps	norm_biasr"   attention_biasattention_dropouthidden_activationglobal_attn_every_n_layerslocal_attentionlocal_rope_thetaembedding_dropoutmlp_biasmlp_dropoutdecoder_biasr8   classifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionsparse_pred_ignore_indexreference_compilerepad_logits_with_grad
ValueError)%selfrA   rC   rD   rE   rF   rM   rB   rG   rH   rI   rJ   r9   r;   r:   r<   r=   r"   rK   rL   rN   rO   rP   rQ   rR   rS   rT   r8   rU   rV   rW   rX   rY   rZ   r[   r\   kwargs	__class__ m/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/modernbert/modular_modernbert.pyr@      sV    '
zModernBertConfig.__init__c                    s   t   }|dd  |S )Nr[   )r?   to_dictpopr^   outputr`   rb   rc   rd      s    
zModernBertConfig.to_dict)#r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   Fr-   r.   r/   r/   r.   r0   Fr1   r
   r2   r3   r1   Fr1   Tr4   r1   Fr(   FFr5   NF)__name__
__module____qualname____doc__Z
model_typeZattribute_mapZkeys_to_ignore_at_inferencer   r@   rd   __classcell__rb   rb   r`   rc   r    8   sV   g                                   Sr    )inputsattention_maskposition_idslabelsreturnc                 C   s   |j dtjd}tj| dd }t|  }tjj	
tj|dtjdd}|  dkrn|  | }n*| j^}	}
}|	|
 }| j|g|R  | }|dur| | nd}|dur| | nd}||||||fS )	a  
    Remove padding from input sequences.

    Args:
        inputs: (batch, seqlen, ...) or (batch, seqlen)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        position_ids: (batch, seqlen), int, position ids
        labels: (batch, seqlen), int, labels

    Returns:
        unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        cu_seqlens: (batch + 1), the cumulative sequence lengths
        max_seqlen_in_batch: int
        unpadded_position_ids: (total_nnz) or None
        unpadded_labels: (total_nnz) or None
    dimdtypeF)as_tupler   )   r   r   N)sumtorchZint32Znonzeroflattenintmaxitemr   
functionalpadZcumsumrt   shapeview)rm   rn   ro   rp   Zseqlens_in_batchindicesZmax_seqlen_in_batch
cu_seqlensZunpadded_inputsbatchseqlenrestr   Zunpadded_position_idsZunpadded_labelsrb   rb   rc   _unpad_modernbert_input   s    r   )rm   r   r   r   rq   c                 C   s   |   dkr:tj|| | j| jd}| ||< |||}nH| j^}}tj|| g|R | j| jd}| ||< |j||g|R  }|S )aQ  
    Add padding to sequences.

    Args:
        inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        batch: int, batch size
        seqlen: int, max sequence length

    Returns:
        padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
    rw   )ru   device)rt   ry   Zzerosru   r   r   r   )rm   r   r   r   rg   Zpadded_inputs_r   rb   rb   rc   _pad_modernbert_output&  s    
"r   c                   @   s8   e Zd Zedeej ee dddZedd Z	dS )ApplyRotaryEmbUnpadNr   
max_seqlenc              
   C   sd   |  }|j\}}}}	|d d d df |d|	}
t|
||d||ddd | ||| || _|S )Nr   rr   r   FT)seqlen_offsetsr   r   interleavedinplace)
contiguousr   r   r   Zsave_for_backwardr   )ctxqkvcossinr   r   	total_nnz_three_nheadsheaddimZqkrb   rb   rc   forwardF  s     
zApplyRotaryEmbUnpad.forwardc           
      C   sn   | j \}}}| }|j\}}}}|d d d df |d|}	t|	||d|| jdddd	 |d d d d d d fS )Nr   rr   r   FT)r   r   r   r   r   	conjugate)Zsaved_tensorsr   r   r   r   r   )
r   dor   r   r   r   r   r   r   Zdqkrb   rb   rc   backwarde  s     zApplyRotaryEmbUnpad.backward)NN)
rh   ri   rj   staticmethodr   ry   Tensorr{   r   r   rb   rb   rb   rc   r   E  s     r   r   c                 C   s   t | ||||S )a  
    Arguments:
        qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (total_nnz, dim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r   apply)r   r   r   r   r   rb   rb   rc   apply_rotary_unpadded|  s    r   c                	       s   e Zd ZdZdeeee eej eej	 d fddZ
dejejee eejeejejf f ddd	Zed
ddZ  ZS )!ModernBertUnpaddedRotaryEmbeddingzP
    The rotary position embeddings applied directly to unpadded sequences.
    r3   N)rt   baser   r   ru   c                    sF   t  j|||dd || _|durB|durB|durB| j|||d dS )a  
        max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache will be recomputed during the forward pass.
        F)rt   r   r   r   Nr   ru   )r?   r@   r   _update_cos_sin_cache)r^   rt   r   r   r   ru   r`   rb   rc   r@     s    z*ModernBertUnpaddedRotaryEmbedding.__init__)r   r   r   rq   c                 C   s6   |dur| j ||j|jd t|| j| j||d}|S )z
        Apply rotary embedding *inplace* to qkv.
        qkv: (total_nnz, 3, nheads, headdim)
        cu_seqlens: (batch + 1,) cumulative sequence lengths
        max_seqlen: int max seq length in the batch
        Nr   r   )r   r   ru   r   Z_cos_cachedZ_sin_cached)r^   r   r   r   rb   rb   rc   r     s    z)ModernBertUnpaddedRotaryEmbedding.forward)rq   c                 C   s   d| j  d| j d| j S )Nzdim=z, base=z, scale_base=)rt   r   Z
scale_baser^   rb   rb   rc   
extra_repr  s    z,ModernBertUnpaddedRotaryEmbedding.extra_repr)r3   NNN)N)rh   ri   rj   rk   r{   floatr   ry   r   ru   r@   r   r   tupler   strr   rl   rb   rb   r`   rc   r     s(        r   c                       sh   e Zd ZdZed fddZejddejej	ddd	Z
deej eej	 ej	dddZ  ZS )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                    sR   t    || _tj|j|j|jd| _tj	|j|j
|jd| _t|j| _d S )N)Zpadding_idxepsbias)r?   r@   r   r   Z	EmbeddingrA   rC   r9   tok_embeddings	LayerNormrI   rJ   normDropoutrQ   dropr^   r   r`   rb   rc   r@     s
    
zModernBertEmbeddings.__init__TZdynamic)	input_idsrq   c                 C   s   |  | | |S N)r   r   r   )r^   r   rb   rb   rc   compiled_embeddings  s    z(ModernBertEmbeddings.compiled_embeddingsN)r   inputs_embedsrq   c                 C   sF   |d ur|  | |}n(| jjr,| |n|  | | |}|S r   )r   r   r   r[   r   r   )r^   r   r   hidden_statesrb   rb   rc   r     s    zModernBertEmbeddings.forward)NN)rh   ri   rj   rk   r    r@   ry   compile
LongTensorr   r   r   r   rl   rb   rb   r`   rc   r     s   
 r   c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    r   c                    sf   t    || _tj|jt|jd |jd| _	t
|j | _t|j| _tj|j|j|jd| _d S )Nr   r   )r?   r@   r   r   LinearrC   r{   rD   rR   Wir   rM   actr   rS   r   Wor   r`   rb   rc   r@     s    
 zModernBertMLP.__init__r   rq   c                 C   s2   |  |jddd\}}| | | || S )Nr   rr   rt   )r   chunkr   r   r   )r^   r   inputZgaterb   rb   rc   r     s    zModernBertMLP.forward)
rh   ri   rj   rk   r    r@   ry   r   r   rl   rb   rb   r`   rc   r     s   r   c                   @   s   e Zd ZdS )ModernBertRotaryEmbeddingN)rh   ri   rj   rb   rb   rb   rc   r     s   r   FModernBertAttention)
moduler   rn   sliding_window_maskro   rO   bsrt   output_attentionsrq   c	                 K   s   | j ||d\}
}|ddjdd\}}}t|||
|\}}| jd }t||dd| }|dkrl|}|| }tjj	|dtj
d	|j}tjj|| j| jd
}t||}|dd }||d|}|r||fS |fS )Nro   r
   rw   r   r         ࿩rr   rr   rr   rs   )ptraining)
rotary_emb	transposeunbindr   head_dimry   matmulr   r~   ZsoftmaxZfloat32toru   ZdropoutrL   r   r   r   )r   r   rn   r   ro   rO   r   rt   r   _kwargsr   r   querykeyvaluescaleZattn_weightsattn_outputrb   rb   rc   eager_attention_forward  s     
r   )
r   r   r   r   r   rO   r   rt   target_dtyperq   c	                 K   s   ||||d}|j tjtjfv}
|
rb|j }||}t|||| jrH| jnd| j|d}||}n"t|||| jrv| jnd| j|d}|	||fS )Nr   r1   )r   r   	dropout_pZdeterministicZwindow_size)
ru   ry   Zfloat16bfloat16r   r   r   rL   rX   r   )r   r   r   r   r   rO   r   rt   r   r   Zconvert_dtypeZ
orig_dtypeattnrb   rb   rc   flash_attention_forward&  s.    
r   )	r   r   rn   r   ro   rO   r   rt   rq   c                 K   s   | j ||d\}	}
|ddjdd\}}}t|||	|
\}}|dkrJ|}tj|||| jr`| jnd|ddd }|	|d	|}|fS )
Nr   r
   rw   r   r   r   r1   )r   Z	attn_maskrr   )
r   r   r   r   FZscaled_dot_product_attentionr   rL   r   r   )r   r   rn   r   ro   rO   r   rt   r   r   r   r   r   r   r   rb   rb   rc   sdpa_attention_forwardQ  s"    r   )flash_attention_2eagersdpac                       sJ   e Zd ZdZd
eee d fddZdej	ee
 ej	ddd	Z  ZS )r   a  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    Nr   layer_idc                    sf  t    || _|| _|j|j dkr@td|j d|j d|j| _|j| _|j| _	|j|j | _
| j
| j	 | _tj|jd| j |jd| _||j dkr|jd |jd f| _|jd ur|jn|j}|j}nd| _|j}|j}|jd	krt| j
||d
| _nt|}||_t|d| _tj|j|j|jd| _|jdkrPt|jnt | _t  | _!d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r
   r   r   r   r   )rt   r   r   r   r1   )"r?   r@   r   r   rC   rF   r]   rL   rX   	num_headsr   all_head_sizer   r   rK   WqkvrN   rO   rP   r"   rB   _attn_implementationr   r   copydeepcopyr!   r   r   r   Identityout_dropsetZpruned_heads)r^   r   r   r!   rB   config_copyr`   rb   rc   r@     s<    


"zModernBertAttention.__init__F)r   r   rq   c              	   K   s   |  |}|jd }| jjdkr6|dd| j| j}n||dd| j| j}t| jj | f|| j| j	|| j
|d|}|d }| | |}|f|dd   S )Nr   r   rr   r
   )r   r   rO   r   rt   r   rw   )r   r   r   r   r   r   r   MODERNBERT_ATTENTION_FUNCTIONr   rO   r   r   r   )r^   r   r   r_   r   r   attn_outputsrb   rb   rc   r     s(    



zModernBertAttention.forward)N)F)rh   ri   rj   rk   r    r   r{   r@   ry   r   boolr   rl   rb   rb   r`   rc   r   {  s   	* c                
       s   e Zd Zdeee d fddZejddej	ej	ddd	Z
dej	eej	 eej	 eej eej	 ee ee ej	dddZ  ZS )ModernBertEncoderLayerNr   c                    sp   t    || _|dkr$t | _ntj|j|j|j	d| _t
||d| _tj|j|j|j	d| _t|| _d S )Nr   r   r   )r?   r@   r   r   r   	attn_normr   rC   rI   rJ   r   r   mlp_normr   mlp)r^   r   r   r`   rb   rc   r@     s    
zModernBertEncoderLayer.__init__Tr   r   c                 C   s   |  | |S r   )r   r   r^   r   rb   rb   rc   compiled_mlp  s    z#ModernBertEncoderLayer.compiled_mlpF)r   rn   r   ro   r   r   r   rq   c           
   	   C   sf   | j | |||||||d}||d  }| jjr<| |n| | |}	||	 }|f|dd   S )Nrn   r   ro   r   r   r   r   rw   )r   r   r   r[   r   r   r   )
r^   r   rn   r   ro   r   r   r   r   Z
mlp_outputrb   rb   rc   r     s     
	zModernBertEncoderLayer.forward)N)NNNNNF)rh   ri   rj   r    r   r{   r@   ry   r   r   r   r   r   r   rl   rb   rb   r`   rc   r     s&   
      r   c                       sv   e Zd ZU eed< dZdZddgZdZdZ	dZ
ejddd	Zdee eed
 fddZdd Z fddZ  ZS )ModernBertPreTrainedModelr   modelTr   r   F)r   c                    sj  | j j  d u rd tjtd fdd}| j j| j jtd| j j  | j j| j j	d d}t
|trz||j|d  nt
|tr||j|d	  ||j|d
  nt
|tr||j|d	  ||j|d
  nt
|tr||j|d
  nxt
|tr||j|d
  nZt
|ttttfr2||j|d  n4t
|tjrf|jjd |jd urf|jj   d S )Nr
   r   stdc                    sJ   t jj| jd|  |  | d t| t jrF| jd urFt j| j d S )Nr1   )r7   r  ab)r   initZtrunc_normal_weight
isinstancer   r   Zzeros_r  Zcutoff_factorrb   rc   init_weight  s    
z<ModernBertPreTrainedModel._init_weights.<locals>.init_weightr+   r   )inout	embedding	final_outr  r  r  r  g      ?)!r   rH   r   Moduler   rG   mathsqrtrE   rC   r
  r   r   r   r   r   r   r   ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr   r	  dataZfill_r   Zzero_)r^   r   r  Zstdsrb   r  rc   _init_weights  sD    




	z'ModernBertPreTrainedModel._init_weights)attn_implementationis_init_checkrq   c              	      sD   z|du r|   rdn|}W n ttfy2   Y n0 t j||dS )zR
        Checks and dispatches to hhe requested attention implementation.
        Nr   )r  r   )Z_flash_attn_2_can_dispatchr]   ImportErrorr?   %_check_and_adjust_attn_implementation)r^   r  r   r`   rb   rc   r"  6  s    z?ModernBertPreTrainedModel._check_and_adjust_attn_implementationc                 C   s   | j jdu rd S t| drBt| jdkrB| j jr:td d| j _| jjdkrh| j jr`td d| j _| jjdkr| j jrtd d| j _| j jd u rt	 | j _d S )	NFhf_device_maprw   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.Zmpsz|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.cpuz|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
r   r[   hasattrlenr#  loggerwarning_oncer   typer   r   rb   rb   rc   _maybe_set_compileM  s,    z,ModernBertPreTrainedModel._maybe_set_compilec                    s<   t  j|i |}| jjdv r8| jjr0td d| j_|S )N>   TNzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)r?   resize_token_embeddingsr   r[   r'  r(  )r^   argsr_   Zmodel_embedsr`   rb   rc   r+  l  s    z1ModernBertPreTrainedModel.resize_token_embeddings)F)rh   ri   rj   r    __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnr   r  r  r   r   r   r"  r*  r+  rl   rb   rb   r`   rc   r    s   
5 r  c                       s   e Zd Zed fddZdd Zdd Zedee	j
 ee	j ee	j ee	j
 ee	j ee	j ee	j ee ee ee ee ee ee eee	jd	f ef d
ddZe	jee	jdddZ  ZS )ModernBertModelr   c                    sf   t     | _t | _t fddt jD | _	tj
 j j jd| _d| _|   d S )Nc                    s   g | ]}t  |qS rb   )r   ).0r   r   rb   rc   
<listcomp>      z,ModernBertModel.__init__.<locals>.<listcomp>r   F)r?   r@   r   r   
embeddingsr   Z
ModuleListrangerE   layersr   rC   rI   rJ   
final_normZgradient_checkpointing	post_initr   r`   r   rc   r@   {  s    
zModernBertModel.__init__c                 C   s   | j jS r   r2  r   r   rb   rb   rc   get_input_embeddings  s    z$ModernBertModel.get_input_embeddingsc                 C   s   || j _d S r   r7  )r^   r   rb   rb   rc   set_input_embeddings  s    z$ModernBertModel.set_input_embeddingsN.)r   rn   r   ro   r   r   r   r   
batch_sizeseq_lenr   output_hidden_statesreturn_dictrq   c              
      s  |dur|n| j j}|dur |n| j j}|dur4|n| j j}|du |duA rTtd|r\dnd}|rhdnd}|   |dur| ||  du rƈdu r|dur|jdd \ n|jdd \ |dur|jn|j}|du rt	j
 f|t	jd}d}| j jdkrdu r|du r|du rd}|du rxt	 & t||d	^}}}}W d   n1 sl0    Y  nt||d	^}}}}n0|du rt	j|d
d}| j||d\}}| j||d}| jD ]V}|r||f }||||||||d}|d }|rt|dkr||d f }q|r<||f }| |}|rt| d}|durt fdd|D }|stdd |||fD S t|||dS )  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nz:You must specify exactly one of input_ids or inputs_embedsrb   r   r   Fr   T)rm   rn   )r   r   )r   )r   r   r  rw   rm   r   r   r   c                 3   s   | ]}t | d V  qdS )r?  N)r   )r/  hsr:  r   r;  rb   rc   	<genexpr>  s   z*ModernBertModel.forward.<locals>.<genexpr>c                 s   s   | ]}|d ur|V  qd S r   rb   )r/  vrb   rb   rc   rB     r1  )last_hidden_stater   
attentions)r   r   r<  use_return_dictr]   r*  %warn_if_padding_and_no_attention_maskr   r   ry   onesr   r   no_gradr   arange	unsqueeze_update_attention_maskr2  r4  r&  r5  r   r   r   )r^   r   rn   r   ro   r   r   r   r   r:  r;  r   r<  r=  Zall_hidden_statesZall_self_attentionsr   Zrepadr   r   Zencoder_layerZlayer_outputsrb   rA  rc   r     s    !

2



	


zModernBertModel.forward)rn   r   rq   c                 C   s   |rF| j jdkr$td d| j _n"| j jdkrFtd| j j d t|| j}t|jd 	d}t
||j }|| j jd k	d	d|j}|| t| jj}||fS )Nr   zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r   zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r   r   )r   r   r'  r(  r   ru   ry   rJ  r   rK  absTrO   r   r   Zmasked_fillZlogical_notZfinfomin)r^   rn   r   Zglobal_attention_maskrowsZdistanceZwindow_maskr   rb   rb   rc   rL    s&    
"z&ModernBertModel._update_attention_mask)NNNNNNNNNNNNN)rh   ri   rj   r    r@   r8  r9  r   r   ry   r   r   r{   r   r   r   r   r   rL  rl   rb   rb   r`   rc   r.  y  sD                zr.  c                       s6   e Zd Zed fddZejejdddZ  ZS )r  r   c                    sN   t    || _t|j|j|j| _t|j	 | _
tj|j|j|jd| _d S )Nr   )r?   r@   r   r   r   rC   rV   r  r   rW   r   r   rI   rJ   r   r   r`   rb   rc   r@   (  s
    
z!ModernBertPredictionHead.__init__r   c                 C   s   |  | | |S r   )r   r   r  r   rb   rb   rc   r   /  s    z ModernBertPredictionHead.forward)	rh   ri   rj   r    r@   ry   r   r   rl   rb   rb   r`   rc   r  '  s   r  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )Zcustom_introc                       s   e Zd ZdgZed fddZdd Zejddd	Z	e
jd
de
je
jdddZedee
j ee
j ee
j ee
j ee
j ee
j ee
j ee
j ee ee ee ee ee ee eee
j ef dddZ  ZS )r  zdecoder.weightr   c                    s^   t  | || _t|| _t|| _tj|j	|j
|jd| _| jj| _| jj| _|   d S )Nr   )r?   r@   r   r.  r  r  headr   r   rC   rA   rT   r  rY   rZ   r6  r   r`   rb   rc   r@   ;  s    



zModernBertForMaskedLM.__init__c                 C   s   | j S r   r  r   rb   rb   rc   get_output_embeddingsH  s    z+ModernBertForMaskedLM.get_output_embeddings)new_embeddingsc                 C   s
   || _ d S r   rR  )r^   rT  rb   rb   rc   set_output_embeddingsK  s    z+ModernBertForMaskedLM.set_output_embeddingsTr   )rg   rq   c                 C   s   |  | |S r   )r  rQ  rf   rb   rb   rc   compiled_headN  s    z#ModernBertForMaskedLM.compiled_headNr   rn   r   ro   r   rp   r   r   r   r:  r;  r   r<  r=  rq   c                 K   s  |dur|n| j j}|   | j jdkr$|du r$|du r$|	du r$|
du r|du r|durt|jdd \}
}n|jdd \}
}|dur|jn|j}|du rtj|
|f|tjd}|du rt	 , t
||||d\}}}}	}}W d   n1 s0    Y  nt
||||d\}}}}	}}| j||||||||	|
||||d}|d }| jr|dur|d}||jd d}|| jk}|| }|| }| j jr| |n| | |}d}|dur| j||fd	| j ji|}| j jdkrH| j js|du rt nt	   t|||
|d
}W d   n1 s>0    Y  |sl|f}|durh|f| S |S t|||j|jdS )r>  Nr   r   r   )rm   rn   ro   rp   r   rn   r   ro   r   r   r   r   r:  r;  r   r<  r=  r   rr   rA   r?  losslogitsr   rE  )r   rF  r*  r   r   r   ry   rH  r   rI  r   r  rY   r   rZ   r[   rV  r  rQ  loss_functionrA   r\   r   r   r   r   rE  )r^   r   rn   r   ro   r   rp   r   r   r   r:  r;  r   r<  r=  r_   r   outputsrD  Zmask_tokensr[  rZ  rg   rb   rb   rc   r   R  sx    #

2


$0zModernBertForMaskedLM.forward)NNNNNNNNNNNNNN)rh   ri   rj   Z_tied_weights_keysr    r@   rS  r   r   rU  ry   r   r   rV  r   r   r   r{   r   r   r   r   r   rl   rb   rb   r`   rc   r  3  sL   
              r  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                       s   e Zd Zed fddZedeej eej	 eej	 eej	 eej	 eej	 eej	 eej	 ee
 ee
 ee
 ee ee ee eeej	 ef dddZ  ZS )	r  r   c                    s\   t  | |j| _|| _t|| _t|| _tj	
|j| _t	|j|j| _|   d S r   )r?   r@   
num_labelsr   r.  r  r  rQ  ry   r   r   rU   r   r   rC   r  r6  r   r`   rb   rc   r@     s    

z,ModernBertForSequenceClassification.__init__NrW  c                 K   sl  |dur|n| j j}|   |dur0| || |
du rn|du rn|dur\|jdd \}
}n|jdd \}
}|dur||jn|j}|du rtj|
|f|tjd}| j	||||||||	|
||||d}|d }| j j
dkr|dddf }n2| j j
dkr||d jd	d
|jd	dd }| |}| |}| |}d}|dur4| j jdu r| jd	krld| j _n:| jd	kr|jtjks|jtjkrd| j _nd| j _| j jdkrt }| jd	kr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkr4t }|||}|sX|f}|durT|f| S |S t|||j|jdS )aB  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr   r   rX  r   r4   r7   rr   rw   r   Trt   ZkeepdimZ
regressionZsingle_label_classificationZmulti_label_classificationrY  )r   rF  r*  rG  r   r   ry   rH  r   r  r8   rK  rx   rQ  r   r  Zproblem_typer^  ru   longr{   r	   squeezer   r   r   r   r   rE  )r^   r   rn   r   ro   r   rp   r   r   r   r:  r;  r   r<  r=  r_   r   r]  rD  pooled_outputr[  rZ  loss_fctrg   rb   rb   rc   r     s    '




(

z+ModernBertForSequenceClassification.forward)NNNNNNNNNNNNNN)rh   ri   rj   r    r@   r   r   ry   r   r   r{   r   r   r   r   r   rl   rb   rb   r`   rc   r    sB                 r  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                       s   e Zd Zed fddZedeej eej	 eej	 eej	 eej	 eej	 eej	 eej	 ee
 ee
 ee
 ee ee ee eeej	 ef dddZ  ZS )	r  r   c                    sV   t  | |j| _t|| _t|| _tj	|j
| _t|j|j| _|   d S r   r?   r@   r^  r.  r  r  rQ  ry   r   r   rU   r   r   rC   r  r6  r   r`   rb   rc   r@   R  s    

z)ModernBertForTokenClassification.__init__NrW  c                 C   s   |dur|n| j j}|   | j||||||||	|
||||d}|d }| |}| |}| |}d}|durt }||d| j	|d}|s|f|dd  }|dur|f| S |S t
|||j|jdS )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        NrX  r   rr   rw   rY  )r   rF  r*  r  rQ  r   r  r   r   r^  r   r   rE  )r^   r   rn   r   ro   r   rp   r   r   r   r:  r;  r   r<  r=  r]  rD  r[  rZ  rc  rg   rb   rb   rc   r   ^  sD    $


z(ModernBertForTokenClassification.forward)NNNNNNNNNNNNNN)rh   ri   rj   r    r@   r   r   ry   r   r   r{   r   r   r   r   r   rl   rb   rb   r`   rc   r  L  sB                 r  c                       s   e Zd Zed fddZedeej eej eej eej eej eej eej eej ee	 ee	 ee	 ee
 ee
 ee
 eeej ef dddZ  ZS )	r  r   c                    sV   t  | |j| _t|| _t|| _tj	|j
| _t|j|j| _|   d S r   rd  r   r`   rb   rc   r@     s    

z'ModernBertForQuestionAnswering.__init__N)r   rn   r   ro   start_positionsend_positionsr   r   r   r:  r;  r   r<  r=  rq   c                 K   s  |dur|n| j j}|   | j|||||||	|
||||d}|d }| |}| |}| |}|jddd\}}|d	 }|d	 }d}|dur|dur| j
||||fi |}|s||f|dd  }|dur|f| S |S t||||j|jdS )r>  N)rn   r   ro   r   r   r   r:  r;  r   r<  r=  r   rw   rr   r   )rZ  start_logits
end_logitsr   rE  )r   rF  r*  r  rQ  r   r  splitra  r   r\  r   r   rE  )r^   r   rn   r   ro   re  rf  r   r   r   r:  r;  r   r<  r=  r_   r]  rD  r[  rg  rh  rZ  rg   rb   rb   rc   r     sH    #


z&ModernBertForQuestionAnswering.forward)NNNNNNNNNNNNN)rh   ri   rj   r    r@   r   r   ry   r   r{   r   r   r   r   r   rl   rb   rb   r`   rc   r    s@                r  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c                       s   e Zd Zed fddZedeej eej	 eej	 eej	 eej	 eej	 eej	 eej	 ee
 ee
 ee
 ee ee ee eeej	 ef dddZ  ZS )	r  r   c                    sR   t  | || _t|| _t|| _tj	|j
| _t|jd| _|   d S )Nrw   )r?   r@   r   r.  r  r  rQ  ry   r   r   rU   r   r   rC   r  r6  r   r`   rb   rc   r@     s    

z$ModernBertForMultipleChoice.__init__NrW  c                 K   s  |dur|n| j j}|dur&|jd n|jd }|durJ|d|dnd}|durh|d|dnd}|dur|d|dnd}|dur|d|d|dnd}|   | j||||||||	|
||||d}|d }| j jdkr|dddf }n2| j jdkr6||d j	dd	|j	dd
d }| 
|}| |}| |}|d|}d}|durt }|||}|s|f|dd  }|dur|f| S |S t|||j|jdS )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nrw   rr   rX  r   r4   r7   r   Tr_  rY  )r   rF  r   r   sizer*  r  r8   rK  rx   rQ  r   r  r   r   r   r   rE  )r^   r   rn   r   ro   r   rp   r   r   r   r:  r;  r   r<  r=  r_   Znum_choicesr]  rD  rb  r[  Zreshaped_logitsrZ  rc  rg   rb   rb   rc   r     sb    &




z#ModernBertForMultipleChoice.forward)NNNNNNNNNNNNNN)rh   ri   rj   r    r@   r   r   ry   r   r   r{   r   r   r   r   r   rl   rb   rb   r`   rc   r    sB                 r  )r    r.  r  r  r  r  r  r  )NN)NN)F)Xr   r  
contextlibr   typingr   r   r   ry   Ztorch.nn.functionalr   r~   r   Ztorch.utils.checkpointZtorch.nnr   r   r	   Zactivationsr   Zconfiguration_utilsr   Zmodeling_attn_mask_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   utilsr   r   r   Zutils.import_utilsr   Zgemma.modeling_gemmar   r   Zflash_attn.flash_attn_interfacer   Zflash_attn.layers.rotaryr   Zflash_attn.ops.triton.rotaryr   objectZ
get_loggerrh   r'  r    r   r   r{   r   r   ZautogradFunctionr   r   r   r  r   r   r   r   r   r   r   ru   r   r   r   r   r   r  r.  r  r  r  r  r  r  __all__rb   rb   rb   rc   <module>   s    
 I  $*;  5 
.
,
$O.  .  Z[p