a
    h                    @   s  d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlm	  m
Z d dlm	Z	 d dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) e$ r d dl*m+Z+ d dl,m-Z- d dl.m/Z/ ne0Z-e%1e2Z3G dd dej4j5Z6dPeej7 ee8 dddZ9G dd de-Z:G dd de	j;Z<G dd de	j;Z=G dd  d e	j;Z>d!d" Z?dQd#d$Z@dRd&ej7ej7ej7eejA eBe8e8f e8e8eeC eeBej7ej7f eBej7 f d'
d(d)ZDejEfd&ej7e:ej7e8eBe8e8f e8e8ejFeBej7 d*
d+d,ZGd&ej7ej7ej7eejA eBe8e8f e8e8eBej7 d-	d.d/ZHeGeDeHd0ZIG d1d& d&e	j;ZJG d2d3 d3eZKe#G d4d5 d5e!ZLdSej7ej7eej7 eej7 eBej7ej7ej7e8eej7 eej7 f d6d7d8ZMej7ej7e8e8ej7d9d:d;ZNe#G d<d= d=eLZOG d>d? d?e	j;ZPe#d@dAG dBdC dCeLZQe#dDdAG dEdF dFeLZRe#dGdAG dHdI dIeLZSe#G dJdK dKeLZTe#dLdAG dMdN dNeLZUg dOZVdS )T    N)nullcontext)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringis_flash_attn_2_availablelogging)is_triton_available   )ModernBertConfig) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryc                   @   s8   e Zd Zedeej ee dddZedd Z	dS )ApplyRotaryEmbUnpadN
cu_seqlens
max_seqlenc              
   C   sd   |  }|j\}}}}	|d d d df |d|	}
t|
||d||ddd | ||| || _|S )N   r   FT)seqlen_offsetsr!   r"   interleavedinplace)
contiguousshapeviewr   Zsave_for_backwardr"   )ctxqkvcossinr!   r"   	total_nnz_three_nheadsheaddimZqk r3   n/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/modernbert/modeling_modernbert.pyforward>   s     
zApplyRotaryEmbUnpad.forwardc           
      C   sn   | j \}}}| }|j\}}}}|d d d df |d|}	t|	||d|| jdddd	 |d d d d d d fS )Nr#   r$   r   FT)r%   r!   r"   r&   r'   	conjugate)Zsaved_tensorsr(   r)   r*   r   r"   )
r+   dor-   r.   r!   r/   r0   r1   r2   Zdqkr3   r3   r4   backward]   s     zApplyRotaryEmbUnpad.backward)NN)
__name__
__module____qualname__staticmethodr   torchTensorintr5   r8   r3   r3   r3   r4   r   =   s     r   r    c                 C   s   t | ||||S )a  
    Arguments:
        qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (total_nnz, dim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r   apply)r,   r-   r.   r!   r"   r3   r3   r4   apply_rotary_unpaddedt   s    rA   c                	       s   e Zd ZdZdeeee eej eej	 d fddZ
dejejee eejeejejf f ddd	Zed
ddZ  ZS )!ModernBertUnpaddedRotaryEmbeddingzP
    The rotary position embeddings applied directly to unpadded sequences.
         @N)dimbaser"   devicedtypec                    sF   t  j|||dd || _|durB|durB|durB| j|||d dS )a  
        max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache will be recomputed during the forward pass.
        F)rD   rE   rF   r&   NrF   rG   )super__init__r"   _update_cos_sin_cache)selfrD   rE   r"   rF   rG   	__class__r3   r4   rJ      s    z*ModernBertUnpaddedRotaryEmbedding.__init__)r,   r!   r"   returnc                 C   s6   |dur| j ||j|jd t|| j| j||d}|S )z
        Apply rotary embedding *inplace* to qkv.
        qkv: (total_nnz, 3, nheads, headdim)
        cu_seqlens: (batch + 1,) cumulative sequence lengths
        max_seqlen: int max seq length in the batch
        NrH   r    )rK   rF   rG   rA   Z_cos_cachedZ_sin_cached)rL   r,   r!   r"   r3   r3   r4   r5      s    z)ModernBertUnpaddedRotaryEmbedding.forward)rO   c                 C   s   d| j  d| j d| j S )Nzdim=z, base=z, scale_base=)rD   rE   Z
scale_baserL   r3   r3   r4   
extra_repr   s    z,ModernBertUnpaddedRotaryEmbedding.extra_repr)rC   NNN)N)r9   r:   r;   __doc__r?   floatr   r=   rF   rG   rJ   r>   r   tupler5   strrQ   __classcell__r3   r3   rM   r4   rB      s(        rB   c                       sh   e Zd ZdZed fddZejddejej	ddd	Z
deej eej	 ej	dddZ  ZS )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                    sR   t    || _tj|j|j|jd| _tj	|j|j
|jd| _t|j| _d S )N)Zpadding_idxepsbias)rI   rJ   rY   r   Z	Embedding
vocab_sizehidden_sizeZpad_token_idtok_embeddings	LayerNormnorm_eps	norm_biasnormDropoutZembedding_dropoutdroprL   rY   rM   r3   r4   rJ      s
    
zModernBertEmbeddings.__init__TZdynamic)	input_idsrO   c                 C   s   |  | | |S N)re   rc   r_   )rL   rh   r3   r3   r4   compiled_embeddings   s    z(ModernBertEmbeddings.compiled_embeddingsN)rh   inputs_embedsrO   c                 C   sF   |d ur|  | |}n(| jjr,| |n|  | | |}|S ri   )re   rc   rY   reference_compilerj   r_   )rL   rh   rk   hidden_statesr3   r3   r4   r5      s    zModernBertEmbeddings.forward)NN)r9   r:   r;   rR   r   rJ   r=   compile
LongTensorr>   rj   r   r5   rV   r3   r3   rM   r4   rW      s   
 rW   c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    rX   c                    sf   t    || _tj|jt|jd |jd| _	t
|j | _t|j| _tj|j|j|jd| _d S )Nr#   r\   )rI   rJ   rY   r   Linearr^   r?   Zintermediate_sizeZmlp_biasWir
   Zhidden_activationactrd   Zmlp_dropoutre   Worf   rM   r3   r4   rJ      s    
 zModernBertMLP.__init__rm   rO   c                 C   s2   |  |jddd\}}| | | || S )Nr#   r$   rD   )rs   chunkru   re   rt   )rL   rm   inputZgater3   r3   r4   r5      s    zModernBertMLP.forward)
r9   r:   r;   rR   r   rJ   r=   r>   r5   rV   r3   r3   rM   r4   rp      s   rp   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	ModernBertRotaryEmbeddinginv_freqNrX   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr{   F)
persistent)rI   rJ   hasattr
isinstancer|   dictgetr}   max_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrY   r   Zrope_init_fnattention_scalingZregister_bufferr{   Zoriginal_inv_freq)rL   rY   rF   r{   rM   r3   r4   rJ      s    
z"ModernBertRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r$   r   mpscpuF)device_typeZenabledr#   rw   )rG   )r{   rS   expandr)   torF   r   r~   rU   r=   Zautocast	transposecatr-   r   r.   rG   )
rL   xposition_idsZinv_freq_expandedZposition_ids_expandedr   ZfreqsZembr-   r.   r3   r3   r4   r5   	  s    0&,z!ModernBertRotaryEmbedding.forward)N)r9   r:   r;   r=   r>   __annotations__r   rJ   no_gradr   r5   rV   r3   r3   rM   r4   rz      s
   

rz   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr$   r#   rw   )r)   r=   r   )r   x1Zx2r3   r3   r4   rotate_half  s    r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr-   r.   r   Zunsqueeze_dimZq_embedZk_embedr3   r3   r4   apply_rotary_pos_emb   s
    

r   FModernBertAttention)
moduler,   attention_masksliding_window_maskr   local_attentionbsrD   output_attentionsrO   c	                 K   s   | j ||d\}
}|ddjdd\}}}t|||
|\}}| jd }t||dd| }|dkrl|}|| }tjj	|dtj
d	|j}tjj|| j| jd
}t||}|dd }||d|}|r||fS |fS )Nr   r	   r   r#   rw         ࿩r$   r$   r$   rD   rG   )ptraining)
rotary_embr   unbindr   head_dimr=   matmulr   
functionalZsoftmaxZfloat32r   rG   Zdropoutattention_dropoutr   r(   r*   )r   r,   r   r   r   r   r   rD   r   _kwargsr-   r.   querykeyvaluescaleZattn_weightsattn_outputr3   r3   r4   eager_attention_forward;  s     
r   )
r   r,   r   r!   r"   r   r   rD   target_dtyperO   c	                 K   s   ||||d}|j tjtjfv}
|
rb|j }||}t|||| jrH| jnd| j|d}||}n"t|||| jrv| jnd| j|d}|	||fS )Nr            )r!   r"   	dropout_pZdeterministicZwindow_size)
rG   r=   Zfloat16bfloat16r   r   r   r   deterministic_flash_attnr*   )r   r,   r   r!   r"   r   r   rD   r   r   Zconvert_dtypeZ
orig_dtypeattnr3   r3   r4   flash_attention_forward`  s.    
r   )	r   r,   r   r   r   r   r   rD   rO   c                 K   s   | j ||d\}	}
|ddjdd\}}}t|||	|
\}}|dkrJ|}tj|||| jr`| jnd|ddd }|	|d	|}|fS )
Nr   r	   r   r#   rw   r   r   )r   Z	attn_maskr$   )
r   r   r   r   FZscaled_dot_product_attentionr   r   r(   r*   )r   r,   r   r   r   r   r   rD   r   r-   r.   r   r   r   r   r3   r3   r4   sdpa_attention_forward  s"    r   )flash_attention_2eagersdpac                       sJ   e Zd ZdZd
eee d fddZdej	ee
 ej	ddd	Z  ZS )r   a  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    NrY   layer_idc                    sf  t    || _|| _|j|j dkr@td|j d|j d|j| _|j| _|j| _	|j|j | _
| j
| j	 | _tj|jd| j |jd| _||j dkr|jd |jd f| _|jd ur|jn|j}|j}nd| _|j}|j}|jd	krt| j
||d
| _nt|}||_t|d| _tj|j|j|jd| _|jdkrPt|jnt | _t  | _!d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r	   rq   r#   r   r   )rD   r"   rE   rX   r   )"rI   rJ   rY   r   r^   Znum_attention_heads
ValueErrorr   r   	num_headsr   all_head_sizer   rr   Zattention_biasWqkvZglobal_attn_every_n_layersr   Zlocal_rope_thetaZglobal_rope_thetar   _attn_implementationrB   r   copydeepcopy
rope_thetarz   ru   rd   Identityout_dropsetZpruned_heads)rL   rY   r   r   r   config_copyrM   r3   r4   rJ     s<    


"zModernBertAttention.__init__F)rm   r   rO   c              	   K   s   |  |}|jd }| jjdkr6|dd| j| j}n||dd| j| j}t| jj | f|| j| j	|| j
|d|}|d }| | |}|f|dd   S )Nr   r   r$   r	   )r,   r   r   r   rD   r   r   )r   r)   rY   r   r*   r   r   MODERNBERT_ATTENTION_FUNCTIONr   r   r   r   ru   )rL   rm   r   kwargsr,   r   attn_outputsr3   r3   r4   r5     s(    



zModernBertAttention.forward)N)F)r9   r:   r;   rR   r   r   r?   rJ   r=   r>   boolr5   rV   r3   r3   rM   r4   r     s   	* c                
       s   e Zd Zdeee d fddZejddej	ej	ddd	Z
dej	eej	 eej	 eej eej	 ee ee ej	dddZ  ZS )ModernBertEncoderLayerNr   c                    sp   t    || _|dkr$t | _ntj|j|j|j	d| _t
||d| _tj|j|j|j	d| _t|| _d S )Nr   rZ   r   )rI   rJ   rY   r   r   	attn_normr`   r^   ra   rb   r   r   mlp_normrp   mlp)rL   rY   r   rM   r3   r4   rJ     s    
zModernBertEncoderLayer.__init__Trg   rv   c                 C   s   |  | |S ri   )r   r   rL   rm   r3   r3   r4   compiled_mlp  s    z#ModernBertEncoderLayer.compiled_mlpF)rm   r   r   r   r!   r"   r   rO   c           
   	   C   sf   | j | |||||||d}||d  }| jjr<| |n| | |}	||	 }|f|dd   S )Nr   r   r   r!   r"   r   r   r   )r   r   rY   rl   r   r   r   )
rL   rm   r   r   r   r!   r"   r   r   Z
mlp_outputr3   r3   r4   r5     s     
	zModernBertEncoderLayer.forward)N)NNNNNF)r9   r:   r;   r   r   r?   rJ   r=   rn   r>   r   ro   r   r5   rV   r3   r3   rM   r4   r     s&   
      r   c                       sv   e Zd ZU eed< dZdZddgZdZdZ	dZ
ejddd	Zdee eed
 fddZdd Z fddZ  ZS )ModernBertPreTrainedModelrY   modelTrW   r   F)r   c                    sj  | j j  d u rd tjtd fdd}| j j| j jtd| j j  | j j| j j	d d}t
|trz||j|d  nt
|tr||j|d	  ||j|d
  nt
|tr||j|d	  ||j|d
  nt
|tr||j|d
  nxt
|tr||j|d
  nZt
|ttttfr2||j|d  n4t
|tjrf|jjd |jd urf|jj   d S )Nr	   r   stdc                    sJ   t jj| jd|  |  | d t| t jrF| jd urFt j| j d S )Nr   )meanr   ab)r   initZtrunc_normal_weightr   rr   r\   Zzeros_r   Zcutoff_factorr3   r4   init_weightA  s    
z<ModernBertPreTrainedModel._init_weights.<locals>.init_weightg       @r   )inout	embedding	final_outr   r   r   r   g      ?)!rY   Zinitializer_cutoff_factorr   ModulerS   Zinitializer_rangemathsqrtnum_hidden_layersr^   r   rW   r_   rp   rs   ru   r   r   ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr`   r   dataZfill_r\   Zzero_)rL   r   r   Zstdsr3   r   r4   _init_weights<  sD    




	z'ModernBertPreTrainedModel._init_weights)attn_implementationis_init_checkrO   c              	      sD   z|du r|   rdn|}W n ttfy2   Y n0 t j||dS )zR
        Checks and dispatches to hhe requested attention implementation.
        Nr   )r   r   )Z_flash_attn_2_can_dispatchr   ImportErrorrI   %_check_and_adjust_attn_implementation)rL   r   r   rM   r3   r4   r   p  s    z?ModernBertPreTrainedModel._check_and_adjust_attn_implementationc                 C   s   | j jdu rd S t| drBt| jdkrB| j jr:td d| j _| jjdkrh| j jr`td d| j _| jjdkr| j jrtd d| j _| j jd u rt	 | j _d S )	NFhf_device_mapr   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
rY   rl   r   lenr   loggerwarning_oncerF   r~   r   rP   r3   r3   r4   _maybe_set_compile  s,    z,ModernBertPreTrainedModel._maybe_set_compilec                    s<   t  j|i |}| jjdv r8| jjr0td d| j_|S )N>   TNzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)rI   resize_token_embeddingsrY   rl   r   r   )rL   argsr   Zmodel_embedsrM   r3   r4   r     s    z1ModernBertPreTrainedModel.resize_token_embeddings)F)r9   r:   r;   r   r   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnr   r   r   r   rU   r   r   r   r   rV   r3   r3   rM   r4   r   2  s   
5 r   )inputsr   r   labelsrO   c                 C   s   |j dtjd}tj| dd }t|  }tjj	
tj|dtjdd}|  dkrn|  | }n*| j^}	}
}|	|
 }| j|g|R  | }|dur| | nd}|dur| | nd}||||||fS )	a  
    Remove padding from input sequences.

    Args:
        inputs: (batch, seqlen, ...) or (batch, seqlen)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        position_ids: (batch, seqlen), int, position ids
        labels: (batch, seqlen), int, labels

    Returns:
        unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        cu_seqlens: (batch + 1), the cumulative sequence lengths
        max_seqlen_in_batch: int
        unpadded_position_ids: (total_nnz) or None
        unpadded_labels: (total_nnz) or None
    r$   r   F)as_tupler   )r   r   r#   N)sumr=   Zint32Znonzeroflattenr?   maxitemr   r   padZcumsumrD   r)   r*   )r  r   r   r  Zseqlens_in_batchindicesZmax_seqlen_in_batchr!   Zunpadded_inputsbatchseqlenrestr)   Zunpadded_position_idsZunpadded_labelsr3   r3   r4   _unpad_modernbert_input  s    r  )r  r	  r
  r  rO   c                 C   s   |   dkr:tj|| | j| jd}| ||< |||}nH| j^}}tj|| g|R | j| jd}| ||< |j||g|R  }|S )aQ  
    Add padding to sequences.

    Args:
        inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        batch: int, batch size
        seqlen: int, max sequence length

    Returns:
        padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
    r   )rG   rF   )rD   r=   ZzerosrG   rF   r*   r)   )r  r	  r
  r  outputZpadded_inputs_r  r3   r3   r4   _pad_modernbert_output  s    
"r  c                       s   e Zd Zed fddZdd Zdd Zedee	j
 ee	j ee	j ee	j
 ee	j ee	j ee	j ee ee ee ee ee ee eee	jd	f ef d
ddZe	jee	jdddZ  ZS )ModernBertModelrX   c                    sf   t     | _t | _t fddt jD | _	tj
 j j jd| _d| _|   d S )Nc                    s   g | ]}t  |qS r3   )r   ).0r   rX   r3   r4   
<listcomp>      z,ModernBertModel.__init__.<locals>.<listcomp>rZ   F)rI   rJ   rY   rW   
embeddingsr   Z
ModuleListranger   layersr`   r^   ra   rb   
final_normZgradient_checkpointing	post_initrf   rM   rX   r4   rJ     s    
zModernBertModel.__init__c                 C   s   | j jS ri   r  r_   rP   r3   r3   r4   get_input_embeddings  s    z$ModernBertModel.get_input_embeddingsc                 C   s   || j _d S ri   r  )rL   r   r3   r3   r4   set_input_embeddings  s    z$ModernBertModel.set_input_embeddingsN.)rh   r   r   r   rk   r	  r!   r"   
batch_sizeseq_lenr   output_hidden_statesreturn_dictrO   c              
      s  |dur|n| j j}|dur |n| j j}|dur4|n| j j}|du |duA rTtd|r\dnd}|rhdnd}|   |dur| ||  du rƈdu r|dur|jdd \ n|jdd \ |dur|jn|j}|du rt	j
 f|t	jd}d}| j jdkrdu r|du r|du rd}|du rxt	 & t||d	^}}}}W d   n1 sl0    Y  nt||d	^}}}}n0|du rt	j|d
d}| j||d\}}| j||d}| jD ]V}|r||f }||||||||d}|d }|rt|dkr||d f }q|r<||f }| |}|rt| d}|durt fdd|D }|stdd |||fD S t|||dS )  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nz:You must specify exactly one of input_ids or inputs_embedsr3   r#   rH   Fr   T)r  r   )rF   r   )r   )rh   rk   r   r   r  r	  r
  r  c                 3   s   | ]}t | d V  qdS )r"  N)r  )r  hsr  r	  r  r3   r4   	<genexpr>|  s   z*ModernBertModel.forward.<locals>.<genexpr>c                 s   s   | ]}|d ur|V  qd S ri   r3   )r  vr3   r3   r4   r%    r  )last_hidden_staterm   
attentions)rY   r   r  use_return_dictr   r   %warn_if_padding_and_no_attention_maskr)   rF   r=   onesr   r   r   r  aranger   _update_attention_maskr  r  r   r  r  rT   r   )rL   rh   r   r   r   rk   r	  r!   r"   r  r  r   r  r   Zall_hidden_statesZall_self_attentionsrF   Zrepadr  rm   Zencoder_layerZlayer_outputsr3   r$  r4   r5     s    !

2



	


zModernBertModel.forward)r   r   rO   c                 C   s   |rF| j jdkr$td d| j _n"| j jdkrFtd| j j d t|| j}t|jd 	d}t
||j }|| j jd k	d	d|j}|| t| jj}||fS )Nr   zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r   zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r#   r   )rY   r   r   r   r   rG   r=   r,  r)   r   absTr   r   rF   Zmasked_fillZlogical_notZfinfomin)rL   r   r   Zglobal_attention_maskrowsZdistanceZwindow_maskr   r3   r3   r4   r-    s&    
"z&ModernBertModel._update_attention_mask)NNNNNNNNNNNNN)r9   r:   r;   r   rJ   r  r  r   r   r=   ro   r>   r?   r   r   rT   r   r5   r-  rV   r3   r3   rM   r4   r    sD                zr  c                       s6   e Zd Zed fddZejejdddZ  ZS )r   rX   c                    sN   t    || _t|j|j|j| _t|j	 | _
tj|j|j|jd| _d S )NrZ   )rI   rJ   rY   r   rr   r^   Zclassifier_biasr   r
   Zclassifier_activationrt   r`   ra   rb   rc   rf   rM   r3   r4   rJ     s
    
z!ModernBertPredictionHead.__init__rv   c                 C   s   |  | | |S ri   )rc   rt   r   r   r3   r3   r4   r5     s    z ModernBertPredictionHead.forward)	r9   r:   r;   r   rJ   r=   r>   r5   rV   r3   r3   rM   r4   r     s   r   zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )Zcustom_introc                       s   e Zd ZdgZed fddZdd Zejddd	Z	e
jd
de
je
jdddZedee
j ee
j ee
j ee
j ee
j ee
j ee
j ee
j ee ee ee ee ee ee eee
j ef dddZ  ZS )r   zdecoder.weightrX   c                    s^   t  | || _t|| _t|| _tj|j	|j
|jd| _| jj| _| jj| _|   d S )Nrq   )rI   rJ   rY   r  r   r   headr   rr   r^   r]   Zdecoder_biasr   sparse_predictionsparse_pred_ignore_indexr  rf   rM   r3   r4   rJ     s    



zModernBertForMaskedLM.__init__c                 C   s   | j S ri   r   rP   r3   r3   r4   get_output_embeddings  s    z+ModernBertForMaskedLM.get_output_embeddings)new_embeddingsc                 C   s
   || _ d S ri   r5  )rL   r7  r3   r3   r4   set_output_embeddings  s    z+ModernBertForMaskedLM.set_output_embeddingsTrg   )r  rO   c                 C   s   |  | |S ri   )r   r2  )rL   r  r3   r3   r4   compiled_head  s    z#ModernBertForMaskedLM.compiled_headNrh   r   r   r   rk   r  r	  r!   r"   r  r  r   r  r   rO   c                 K   s  |dur|n| j j}|   | j jdkr$|du r$|du r$|	du r$|
du r|du r|durt|jdd \}
}n|jdd \}
}|dur|jn|j}|du rtj|
|f|tjd}|du rt	 , t
||||d\}}}}	}}W d   n1 s0    Y  nt
||||d\}}}}	}}| j||||||||	|
||||d}|d }| jr|dur|d}||jd d}|| jk}|| }|| }| j jr| |n| | |}d}|dur| j||fd	| j ji|}| j jdkrH| j js|du rt nt	   t|||
|d
}W d   n1 s>0    Y  |sl|f}|durh|f| S |S t|||j|jdS )r!  Nr   r#   rH   )r  r   r   r  rh   r   r   r   rk   r	  r!   r"   r  r  r   r  r   r   r$   r]   r"  losslogitsrm   r(  )rY   r)  r   r   r)   rF   r=   r+  r   r   r  r   r3  r*   r4  rl   r9  r   r2  loss_functionr]   Zrepad_logits_with_gradr   r  r   rm   r(  )rL   rh   r   r   r   rk   r  r	  r!   r"   r  r  r   r  r   r   rF   outputsr'  Zmask_tokensr>  r=  r  r3   r3   r4   r5     sx    #

2


$0zModernBertForMaskedLM.forward)NNNNNNNNNNNNNN)r9   r:   r;   Z_tied_weights_keysr   rJ   r6  r   rr   r8  r=   rn   r>   r9  r   r   ro   r?   r   r   rT   r   r5   rV   r3   r3   rM   r4   r     sL   
              r   z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c                       s   e Zd Zed fddZedeej eej	 eej	 eej	 eej	 eej	 eej	 eej	 ee
 ee
 ee
 ee ee ee eeej	 ef dddZ  ZS )	r   rX   c                    s\   t  | |j| _|| _t|| _t|| _tj	
|j| _t	|j|j| _|   d S ri   )rI   rJ   
num_labelsrY   r  r   r   r2  r=   r   rd   classifier_dropoutre   rr   r^   r   r  rf   rM   r3   r4   rJ   K  s    

z,ModernBertForSequenceClassification.__init__Nr:  c                 K   sl  |dur|n| j j}|   |dur0| || |
du rn|du rn|dur\|jdd \}
}n|jdd \}
}|dur||jn|j}|du rtj|
|f|tjd}| j	||||||||	|
||||d}|d }| j j
dkr|dddf }n2| j j
dkr||d jd	d
|jd	dd }| |}| |}| |}d}|dur4| j jdu r| jd	krld| j _n:| jd	kr|jtjks|jtjkrd| j _nd| j _| j jdkrt }| jd	kr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkr4t }|||}|sX|f}|durT|f| S |S t|||j|jdS )aB  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr#   rH   r;  r   clsr   r$   r   rw   TrD   ZkeepdimZ
regressionZsingle_label_classificationZmulti_label_classificationr<  )rY   r)  r   r*  r)   rF   r=   r+  r   r   classifier_poolingr   r  r2  re   r   Zproblem_typerA  rG   longr?   r   squeezer   r*   r   r   rm   r(  )rL   rh   r   r   r   rk   r  r	  r!   r"   r  r  r   r  r   r   rF   r@  r'  pooled_outputr>  r=  loss_fctr  r3   r3   r4   r5   X  s    '




(

z+ModernBertForSequenceClassification.forward)NNNNNNNNNNNNNN)r9   r:   r;   r   rJ   r   r   r=   ro   r>   r?   r   r   rT   r   r5   rV   r3   r3   rM   r4   r   E  sB                 r   zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                       s   e Zd Zed fddZedeej eej	 eej	 eej	 eej	 eej	 eej	 eej	 ee
 ee
 ee
 ee ee ee eeej	 ef dddZ  ZS )	r   rX   c                    sV   t  | |j| _t|| _t|| _tj	|j
| _t|j|j| _|   d S ri   rI   rJ   rA  r  r   r   r2  r=   r   rd   rB  re   rr   r^   r   r  rf   rM   r3   r4   rJ     s    

z)ModernBertForTokenClassification.__init__Nr:  c                 C   s   |dur|n| j j}|   | j||||||||	|
||||d}|d }| |}| |}| |}d}|durt }||d| j	|d}|s|f|dd  }|dur|f| S |S t
|||j|jdS )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr;  r   r$   r   r<  )rY   r)  r   r   r2  re   r   r   r*   rA  r   rm   r(  )rL   rh   r   r   r   rk   r  r	  r!   r"   r  r  r   r  r   r@  r'  r>  r=  rI  r  r3   r3   r4   r5     sD    $


z(ModernBertForTokenClassification.forward)NNNNNNNNNNNNNN)r9   r:   r;   r   rJ   r   r   r=   ro   r>   r?   r   r   rT   r   r5   rV   r3   r3   rM   r4   r     sB                 r   c                       s   e Zd Zed fddZedeej eej eej eej eej eej eej eej ee	 ee	 ee	 ee
 ee
 ee
 eeej ef dddZ  ZS )	r   rX   c                    sV   t  | |j| _t|| _t|| _tj	|j
| _t|j|j| _|   d S ri   rJ  rf   rM   r3   r4   rJ   /  s    

z'ModernBertForQuestionAnswering.__init__N)rh   r   r   r   start_positionsend_positionsr	  r!   r"   r  r  r   r  r   rO   c                 K   s  |dur|n| j j}|   | j|||||||	|
||||d}|d }| |}| |}| |}|jddd\}}|d	 }|d	 }d}|dur|dur| j
||||fi |}|s||f|dd  }|dur|f| S |S t||||j|jdS )r!  N)r   r   r   r	  r!   r"   r  r  r   r  r   r   r   r$   rw   )r=  start_logits
end_logitsrm   r(  )rY   r)  r   r   r2  re   r   splitrG  r(   r?  r   rm   r(  )rL   rh   r   r   r   rK  rL  r	  r!   r"   r  r  r   r  r   r   r@  r'  r>  rM  rN  r=  r  r3   r3   r4   r5   :  sH    #


z&ModernBertForQuestionAnswering.forward)NNNNNNNNNNNNN)r9   r:   r;   r   rJ   r   r   r=   r>   r?   r   r   rT   r   r5   rV   r3   r3   rM   r4   r   -  s@                r   z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c                       s   e Zd Zed fddZedeej eej	 eej	 eej	 eej	 eej	 eej	 eej	 ee
 ee
 ee
 ee ee ee eeej	 ef dddZ  ZS )	r   rX   c                    sR   t  | || _t|| _t|| _tj	|j
| _t|jd| _|   d S )Nr   )rI   rJ   rY   r  r   r   r2  r=   r   rd   rB  re   rr   r^   r   r  rf   rM   r3   r4   rJ     s    

z$ModernBertForMultipleChoice.__init__Nr:  c                 K   s  |dur|n| j j}|dur&|jd n|jd }|durJ|d|dnd}|durh|d|dnd}|dur|d|dnd}|dur|d|d|dnd}|   | j||||||||	|
||||d}|d }| j jdkr|dddf }n2| j jdkr6||d j	dd	|j	dd
d }| 
|}| |}| |}|d|}d}|durt }|||}|s|f|dd  }|dur|f| S |S t|||j|jdS )a  
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        Nr   r$   r;  r   rC  r   rw   TrD  r<  )rY   r)  r)   r*   sizer   r   rE  r   r  r2  re   r   r   r   r   rm   r(  )rL   rh   r   r   r   rk   r  r	  r!   r"   r  r  r   r  r   r   Znum_choicesr@  r'  rH  r>  Zreshaped_logitsr=  rI  r  r3   r3   r4   r5     sb    &




z#ModernBertForMultipleChoice.forward)NNNNNNNNNNNNNN)r9   r:   r;   r   rJ   r   r   r=   ro   r>   r?   r   r   rT   r   r5   rV   r3   r3   rM   r4   r     sB                 r   )r  r   r   r   r   r   r   )NN)Nr   )F)NN)Wr   r   
contextlibr   typingr   r   r=   Ztorch.nn.functionalr   r   r   Ztorch.nnr   r   r   Zactivationsr
   Zmodeling_attn_mask_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   utilsr   r   r   Zutils.import_utilsr   Zconfiguration_modernbertr   Zflash_attn.flash_attn_interfacer   Zflash_attn.layers.rotaryr   Zflash_attn.ops.triton.rotaryr   objectZ
get_loggerr9   r   ZautogradFunctionr   r>   r?   rA   rB   r   rW   rp   rz   r   r   ro   rT   r   r   r   rG   r   r   r   r   r   r   r  r  r  r   r   r   r   r   r   __all__r3   r3   r3   r4   <module>   s    
;  5$
$ 
.
,
$O.   $* .  Z[p