a
    hb                    @   s|  d Z ddlZddlZddlmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- e& rddl.m/Z/ ddl0m1Z1 e)2e3Z4G dd dej5Z6G dd dej5Z7G dd dej5Z8G dd dej5Z9G dd dej5Z:G dd dej5Z;G d d! d!ej5Z<G d"d# d#eZ=G d$d% d%ej5Z>e%G d&d' d'e!Z?G d(d) d)e?Z@e%G d*d+ d+e?ZAe%d,d-G d.d/ d/e?eZBe%G d0d1 d1e?ZCe%d2d-G d3d4 d4e?ZDe%G d5d6 d6e?ZEe%G d7d8 d8e?ZFg d9ZGdS ):zPyTorch UMT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )
UMT5Config)	BlockMask)make_flex_block_causal_maskc                       s&   e Zd Zd fdd	Zdd Z  ZS )UMT5LayerNormư>c                    s&   t    tt|| _|| _dS )ze
        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/umt5/modeling_umt5.pyr'   ?   s    
zUMT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv rR| | jj}| j| S )N   T)Zkeepdim)tor)   Zfloat32powmeanZrsqrtr,   r+   dtypefloat16Zbfloat16)r-   hidden_statesZvariancer2   r2   r3   forwardG   s
    zUMT5LayerNorm.forward)r%   )__name__
__module____qualname__r'   r<   __classcell__r2   r2   r0   r3   r$   >   s   r$   c                       s*   e Zd Zed fddZdd Z  ZS )UMT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r&   r'   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr	   dense_act_fnactr-   rC   r0   r2   r3   r'   Y   s
    
zUMT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr^|j| jjjkr^| jjjtj	kr^|
| jjj}| |}|S N)rJ   rP   rN   
isinstancerK   r+   r)   Tensorr9   int8r6   r-   r;   r2   r2   r3   r<   `   s    



zUMT5DenseActDense.forwardr=   r>   r?   r!   r'   r<   r@   r2   r2   r0   r3   rA   X   s   rA   c                       s*   e Zd Zed fddZdd Z  ZS )UMT5DenseGatedActDenserB   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S rD   )r&   r'   r   rG   rH   rI   wi_0wi_1rK   rL   rM   rN   r	   rO   rP   rQ   r0   r2   r3   r'   p   s    
zUMT5DenseGatedActDense.__init__c                 C   sz   |  | |}| |}|| }| |}t| jjtjrl|j	| jjj	krl| jjj	tj
krl|| jjj	}| |}|S rR   )rP   rY   rZ   rN   rS   rK   r+   r)   rT   r9   rU   r6   )r-   r;   Zhidden_geluZhidden_linearr2   r2   r3   r<   x   s    


zUMT5DenseGatedActDense.forwardrW   r2   r2   r0   r3   rX   o   s   rX   c                       s*   e Zd Zed fddZdd Z  ZS )UMT5LayerFFrB   c                    sJ   t    |jrt|| _n
t|| _t|j|jd| _	t
|j| _d S )Nr/   )r&   r'   Zis_gated_actrX   DenseReluDenserA   r$   rH   layer_norm_epsilon
layer_normr   rL   rM   rN   rQ   r0   r2   r3   r'      s    

zUMT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rR   )r_   r]   rN   )r-   r;   Zforwarded_statesr2   r2   r3   r<      s    

zUMT5LayerFF.forwardrW   r2   r2   r0   r3   r[      s   
r[   c                	       s   e Zd ZdZdee d fddZejejddd	Z	d
d Z
dddZedddddejeej eeej  eej eej eej dddZ  ZS )UMT5Attentionz7
    T5's attention using relative_attention_bias.
    FN	layer_idxc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r| jrtd| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrt| j| j
| _t | _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrE   )r&   r'   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerH   d_kvkey_value_proj_dim	num_headsn_headsrM   rN   Z	inner_dimrb   loggerwarning_oncer1   r=   r   rG   qkvo	Embeddingrelative_attention_biassetpruned_heads)r-   rC   rd   rb   r0   r2   r3   r'      s,    
zUMT5Attention.__init__)
projectionreturnc                 C   s6   |  d d | j| jf }||dddd}|S )Nr5   r   r4   r    r   )sizerj   rh   viewpermute)r-   ru   Znew_projection_shapeZnew_projectionr2   r2   r3   _shape   s    zUMT5Attention._shapec           	      C   s   d}| j }| j}| jsB|d }||dktj| 7 }t|}nt|t| }|d }||k }t	|
 | t	||  }|||  }||tj }t|t||d }|t|||7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r4   r    )re   rf   rc   r6   r)   longabsminZ
zeros_likelogfloatmathZ	full_likewhere)	r-   relative_positionZrelative_bucketsZnum_bucketsZmax_distanceZ	max_exactZis_smallZ	log_ratioZrelative_position_if_larger2   r2   r3   _relative_position_bucket   s$     z'UMT5Attention._relative_position_bucketc           
      C   s   |du r| j jj}|du r:tj|tj|ddddf }n|dddf }tj|tj|ddddf }|| }| |}|  |}	|	g dd}	|	S )z%Compute binned relative position biasN)r9   device)r4   r   r    r   )	rr   r+   r   r)   aranger{   r   ry   Z	unsqueeze)
r-   Zquery_length
key_lengthr   cache_positionZcontext_positionZmemory_positionr   Zrelative_position_bucketvaluesr2   r2   r3   compute_bias   s    
 

zUMT5Attention.compute_biaspast_key_valuepast_key_values4.58new_nameversion)r;   encoder_hidden_statesr   attention_masklayer_head_maskr   c                 C   s  |j d d \}}|d u}	| |}
|
|d| j| jdd}
|d urtt|trt|j	| j
}|	rl|j}qx|j}n|}|	r|n|}|	r|d ur|r|j| j
 j}|j| j
 j}n| |}| |}||d| j| jdd}||d| j| jdd}|d urB|	s|nd }|||| j
d|i\}}|	rBd|j| j
< t|
|dd}|d url||  n|}|j d }| jstjd| j||f|j|jd}n6| j|||j|d	}|d d d d | d d d f }|d ur|d d d d d d d |j d f }|| }| jrLt|j d }d
|t| j< |d d | f }n|}||7 }tj j!|" dd#|}tj j$|| j$| j%d}|d ur|| }t||}|dd& }|||d}| '|}||fS )Nr4   r5   r    r   Tr   )r   r9   )r   r   r   dim)ptraining)(shaperm   rx   rj   rh   Z	transposerS   r   
is_updatedgetrb   Zcross_attention_cacheself_attention_cacheZlayerskeysr   rn   ro   updater)   matmulget_seq_lengthrd   Zzerosr   r9   r   rt   r*   listboolr   Z
functionalZsoftmaxr   Ztype_asrN   r   
contiguousrp   )r-   r;   r   r   r   r   r   
batch_size
seq_lengthZis_cross_attentionZquery_statesr   Zcurr_past_key_valueZcurrent_statesZ
key_statesZvalue_statesZscoresZreal_seq_lengthr   Zposition_biascausal_maskmaskZposition_bias_maskedZattn_weightsZattn_outputr2   r2   r3   r<     sl    






"
&

zUMT5Attention.forward)FN)NN)NNNNN)r=   r>   r?   __doc__r   intr'   r)   rT   rz   r   r   r   tupler<   r@   r2   r2   r0   r3   r`      s&   /
     r`   c                       s@   e Zd Zdee d fddZedddddd	d
Z  ZS )UMT5LayerSelfAttentionNra   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NTrd   rb   r\   )r&   r'   r`   SelfAttentionr$   rH   r^   r_   r   rL   rM   rN   r-   rC   rb   r0   r2   r3   r'   f  s    
zUMT5LayerSelfAttention.__init__r   r   r   r   c           	      C   sF   |  |}| j|||||d}|| |d  }|f|dd   }|S )Nr   r   r   r   r   r    )r_   r   rN   )	r-   r;   r   r   r   r   normed_hidden_statesattention_outputoutputsr2   r2   r3   r<   l  s    	
zUMT5LayerSelfAttention.forward)N)NNNN	r=   r>   r?   r   r   r'   r   r<   r@   r2   r2   r0   r3   r   e  s       r   c                       s@   e Zd Zdee d fddZedddddd	d
Z  ZS )UMT5LayerCrossAttentionNra   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFr   r\   )r&   r'   r`   EncDecAttentionr$   rH   r^   r_   r   rL   rM   rN   r   r0   r2   r3   r'     s    
z UMT5LayerCrossAttention.__init__r   r   r   r   c                 C   sH   |  |}| j||||||d}|| |d  }	|	f|dd   }
|
S )Nr   r   r   r   r   r   r    )r_   r   rN   )r-   r;   r   r   r   r   r   r   r   Zlayer_outputr   r2   r2   r3   r<     s    

zUMT5LayerCrossAttention.forward)N)NNNNNr   r2   r2   r0   r3   r     s        r   c                
       s@   e Zd Zdee d fddZedddddd
dZ  ZS )	UMT5BlockNra   c                    s^   t    |j| _t | _| jt||d | jrJ| jt||d | jt	| d S )Nra   )
r&   r'   rc   r   
ModuleListlayerappendr   r   r[   r   r0   r2   r3   r'     s    

zUMT5Block.__init__r   r   r   r   Fc                 C   sT  | j d |||||
d\}}|jtjkrdt|jj}tt| |d |}tj	|| |d}d }| j
ot|d u}|r| j d ||||||
d\}}|jtjkrt|jj}tt| |d |}tj	|| |d}| j d |}|jtjkr8t|jj}tt| |d |}tj	|| |d}|f}|	rP|||f7 }|S )Nr   r   i  )r}   maxr    r   r5   )r   r9   r)   r:   finfor   r   isinfanyclamprc   )r-   r;   r   r   encoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacheoutput_attentionsr   Zself_attn_weightsZ	max_dtypeZclamp_valueZcross_attn_weightsZdo_cross_attentionr   r2   r2   r3   r<     sF    
	
	zUMT5Block.forward)N)	NNNNNNFFNr   r2   r2   r0   r3   r     s   
         r   c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )UMT5ClassificationHeadz-Head for sentence-level classification tasks.rB   c                    sB   t    t|j|j| _tj|jd| _t|j|j	| _
d S )N)r   )r&   r'   r   rG   rH   denserL   classifier_dropoutrN   
num_labelsout_projrQ   r0   r2   r3   r'     s    
zUMT5ClassificationHead.__init__)r;   rv   c                 C   s6   |  |}| |}t|}|  |}| |}|S rR   )rN   r   r)   tanhr   rV   r2   r2   r3   r<     s    




zUMT5ClassificationHead.forward)
r=   r>   r?   r   r!   r'   r)   rT   r<   r@   r2   r2   r0   r3   r     s   r   c                   @   sJ   e Zd ZU eed< dZdZdZdgZdgZ	e
dd Zdd	 Zd
d ZdS )UMT5PreTrainedModelrC   transformerTr   rK   c                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r)   Ztensorr   r   )r-   r   Z
input_maskdummy_inputsr2   r2   r3   r     s    

z UMT5PreTrainedModel.dummy_inputsc                 C   s  | j j}t|tr(|jj|d  nt|ttt	t
fr|jjjjd|d d t|dr|| j js||jjjjd|d d t|dr|jjjjd|| j jd  d |jjj  n2t|trt|dr|jjjjd|d d |jjj  nt|tr|jjjjd|| j jd  d t|jdrL|jjd	urL|jjj  |jjjjd|| j jd  d t|jdr|jjd	ur|jjj  nPt|tr>|jjjjd|| j jd  d t|jdr|jjd	ur|jjj  |jjjjd|| j jd  d t|jdr|jjd	ur|jjj  nt|tr*|jjjjd|| j jd  d t|jdr|jjd	ur|jjj  |jjjjd|| j jd  d t|jdr|jjd	ur|jjj  |jjjjd|| j jd  d t|jdr|jjd	ur|jjj  nt|t r| j j}| j j!}| j j"}|j#jjjd||| d  d |j$jjjd||d  d |j%jjjd||d  d |j&jjjd||| d  d |j'r|j(jjjd||d  d d	S )
zInitialize the weights      ?        )r8   Zstdlm_head
qa_outputs      
classifierrF   N))rC   Zinitializer_factorrS   r$   r+   dataZfill_	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringsharedZnormal_hasattrtie_word_embeddingsr   r   rH   rF   Zzero_UMT5ForTokenClassificationr   r   r   r   rA   rJ   rK   rI   rX   rY   rZ   r`   rg   ri   rm   rn   ro   rp   rd   rr   )r-   modulefactorrH   rh   rj   r2   r2   r3   _init_weights  sn    

 

         z!UMT5PreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d u r tdt|rbt|jd d d |}tj||dd df gdd}n4|	|j}|dd df 
 |ddd f< ||d< |d u rtd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information.r5   )r    .r   r    ).r   z1self.model.config.pad_token_id has to be defined.)rC   decoder_start_token_idpad_token_id
ValueErrorr   r)   fullr   catZ	new_zeroscloneZmasked_fill_)r-   r   r   r   Zshifted_input_idsr2   r2   r3   _shift_rightU  s       z UMT5PreTrainedModel._shift_rightN)r=   r>   r?   r!   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_can_compile_fullgraphZ_no_split_modulesZ_keep_in_fp32_modulespropertyr   r   r   r2   r2   r2   r3   r     s   


Br   c                       sx   e Zd Zd fdd	Zdd ZdddZdeejd	f ejeje	e
d
ddZeejeeejejedddZ  ZS )	UMT5StackNc                    sl   t    || _ j| _t fddt jD | _t	 j
 jd| _t j| _d| _|   d S )Nc                    s   g | ]}t  |d qS )ra   )r   ).0irB   r2   r3   
<listcomp>v      z&UMT5Stack.__init__.<locals>.<listcomp>r\   F)r&   r'   embed_tokensrc   r   r   range
num_layersblockr$   rH   r^   final_layer_normrL   rM   rN   gradient_checkpointing	post_init)r-   rC   r   r0   rB   r3   r'   r  s     zUMT5Stack.__init__c                 C   s
   || _ d S rR   )r   r-   Znew_embeddingsr2   r2   r3   set_input_embeddings~  s    zUMT5Stack.set_input_embeddingsc           #      C   s  |	d ur|	n| j j}	|
d ur |
n| j j}
|d ur4|n| j j}|d urH|n| j j}|d ur|d ur| jrjdnd}td| d| dn`|d ur| }|d|d }n>|d ur| d d }n$| jrdnd}td| d| d	| j	r
| j
r
|	r
td
 d}	|d u r2| jd u r(td| |}|\}}|	du r\| js\td|  d| jr|	r|d u r| j jrtt| j dt| j d}nt| j d}n| jsd }|d ur| nd}|d u rtj||| |jd}|d u rt s|| }tj|||jd}| jrD| |||t|tr:|jn||
}nL|d ur|d d d d d d f }|j|jd}d| t|jj }nd }| jr|d ur| \}}}||f}|d u rtj||jd}| |}nd }| || j j }| || j j }|rdnd }|
rdnd }|
r0| jr0dnd }| !|}t"| j#D ]x\}}|| } || }!|rp||f }|||||| |!||	|
|d
}"|"d }|
rH||"d f7 }| jrH||"d f7 }qH| $|}| !|}|r||f }|st%dd |||||fD S t&|||||dS )NZdecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer5   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderrB   r   r   )r9   r   r2   )r   r   r   r   r   r   r   r    r4   c                 s   s   | ]}|d ur|V  qd S rR   r2   )r   ro   r2   r2   r3   	<genexpr>  s   z$UMT5Stack.forward.<locals>.<genexpr>)last_hidden_stater   r;   
attentionscross_attentions)'rC   r   r   output_hidden_statesuse_return_dictrc   r   rw   rx   r   r   rk   rl   r   is_encoder_decoderr   r   r   r)   r   r   r   r*   _update_causal_maskrS   r   r6   r9   r   r}   Zinvert_attention_maskZget_head_maskr   rN   	enumerater   r   r   r   )#r-   r   r   r   r   r   	head_maskcross_attn_head_maskr   r   r   r   return_dictr   Zerr_msg_prefixZinput_shaper   r   past_key_values_lengthZmask_seq_lengthr   Zencoder_batch_sizeZencoder_sequence_length_Zencoder_hidden_shapeZencoder_extended_attention_maskZall_hidden_statesZall_attentionsZall_cross_attentionsr;   r   Zlayer_moduler   r   Zlayer_outputsr2   r2   r3   r<     s    





	






zUMT5Stack.forwardFr"   )r   input_tensorr   r   r   c                 C   sB  | j jdkr(|d ur$|dk r$|S d S | j jdkrLt|tjrHt|}|S |d ur\| nd}|d urn|jnd}| j jdkr|s|st	j
|||| jdrd S |j}|jd }	|r| }
n"t|tjr|jd	 n
||	 d }
| j||	|
|||jd d
}| j jdkr>|d ur>|jjdv r>|s>t|j}t	||}|S )NZflash_attention_2r   Zflex_attentionr   FZsdpa)r   r  Zis_trainingr    r5   )sequence_lengthtarget_lengthr9   r   r   )cudaZxpuZnpu)rC   Z_attn_implementationr   rS   r)   rT   r#   r   Zis_compileabler   Z_ignore_causal_mask_sdpar   r9   r   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r}   Z_unmask_unattended)r-   r   r
  r   r   r   Zpast_seen_tokensZusing_compilable_cacher9   r  r  r   	min_dtyper2   r2   r3   r  -  sZ    






	zUMT5Stack._update_causal_mask)r   r  r  r9   r   r   c                 K   sF  | dur|   dkr| }n&t|j}tj||f|||jd}|dkrVtj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| durB|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuer9   r   r    )Zdiagonalr   r5   r   )r   r)   r   r}   r   r   Ztriur   Zreshapeexpandr   r   r6   Zmasked_fill)r   r  r  r9   r   r   kwargsr   r  Zmask_lengthZpadding_maskr2   r2   r3   r  q  s*     $

6  z?UMT5Stack._prepare_4d_causal_attention_mask_with_cache_position)N)NNNNNNNNNNNNN)F)r=   r>   r?   r'   r   r<   r   r)   rT   r
   r   r  staticmethodr   r9   r  r@   r2   r2   r0   r3   r   q  sB                
 3 Dr   c                       s   e Zd ZU dZdZeed< ddgZ fddZdd	 Z	d
d Z
dd Zdd Zdd Zedeej eej eej eej eej eej eej eeeej   ee eej eej ee ee ee ee eej eeej ef dddZ  ZS )r   ao  
    Examples:

    ```python
    >>> from transformers import UMT5Model, AutoTokenizer

    >>> model = UMT5Model.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
    >>> label = "<extra_id_0> verhandelt"
    >>> inputs = tokenizer(inputs, return_tensors="pt")
    >>> labels = tokenizer(label=label, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```umt5rC   encoder.embed_tokens.weightdecoder.embed_tokens.weightc                    s   t  | t|j|j| _t|}d|_	d|_
d|_t|| j| _t|}d|_	d|_|j|_t|| j| _|   d S NFT)r&   r'   r   rq   
vocab_sizerH   r   copydeepcopyrc   r   tie_encoder_decoderr   encodernum_decoder_layersr   decoderr   r-   rC   encoder_configZdecoder_configr0   r2   r3   r'     s    

zUMT5Model.__init__c                 C   s   | j S rR   r   r-   r2   r2   r3   get_input_embeddings  s    zUMT5Model.get_input_embeddingsc                 C   s"   || _ | j| | j| d S rR   r   r  r   r  r   r2   r2   r3   r     s    zUMT5Model.set_input_embeddingsc                 C   s0   | j jr,| | jj| j | | jj| j d S rR   rC   r   _tie_or_clone_weightsr  r   r   r  r#  r2   r2   r3   _tie_weights  s    zUMT5Model._tie_weightsc                 C   s   | j S rR   r  r#  r2   r2   r3   get_encoder  s    zUMT5Model.get_encoderc                 C   s*   |  D ]\}}| jj| j| qdS )
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   Z	attentionprune_headsr-   Zheads_to_pruner   Zheadsr2   r2   r3   _prune_heads  s    zUMT5Model._prune_headsN)r   r   r   r   r  decoder_head_maskr  encoder_outputsr   r   decoder_inputs_embedsr   r   r   r  r   rv   c                 C   s   |dur|n| j j}|dur |n| j j}|du rJ| j|||
||||d}nH|rt|tst|d t|dkrt|d ndt|dkr|d ndd}|d }| j||||	|||||||||d}|s|| S t|j	|j
|j|j|j|j	|j|jdS )	a+  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5Model.from_pretrained("google/umt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr   r   r   r  r   r   r  r   r    r4   r   r;   r   r   r   r   r   r   r   r  r  r   r   r   r  r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentions)rC   r   r  r  rS   r   lenr  r   r   r   r;   r   r   )r-   r   r   r   r   r  r0  r  r1  r   r   r2  r   r   r   r  r   r;   decoder_outputsr2   r2   r3   r<     s\    Q	zUMT5Model.forward)NNNNNNNNNNNNNNNN)r=   r>   r?   r   
model_typer!   r   _tied_weights_keysr'   r$  r   r(  r*  r/  r   r   r)   
LongTensorFloatTensor
BoolTensorrT   r   r
   r   r   r   r<   r@   r2   r2   r0   r3   r     s\   
                r   z<
    UMT5 Model with a `language modeling` head on top.
    )Zcustom_introc                       s  e Zd ZdZdZg dZ fddZdd Zdd	 Zd
d Z	dd Z
edeej eej eej eej eej eej eej eeeej   ee eej eej eej ee ee ee ee eej eeej ef dddZejdddZ  ZS )r   a  
    Examples:

    ```python
    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```r  )r  r  zlm_head.weightc                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d S )NFTrE   )r&   r'   rH   	model_dimr   rq   r  r   r  r  rc   r   r  r   r  r  r   r  rG   r   r   r   r0   r2   r3   r'     s    

z%UMT5ForConditionalGeneration.__init__c                 C   s   | j S rR   r"  r#  r2   r2   r3   r$    s    z1UMT5ForConditionalGeneration.get_input_embeddingsc                 C   s"   || _ | j| | j| d S rR   r%  r   r2   r2   r3   r     s    z1UMT5ForConditionalGeneration.set_input_embeddingsc                 C   s0   | j jr,| | jj| j | | jj| j d S rR   r&  r#  r2   r2   r3   r(    s    z)UMT5ForConditionalGeneration._tie_weightsc                 C   s   | j S rR   r)  r#  r2   r2   r3   r*    s    z(UMT5ForConditionalGeneration.get_encoderN)r   r   r   r   r  r0  r  r1  r   r   r2  labelsr   r   r   r  r   rv   c                 C   s  |dur|n| j j}|dur |n| j j}|du rJ| j|||
||||d}nH|rt|tst|d t|dkrt|d ndt|dkr|d ndd}|d }|dur|du r|du r| |}| j||||	|||||||||d}|d }| j j	r|| j
d  }| |}d}|durJtd	d
}||j}||d|d|d}|s~|f|dd  | }|durz|f| S |S t|||j|j|j|j|j|j|jd	S )aK  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
        >>> outputs = model.generate(input_ids)
        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
        ```Nr3  r   r    r4   r4  r5  r   r   Zignore_indexr5   	losslogitsr   r6  r7  r   r8  r   r9  )rC   r   r  r  rS   r   r:  r   r  r   rA  r   r   r6   r   rx   rw   r   r   r;   r   r   r   )r-   r   r   r   r   r  r0  r  r1  r   r   r2  rB  r   r   r   r  r   r;   r;  sequence_outputZ	lm_logitsrE  loss_fctoutputr2   r2   r3   r<     sv    U	



z$UMT5ForConditionalGeneration.forward)rB  c                 C   s
   |  |S rR   )r   )r-   rB  r2   r2   r3   %prepare_decoder_input_ids_from_labels`  s    zBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNNNNN)r=   r>   r?   r   r<  r=  r'   r$  r   r(  r*  r   r   r)   r>  r?  r@  rT   r   r
   r   r   r   r<   rJ  r@   r2   r2   r0   r3   r   y  s`                     #r   c                       s   e Zd ZdZdZdgZ fddZdd Zdd	 Zd
d Z	dd Z
dd Zedeej eej eej eej ee ee ee eeej ef dddZ  ZS )r   a  
    Examples:

    ```python
    >>> from transformers import UMT5EncoderModel, AutoTokenizer

    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```r  r  c                    sN   t  | t|j|j| _t|}d|_	d|_
t|| j| _|   d S NF)r&   r'   r   rq   r  rH   r   r  r  r   r  r   r  r   )r-   rC   r!  r0   r2   r3   r'   x  s    
zUMT5EncoderModel.__init__c                 C   s   | j S rR   r"  r#  r2   r2   r3   r$    s    z%UMT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S rR   )r   r  r   r   r2   r2   r3   r     s    z%UMT5EncoderModel.set_input_embeddingsc                 C   s   | j jr| | jj| j d S rR   )rC   r   r'  r  r   r   r#  r2   r2   r3   r(    s    zUMT5EncoderModel._tie_weightsc                 C   s   | j S rR   r)  r#  r2   r2   r3   r*    s    zUMT5EncoderModel.get_encoderc                 C   s0   |  D ]"\}}| jj| jd j| qdS )r+  r   N)r,  r  r   r   r   r-  r.  r2   r2   r3   r/    s    zUMT5EncoderModel._prune_headsN)r   r   r  r   r   r   r  rv   c           	   	   C   s0   |dur|n| j j}| j|||||||d}|S )aQ  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr3  )rC   r  r  )	r-   r   r   r  r   r   r   r  r1  r2   r2   r3   r<     s    #
zUMT5EncoderModel.forward)NNNNNNN)r=   r>   r?   r   r<  r=  r'   r$  r   r(  r*  r/  r   r   r)   r>  r?  r   r   r   r   r<   r@   r2   r2   r0   r3   r   d  s6          r   z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                       s   e Zd ZdgZddgZed fddZedee	j
 ee	j ee	j
 ee	j
 ee	j ee	j ee	j eee	j  ee	j ee	j ee	j
 ee ee ee ee eeef dd	d
Z  ZS )UMT5ForSequenceClassificationFdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightr  r  rB   c                    s2   t  | t|| _t|| _|   d| _d S rK  )r&   r'   r   r   r   classification_headr   Zmodel_parallelrQ   r0   r2   r3   r'     s
    

z&UMT5ForSequenceClassification.__init__N)r   r   r   r   r  r0  r  r1  r   r2  rB  r   r   r   r  rv   c                 C   sx  |dur|n| j j}|dur d}|du rB|	durBtd| jj |du rl|
du rl|du rbtd| |}| j|||||||||	|
||||d}|d }|| j j	
|j}tt|ddkrtd|j\}}}||ddf |d	|ddd	ddf }| |}d}|dur |
|j}| j jdu r| j jdkrRd
| j _n<| j jdkr|jtjks||jtjkrd| j _nd| j _| j jd
krt }| j jdkr|| | }n
|||}nP| j jdkrt }||d	| j j|d	}n| j jdkr t }|||}|sP|f|dd  }|durL|f| S |S t|||j|j|j|j |j!|j"|j#d	S )as
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r   r   r  r0  r  r1  r   r2  r   r   r   r  r   r    z7All examples must have the same number of <eos> tokens.r5   Z
regressionZsingle_label_classificationZmulti_label_classificationrD  )$rC   r  NotImplementedErrorr1   r=   r   r   r   eqZeos_token_idr6   r   r:  r)   Zunique_consecutivesumr   rx   rN  Zproblem_typer   r9   r{   r   r   squeezer   r   r   r   r6  r7  r   r8  r   r9  )r-   r   r   r   r   r  r0  r  r1  r   r2  rB  r   r   r   r  r   rG  Zeos_maskr   r	  r.   Zsentence_representationrF  rE  rH  rI  r2   r2   r3   r<     s    >
,


*

z%UMT5ForSequenceClassification.forward)NNNNNNNNNNNNNNN)r=   r>   r?   "_keys_to_ignore_on_load_unexpectedr=  r!   r'   r   r   r)   r>  rT   r   r?  r   r   r   r   r<   r@   r2   r2   r0   r3   rL    sJ   
               
rL  c                       s   e Zd ZdgZdgZed fddZed
ee	j
 ee	j
 ee	j
 ee	j
 ee	j
 ee ee ee eee	j
 ef d	dd	Z  ZS )r   rM  z'transformer.encoder.embed_tokens.weightrB   c                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rR   )r&   r'   r   r   r   r   rL   r   rN   rG   r.   r   r   rQ   r0   r2   r3   r'     s    
z#UMT5ForTokenClassification.__init__N)	r   r   r  r   rB  r   r   r  rv   c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}
| |
}d}|durtt }||d| j|d}|s||	dd f}|dur|f| S |S t|||	j	|	j
dS )aB  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   r  r   r   r   r  r   r5   r4   )rE  rF  r;   r   )rC   r  r   rN   r   r   rx   r   r   r;   r   )r-   r   r   r  r   rB  r   r   r  r   r;   rF  rE  rH  rI  r2   r2   r3   r<     s4    


z"UMT5ForTokenClassification.forward)NNNNNNNN)r=   r>   r?   rT  r=  r!   r'   r   r   r)   rT   r   r   r   r   r<   r@   r2   r2   r0   r3   r   z  s.           r   c                       s   e Zd ZddgZ fddZdd Zdd Zd	d
 Zdd Ze	de
ej e
ej e
ej e
ej e
ej e
ej e
ej e
eeej   e
ej e
ej e
ej e
ej e
e e
e e
e e
e eeej ef dddZ  ZS )r   r  r  c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _|j| _t|j|j| _|   d S r  )r&   r'   rH   rA  r   rq   r  r   r  r  rc   r   r  r   r  r  r   r  r   rG   r   r   r   r0   r2   r3   r'     s     

z!UMT5ForQuestionAnswering.__init__c                 C   s   | j S rR   r"  r#  r2   r2   r3   r$    s    z-UMT5ForQuestionAnswering.get_input_embeddingsc                 C   s"   || _ | j| | j| d S rR   r%  r   r2   r2   r3   r     s    z-UMT5ForQuestionAnswering.set_input_embeddingsc                 C   s0   | j jr,| | jj| j | | jj| j d S rR   r&  r#  r2   r2   r3   r(    s    z%UMT5ForQuestionAnswering._tie_weightsc                 C   s   | j S rR   r)  r#  r2   r2   r3   r*    s    z$UMT5ForQuestionAnswering.get_encoderN)r   r   r   r   r  r0  r  r1  start_positionsend_positionsr   r2  r   r   r   r  rv   c                 C   sn  |dur|n| j j}|dur |n| j j}|	dur<|
dur<d}|du rf|du rf|du r\td| |}|durr|n| j j}|dur|n| j j}|du r| j|||||||d}nH|rt|tst|d t|dkr|d ndt|dkr|d ndd}|d }| j	|||d||||||||d	}|d }| 
|}|jdd
d\}}|d
 }|d
 }d}|	dur|
durt|	 dkr|	d
|j}	t|
 dkr|
d
|j}
|d}|	d|}	|
d|}
t|d}|||	}|||
}|| d }|sD||f|dd  | }|dur@|f| S |S t||||j|j|j|j|j|j|jd
S )aI	  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NFrO  r3  r   r    r4   r4  )r   r   r   r   r   r   r  r  r   r   r   r  r5   r   rC  )
rE  start_logits
end_logitsr   r6  r7  r   r8  r   r9  )rC   r  r   r   r   r  rS   r   r:  r  r   splitrS  r   rw   r6   r   r   r   r   r   r;   r   r   r   )r-   r   r   r   r   r  r0  r  r1  rU  rV  r   r2  r   r   r   r  r;   r;  rG  rF  rW  rX  Z
total_lossZignored_indexrH  Z
start_lossZend_lossrI  r2   r2   r3   r<     s    <
	




z UMT5ForQuestionAnswering.forward)NNNNNNNNNNNNNNNN)r=   r>   r?   r=  r'   r$  r   r(  r*  r   r   r)   r>  r?  r@  rT   r   r   r   r   r<   r@   r2   r2   r0   r3   r     sT                   r   )r   r   r   rL  r   r   r   )Hr   r  r   typingr   r   r)   r   Ztorch.nnr   r   r   Zactivationsr	   Zcache_utilsr
   r   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   r   r   Zmodeling_utilsr   utilsr   r   r   r   r   r   r   Zutils.deprecationr   Zconfiguration_umt5r!   Z!torch.nn.attention.flex_attentionr"   Zintegrations.flex_attentionr#   Z
get_loggerr=   rk   Moduler$   rA   rX   r[   r`   r   r   r   r   r   r   r   r   r   rL  r   r   __all__r2   r2   r2   r3   <module>   sp   $	$	
 GKr  ; O gl $L N