a
    h                    @   s  d Z ddlZddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- e& r$ddl.m/Z/ ddl0m1Z1 e)2e3Z4dXej5e6e6e6ej5dddZ7ej5e6e6ej5dddZ8dYej5e6e6e6ej5dddZ9e6ej5ddd Z:ej5e6ej5d!d"d#Z;ej5e6ej<ej5d$d%d&Z=ej5e6e>ej5ej5f d'd(d)Z?ej5e6ej5d'd*d+Z@ej5ej5e6ej5d,d-d.ZAG d/d0 d0e	jBZCzdd1lDmEZE eEZCe4Fd2 W n0 eGyT   Y n eHyp   e4Id3 Y n0 G d4d5 d5e	jBZJG d6d7 d7e	jBZKG d8d9 d9e	jBZLG d:d; d;e	jBZMG d<d= d=e	jBZNG d>d? d?e	jBZOG d@dA dAe	jBZPG dBdC dCe	jBZQG dDdE dEe	jBZRG dFdG dGe	jBZSG dHdI dIeZTe%G dJdK dKeZUG dLdM dMeUZVdNZWe%G dOdP dPeUZXe%dQdRG dSdT dTeUeZYe%G dUdV dVeUZZg dWZ[dS )ZzPyTorch LongT5 model.    N)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )LongT5Config)	BlockMask)make_flex_block_causal_mask)x	block_lendim	pad_valuereturnc                 C   s   | j |  | }t| j sDt| j }||  |7  < tj|| jdS dg| j }d|f||< t|ddd d}tj	j
| |d|d} | S )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr(   ndimsumr   
functionalr.   )r"   r#   r$   r%   Zpad_lenZ	new_shaper.   r+   r+   f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multiple@   s    

r:   )r"   r#   r$   r&   c                 C   s~   | j | | dkr"t| ||dd} | j | | }| j d| ||f | j |d d  }d|v rttj|| j| jdS | |S )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
    r   )r%   Nr   r(   device)r1   r:   r4   emptyr(   r<   reshape)r"   r#   r$   
num_blocksZoutput_shaper+   r+   r9   _split_into_blocksP   s    (r@   )r"   	block_dimsequence_dimr%   r&   c           	      C   s   | j | }dg| j }d||< t|ddd d}tjj| |d|d} g }tdD ]>}td	dg| j }t||| ||< t|}|	| |  qRt
j||d
S )zConcatenate three consecutive blocks for each input block for local attentiont.

    For more information, see: https://huggingface.co/papers/2112.07916.
    r)   )r   r   Nr*   r+   r,   r-   r   r   r$   )r1   r6   r7   r   r8   r.   rangeslicetupleappendr4   cat)	r"   rA   rB   r%   r?   r.   Zblocks_listiindicesr+   r+   r9   _concatenate_3_blocks_   s    
rK   )r#   r&   c                 C   s:   t jd|  t jd}|| |   }|d|d }|S )z:Makes 3-blocked relative position ids for local attention.r   r'   r   r   )r4   arangeZint32	unsqueeze)r#   Zposition_idsZcenter_position_idsrelative_position_idsr+   r+   r9   "_make_3block_relative_position_idsx   s    rO   )local_attention_maskr#   r&   c                 C   sF   t |}t||k }|ddddddf }|| j}t| |S )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)rO   r4   abstor<   logical_and)rP   r#   rN   Zlocality_maskr+   r+   r9   _mask_local_attention_mask   s
    rT   )attention_maskr#   r<   r&   c                 C   sV   t | |dd}t|ddd}|d}|d}t||}t||}|d|S )z;Prepare attention mask to be applied for a local attention.r   rC      rA   rB   r*   )r@   rK   rM   r4   rS   rT   rR   )rU   r#   r<   Z_blocked_attention_maskZ_3blocked_attention_maskrP   r+   r+   r9   _get_local_attention_mask   s    


rY   )rU   global_block_sizer&   c                    s\  | j dd \}tjtjd fdd}tj| | jd  }tj|dd| }t| d	kd
d| j}t	|| d
 | j}tj
d|j|jd}t||k||}||  | d  }||}  }|dkrtj|ddj|ddd}	ntj|d|j|jd}	tjt||ddd }
|
| j}
t|
|	kdd}
|tj|
tjfS )a  Obtain the "fixed block" global id corresponding to each input token.

    This implementation is a simplified version of the original Flaxformr implementation adopted from:
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
    the whole fixed block, are assigned to the preceding block.

    Padding tokens from the original sequence are represented by -1.
    NrV   )	block_idsr&   c                    sd   t    d k}|| j}t || dk}|dd| jd }t 	| |k | |} | S )Nr   r   r*   )
r4   rL   rR   r<   rS   r7   rM   typer(   where)r[   Z
block_endsZtrue_block_endsZfull_blocksrZ   Zseq_lenr+   r9   handle_orphan_tokens   s    z:_make_global_fixed_block_ids.<locals>.handle_orphan_tokensr<   r   )Zaxis              ?g     @r*   r;   r   rC   )r1   r4   TensorZ	ones_liker<   Zcumsumr]   r\   r(   floortensormaxvaluesrepeat	transposer5   onesrR   int)rU   rZ   
batch_sizer_   Zfixed_block_maskmaskZglobal_block_idsZ_global_block_ids_lower_boundZnum_globalsZ_sequence_block_ids_maxglobal_segment_idsr+   r^   r9   _make_global_fixed_block_ids   s,    
"ro   c                 C   s@   t | |\}}|jd }tj||jd}||d  }|tjS )zBCreate the relative position tensor for local -> global attention.r*   r`   .N)ro   r1   r4   rL   r<   r\   int64)rU   rZ   r[   rn   global_seq_lenZglobal_positionsside_relative_positionr+   r+   r9    _make_side_relative_position_ids   s
    
rt   )hidden_statesr[   rr   r&   c                 C   sf   | |dktj||j|jd}tj|tj	|d ddddddf }t
d| || jS )zFCompute individual block aggregates by summing over individual blocks.r   r;   r   Nr*   z...nd,...ng->...gd)r]   r4   re   r(   r<   r   r8   Zone_hotr\   rq   einsum)ru   r[   rr   Zone_hot_block_idsr+   r+   r9   _create_global_aggregates   s
    0rw   c                       s&   e Zd Zd fdd	Zdd Z  ZS )LongT5LayerNormư>c                    s&   t    tt|| _|| _dS )zg
        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parameterr4   rj   weightvariance_epsilon)selfZhidden_sizeeps	__class__r+   r9   r{      s    
zLongT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv rR| | jj}| j| S )NrV   r*   T)Zkeepdim)rR   r4   Zfloat32powmeanZrsqrtr~   r}   r(   float16Zbfloat16)r   ru   Zvariancer+   r+   r9   forward   s
    zLongT5LayerNorm.forward)ry   )__name__
__module____qualname__r{   r   __classcell__r+   r+   r   r9   rx      s   rx   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                       s*   e Zd Zed fddZdd Z  ZS )LongT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)rz   r{   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr   r   r   r+   r9   r{   
  s
    
zLongT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr^|j| jjjkr^| jjjtj	kr^|
| jjj}| |}|S N)r   r   r   
isinstancer   r}   r4   rc   r(   Zint8rR   )r   ru   r+   r+   r9   r     s    



zLongT5DenseActDense.forwardr   r   r   r   r{   r   r   r+   r+   r   r9   r   	  s   r   c                       s*   e Zd Zed fddZdd Z  ZS )LongT5DenseGatedActDenser   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S r   )rz   r{   r   r   r   r   wi_0wi_1r   r   r   r   r   r   r   r   r   r+   r9   r{      s    
z!LongT5DenseGatedActDense.__init__c                 C   s:   |  | |}| |}|| }| |}| |}|S r   )r   r   r   r   r   )r   ru   Zhidden_geluZhidden_linearr+   r+   r9   r   (  s    


z LongT5DenseGatedActDense.forwardr   r+   r+   r   r9   r     s   r   c                       s*   e Zd Zed fddZdd Z  ZS )LongT5LayerFFr   c                    sJ   t    |jrt|| _n
t|| _t|j|jd| _	t
|j| _d S )Nr   )rz   r{   Zis_gated_actr   DenseReluDenser   rx   r   layer_norm_epsilon
layer_normr   r   r   r   r   r   r+   r9   r{   3  s    

zLongT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S r   )r   r   r   )r   ru   Zforwarded_statesr+   r+   r9   r   =  s    

zLongT5LayerFF.forwardr   r+   r+   r   r9   r   2  s   
r   c                
       sb   e Zd Zdeee d fddZdd ZedddZ	dddZ
edddddddZ  ZS )LongT5AttentionFN)r   	layer_idxc                    s  t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r| jrtd| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrt| j| j
| _t | _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )rz   r{   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   r   r   r   r   r+   r9   r{   F  s.    
zLongT5Attention.__init__c                 C   s   t |dkrd S t|| j| j| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt | | _| j| j | _
| j|| _d S Nr   r   rC   lenr   r   r   r   r   r   r   r   r   r   unionr   headsindexr+   r+   r9   prune_headsi  s    zLongT5Attention.prune_headsT       c                 C   s   d}|r4|d }|| dk tj| 7 }t| } nt| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   rV   r   rR   r4   longrQ   minZ
zeros_likelogfloatmathZ	full_liker]   relative_positionbidirectionalnum_bucketsmax_distanceZrelative_bucketsZ	max_exactZis_smallZrelative_position_if_larger+   r+   r9   _relative_position_buckety  s,    z)LongT5Attention._relative_position_bucketc           
      C   s   |du r| j jj}|du r:tj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|| j | j	| j
d}|  |}	|	g dd}	|	S )%Compute binned relative position biasNr;   r   r   r   rV   r   r   r   )r   r}   r<   r4   rL   r   rR   r   r   r   r   permuterM   )
r   query_length
key_lengthr<   cache_positioncontext_positionmemory_positionr   relative_position_bucketrg   r+   r+   r9   compute_bias  s     
 
zLongT5Attention.compute_biaspast_key_valuepast_key_values4.58new_nameversionc                 C   s  |j dd \}}|du}| |}||d| j| jdd}|durtt|trt|j	| j
}|rl|j}qx|j}n|}|r|n|}|r|dur|r|j| j
 j}|j| j
 j}n| |}| |}||d| j| jdd}||d| j| jdd}|durB|s|
nd}
|||| j
d|
i\}}|rBd|j| j
< t||dd}|du r0|j d }|durx|n
|
d d }| jstjd| j||f|j|jd	}| jr| jrd|_n6| j|||j|
d
}|dddd| dddf }|dur0|ddddddd|j d f }|| }| jrlt|j d }d|t| j< |dd|  f }n|}||7 }t!j"j#|$ dd%|}t!j"j&|| j&| jd}|dur|| }t||}|dd' }||d| j(}| )|}||f}|	r||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        NrV   r*   r   r   Tr   rX   r<   r(   )r<   r   r   rC   ptraining)*r1   r   viewr   r   ri   r   r   
is_updatedgetr   Zcross_attention_cacheself_attention_cacheZlayerskeysrg   r   r   updater4   matmulr   r5   r<   r(   r   r   requires_gradr   r   rj   r3   boolr   r8   softmaxr   type_asr   
contiguousr   r   )r   ru   rm   key_value_statesposition_biasr   layer_head_maskr   	use_cacheoutput_attentionsr   rl   
seq_lengthZis_cross_attentionquery_statesr   Zcurr_past_key_valueZcurrent_states
key_statesvalue_statesscoresr   Zreal_seq_lengthcausal_maskZposition_bias_maskedattn_weightsattn_outputoutputsr+   r+   r9   r     sx    






"
&


zLongT5Attention.forward)FN)Tr   r   )NN)	NNNNNNFFN)r   r   r   r   r   rk   r{   r   staticmethodr   r   r   r   r   r+   r+   r   r9   r   E  s*     #/
         r   c                       sV   e Zd Zdeedd fddZdd ZedddZe	dddZ
dddZ  ZS )LongT5LocalAttentionFNr   r   r&   c                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| jd | _|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrt| j| j
| _t | _d| _d S )Nr   Fr   )rz   r{   r   r   r   r   r   r   r   r   r   local_radiusr#   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r+   r9   r{   .  s(    
zLongT5LocalAttention.__init__c                 C   s   t |dkrd S t|| j| j| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt | | _| j| j | _
| j|| _d S r   r   r   r+   r+   r9   r   H  s    z LongT5LocalAttention.prune_headsTr   r   c                 C   s   d}|r4|d }|| dk tj| 7 }t| } nt| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S r   r   r   r+   r+   r9   r   X  s,    z.LongT5LocalAttention._relative_position_bucketblock_lengthc                 C   s   | j jjjdkr| j jjnd}tjd| tj|d}|||  }|dddf |dddf  }| j|| j | j	| j
d}|  |}|g ddd}|S r   metaNr   r;   r   r   r   r   r}   r<   r\   r4   rL   r   r   r   r   r   r   rM   r   r  Ztarget_devicer   r   r   r   rg   r+   r+   r9   r     s      
z!LongT5LocalAttention.compute_biasc                    s  |j d d \ } fdd} fdd}||}	||}
||}t|	jdd}	t|
jdd}
t|jdd}t|
ddd}
t|ddd}td	|	|
}|d u r6j	stj
ddjjd
j f|j|jd}jrjrd|_nj}|d ur6t|dkdd}||dd }||7 }tjj| dd|}tjj|jjd}|d ur|| }||j}|td||}|d d d |d d f }|}||f}|r||f }|S )NrV   c                    s   |   djjS Z
projectionr*   r   r   r   Zstatesrl   r   r+   r9   r1     s    z+LongT5LocalAttention.forward.<locals>.shapec                    s   |    djS r>   r*   r   r   r   r  r  r+   r9   unshape  s    z-LongT5LocalAttention.forward.<locals>.unshaper   rC   rW   ...qhd,...khd->...hqkr   r   Tr   ra       _r*   r   ...hqk,...khd->...qhd)r1   r   r   r   r@   r#   rK   r4   rv   r   r5   r   r<   r(   r   r   r   r   r]   ri   r   r8   r   r   r   r   r\   r   )r   ru   rm   r   r   r   r   r1   r  r   r   r   r   r  r  r  r+   r  r9   r     sP    




zLongT5LocalAttention.forward)F)Tr   r   )NNNF)r   r   r   r   r   r{   r   r  r   rk   r   r   r   r+   r+   r   r9   r  -  s   /    r  c                       sn   e Zd Zdeedd fddZdd ZedddZe	dddZ
ejejejdddZdddZ  ZS )LongT5TransientGlobalAttentionFNr  c                    s  t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| jd | _|j| _|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrt| j| j
| _t | _| jrt| j| j
| _t|j|jd| _d S )Nr   Fr   r   )rz   r{   r   r   r   r   r   r   r   r   r   r  r#   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   global_relative_attention_biasrx   r   global_input_layer_normr	  r   r+   r9   r{     s.    
z'LongT5TransientGlobalAttention.__init__c                 C   s   t |dkrd S t|| j| j| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt | | _| j| j | _
| j|| _d S r   r   r   r+   r+   r9   r     s    z*LongT5TransientGlobalAttention.prune_headsTr   r   c                 C   s   d}|r4|d }|| dk tj| 7 }t| } nt| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S r   r   r   r+   r+   r9   r     s,    z8LongT5TransientGlobalAttention._relative_position_bucketr
  c                 C   s   | j jjjdkr| j jjnd}tjd| tj|d}|||  }|dddf |dddf  }| j|| j | j	| j
d}|  |}|g ddd}|S r  r  r  r+   r+   r9   r   P  s      
z+LongT5TransientGlobalAttention.compute_bias)rm   rn   r&   c                 C   s   t |d |d d d d d f d d d df }t |dkdd}t|| j}| j|| j | j| jd}| 	|}|
g d}|| }|S )Nrp   .r   ra   r  r   )r   r   r   rV   )r4   eqr]   rt   rZ   r   r   r   r   r  r   )r   rm   rn   Zside_attention_maskZattention_side_biasrs   Zside_relative_position_bucketZ	side_biasr+   r+   r9   compute_side_biash  s    0
z0LongT5TransientGlobalAttention.compute_side_biasc                    s:  |j d d \ } fdd} fdd}t|d ur<|nt|j d d j\}	}
|
j d }t||	|}|}||}||}|	|}||}|	|}t
|jdd}t
|jdd}t
|jdd}t|ddd	}t|ddd	}dg|jd  }|j d |d< |d|}|d|}tj||gdd}tj||gdd}td
||}|d urt|j|j}t|dkdd}nd }|d u rjs tjddjjdj f|j|jd}jrjrd|_nj}|d ur&||dd }||j}|d u rHt |}||
}t
|jdddd}||j |j}tj||gdd}||7 }t!j"j#|$ dd%|}t!j"j&|j&jd}|d ur|| }||j}|td||}|d d d |d d f }'|}||f}|r6||f }|S )NrV   c                    s   |   djjS r  r  r  r  r+   r9   r1     s    z5LongT5TransientGlobalAttention.forward.<locals>.shapec                    s   |    djS r  r  r  r  r+   r9   r    s    z7LongT5TransientGlobalAttention.forward.<locals>.unshaper*   r   rC   rW   r  r   ra   r  r   r   TrX   r   r  )(r1   ro   r4   rj   rZ   rw   r  r   r   r   r@   r#   rK   r6   rM   rh   rH   rv   rY   r<   r]   r   r5   r   r(   r   r   r   r   ri   r\   r  rR   r   r8   r   r   r   r   r   )r   ru   rm   r   r   r   r   r1   r  r[   rn   Z_global_seq_lenZglobal_inputsr   r   r   Zside_key_statesZside_value_statesZrepsr   rP   Zside_position_biasr  r  r  r+   r  r9   r   }  s~    








z&LongT5TransientGlobalAttention.forward)F)Tr   r   )NNNF)r   r   r   r   r   r{   r   r  r   rk   r   r4   rc   r  r   r   r+   r+   r   r9   r    s   /    r  c                       s@   e Zd Zdee d fddZedddd	dd
dZ  ZS )LongT5LayerSelfAttentionFNr   c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nr   r   r   )rz   r{   r   SelfAttentionrx   r   r   r   r   r   r   r   r   r   r+   r9   r{     s    
z!LongT5LayerSelfAttention.__init__r   r   r   r   c	              
   C   sL   |  |}	| j|	|||||||d}
|| |
d  }|f|
dd   }|S )N)rm   r   r   r   r   r   r   r   r   )r   r"  r   )r   ru   rU   r   r   r   r   r   r   normed_hidden_statesattention_outputr  r+   r+   r9   r     s    

z LongT5LayerSelfAttention.forward)FN)NNNNFFN	r   r   r   r   rk   r{   r   r   r   r+   r+   r   r9   r    s          r  c                       s<   e Zd ZdZd
ee d fddZdeddd	Z  Z	S )LongT5LayerLocalSelfAttentionz$Local self attention used in encoderFNr   c                    s<   t    t||d| _t|j|jd| _t	|j
| _d S N)r   r   )rz   r{   r  LocalSelfAttentionrx   r   r   r   r   r   r   r   r   r   r+   r9   r{     s    
z&LongT5LayerLocalSelfAttention.__init__kwargsc           
      K   sF   |  |}| j|||||d}|| |d  }|f|dd   }	|	S N)rm   r   r   r   r   r   )r   r(  r   
r   ru   rU   r   r   r   r*  r#  r$  r  r+   r+   r9   r   $  s    	
z%LongT5LayerLocalSelfAttention.forward)FN)NNNF
r   r   r   __doc__r   rk   r{   r   r   r   r+   r+   r   r9   r&    s   	    r&  c                       s<   e Zd ZdZd
ee d fddZdeddd	Z  Z	S )'LongT5LayerTransientGlobalSelfAttentionz/Transient-Global self attention used in encoderFNr   c                    s<   t    t||d| _t|j|jd| _t	|j
| _d S r'  )rz   r{   r  TransientGlobalSelfAttentionrx   r   r   r   r   r   r   r   r   r   r+   r9   r{   =  s    
z0LongT5LayerTransientGlobalSelfAttention.__init__r)  c           
      K   sF   |  |}| j|||||d}|| |d  }|f|dd   }	|	S r+  )r   r0  r   r,  r+   r+   r9   r   E  s    	
z/LongT5LayerTransientGlobalSelfAttention.forward)FN)NNNFr-  r+   r+   r   r9   r/  :  s       r/  c                	       s@   e Zd Zdee d fddZedddddd
dZ  ZS )LongT5LayerCrossAttentionNr   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFr!  r   )rz   r{   r   EncDecAttentionrx   r   r   r   r   r   r   r   )r   r   r   r   r+   r9   r{   ]  s    
z"LongT5LayerCrossAttention.__init__r   r   r   r   Fc                 C   sP   |  |}| j|||||||||	|
d
}|| |d  }|f|dd   }|S )N)	rm   r   r   r   r   r   r   r   r   r   r   )r   r2  r   )r   ru   r   rU   r   r   r   r   r   r   r   r#  r$  Zlayer_outputr  r+   r+   r9   r   c  s     
z!LongT5LayerCrossAttention.forward)N)NNNNFNFNr%  r+   r+   r   r9   r1  \  s           r1  c                       s@   e Zd Zdee d fddZedddd	dddZ  ZS )LongT5BlockFNr   c                    s   t    |j| _|jrt}n2|jdkr.t}n"|jdkr>t}ntd|j dt	 | _
| j
||||d | jr| j
t||d | j
t| d S )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .r!  r   )rz   r{   r   r  encoder_attention_typer&  r/  
ValueErrorr   
ModuleListlayerrG   r1  r   )r   r   r   r   Zattention_layerr   r+   r9   r{     s(    



zLongT5Block.__init__r   r   r   r   Tc                 C   s^  | j d |||||	|
||d}|d }|dd  }|jtjkrpt| rpt|jjd }tj|| |d}| j	o||d u}|r| j d ||||||	|d d |
||d
}|d }|jtjkrt| rt|jjd }tj|| |d}||dd   }| j d |}|jtjkrTt| rTt|jjd }tj|| |d}|f| S )Nr   )rU   r   r   r   r   r   r   r   i  )r   rf   r*   )	r   rU   r   r   r   r   r   r   r   )
r9  r(   r4   r   isinfanyfinforf   clampr   )r   ru   rU   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskr   r   r   return_dictr   Zself_attention_outputsZattention_outputsZclamp_valueZdo_cross_attentionZcross_attention_outputsr+   r+   r9   r     sP    

zLongT5Block.forward)FN)NNNNNNNNFFTNr%  r+   r+   r   r9   r3    s               r3  c                   @   sD   e Zd ZU eed< dZdZdgZdZe	dd Z
dd	 Zd
d ZdS )LongT5PreTrainedModelr   ZtransformerTr3  Fc                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r4   re   r   r   )r   rE  Z
input_maskdummy_inputsr+   r+   r9   rG    s    

z"LongT5PreTrainedModel.dummy_inputsc                 C   s  | j j}t|tr(|jj|d  nt|ttt	fr~|j
jjjd|d d t|drz| j jsz|jjjjd|d d nzt|tr|jjjjd|| j jd  d t|jdr|jjdur|jjj  |jjjjd|| j jd  d t|jdr|jjdur|jjj  nt|tr
|jjjjd|| j jd  d t|jdrt|jjdurt|jjj  |jjjjd|| j jd  d t|jdr|jjdur|jjj  |jjjjd|| j jd  d t|jdr|jjdur|jjj  nt|tttfr| j j}| j j}| j j}|jjjjd||| d  d |jjjjd||d  d |j jjjd||d  d |j!jjjd||| d  d |j"r|j#jjjd||d  d t|tr|j$jjjd||d  d dS )zInitialize the weightsrb   ra   )r   Zstdlm_head      r   N)%r   Zinitializer_factorr   rx   r}   dataZfill_LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelsharedZnormal_hasattrtie_word_embeddingsrH  r   r   r   r   Zzero_r   r   r   r   r   r   r  r  r   r   r   r   r   r   r   r   r  )r   modulefactorr   r   r   r+   r+   r9   _init_weights  sL    
       
z#LongT5PreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d u r tdt|rbt|jd d d |}tj||dd df gdd}n4|	|j}|dd df 
 |ddd f< ||d< |d u rtd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information.r*   )r   .rC   r   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr7  r   r4   fullr1   rH   Z	new_zeroscloneZmasked_fill_)r   rE  rU  rV  Zshifted_input_idsr+   r+   r9   _shift_right)  s       z"LongT5PreTrainedModel._shift_rightN)r   r   r   r   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_can_compile_fullgraphpropertyrG  rS  rY  r+   r+   r+   r9   rC    s   


1rC  c                       sx   e Zd Zd fdd	Zdd ZdddZdeejd	f ejeje	e
d
ddZeejeeejejedddZ  ZS )LongT5StackNc                    s   t    t j j| _|d ur0|j| j_ j| _ j	| _	| j	d | _
t fddt jD | _t j jd| _t j| _d| _|   d S )Nr   c                    s"   g | ]}t  t|d k|dqS )r   r!  )r3  r   ).0rI   r   r+   r9   
<listcomp>R  s   z(LongT5Stack.__init__.<locals>.<listcomp>r   F)rz   r{   r   r   
vocab_sizer   embed_tokensr}   r   r  r#   r8  rD   
num_layersblockrx   r   final_layer_normr   r   r   r   	post_init)r   r   r`  r   r   r9   r{   F  s     

zLongT5Stack.__init__c                 C   s
   || _ d S r   )r`  r   Znew_embeddingsr+   r+   r9   set_input_embeddings`  s    z LongT5Stack.set_input_embeddingsc           %      C   s
  |	d ur|	n| j j}	|
d ur |
n| j j}
|d ur4|n| j j}|d urH|n| j j}|d ur|d ur| jrjdnd}td| d| dn`|d ur| }|d|d }n>|d ur| d d }n$| jrdnd}td| d| d	| j	r
| j
r
|	r
td
 d}	|d u r2| jd us(J d| |}|\}}| jr|	r|d u r| j jrxtt| j dt| j d}nt| j d}n| jsd }|d ur| nd}|d u rtj||| |jd}|d u rt s|| }tj|||jd}| jr"| |||t|tr|jn||
}n$| j jdkrBt|| j|j}n|}| jr|d ur| \}}}||f}|d u rtj||jd}| |}nd }| || j j}| || j j}|rdnd }|
rdnd }|
r| jrdnd }d }d }|  |}t!| j"D ]\} }!||  }"||  }#|r.||f }|!|||||||"|#||	|
||d}$|$d }|$d }| jr|d ur|$|
r~dnd }|
r||$d f }| jr||$d f }q| #|}|  |}|r||f }|st$dd |||||fD S t%|||||dS )NZdecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer*   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsr   r   r`   r4  r+   )r   rA  r   r   r   rB  r   r   r   rV      c                 s   s   | ]}|d ur|V  qd S r   r+   )r]  r   r+   r+   r9   	<genexpr>  s   z&LongT5Stack.forward.<locals>.<genexpr>)last_hidden_stater   ru   
attentionscross_attentions)&r   r   r   output_hidden_statesuse_return_dictr   r7  sizer   r   r   r   r   r`  Zis_encoder_decoderr   r
   get_seq_lengthr4   rL   r<   r   rj   _update_causal_maskr   r   r6  rY   r#   Zinvert_attention_maskZget_head_maskra  r   	enumeraterb  rc  rF   r   )%r   rE  rU   r>  r?  rh  	head_maskcross_attn_head_maskr   r   r   rn  rB  r   Zerr_msg_prefixZinput_shaperl   r   past_key_values_lengthZmask_seq_lengthr  Zencoder_batch_sizeZencoder_sequence_length_Zencoder_hidden_shapeZencoder_extended_attention_maskZall_hidden_statesZall_attentionsZall_cross_attentionsr   r@  ru   rI   Zlayer_moduler   rA  Zlayer_outputsr+   r+   r9   r   c  s    











zLongT5Stack.forwardFr    )rU   input_tensorr   r   r   c                 C   sB  | j jdkr(|d ur$|dk r$|S d S | j jdkrLt|tjrHt|}|S |d ur\| nd}|d urn|jnd}| j jdkr|s|st	j
|||| jdrd S |j}|jd }	|r| }
n"t|tjr|jd	 n
||	 d }
| j||	|
|||jd d
}| j jdkr>|d ur>|jjdv r>|s>t|j}t	||}|S )NZflash_attention_2ra   Zflex_attentionr   FZsdpa)rh  rv  Zis_trainingr   r*   )sequence_lengthtarget_lengthr(   r   rl   )cudaZxpuZnpu)r   Z_attn_implementationr;  r   r4   rc   r!   rq  Zis_compileabler   Z_ignore_causal_mask_sdpar   r(   r1   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr<   r\   r<  r   Z_unmask_unattended)r   rU   rx  r   r   r   Zpast_seen_tokensZusing_compilable_cacher(   ry  rz  r  	min_dtyper+   r+   r9   rr    sZ    






	zLongT5Stack._update_causal_mask)rU   ry  rz  r(   r   rl   c                 K   sF  | dur|   dkr| }n&t|j}tj||f|||jd}|dkrVtj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| durB|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nri  )Z
fill_valuer(   r<   r   )Zdiagonalr`   r*   r   )r$   r4   r<  r   rW  r<   ZtriurL   r>   expandrX  r1   rR   Zmasked_fill)rU   ry  rz  r(   r   rl   r*  r  r}  Zmask_lengthZpadding_maskr+   r+   r9   r|  [  s*     $

6  zALongT5Stack._prepare_4d_causal_attention_mask_with_cache_position)N)NNNNNNNNNNNNN)F)r   r   r   r{   rf  r   r   r4   rc   r	   r   rr  r  rk   r(   r|  r   r+   r+   r   r9   r\  E  sB                
 ; Dr\  a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
c                       s   e Zd ZdgZddgZed fddZdd Zd	d
 Zdd Z	dd Z
dd Zedeej eej eej eej eej eej eej eeeej   ee eej eej ee ee ee ee eej eeej ef dddZ  ZS )rK  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    s   t  | t|j|j| _t|}d|_	d|_
d|_t|| j| _t|}d|_	d|_|j|_t|| j| _|   d S )NFT)rz   r{   r   r   r_  r   rN  copydeepcopyr   r   tie_encoder_decoderr\  encodernum_decoder_layersra  decoderrd  r   r   encoder_configZdecoder_configr   r+   r9   r{     s    

zLongT5Model.__init__c                 C   s   | j S r   rN  r   r+   r+   r9   get_input_embeddings  s    z LongT5Model.get_input_embeddingsc                 C   s"   || _ | j| | j| d S r   rN  r  rf  r  re  r+   r+   r9   rf    s    z LongT5Model.set_input_embeddingsc                 C   s0   | j jr,| | jj| j | | jj| j d S r   r   rP  _tie_or_clone_weightsr  r`  rN  r  r  r+   r+   r9   _tie_weights  s    zLongT5Model._tie_weightsc                 C   s   | j S r   r  r  r+   r+   r9   get_encoder  s    zLongT5Model.get_encoderc                 C   s*   |  D ]\}}| jj| j| qdS z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        Nitemsr  r9  Z	attentionr   r   Zheads_to_pruner9  r   r+   r+   r9   _prune_heads  s    zLongT5Model._prune_headsN)rE  rU   rD  rF  rt  decoder_head_maskru  encoder_outputsr   rh  decoder_inputs_embedsr   r   rn  rB  r   r&   c                 C   s"  |dur|n| j j}|dur |n| j j}|durX|du rX| j j| j jkrXttt |}|du rz| j	|||
||||d}nH|rt
|tst|d t|dkr|d ndt|dkr|d ndd}|d }| j||||	|||||||||d}|s|| S t|j|j|j|j|j|j|j|jdS )	ax  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

        >>> # Let's try a very long encoder input.
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1

        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NrE  rU   rh  rt  r   rn  rB  r   r   rV   rk  ru   rl  rE  rU   rh  r   r>  r?  rt  ru  r   r   rn  rB  r   )rk  r   decoder_hidden_statesdecoder_attentionsrm  encoder_last_hidden_stater>  encoder_attentions)r   r   ro  ra  r  warningswarnZ#_LongT5Model__HEAD_MASK_WARNING_MSGFutureWarningr  r   r   r   r  r   rk  r   ru   rl  rm  )r   rE  rU   rD  rF  rt  r  ru  r  r   rh  r  r   r   rn  rB  r   ru   decoder_outputsr+   r+   r9   r     sd    Q	zLongT5Model.forward)NNNNNNNNNNNNNNNN)r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r{   r  rf  r  r  r  r   r   r4   
LongTensorFloatTensor
BoolTensorrc   rF   r	   r   r   r   r   r   r+   r+   r   r9   rK    sZ                   rK  z>
    LONGT5 Model with a `language modeling` head on top.
    )Zcustom_introc                       s  e Zd ZdgZg dZed fddZdd Zdd	 Zd
d Z	dd Z
edeej eej eej eej eej eej eej eeeej   ee eej eej eej ee ee ee ee eej eeej ef dddZejdddZ  ZS )rL  r  )r  r  zlm_head.weightr   c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d S )NFTr   )rz   r{   r   	model_dimr   r   r_  rN  r  r  r   r   r  r\  r  r  ra  r  r   rH  rd  r  r   r+   r9   r{   h  s    

z'LongT5ForConditionalGeneration.__init__c                 C   s   | j S r   r  r  r+   r+   r9   r    s    z3LongT5ForConditionalGeneration.get_input_embeddingsc                 C   s"   || _ | j| | j| d S r   r  re  r+   r+   r9   rf    s    z3LongT5ForConditionalGeneration.set_input_embeddingsc                 C   s0   | j jr,| | jj| j | | jj| j d S r   r  r  r+   r+   r9   r    s    z+LongT5ForConditionalGeneration._tie_weightsc                 C   s   | j S r   r  r  r+   r+   r9   r    s    z*LongT5ForConditionalGeneration.get_encoderN)rE  rU   rD  rF  rt  r  ru  r  r   rh  r  labelsr   r   rn  rB  r   r&   c                 C   s  |dur|n| j j}|dur |n| j j}|durX|du rX| j j| j jkrXttt |}|du rz| j	|||
||||d}nH|rt
|tst|d t|dkr|d ndt|dkr|d ndd}|d }|dur|du r|du r| |}| j||||	|||||||||d}|d }| j jr0|| jd  }| |}d}|dur|td	d
}||j}||d|d|d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
        >>> model = LongT5ForConditionalGeneration.from_pretrained(
        ...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
        ... )

        >>> # Let's try a very long input.
        >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
        >>> input_ids = inputs.input_ids

        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        abstractthe aim of this article is to provide an overview of the literature on the role of dog
        ```Nr  r   r   rV   r  r  rI  rT  )Zignore_indexr*   )	lossZlogitsr   r  r  rm  r  r>  r  )r   r   ro  ra  r  r  r  Z6_LongT5ForConditionalGeneration__HEAD_MASK_WARNING_MSGr  r  r   r   r   rY  r  rP  r  rH  r   rR   r<   r   rp  r   r   ru   rl  rm  rk  )r   rE  rU   rD  rF  rt  r  ru  r  r   rh  r  r  r   r   rn  rB  r   ru   r  Zsequence_outputZ	lm_logitsr  Zloss_fctoutputr+   r+   r9   r     s~    U	




z&LongT5ForConditionalGeneration.forward)r  c                 C   s
   |  |S r   )rY  )r   r  r+   r+   r9   %prepare_decoder_input_ids_from_labels8  s    zDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNNNNN)r   r   r   r  r  r   r{   r  rf  r  r  r   r   r4   r  r  r  rc   rF   r	   r   r   r   r   r  r   r+   r+   r   r9   rL  ]  s`                     )rL  c                       s   e Zd ZdgZdgZed fddZdd Zdd	 Zd
d Z	dd Z
dd Zedeej eej eej eej ee ee ee eeej ef dddZ  ZS )rM  r  r  r   c                    sN   t  | t|j|j| _t|}d|_	d|_
t|| j| _|   d S )NF)rz   r{   r   r   r_  r   rN  r  r  r   r  r\  r  rd  )r   r   r  r   r+   r9   r{   A  s    
zLongT5EncoderModel.__init__c                 C   s   | j S r   r  r  r+   r+   r9   r  M  s    z'LongT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S r   )rN  r  rf  re  r+   r+   r9   rf  P  s    z'LongT5EncoderModel.set_input_embeddingsc                 C   s   | j jr| | jj| j d S r   )r   rP  r  r  r`  rN  r  r+   r+   r9   r  T  s    zLongT5EncoderModel._tie_weightsc                 C   s   | j S r   r  r  r+   r+   r9   r  X  s    zLongT5EncoderModel.get_encoderc                 C   s*   |  D ]\}}| jj| j| qdS r  r  r  r+   r+   r9   r  [  s    zLongT5EncoderModel._prune_headsN)rE  rU   rt  rh  r   rn  rB  r&   c           	   	   C   s0   |dur|n| j j}| j|||||||d}|S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  )r   ro  r  )	r   rE  rU   rt  rh  r   rn  rB  r  r+   r+   r9   r   c  s    #
zLongT5EncoderModel.forward)NNNNNNN)r   r   r   r  r  r   r{   r  rf  r  r  r  r   r   r4   r  r  r   r   rF   r   r   r   r+   r+   r   r9   rM  <  s4          rM  )rM  rL  rK  rC  )r   )r   )\r.  r  r   r  typingr   r   r   r4   r   Ztorch.nnr   Zactivationsr   Zcache_utilsr	   r
   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   utilsr   r   r   r   r   r   r   Zutils.deprecationr   Zconfiguration_longt5r   Z!torch.nn.attention.flex_attentionr    Zintegrations.flex_attentionr!   Z
get_loggerr   r   rc   rk   r:   r@   rK   rO   rT   r<   rY   rF   ro   rt   rw   Modulerx   Zapex.normalizationr   infoImportError	Exceptionwarningr   r   r   r   r  r  r  r&  r/  r1  r3  rC  r\  Z__HEAD_MASK_WARNING_MSGrK  rL  rM  __all__r+   r+   r+   r9   <module>   s   $	
		1


 i C  	%"'`a  R @ [X