a
    h                     @   s  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2 e*3e4Z5G dd dej6Z7G dd dej6Z8G dd dej6Z9dd Z:dLddZ;ej<e=ej<dddZ>dMej6ej<ej<ej<eej< e?ee? ee? e@ej<ej<f d!	d"d#ZAG d$d% d%ej6ZBG d&d' d'ej6ZCG d(d) d)eZDG d*d+ d+eDZEG d,d- d-ej6ZFG d.d/ d/ej6ZGG d0d1 d1ej6ZHe'G d2d3 d3e"ZIeej< ed4d5d6ZJe=ed7d8d9ZKeejL ej<ee= ej<d:d;d<ZMG d=d> d>eIZNG d?d@ d@eNZOe'G dAdB dBeIZPe'G dCdD dDeIZQG dEdF dFeIeZRe'G dGdH dHeIZSe'G dIdJ dJeIZTg dKZUdS )N    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )T5GemmaConfigT5GemmaModuleConfigc                       s>   e Zd Zdeed fddZdd Zdd Zd	d
 Z  Z	S )T5GemmaRMSNormư>)dimepsc                    s&   t    || _tt|| _d S N)super__init__r(   nn	ParametertorchZzerosweight)selfr'   r(   	__class__ h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/t5gemma/modeling_t5gemma.pyr+   6   s    
zT5GemmaRMSNorm.__init__c                 C   s$   |t |djddd| j  S )N   T)Zkeepdim)r.   Zrsqrtpowmeanr(   )r0   xr3   r3   r4   _norm;   s    zT5GemmaRMSNorm._normc                 C   s*   |  | }|d| j   }||S )Ng      ?)r:   floatr/   Ztype_as)r0   r9   outputr3   r3   r4   forward>   s    zT5GemmaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler/   shaper(   r0   r3   r3   r4   
extra_reprE   s    zT5GemmaRMSNorm.extra_repr)r&   )
__name__
__module____qualname__intr;   r+   r:   r=   rA   __classcell__r3   r3   r1   r4   r%   5   s   r%   c                       s$   e Zd Z fddZdd Z  ZS )
T5GemmaMLPc                    s   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _t|j| _d S )NFbias)r*   r+   confighidden_sizeZintermediate_sizer,   Linear	gate_projup_proj	down_projr   Zhidden_activationact_fnDropoutdropout_ratedropoutr0   rJ   r1   r3   r4   r+   J   s    
zT5GemmaMLP.__init__c                 C   s2   |  | || | }| |}| |}|S r)   )rP   rM   rN   rS   rO   )r0   r9   hidden_statesrO   r3   r3   r4   r=   U   s    

zT5GemmaMLP.forward)rB   rC   rD   r+   r=   rF   r3   r3   r1   r4   rG   I   s   rG   c                       s>   e Zd ZU ejed< d fdd	Ze edd Z	  Z
S )T5GemmaRotaryEmbeddinginv_freqNc                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultrW   F)
persistent)r*   r+   hasattr
isinstancerX   dictgetrY   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrJ   r   Zrope_init_fnattention_scalingZregister_bufferrW   Zoriginal_inv_freq)r0   rJ   devicerW   r1   r3   r4   r+   _   s    
zT5GemmaRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r6   r"   ZmpscpuF)device_typeZenabledr5   r'   dtype)rW   r;   expandr?   torb   r^   rZ   strr.   Zautocast	transposecatcosra   sinrg   )
r0   r9   position_idsZinv_freq_expandedZposition_ids_expandedrd   ZfreqsZembrm   rn   r3   r3   r4   r=   p   s    0&,zT5GemmaRotaryEmbedding.forward)N)rB   rC   rD   r.   Tensor__annotations__r+   Zno_gradr   r=   rF   r3   r3   r1   r4   rV   \   s
   

rV   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr6   r5   re   )r?   r.   rl   )r9   x1Zx2r3   r3   r4   rotate_half   s    rs   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezers   )qkrm   rn   ro   Zunsqueeze_dimZq_embedZk_embedr3   r3   r4   apply_rotary_pos_emb   s
    

rw   )rU   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)r?   rh   reshape)rU   rx   batchnum_key_value_headsslenhead_dimr3   r3   r4   	repeat_kv   s
    0r           )	modulequerykeyvalueattention_maskrS   scalingsoftcapry   c                 K   s   |d u r| j d }t|| j}	t|| j}
t||	dd| }|d urd|| }t|}|| }|d ur|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	N      r5   r   r6   )r'   rg   )ptrainingr"   )r~   r   num_key_value_groupsr.   matmulrk   tanhr?   r,   Z
functionalZsoftmaxZfloat32ri   rg   rS   r   
contiguous)r   r   r   r   r   rS   r   r   kwargs
key_statesvalue_statesattn_weightsZcausal_maskattn_outputr3   r3   r4   eager_attention_forward   s"    

&r   c                       s   e Zd ZdZeed fddZedddddej	e
ej	ej	f eej	 ee eej ee e
ej	eej	 ee
ej	  f d
ddZ  ZS )T5GemmaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperrJ   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _|j| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _|j| dkr|jnd | _d S )Nr~   r   rH   sliding_attention)r*   r+   rJ   r   getattrrK   num_attention_headsr~   r|   r   query_pre_attn_scalarr   attention_dropoutZ
is_decoder	is_causalr,   rL   attention_biasq_projk_projv_projo_projattn_logit_softcappinglayer_typessliding_windowr0   rJ   r   r1   r3   r4   r+      s,    


zT5GemmaSelfAttention.__init__past_key_valuepast_key_values4.58new_nameversionNrU   position_embeddingsr   r   cache_positionr   ry   c                 K   s,  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d ur|||d}||
|| j	|\}
}t
}| jjdkrt| jj }|| |	|
||f| jr| jnd| j| j| jd|\}}|jg |dR   }| |}||fS Nr6   r"   r5   )rn   rm   r   eagerr   rS   r   r   r   r?   r~   r   viewrk   r   r   rw   updater   r   rJ   _attn_implementationr   r   r   r   r   r   rz   r   r   r0   rU   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rm   rn   Zcache_kwargsattention_interfacer   r   r3   r3   r4   r=      s<    



zT5GemmaSelfAttention.forward)NN)rB   rC   rD   __doc__r$   rE   r+   r   r.   rp   r>   r   r   
LongTensorr   r   r=   rF   r3   r3   r1   r4   r      s     r   c                       s   e Zd ZdZeed fddZedddddej	e
ej	 e
ej	 e
e ee eej	e
ej	 e
eej	  f d
ddZ  ZS )T5GemmaCrossAttentionr   r   c                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _|jd u rtdd S )Nr~   r   FrH   zBCross-attention needs cross_attention_hidden_size to be specified.)r*   r+   rJ   r   r   rK   r   r~   r|   r   r   r   r   r   r,   rL   r   r   Zcross_attention_hidden_sizer   r   r   r   
ValueErrorr   r1   r3   r4   r+      s.    



zT5GemmaCrossAttention.__init__r   r   r   r   N)rU   r   encoder_hidden_statesr   r   ry   c                 K   s  |d u rt d|jd d }g |d| jR }| ||dd}|d urf|j| j}	|j	}
|d u sr|	s|jd d }g |d| jR }| 
||dd}| ||dd}|d ur|
||| j\}}d|j| j< n|
j| j j}|
j| j j}t}| jjdkr(t| jj }|| ||||f| jrD| jnd| jd | jd|\}}|jg |dR   }| |}||fS )	Nz5Encoder hidden state is required for cross attention.r6   r"   r5   Tr   r   r   )r   r?   r~   r   r   rk   
is_updatedr`   r   Zcross_attention_cacher   r   r   layerskeysvaluesr   rJ   r   r   r   r   r   r   rz   r   r   )r0   rU   r   r   r   r   r   r   r   r   Zcurr_past_key_valueZencoder_input_shapeZencoder_hidden_shaper   r   r   r   r   r3   r3   r4   r=   <  sL    	


zT5GemmaCrossAttention.forward)N)rB   rC   rD   r   r$   rE   r+   r   r.   rp   r   r   r   r   r>   r=   rF   r3   r3   r1   r4   r     s    r   c                       s`   e Zd ZdZed fddZd	ejeejejf e	ej e	ej
 eejf dddZ  ZS )
T5GemmaEncoderLayerzEncoder sub-layer.r   c                    s   t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _d S Nr   r(   )r*   r+   rK   rJ   r   r   attention_typer   	self_attnr%   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrG   mlppre_feedforward_layernormpost_feedforward_layernormr,   rQ   rR   rS   r   r1   r3   r4   r+   v  s    

zT5GemmaEncoderLayer.__init__N)rU   r   r   ro   ry   c                 K   sz   |}|  |}| jf ||||d d|\}}| |}|| | }|}| |}| |}| |}|| | }|S )N)rU   r   r   ro   r   )r   r   r   rS   r   r   r   )r0   rU   r   r   ro   r   residual_r3   r3   r4   r=     s&    





zT5GemmaEncoderLayer.forward)NN)rB   rC   rD   r   rE   r+   r.   rp   r>   r   r   FloatTensorr=   rF   r3   r3   r1   r4   r   s  s     
r   c                       s   e Zd ZdZed fddZedddddeje	ejejf e
ej e
ej e
e e
e e
ej e
ej e
ej ejd
ddZ  ZS )T5GemmaDecoderLayerz2Decoder sub-layer: an extra cross-attention layer.r   c                    sD   t  || t||d| _t|j|jd| _t|j|jd| _d S r   )	r*   r+   r   
cross_attnr%   rK   r   pre_cross_attn_layernormpost_cross_attn_layernormr   r1   r3   r4   r+     s    zT5GemmaDecoderLayer.__init__r   r   r   r   NF)
rU   r   r   ro   r   	use_cacher   r   encoder_attention_maskry   c
              
   K   s   |}|  |}| jf |||||d ur*|jnd ||d|
\}}| |}|| | }|}| |}| jf |||	||d|
\}}| |}|| | }|}| |}| 	|}| 
|}|| | }|S )N)rU   r   r   ro   r   r   r   )rU   r   r   r   r   )r   r   self_attention_cacher   rS   r   r   r   r   r   r   )r0   rU   r   r   ro   r   r   r   r   r   r   r   r   r3   r3   r4   r=     sD    









zT5GemmaDecoderLayer.forward)NNNFNNN)rB   rC   rD   r   rE   r+   r   r.   rp   r>   r   r   r	   boolr   r=   rF   r3   r3   r1   r4   r     s,          r   c                       s@   e Zd ZdZd	eeed fddZejejdddZ	  Z
S )
T5GemmaClassificationHeadz-Head for sentence-level classification tasks.r   )rK   
num_labelsclassifier_dropout_ratec                    s*   t    tj|d| _t||| _d S )N)r   )r*   r+   r,   rQ   rS   rL   out_proj)r0   rK   r   r   r1   r3   r4   r+     s    
z"T5GemmaClassificationHead.__init__rU   ry   c                 C   s   |  |}| |}|S r)   )rS   r   )r0   rU   r3   r3   r4   r=     s    

z!T5GemmaClassificationHead.forward)r   )rB   rC   rD   r   rE   r;   r+   r.   rp   r=   rF   r3   r3   r1   r4   r     s   r   c                       s@   e Zd ZdZd	eeed fddZejejdddZ	  Z
S )
T5GemmaLMHeadz.Head for language modeling (generation) tasks.F)rK   
vocab_sizerI   c                    s    t    tj|||d| _d S )NrH   )r*   r+   r,   rL   r   )r0   rK   r   rI   r1   r3   r4   r+     s    
zT5GemmaLMHead.__init__r   c                 C   s   |  |}|S r)   )r   )r0   rU   logitsr3   r3   r4   r=     s    
zT5GemmaLMHead.forward)F)rB   rC   rD   r   rE   r   r+   r.   rp   r=   rF   r3   r3   r1   r4   r     s   r   c                       s   e Zd ZdZeed fddZedddddej	e
ej	ej	f eej	 ee eej ee e
ej	eej	 ee
ej	  f d
ddZ  ZS )T5GemmaAttentionr   r   c                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _|j| dkr|jnd | _d S )Nr~   r   TrH   r   )r*   r+   rJ   r   r   rK   r   r~   r|   r   r   r   r   r   r,   rL   r   r   r   r   r   r   r   r   r   r1   r3   r4   r+     s,    


zT5GemmaAttention.__init__r   r   r   r   Nr   c                 K   s,  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d ur|||d}||
|| j	|\}
}t
}| jjdkrt| jj }|| |	|
||f| jr| jnd| j| j| jd|\}}|jg |dR   }| |}||fS r   r   r   r3   r3   r4   r=     s<    



zT5GemmaAttention.forward)NN)rB   rC   rD   r   r#   rE   r+   r   r.   rp   r>   r   r   r   r   r   r=   rF   r3   r3   r1   r4   r     s     r   c                       s`   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZ fddZd	d
 Z  ZS )T5GemmaPreTrainedModelrJ   modelTZT5GemmaBlockr   )rU   
attentionsc                    s   t  | | jj}t|trp|jjjd d }|jjj	j
d|| d t|jdr|jjd ur|jjj	  n<t|tr| jjs|jjjd d }|jjj	j
d|| d d S )Nr   r   r   )r8   stdrI   )r*   _init_weightsrJ   Zinitializer_ranger^   r   r   r/   r?   dataZnormal_r]   rI   Zzero_r   tie_word_embeddings)r0   r   r   scaler1   r3   r4   r   Y  s    

z$T5GemmaPreTrainedModel._init_weightsc                 C   s|   | j jj}| j jj}|du r$td||j}|dddf  |dddf< ||d< |du rhtd||dk| |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r6   r"   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rJ   decoderZbos_token_idpad_token_idr   Z	new_zerosr?   cloneZmasked_fill_)r0   	input_idsZdecoder_start_token_idr   Zshifted_input_idsr3   r3   r4   _shift_rightg  s    

 z#T5GemmaPreTrainedModel._shift_right)rB   rC   rD   r#   rq   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZ_supports_attention_backendr   r   _can_record_outputsr   r   rF   r3   r3   r1   r4   r   G  s   
r   )r   ry   c                    s   t t t t td fdd}|S )z4
    This creates bidirectional attention mask.
    	batch_idxhead_idxq_idxkv_idxry   c                    s,    d u rt jdt jdS  | |f t jS )Nr3   rf   )r.   onesr   ri   r   r   r   r   r   r3   r4   
inner_mask  s    z/bidirectional_mask_function.<locals>.inner_maskrE   r   )r   r   r3   r   r4   bidirectional_mask_function  s    r   )r   ry   c                    s   t t t t td fdd}|S )zH
    This creates bidirectional attention mask with sliding window.
    r   c                    s   |  |k ||  k @ S r)   r3   r   r   r3   r4   r     s    z>sliding_window_bidirectional_mask_function.<locals>.inner_maskr   )r   r   r3   r   r4   *sliding_window_bidirectional_mask_function  s    r  )	token_idsrU   r   ry   c                 C   sV   | dur.|du rt d| |k|jtj}n$tj|jd |jd f|jtjd}|S )z%Construct the default attention mask.Nz3`pad_token_id` is required for padding information.r   r"   rb   rg   )r   ri   rb   r.   longr   r?   )r  rU   r   r   r3   r3   r4   make_default_2d_attention_mask  s    r  c                	       s`   e Zd ZeedZ fddZedee	j
 ee	j ee	j
 ee	j ee edddZ  ZS )	T5GemmaEncoder)r   rU   c                    s   t     j| _ j| _t j j| j| _t	 j j
d| _t d| _d| _t fddt jD | _t j| _|   d S )Nr   rJ   Fc                    s   g | ]}t  |qS r3   )r   .0r   r  r3   r4   
<listcomp>      z+T5GemmaEncoder.__init__.<locals>.<listcomp>)r*   r+   r   Zpadding_idxr   r,   Z	EmbeddingrK   embed_tokensr%   r   normrV   
rotary_embZgradient_checkpointing
ModuleListrangenum_hidden_layersr   rQ   rR   rS   	post_initrT   r1   r  r4   r+     s    zT5GemmaEncoder.__init__Nr   r   ro   inputs_embedsr   ry   c                 K   sh  |d u |d uA rt d|dd  |d u r6| |}tjd|jd |jd}|d u r`|d}|d u rxt||| j	j
}t| }ts| j	|||d |d}tf i |dt|itf i |t| j	jt|dd	}|}	| |	|}
tj| j	jd
 |	jd}|	| }	| |	}	| jd | j	j D ]"}||	|
||j |fi |}	q&| |	}	| |	}	t|	dS )N:You must specify exactly one of input_ids or inputs_embedsr   r   r"   rb   rJ   Zinput_embedsr   r   r   ro   or_mask_function)r  Zand_mask_functionfull_attentionr         ?rf   )last_hidden_state)r   popr  r.   aranger?   rb   rt   r  rJ   r   r^   r_   r   r   r   r  r   r  tensorrK   rg   rS   r   r  r   r  r   )r0   r   r   ro   r  r   r   self_attn_mask_mappingmask_kwargsrU   r   
normalizerlayer_moduler3   r3   r4   r=     sb    	

	




zT5GemmaEncoder.forward)NNNN)rB   rC   rD   r   r   r   r+   r!   r   r.   r   rp   r   r   r   r   r=   rF   r3   r3   r1   r4   r    s"       r  c                       s   e Zd ZeeddeeddedZ fddZe	d
e
ej e
ej e
ej e
e e
ej e
e e
ej e
ej e
ej ee eddd	Z  ZS )T5GemmaDecoderr"   )index)r   cross_attentionsrU   c                    s8   t    t fddt jD | _|   d S )Nc                    s   g | ]}t  |qS r3   )r   r  r  r3   r4   r
    r  z+T5GemmaDecoder.__init__.<locals>.<listcomp>)r*   r+   r,   r  r  r  r   r  rT   r1   r  r4   r+     s
    zT5GemmaDecoder.__init__N)r   r   ro   r   r  r   r   r   r   r   ry   c
                 K   s   |d u |d uA rt d|d u r(t d|d u r:| |}| jsf|rf|d u rftt| jdt| jd}|d u r|d ur~| nd}tj|||j	d  |j
d}|d u r|d}|d u r|d u rt||| jj}t| }ts | j||||d ur|jnd |d}tf i |tf i |d}t|	 }tsb| j||	|d d d}d	tf i |d
t|	ii}|}| ||}tj| jjd |jd}|| }| |}| jd | jj D ]0}|||||j ||||||d	 f	i |
}q| |}| |}t||dS )Nr  z0`encoder_hidden_states` must be given in decoderr  r   r"   r  r  r  r  r  r  rf   )r  r   )r   r  r   r	   r   rJ   Zget_seq_lengthr.   r  r?   rb   rt   r  r   r^   r_   r   r   r   r   r  r  rK   rg   rS   r   r  r   r  r   )r0   r   r   ro   r   r  r   r   r   r   r   Zpast_seen_tokensr   r!  Zcross_attn_mask_mappingrU   r   r"  r#  r3   r3   r4   r=     s    

		




zT5GemmaDecoder.forward)	NNNNNNNNN)rB   rC   rD   r    r   r   r   r   r+   r!   r   r.   r   rp   r	   r   r   r   r   r   r=   rF   r3   r3   r1   r4   r$    s8   

         r$  c                       s   e Zd Zed fddZdd Zdd Zdd	 Zee	de
ej e
ej e
ej e
ej e
ej e
ej e
e e
e e
ej e
ej e
e e
ej ee edddZ  ZS )T5GemmaModelr  c                    s>   t  | |jstdt|j| _t|j| _|   d S )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	r*   r+   is_encoder_decoderr   r  encoderr$  r   r  rT   r1   r3   r4   r+   w  s    zT5GemmaModel.__init__c                 C   s   | j S r)   r)  r@   r3   r3   r4   get_encoder  s    zT5GemmaModel.get_encoderc                 C   s
   | j  S r)   r)  get_input_embeddingsr@   r3   r3   r4   r-    s    z!T5GemmaModel.get_input_embeddingsc                 C   s   | j |S r)   r)  set_input_embeddingsr0   Znew_embeddingsr3   r3   r4   r/    s    z!T5GemmaModel.set_input_embeddingsN)r   r   ro   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   r  decoder_inputs_embedsr   r   r   ry   c                 K   s   |du r"| j f ||||	d|}|j}| jf ||||
|||||d	|}t|j|j|ddrh|jn|jf|j|j|j|j|jdS )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        Nr   r   ro   r  )	r   r   ro   r  r   r   r   r   r   Zoutput_hidden_statesF)r  r   decoder_hidden_statesdecoder_attentionsr&  encoder_last_hidden_stater   encoder_attentions)	r)  r  r   r   r   r`   rU   r   r&  )r0   r   r   ro   r1  r2  r3  r4  r   r  r5  r   r   r   r   decoder_outputsr3   r3   r4   r=     sF    

zT5GemmaModel.forward)NNNNNNNNNNNN)rB   rC   rD   r#   r+   r+  r-  r/  r   r   r   r.   r   r   
BoolTensorr   r	   rp   r   r   r   r   r=   rF   r3   r3   r1   r4   r'  u  sD               r'  c                
       sp   e Zd Zed fddZdd Zdd Zeede	e
j e	e
j e	e
j e	e
j ee ed	d
dZ  ZS )T5GemmaEncoderModelr  c                    s2   t  | |jrtdt|j| _|   d S )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)r*   r+   r(  r   r  r)  r  rT   r1   r3   r4   r+     s
    zT5GemmaEncoderModel.__init__c                 C   s
   | j  S r)   r,  r@   r3   r3   r4   r-    s    z(T5GemmaEncoderModel.get_input_embeddingsc                 C   s   | j |S r)   r.  r0  r3   r3   r4   r/    s    z(T5GemmaEncoderModel.set_input_embeddingsNr  c                 K   s   | j f ||||d|}|S )Nr6  r*  )r0   r   r   ro   r  r   r4  r3   r3   r4   r=     s    
zT5GemmaEncoderModel.forward)NNNN)rB   rC   rD   r#   r+   r-  r/  r   r   r   r.   r   r   rp   r   r   r   r=   rF   r3   r3   r1   r4   r=    s"   	    r=  c                       s  e Zd ZddgZddiZddgdgfiZed fdd	Zd
d Zdd Z	dd Z
dd Zdd Zeedeej eej eej eej eej eej ee ee eej eej eej ee eej eeejf ee eeej ef dddZejdddZ  Z S )T5GemmaForConditionalGenerationz!model.decoder.embed_tokens.weightzlm_head.out_proj.weightzlm_head.out_projZcolwise_reprU   r   r  c                    sJ   d|_ t | t|| _|jj| _t|jj| j| _	d| _
|   d S )NTZForMaskedLM)r(  r*   r+   r'  r   r   r   r   rK   lm_headZ	loss_typer  rT   r1   r3   r4   r+     s    

z(T5GemmaForConditionalGeneration.__init__c                 C   s   || j _d S r)   r?  r   r0  r3   r3   r4   set_output_embeddings  s    z5T5GemmaForConditionalGeneration.set_output_embeddingsc                 C   s   | j jS r)   r@  r@   r3   r3   r4   get_output_embeddings   s    z5T5GemmaForConditionalGeneration.get_output_embeddingsc                 C   s$   | j jr | | jj|    d S r)   )rJ   r   Z_tie_or_clone_weightsr?  r   get_decoderr-  r@   r3   r3   r4   _tie_weights  s    z,T5GemmaForConditionalGeneration._tie_weightsc                 C   s   | j jS r)   )r   r)  r@   r3   r3   r4   r+    s    z+T5GemmaForConditionalGeneration.get_encoderc                 C   s   | j jS r)   )r   r   r@   r3   r3   r4   rC    s    z+T5GemmaForConditionalGeneration.get_decoderNr   )r   r   ro   r1  r2  r3  r4  r   r  r5  labelsr   r   logits_to_keepr   ry   c                 K   sD  | j r<| jjdkr<d| jj d}t r2t|n
t| |dur^|du r^|
du r^| |}| jf |||||||||	|
||d|}|j	}t
|trt| dn|}| |dd|ddf }|  j}|jdur||j }t|}||j }d}|dur| j||| jfi |}t|||j|j|j|j|j|j|jd	S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        r   ziIt is strongly recommended to train T5Gemma models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)r   r   ro   r1  r2  r3  r4  r   r  r5  r   r   )	lossr   r   r7  r8  r&  r9  r   r:  )r   rJ   r   r   r   loggerwarning_oncer   r   r  r^   rE   slicer?  rC  Zfinal_logit_softcappingr.   r   loss_functionr   r   r   r7  r8  r&  r9  r   r:  )r0   r   r   ro   r1  r2  r3  r4  r   r  r5  rE  r   r   rF  r   msgr;  rU   Zslice_indicesr   Zdecoder_configrG  r3   r3   r4   r=     s`    








z'T5GemmaForConditionalGeneration.forward)rE  c                 C   s
   |  |S r)   )r   )r0   rE  r3   r3   r4   %prepare_decoder_input_ids_from_labelsd  s    zET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNr   )!rB   rC   rD   Z_tied_weights_keysZ_tp_planZ_pp_planr#   r+   rA  rB  rD  r+  rC  r   r   r   r.   r   r   r<  r   r	   r   r   rE   rp   r   r   r>   r   r=   rM  rF   r3   r3   r1   r4   r>    sX                 Tr>  c                       s   e Zd Zdeee d fddZdd Zdd Ze	e
deej eej eej eej eej eej ee eej eej eej ee ed	d
dZ  ZS ) T5GemmaForSequenceClassificationNrJ   r(  c                    s|   |dur||_ t | |j| _|j r4t|| _n
t|| _|jj}|j rT|j	j}t
|dd}t|| j|| _|   dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
        Nr   皙?r(  r*   r+   r   r'  r   r=  r)  rK   r   r   r   scorer  r0   rJ   r(  rK   Zclassifier_dropoutr1   r3   r4   r+   j  s    
z)T5GemmaForSequenceClassification.__init__c                 C   s
   | j  S r)   r   r-  r@   r3   r3   r4   r-    s    z5T5GemmaForSequenceClassification.get_input_embeddingsc                 C   s   | j | d S r)   r   r/  r0   r   r3   r3   r4   r/    s    z5T5GemmaForSequenceClassification.set_input_embeddingsr   r   ro   r1  r2  r3  r4  r  r5  rE  r   ry   c                 K   s   | j jr,|du r,|dur,td| jj d| j jr^|du r^|	du r^|du rTtd| |}| j jr| j|f||||||||	dd	|}|j}|j	}|j
}n,| j|f|||d|}|j}|j}|j}| |}|dur|jd }n
|jd }| j jdu r|d	krtd
| j jdu r(d}n|dur|| j jk|jtj}tj|jd |jtjd}|| d}| j jr|d	7 }tj||jd d	 d}nd}t| jj d |tj||jd|f }d}|
dur| j||
|| j d}t||||dS )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   ro   r1  r2  r3  r4  r  r5  r   r   ro   r  r   r"   z=Cannot handle batch sizes > 1 if no padding token is defined.r6   r  )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )r   rE  pooled_logitsrJ   rG  r   rU   r   )rJ   r(  NotImplementedErrorr2   rB   r   r   r   r  r7  r8  rU   r   rR  r?   r   ri   rb   r.   Zint32r  ZargmaxclamprH  rI  rK  r   )r0   r   r   ro   r1  r2  r3  r4  r  r5  rE  r   outputsr  rU   r   r   Z
batch_sizeZlast_non_pad_tokenZnon_pad_maskZtoken_indicesr_  rG  r3   r3   r4   r=     s    





z(T5GemmaForSequenceClassification.forward)N)
NNNNNNNNNN)rB   rC   rD   r#   r   r   r+   r-  r/  r   r   r.   r   rp   r   r   r   r   r   r=   rF   r3   r3   r1   r4   rN  h  s:             rN  c                       s   e Zd Zdeee d fddZdd Zdd Ze	e
deej eej eej eej eej eej ee eej eej eej ee ed	d
dZ  ZS )T5GemmaForTokenClassificationNrO  c                    s|   |dur||_ t | |j| _|j r4t|| _n
t|| _|jj}|j rT|j	j}t
|dd}t|| j|| _|   dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for token classification. When set to False, only encoder is used.
        Nr   rP  rQ  rS  r1   r3   r4   r+     s    
z&T5GemmaForTokenClassification.__init__c                 C   s
   | j  S r)   rT  r@   r3   r3   r4   r-    s    z2T5GemmaForTokenClassification.get_input_embeddingsc                 C   s   | j | d S r)   rU  rV  r3   r3   r4   r/    s    z2T5GemmaForTokenClassification.set_input_embeddingsrW  c                 K   s  | j jr,|du r,|dur,td| jj d| j jr^|du r^|	du r^|du rTtd| |}| j jr| j|f||||||||	dd	|}|j}|j	}|j
}n,| j|f|||d|}|j}|j}|j}| |}d}|
dur| ||
| j }t||||dS )	rX  NrY  rZ  r[  Fr\  r]  r`  )rJ   r(  ra  r2   rB   r   r   r   r  r7  r8  rU   r   rR  rK  r   )r0   r   r   ro   r1  r2  r3  r4  r  r5  rE  r   rc  r  rU   r   r   rG  r3   r3   r4   r=     sf    

z%T5GemmaForTokenClassification.forward)N)
NNNNNNNNNN)rB   rC   rD   r#   r   r   r+   r-  r/  r   r   r.   r   rp   r   r   r   r   r   r=   rF   r3   r3   r1   r4   rd    s:             rd  )r>  r'  r=  r   rN  rd  )Nr"   )r   NN)Vtypingr   r   r   r.   Ztorch.nnr,   Zactivationsr   Zcache_utilsr   r   r	   Z
generationr
   Zmasking_utilsr   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zutils.deprecationr   Zutils.genericr    r!   Zconfiguration_t5gemmar#   r$   Z
get_loggerrB   rH  Moduler%   rG   rV   rs   rw   rp   rE   r   r;   r>   r   r   r   r   r   r   r   r   r   r   r  r   r  r  r$  r'  r=  r>  rN  rd  __all__r3   r3   r3   r4   <module>   s    
$
   #LV4;K:]mR${ r