a
    hr                     @   s  d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 dZ6e%7e8Z9G dd de,Z:G dd deZ;G dd de1Z<G dd de/Z=G dd de2Z>G dd de.Z?G dd  d e.Z@eejA ed!d"d#ZBeCed$d%d&ZDG d'd( d(eZEG d)d* d*eEZFG d+d, d,ejGZHG d-d. d.ejGZIe"G d/d0 d0e0ZJeejK ejAeeC ejAd1d2d3ZLG d4d5 d5eJZMG d6d7 d7eMZNe"G d8d9 d9eJZOe"G d:d; d;eJZPG d<d= d=eJeZQe"G d>d? d?eJZRe"G d@dA dAeJZSg dBZTdS )C    )AnyCallableOptionalUnionN   )CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )Gemma2Config)Gemma2Attention	Gemma2MLPGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingcreate_causal_mask!create_sliding_window_causal_maskeager_attention_forwardz google/t5gemma-2b-2b-prefixlm-itc                   @   s   e Zd ZdS )T5GemmaModuleConfigN__name__
__module____qualname__ r.   r.   g/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/t5gemma/modular_t5gemma.pyr)   @   s   r)   c                       s   e Zd ZdZdZdgZdddddddddddddddddddZdgdgfd	d
gd	gfd	gd	gfdgdgfd	d
gd	gfd	gd	gfdZdee	e
eeef f  ee	e
eeef f  eeeeeed fddZ fddZdd Z  ZS )T5GemmaConfiga  
    This is the configuration class to store the configuration of a [`T5GemmaModel`]. It is used to instantiate an T5Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to a hypothetical balanced Gemma2 encoder-decoder model.
    e.g. [google/t5gemma-2b-2b-prefixlm-it](https://huggingface.co/google/t5gemma-2b-2b-prefixlm-it)
    ```python
    >>> from transformers import T5GemmaConfig, T5GemmaModel
    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
    >>> model = T5GemmaModel(t5gemma_config)
    ```
    Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the
    documentation from [PretrainedConfig] for more information.
    Args:
        encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the encoder.
        decoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the decoder.
        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        dropout_rate (`float`, *optional*, defaults to 0.0):
            The ratio for all dropout layers (following T5).
        classifier_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier (following T5).
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether tie input and output embeddings.
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the T5Gemma model (the same as Gemma 2).
        kwargs (additional keyword arguments, optional, *optional*):
            Will be passed to the PretrainedConfig base class.
    Zt5gemmapast_key_valuesZcolwiseZrowwise)z!encoder.layers.*.self_attn.q_projz!encoder.layers.*.self_attn.k_projz!encoder.layers.*.self_attn.v_projz!encoder.layers.*.self_attn.o_projzencoder.layers.*.mlp.gate_projzencoder.layers.*.mlp.up_projzencoder.layers.*.mlp.down_projz!decoder.layers.*.self_attn.q_projz!decoder.layers.*.self_attn.k_projz!decoder.layers.*.self_attn.v_projz!decoder.layers.*.self_attn.o_projz"decoder.layers.*.cross_attn.q_projz"decoder.layers.*.cross_attn.k_projz"decoder.layers.*.cross_attn.v_projz"decoder.layers.*.cross_attn.o_projzdecoder.layers.*.mlp.gate_projzdecoder.layers.*.mlp.up_projzdecoder.layers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)zencoder.embed_tokenszencoder.layerszencoder.normzdecoder.embed_tokenszdecoder.layerszdecoder.normNT          )encoderdecoderis_encoder_decoderdropout_rateclassifier_dropout_rateattention_dropouttie_word_embeddings
vocab_sizec	                    sf  t |trtf i |}n,|d u r*t }nt |tsFJ t| dt |tr`tf i |}n*|d u rn|}nt |tsJ t| dtf i | }tf i | }d|_||_||_|| _d|_d|_	||_||_|j
|_|| _dD ]}
|
|	vrt||
|	|
< qt jf i |	 || _|	d|j	| _	|	d|j| _|| _|| _|| _|| _|| _d S )Nz is not supported.FT)bos_token_idpad_token_idZeos_token_id	use_cacheinitializer_range)
isinstancedictr)   typeto_dict
is_decoderr;   r=   r8   rB   hidden_sizecross_attention_hidden_sizer9   getattrsuper__init__r:   getrC   r<   r>   r?   )selfr8   r9   r:   r;   r<   r=   r>   r?   kwargsZspecial_token_key	__class__r.   r/   rM      sD    

zT5GemmaConfig.__init__c                    s>   g d}||v r,t | j|| t | j|| t || d S )N)output_hidden_statesZoutput_attentions_attn_implementationr;   r=   r?   )setattrr8   r9   rL   __setattr__)rO   keyvalueZshared_attr_with_submodulesrQ   r.   r/   rV      s
    	zT5GemmaConfig.__setattr__c                 O   s   | S Nr.   )rO   argsrP   r.   r.   r/   get_text_config   s    zT5GemmaConfig.get_text_config)NNTr6   r6   r6   Tr7   )r+   r,   r-   __doc__Z
model_typeZkeys_to_ignore_at_inferenceZbase_model_tp_planZbase_model_pp_planr   r   r)   rE   r   boolfloatintrM   rV   r[   __classcell__r.   r.   rQ   r/   r0   D   sb   !



        :r0   c                   @   s   e Zd ZdS )T5GemmaRMSNormNr*   r.   r.   r.   r/   ra      s   ra   c                       s$   e Zd Z fddZdd Z  ZS )
T5GemmaMLPc                    s   t  | t|j| _d S rY   )rL   rM   nnDropoutr;   dropoutrO   configrQ   r.   r/   rM      s    zT5GemmaMLP.__init__c                 C   s2   |  | || | }| |}| |}|S rY   )Zact_fnZ	gate_projZup_projre   	down_proj)rO   xr4   rh   r.   r.   r/   forward   s    

zT5GemmaMLP.forward)r+   r,   r-   rM   rj   r`   r.   r.   rQ   r/   rb      s   rb   c                       s   e Zd Zd fdd	Z  ZS )T5GemmaRotaryEmbeddingNc                    s   t  || d S rY   )rL   rM   )rO   rg   devicerQ   r.   r/   rM      s    zT5GemmaRotaryEmbedding.__init__)N)r+   r,   r-   rM   r`   r.   r.   rQ   r/   rk      s   rk   c                       s$   e Zd Zeed fddZ  ZS )T5GemmaSelfAttentionrg   	layer_idxc                    s   t  || |j| _d S rY   )rL   rM   rH   	is_causalrO   rg   ro   rQ   r.   r/   rM      s    zT5GemmaSelfAttention.__init__)r+   r,   r-   r)   r_   rM   r`   r.   r.   rQ   r/   rm      s   rm   c                       s~   e Zd Zeed fddZedddddeje	ej e	ej e	e
 ee eeje	ej e	eej  f d	d
dZ  ZS )T5GemmaCrossAttentionrn   c                    sj   t  || | `d| _|jd u r*tdtj|j|j| j	 |j
d| _tj|j|j| j	 |j
d| _d S )NFzBCross-attention needs cross_attention_hidden_size to be specified.bias)rL   rM   sliding_windowrp   rJ   
ValueErrorrc   LinearZnum_key_value_headshead_dimZattention_biask_projv_projrq   rQ   r.   r/   rM      s    
zT5GemmaCrossAttention.__init__past_key_valuer1   4.58new_nameversionN)r4   r5   encoder_hidden_statesr1   rP   returnc                 K   s  |d u rt d|jd d }g |d| jR }| ||dd}|d urf|j| j}	|j	}
|d u sr|	s|jd d }g |d| jR }| 
||dd}| ||dd}|d ur|
||| j\}}d|j| j< n|
j| j j}|
j| j j}t}| jjdkr(t| jj }|| ||||f| jrD| jnd| jd | jd|\}}|jg |dR   }| |}||fS )	Nz5Encoder hidden state is required for cross attention.   r   Teagerr6   )re   scalingru   Zsoftcap)rv   shaperx   Zq_projviewZ	transpose
is_updatedrN   ro   Zcross_attention_cachery   rz   updatelayerskeysvaluesr(   rg   rT   r   trainingr=   r   Zattn_logit_softcappingZreshape
contiguousZo_proj)rO   r4   r5   r   r1   rP   Zinput_shapeZhidden_shapeZquery_statesr   Zcurr_past_key_valueZencoder_input_shapeZencoder_hidden_shapeZ
key_statesZvalue_statesZattention_interfaceZattn_outputZattn_weightsr.   r.   r/   rj     sL    	


zT5GemmaCrossAttention.forward)N)r+   r,   r-   r)   r_   rM   r   torchTensorr   r   r   r   tuplerj   r`   r.   r.   rQ   r/   rr      s    rr   )r5   r   c                    s   t t t t td fdd}|S )z4
    This creates bidirectional attention mask.
    	batch_idxhead_idxq_idxkv_idxr   c                    s,    d u rt jdt jdS  | |f t jS )Nr.   dtype)r   onesr]   tor   r   r   r   r5   r.   r/   
inner_mask?  s    z/bidirectional_mask_function.<locals>.inner_maskr_   r]   )r5   r   r.   r   r/   bidirectional_mask_function:  s    r   )ru   r   c                    s   t t t t td fdd}|S )zH
    This creates bidirectional attention mask with sliding window.
    r   c                    s   |  |k ||  k @ S rY   r.   r   ru   r.   r/   r   L  s    z>sliding_window_bidirectional_mask_function.<locals>.inner_maskr   )ru   r   r.   r   r/   *sliding_window_bidirectional_mask_functionG  s    r   c                       s`   e Zd ZdZed fddZd	ejeejejf e	ej e	ej
 eejf dddZ  ZS )
T5GemmaEncoderLayerzEncoder sub-layer.ro   c                    s   t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _d S Nrn   eps)rL   rM   rI   rg   ro   Zlayer_typesattention_typerm   	self_attnra   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrb   mlppre_feedforward_layernormpost_feedforward_layernormrc   rd   r;   re   rq   rQ   r.   r/   rM   U  s    

zT5GemmaEncoderLayer.__init__N)r4   position_embeddingsr5   position_idsr   c                 K   sz   |}|  |}| jf ||||d d|\}}| |}|| | }|}| |}| |}| |}|| | }|S )N)r4   r   r5   r   r1   )r   r   r   re   r   r   r   )rO   r4   r   r5   r   rP   residual_r.   r.   r/   rj   i  s&    





zT5GemmaEncoderLayer.forward)NN)r+   r,   r-   r\   r_   rM   r   r   r   r   
LongTensorFloatTensorrj   r`   r.   r.   rQ   r/   r   R  s     
r   c                       s   e Zd ZdZed fddZedddddeje	ejejf e
ej e
ej e
e e
e e
ej e
ej e
ej ejd
ddZ  ZS )T5GemmaDecoderLayerz2Decoder sub-layer: an extra cross-attention layer.r   c                    sD   t  || t||d| _t|j|jd| _t|j|jd| _d S r   )	rL   rM   rr   
cross_attnra   rI   r   pre_cross_attn_layernormpost_cross_attn_layernormrq   rQ   r.   r/   rM     s    zT5GemmaDecoderLayer.__init__r{   r1   r|   r}   NF)
r4   r   r5   r   r1   rB   cache_positionr   encoder_attention_maskr   c
              
   K   s   |}|  |}| jf |||||d ur*|jnd ||d|
\}}| |}|| | }|}| |}| jf |||	||d|
\}}| |}|| | }|}| |}| 	|}| 
|}|| | }|S )N)r4   r   r5   r   r1   rB   r   )r4   r   r5   r1   rB   )r   r   self_attention_cacher   re   r   r   r   r   r   r   )rO   r4   r   r5   r   r1   rB   r   r   r   rP   r   r   r.   r.   r/   rj     sD    









zT5GemmaDecoderLayer.forward)NNNFNNN)r+   r,   r-   r\   r_   rM   r   r   r   r   r   r   r	   r]   r   rj   r`   r.   r.   rQ   r/   r     s,          r   c                       s@   e Zd ZdZd	eeed fddZejejdddZ	  Z
S )
T5GemmaClassificationHeadz-Head for sentence-level classification tasks.r6   )rI   
num_labelsr<   c                    s*   t    tj|d| _t||| _d S )N)p)rL   rM   rc   rd   re   rw   out_proj)rO   rI   r   r<   rQ   r.   r/   rM     s    
z"T5GemmaClassificationHead.__init__r4   r   c                 C   s   |  |}| |}|S rY   )re   r   )rO   r4   r.   r.   r/   rj     s    

z!T5GemmaClassificationHead.forward)r6   )r+   r,   r-   r\   r_   r^   rM   r   r   rj   r`   r.   r.   rQ   r/   r     s   r   c                       s@   e Zd ZdZd	eeed fddZejejdddZ	  Z
S )
T5GemmaLMHeadz.Head for language modeling (generation) tasks.F)rI   r?   rt   c                    s    t    tj|||d| _d S )Nrs   )rL   rM   rc   rw   r   )rO   rI   r?   rt   rQ   r.   r/   rM     s    
zT5GemmaLMHead.__init__r   c                 C   s   |  |}|S rY   )r   )rO   r4   logitsr.   r.   r/   rj     s    
zT5GemmaLMHead.forward)F)r+   r,   r-   r\   r_   r]   rM   r   r   rj   r`   r.   r.   rQ   r/   r     s   r   c                   @   s4   e Zd ZU eed< dZdZdgZdd Zdd Z	d	S )
T5GemmaPreTrainedModelrg   modelTZT5GemmaBlockc                 C   s   t | | | jj}t|trp|jjjd d }|jjj	j
d|| d t|jdr|jjd ur|jjj	  n<t|tr| jjs|jjjd d }|jjj	j
d|| d d S )Nr   g      r6   )meanstdrt   )r   _init_weightsrg   rC   rD   r   r   weightr   dataZnormal_hasattrrt   Zzero_r   r>   )rO   moduler   scaler.   r.   r/   r     s    

z$T5GemmaPreTrainedModel._init_weightsc                 C   s|   | j jj}| j jj}|du r$td||j}|dddf  |dddf< ||d< |du rhtd||dk| |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r   r   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rg   r9   r@   rA   rv   Z	new_zerosr   cloneZmasked_fill_)rO   r2   Zdecoder_start_token_idrA   Zshifted_input_idsr.   r.   r/   _shift_right  s    

 z#T5GemmaPreTrainedModel._shift_rightN)
r+   r,   r-   r0   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesr   r   r.   r.   r.   r/   r     s   
r   )	token_idsr4   rA   r   c                 C   sV   | dur.|du rt d| |k|jtj}n$tj|jd |jd f|jtjd}|S )z%Construct the default attention mask.Nz3`pad_token_id` is required for padding information.r   r   rl   r   )rv   r   rl   r   longr   r   )r   r4   rA   r5   r.   r.   r/   make_default_2d_attention_mask  s    r   c                	       s`   e Zd ZeedZ fddZedee	j
 ee	j ee	j
 ee	j ee edddZ  ZS )	T5GemmaEncoder)
attentionsr4   c                    s   t     j| _ j| _t j j| j| _t	 j j
d| _t d| _d| _t fddt jD | _t j| _|   d S )Nr   rg   Fc                    s   g | ]}t  |qS r.   )r   .0ro   r   r.   r/   
<listcomp>-      z+T5GemmaEncoder.__init__.<locals>.<listcomp>)rL   rM   rA   Zpadding_idxr?   rc   Z	EmbeddingrI   embed_tokensra   r   normrk   
rotary_embZgradient_checkpointing
ModuleListrangenum_hidden_layersr   rd   r;   re   	post_initrf   rQ   r   r/   rM   "  s    zT5GemmaEncoder.__init__Nr2   r5   r   r3   rP   r   c                 K   sh  |d u |d uA rt d|dd  |d u r6| |}tjd|jd |jd}|d u r`|d}|d u rxt||| j	j
}t| }ts| j	|||d |d}tf i |dt|itf i |t| j	jt|dd	}|}	| |	|}
tj| j	jd
 |	jd}|	| }	| |	}	| jd | j	j D ]"}||	|
||j |fi |}	q&| |	}	| |	}	t|	dS )N:You must specify exactly one of input_ids or inputs_embedsr1   r   r   rl   rg   Zinput_embedsr5   r   r1   r   or_mask_function)r   Zand_mask_functionfull_attentionZsliding_attention      ?r   )last_hidden_state)rv   popr   r   aranger   rl   	unsqueezer   rg   rA   rD   rE   r&   r   r'   r   ru   r   tensorrI   r   re   r   r   r   r   r   )rO   r2   r5   r   r3   rP   r   self_attn_mask_mappingmask_kwargsr4   r   
normalizerlayer_moduler.   r.   r/   rj   4  sb    	

	




zT5GemmaEncoder.forward)NNNN)r+   r,   r-   rm   r   _can_record_outputsrM   r   r   r   r   r   r   r   r   r   rj   r`   r.   r.   rQ   r/   r     s"       r   c                       s   e Zd ZeeddeeddedZ fddZe	d
e
ej e
ej e
ej e
e e
ej e
e e
ej e
ej e
ej ee eddd	Z  ZS )T5GemmaDecoderr   )index)r   cross_attentionsr4   c                    s8   t    t fddt jD | _|   d S )Nc                    s   g | ]}t  |qS r.   )r   r   r   r.   r/   r     r   z+T5GemmaDecoder.__init__.<locals>.<listcomp>)rL   rM   rc   r   r   r   r   r   rf   rQ   r   r/   rM     s
    zT5GemmaDecoder.__init__N)r2   r5   r   r1   r3   rB   r   r   r   rP   r   c
                 K   s   |d u |d uA rt d|d u r(t d|d u r:| |}| jsf|rf|d u rftt| jdt| jd}|d u r|d ur~| nd}tj|||j	d  |j
d}|d u r|d}|d u r|d u rt||| jj}t| }ts | j||||d ur|jnd |d}tf i |tf i |d}t|	 }tsb| j||	|d d d}d	tf i |d
t|	ii}|}| ||}tj| jjd |jd}|| }| |}| jd | jj D ]0}|||||j ||||||d	 f	i |
}q| |}| |}t||dS )Nr   z0`encoder_hidden_states` must be given in decoderr   r   r   r   r   r   r   r   r   r   )r   r1   )rv   r   r   r	   r   rg   Zget_seq_lengthr   r   r   rl   r   r   rA   rD   rE   r   r&   r'   r   r   r   rI   r   re   r   r   r   r   r   )rO   r2   r5   r   r1   r3   rB   r   r   r   rP   Zpast_seen_tokensr   r   Zcross_attn_mask_mappingr4   r   r   r   r.   r.   r/   rj     s    

		




zT5GemmaDecoder.forward)	NNNNNNNNN)r+   r,   r-   r   rm   rr   r   r   rM   r   r   r   r   r   r	   r   r]   r   r   r   rj   r`   r.   r.   rQ   r/   r   y  s8   

         r   c                       s   e Zd Zed fddZdd Zdd Zdd	 Zee	de
ej e
ej e
ej e
ej e
ej e
ej e
e e
e e
ej e
ej e
e e
ej ee edddZ  ZS )T5GemmaModelr   c                    s>   t  | |jstdt|j| _t|j| _|   d S )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	rL   rM   r:   rv   r   r8   r   r9   r   rf   rQ   r.   r/   rM     s    zT5GemmaModel.__init__c                 C   s   | j S rY   r8   rO   r.   r.   r/   get_encoder  s    zT5GemmaModel.get_encoderc                 C   s
   | j  S rY   r8   get_input_embeddingsr   r.   r.   r/   r     s    z!T5GemmaModel.get_input_embeddingsc                 C   s   | j |S rY   r8   set_input_embeddingsrO   Znew_embeddingsr.   r.   r/   r     s    z!T5GemmaModel.set_input_embeddingsN)r2   r5   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr1   r3   decoder_inputs_embedsrB   r   rP   r   c                 K   s   |du r"| j f ||||	d|}|j}| jf ||||
|||||d	|}t|j|j|ddrh|jn|jf|j|j|j|j|jdS )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        Nr2   r5   r   r3   )	r2   r5   r   r3   r1   r   r   rB   r   rS   F)r   r1   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentions)	r8   r   r9   r   r1   rN   r4   r   r   )rO   r2   r5   r   r   r  r  r  r1   r3   r  rB   r   rP   r   decoder_outputsr.   r.   r/   rj     sF    

zT5GemmaModel.forward)NNNNNNNNNNNN)r+   r,   r-   r0   rM   r   r   r   r   r   r   r   r   r   
BoolTensorr   r	   r   r]   r   r   r   rj   r`   r.   r.   rQ   r/   r     sD               r   c                
       sp   e Zd Zed fddZdd Zdd Zeede	e
j e	e
j e	e
j e	e
j ee ed	d
dZ  ZS )T5GemmaEncoderModelr   c                    s2   t  | |jrtdt|j| _|   d S )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)rL   rM   r:   rv   r   r8   r   rf   rQ   r.   r/   rM   ;  s
    zT5GemmaEncoderModel.__init__c                 C   s
   | j  S rY   r   r   r.   r.   r/   r   D  s    z(T5GemmaEncoderModel.get_input_embeddingsc                 C   s   | j |S rY   r   r   r.   r.   r/   r   G  s    z(T5GemmaEncoderModel.set_input_embeddingsNr   c                 K   s   | j f ||||d|}|S )Nr  r   )rO   r2   r5   r   r3   rP   r  r.   r.   r/   rj   J  s    
zT5GemmaEncoderModel.forward)NNNN)r+   r,   r-   r0   rM   r   r   r   r   r   r   r   r   r   r   r   r   rj   r`   r.   r.   rQ   r/   r  9  s"   	    r  c                       s  e Zd ZddgZddiZddgdgfiZed fdd	Zd
d Zdd Z	dd Z
dd Zdd Zeedeej eej eej eej eej eej ee ee eej eej eej ee eej eeejf ee eeej ef dddZejdddZ  Z S )T5GemmaForConditionalGenerationz!model.decoder.embed_tokens.weightzlm_head.out_proj.weightzlm_head.out_projZcolwise_repr4   r   r   c                    sJ   d|_ t | t|| _|jj| _t|jj| j| _	d| _
|   d S )NTZForMaskedLM)r:   rL   rM   r   r   r9   r?   r   rI   lm_headZ	loss_typer   rf   rQ   r.   r/   rM   c  s    

z(T5GemmaForConditionalGeneration.__init__c                 C   s   || j _d S rY   r  r   r   r.   r.   r/   set_output_embeddingsn  s    z5T5GemmaForConditionalGeneration.set_output_embeddingsc                 C   s   | j jS rY   r  r   r.   r.   r/   get_output_embeddingsq  s    z5T5GemmaForConditionalGeneration.get_output_embeddingsc                 C   s$   | j jr | | jj|    d S rY   )rg   r>   Z_tie_or_clone_weightsr  r   get_decoderr   r   r.   r.   r/   _tie_weightst  s    z,T5GemmaForConditionalGeneration._tie_weightsc                 C   s   | j jS rY   )r   r8   r   r.   r.   r/   r   y  s    z+T5GemmaForConditionalGeneration.get_encoderc                 C   s   | j jS rY   )r   r9   r   r.   r.   r/   r  |  s    z+T5GemmaForConditionalGeneration.get_decoderNr   )r2   r5   r   r   r  r  r  r1   r3   r  labelsrB   r   logits_to_keeprP   r   c                 K   sD  | j r<| jjdkr<d| jj d}t r2t|n
t| |dur^|du r^|
du r^| |}| jf |||||||||	|
||d|}|j	}t
|trt| dn|}| |dd|ddf }|  j}|jdur||j }t|}||j }d}|dur| j||| jfi |}t|||j|j|j|j|j|j|jd	S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        r   ziIt is strongly recommended to train T5Gemma models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)r2   r5   r   r   r  r  r  r1   r3   r  rB   r   )	lossr   r1   r  r  r   r  r   r	  )r   rg   rT   r   rv   loggerwarning_oncer   r   r   rD   r_   slicer  r  Zfinal_logit_softcappingr   tanhloss_functionr?   r   r1   r  r  r   r  r   r	  )rO   r2   r5   r   r   r  r  r  r1   r3   r  r  rB   r   r  rP   msgr
  r4   Zslice_indicesr   Zdecoder_configr  r.   r.   r/   rj     s`    








z'T5GemmaForConditionalGeneration.forward)r  c                 C   s
   |  |S rY   )r   )rO   r  r.   r.   r/   %prepare_decoder_input_ids_from_labels  s    zET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNr   )!r+   r,   r-   Z_tied_weights_keysZ_tp_planZ_pp_planr0   rM   r  r  r  r   r  r   r   r   r   r   r   r  r   r	   r]   r   r_   r   r   r   r   r   rj   r  r`   r.   r.   rQ   r/   r  ^  sX                 Tr  c                       s   e Zd Zdeee d fddZdd Zdd Ze	e
deej eej eej eej eej eej ee eej eej eej ee ed	d
dZ  ZS ) T5GemmaForSequenceClassificationNrg   r:   c                    s|   |dur||_ t | |j| _|j r4t|| _n
t|| _|jj}|j rT|j	j}t
|dd}t|| j|| _|   dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
        Nr<   皙?r:   rL   rM   r   r   r   r  r8   rI   r9   rK   r   scorer   rO   rg   r:   rI   Zclassifier_dropoutrQ   r.   r/   rM     s    
z)T5GemmaForSequenceClassification.__init__c                 C   s
   | j  S rY   r   r   r   r.   r.   r/   r     s    z5T5GemmaForSequenceClassification.get_input_embeddingsc                 C   s   | j | d S rY   r   r   rO   rX   r.   r.   r/   r     s    z5T5GemmaForSequenceClassification.set_input_embeddingsr2   r5   r   r   r  r  r  r3   r  r  rP   r   c                 K   s   | j jr,|du r,|dur,td| jj d| j jr^|du r^|	du r^|du rTtd| |}| j jr| j|f||||||||	dd	|}|j}|j	}|j
}n,| j|f|||d|}|j}|j}|j}| |}|dur|jd }n
|jd }| j jdu r|d	krtd
| j jdu r(d}n|dur|| j jk|jtj}tj|jd |jtjd}|| d}| j jr|d	7 }tj||jd d	 d}nd}t| jj d |tj||jd|f }d}|
dur| j||
|| j d}t||||dS )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r5   r   r   r  r  r  r3   r  rB   r5   r   r3   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r   )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   )r   r  pooled_logitsrg   r  r   r4   r   )rg   r:   NotImplementedErrorrR   r+   rv   r   r   r   r  r  r4   r   r"  r   rA   r   rl   r   Zint32r   Zargmaxclampr  r  r  r   )rO   r2   r5   r   r   r  r  r  r3   r  r  rP   outputsr   r4   r   r   Z
batch_sizeZlast_non_pad_tokenZnon_pad_maskZtoken_indicesr/  r  r.   r.   r/   rj     s    





z(T5GemmaForSequenceClassification.forward)N)
NNNNNNNNNN)r+   r,   r-   r0   r   r]   rM   r   r   r   r   r   r   r   r   r   r   r   r   rj   r`   r.   r.   rQ   r/   r    s:             r  c                       s   e Zd Zdeee d fddZdd Zdd Ze	e
deej eej eej eej eej eej ee eej eej eej ee ed	d
dZ  ZS )T5GemmaForTokenClassificationNr  c                    s|   |dur||_ t | |j| _|j r4t|| _n
t|| _|jj}|j rT|j	j}t
|dd}t|| j|| _|   dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for token classification. When set to False, only encoder is used.
        Nr<   r   r!  r#  rQ   r.   r/   rM   h  s    
z&T5GemmaForTokenClassification.__init__c                 C   s
   | j  S rY   r$  r   r.   r.   r/   r     s    z2T5GemmaForTokenClassification.get_input_embeddingsc                 C   s   | j | d S rY   r%  r&  r.   r.   r/   r     s    z2T5GemmaForTokenClassification.set_input_embeddingsr'  c                 K   s  | j jr,|du r,|dur,td| jj d| j jr^|du r^|	du r^|du rTtd| |}| j jr| j|f||||||||	dd	|}|j}|j	}|j
}n,| j|f|||d|}|j}|j}|j}| |}d}|
dur| ||
| j }t||||dS )	r(  Nr)  r*  r+  Fr,  r-  r0  )rg   r:   r1  rR   r+   rv   r   r   r   r  r  r4   r   r"  r  r   )rO   r2   r5   r   r   r  r  r  r3   r  r  rP   r3  r   r4   r   r   r  r.   r.   r/   rj     sf    

z%T5GemmaForTokenClassification.forward)N)
NNNNNNNNNN)r+   r,   r-   r0   r   r]   rM   r   r   r   r   r   r   r   r   r   r   r   r   rj   r`   r.   r.   rQ   r/   r4  f  s:             r4  )r0   r)   r  r   r  r   r  r4  )Utypingr   r   r   r   r   Ztorch.nnrc   Zcache_utilsr   r   r	   Zconfiguration_utilsr
   Z
generationr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zutils.deprecationr   Zutils.genericr   r   Zgemma2.configuration_gemma2r    Zgemma2.modeling_gemma2r!   r"   r#   r$   r%   r&   r'   r(   Z_CHECKPOINT_FOR_DOCZ
get_loggerr+   r  r)   r0   ra   rb   rk   rm   rr   r   r   r_   r   r   r   Moduler   r   r   r   r   r   r   r   r  r  r  r4  __all__r.   r.   r.   r/   <module>   sf    (
 G4;0]mR${ r