a
    h                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6 e-7e8Z9ee+ddG dd deZ:ee+ddG dd de)Z;G dd  d e
j<Z=G d!d" d"e
j>Z?G d#d$ d$e
j>Z@G d%d& d&e
j>ZAd'd( ZBdJd)d*ZCejDeEejDd+d,d-ZFdKe
j>ejDejDejDeejD eGeeG eeG eHejDejDf d/	d0d1ZIG d2d3 d3e
j>ZJG d4d5 d5eZKe+G d6d7 d7e%ZLe+G d8d9 d9eLZMe+G d:d; d;eLeZNG d<d= d=e
j>ZOeejD eejD eEee d>d?d@ZPe+dAdG dBdC dCeLZQe+dDdG dEdF dFeLeZRG dGdH dHeLZSg dIZTdS )L    N)Callable)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCache)PretrainedConfig)GenerationMixin)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )	AutoModel   )Gemma3ConfigGemma3TextConfigzK
    Base class for Gemma3 outputs, with hidden states and attentions.
    )Zcustom_introc                   @   s$   e Zd ZU dZdZeej ed< dS )Gemma3ModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r&   r   torchFloatTensor__annotations__ r.   r.   f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/gemma3/modeling_gemma3.pyr%   3   s   
r%   zR
    Base class for Gemma3 causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeeej ef  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	Gemma3CausalLMOutputWithPastaa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr&   )r'   r(   r)   r*   r1   r   r+   r,   r-   r2   r3   r   listr   r4   tupler5   r&   r.   r.   r.   r/   r0   I   s   
r0   c                       sB   e Zd ZdZd	eeeed fddZejd fddZ	  Z
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?)num_embeddingsembedding_dimpadding_idxembed_scalec                    s*   t  ||| | jdt|dd d S )Nr=   F
persistent)super__init__register_bufferr+   tensor)selfr:   r;   r<   r=   	__class__r.   r/   rA   m   s    z&Gemma3TextScaledWordEmbedding.__init__)	input_idsc                    s   t  || j| jj S N)r@   forwardr=   toweightdtype)rD   rG   rE   r.   r/   rI   q   s    z%Gemma3TextScaledWordEmbedding.forward)r9   )r'   r(   r)   r*   intfloatrA   r+   TensorrI   __classcell__r.   r.   rE   r/   r8   h   s   r8   c                       s*   e Zd Zed fddZdd Z  ZS )	Gemma3MLPconfigc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFZbias)r@   rA   rS   hidden_sizeZintermediate_sizennLinear	gate_projup_proj	down_projr   Zhidden_activationact_fnrD   rS   rE   r.   r/   rA   v   s    
zGemma3MLP.__init__c                 C   s$   |  | | || | }|S rH   )r[   r\   rY   rZ   )rD   xr[   r.   r.   r/   rI      s     zGemma3MLP.forward)r'   r(   r)   r$   rA   rI   rP   r.   r.   rE   r/   rQ   u   s   
rQ   c                       s>   e Zd Zdeed fddZdd Zdd Zd	d
 Z  Z	S )Gemma3RMSNormư>dimepsc                    s&   t    || _tt|| _d S rH   )r@   rA   rc   rW   	Parameterr+   zerosrK   )rD   rb   rc   rE   r.   r/   rA      s    
zGemma3RMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr    T)Zkeepdim)r+   Zrsqrtpowmeanrc   )rD   r^   r.   r.   r/   _norm   s    zGemma3RMSNorm._normc                 C   s*   |  | }|d| j   }||S )Nr9   )ri   rN   rK   type_as)rD   r^   outputr.   r.   r/   rI      s    zGemma3RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r7   rK   shaperc   rD   r.   r.   r/   
extra_repr   s    zGemma3RMSNorm.extra_repr)r`   )
r'   r(   r)   rM   rN   rA   ri   rI   rn   rP   r.   r.   rE   r/   r_      s   r_   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	Gemma3RotaryEmbeddinginv_freqNrR   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultrp   Fr>   )r@   rA   hasattr
isinstancerq   dictgetrr   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrS   r   Zrope_init_fnattention_scalingrB   rp   Zoriginal_inv_freq)rD   rS   devicerp   rE   r.   r/   rA      s    
zGemma3RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   rf   r"   ZmpscpuF)device_typeZenabledr    rb   )rL   )rp   rN   expandrl   rJ   rz   rv   rs   strr+   Zautocast	transposecatcosry   sinrL   )
rD   r^   position_idsZinv_freq_expandedZposition_ids_expandedr|   ZfreqsZembr   r   r.   r.   r/   rI      s    0&,zGemma3RotaryEmbedding.forward)N)r'   r(   r)   r+   rO   r-   r$   rA   Zno_gradr   rI   rP   r.   r.   rE   r/   ro      s
   

ro   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nrf   r    r}   )rl   r+   r   )r^   x1Zx2r.   r.   r/   rotate_half   s    r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   Zunsqueeze_dimZq_embedZk_embedr.   r.   r/   apply_rotary_pos_emb   s
    

r   )r4   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)rl   r~   reshape)r4   r   batchnum_key_value_headsslenhead_dimr.   r.   r/   	repeat_kv   s
    0r           )	modulequerykeyvalueattention_maskdropoutscalingsoftcapr   c                 K   s   |d u r| j d }t|| j}	t|| j}
t||	dd| }|d urd|| }t|}|| }|d ur|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	N      r    r   rf   )rb   rL   )ptrainingr"   )r   r   num_key_value_groupsr+   matmulr   tanhrl   rW   
functionalZsoftmaxZfloat32rJ   rL   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsZcausal_maskattn_outputr.   r.   r/   eager_attention_forward   s"    

&r   c                       s   e Zd ZdZeed fddZedddddej	ej	e
ej	 e
e e
ej ee eej	e
ej	 e
eej	  f d
ddZ  ZS )Gemma3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrS   	layer_idxc                    s"  t    |j| dk| _|| _|| _t|d|j|j | _	|j|j
 | _|jd | _| jj| _d| _tj|j|j| j	 |jd| _tj|j|j
| j	 |jd| _tj|j|j
| j	 |jd| _tj|j| j	 |j|jd| _| jj| _| jr|jnd | _t|j	|jd| _t|j	|jd| _d S )Nsliding_attentionr   r   TrU   ra   )r@   rA   layer_types
is_slidingrS   r   getattrrV   Znum_attention_headsr   r   r   Zquery_pre_attn_scalarr   attention_dropoutZ	is_causalrW   rX   Zattention_biasq_projk_projv_projo_projZattn_logit_softcappingsliding_windowr_   rms_norm_epsq_normk_normrD   rS   r   rE   r.   r/   rA     s2    


zGemma3Attention.__init__past_key_valuer3   4.58new_nameversionN)r4   position_embeddingsr   r3   cache_positionr   r   c                 K   s<  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| |	}	| |
}
|\}}t	|	|
||\}	}
|d ur|||d}|
|
|| j|\}
}t}| jjdkrt| jj }|| |	|
||f| jr| jnd| j| jd|\}}|jg |dR   }| |}||fS )Nrf   r"   r    )r   r   r   eagerr   )r   r   r   )rl   r   r   viewr   r   r   r   r   r   updater   r   rS   _attn_implementationr   r   r   r   r   r   r   r   )rD   r4   r   r   r3   r   r   Zinput_shapeZhidden_shapeZquery_statesr   r   r   r   Zcache_kwargsZattention_interfacer   r   r.   r.   r/   rI   .  s>    


	

zGemma3Attention.forward)NN)r'   r(   r)   r*   r$   rM   rA   r   r+   rO   r   r   
LongTensorr   r   r7   rI   rP   r.   r.   rE   r/   r     s     r   c                       s   e Zd Zeed fddZedddddejejeje	ej e	ej
 e	e e	e e	e e	ej
 eeje	eejejf  f d

ddZ  ZS )Gemma3DecoderLayerr   c                    s   t    || _|j| _|| _|j| | _t||d| _t	|| _
t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _d S )Nr   rc   )r@   rA   rS   rV   r   r   attention_typer   	self_attnrQ   mlpr_   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   rE   r.   r/   rA   `  s    

zGemma3DecoderLayer.__init__r   r3   r   r   NF)
r4   position_embeddings_globalposition_embeddings_localr   r   r3   output_attentions	use_cacher   r   c
                 K   s   |}|  |}| jjr|}n|}| jf ||||||||	d|
\}}| |}|| }|}| |}| |}| |}|| }|f}|r||f7 }|S )N)r4   r   r   r   r3   r   r   r   )r   r   r   r   r   r   r   )rD   r4   r   r   r   r   r3   r   r   r   r   Zresidualr   Zself_attn_weightsoutputsr.   r.   r/   rI   m  s8    
	





zGemma3DecoderLayer.forward)NNNFFN)r'   r(   r)   r$   rM   rA   r   r+   rO   r   r   r   boolr7   r,   rI   rP   r.   r.   rE   r/   r   _  s(         r   c                       sZ   e Zd ZU eed< dZdZg dZdgZdZ	dZ
dZdZdZeedZ fddZ  ZS )	Gemma3PreTrainedModelrS    T)r   ZSiglipVisionEmbeddingsZSiglipEncoderLayerZ#SiglipMultiheadAttentionPoolingHeadr3   )r4   r5   c                    s&   t  | t|tr"|jj  d S rH   )r@   _init_weightsrv   Gemma3MultiModalProjectormm_input_projection_weightdataZzero_)rD   r   rE   r.   r/   r     s    
z#Gemma3PreTrainedModel._init_weights)r'   r(   r)   r#   r-   base_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZ_supports_attention_backendr   r   Z_can_record_outputsr   rP   r.   r.   rE   r/   r     s   
r   c                       s   e Zd ZU eed< ed fddZeed	ee	j
 ee	j ee	j
 ee ee	j ee ee ee ee	j
 ee edddZ  ZS )
Gemma3TextModelrS   rR   c                    s   t     j| _ j| _t j j| j| jjd d| _t	
 fddt jD | _t j jd| _t d| _d| _t   j _dd	i _t d| _|   d S )
N      ?)r=   c                    s   g | ]}t  |qS r.   )r   ).0r   rR   r.   r/   
<listcomp>      z,Gemma3TextModel.__init__.<locals>.<listcomp>r   rR   Frr   rt   )r@   rA   pad_token_idr<   
vocab_sizer8   rV   rS   embed_tokensrW   Z
ModuleListrangenum_hidden_layerslayersr_   r   normro   
rotary_embgradient_checkpointingcopydeepcopyZrope_local_base_freqZ
rope_thetarq   rotary_emb_local	post_initr]   rE   rR   r/   rA     s"    

zGemma3TextModel.__init__N)rG   r   r   r3   inputs_embedsr   r   output_hidden_statesr   r   r   c
                 K   s  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|d u |d uA rTtd| jrr| jrr|rrtd d}|d u r| 	|}|r|d u r| jst
| j d}|	d u r|d ur| nd}tj|||jd  |jd}	|d u r|	d}t| }ts.| j |||	||d}tf i |tf i |d	}|}| ||}| ||}|rTd
nd }|rbd
nd }| jd | j j D ]Z}|r||f7 }||f||||j |||||	d|
}|d }|rx||d f7 }qx| |}|r||f7 }t||||dS )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrR   r   r"   rz   rS   input_embedsr   r   r3   r   Zfull_attentionr   r.   )r   r   r   r   r3   r   r   r   )last_hidden_stater3   r4   r5   )rS   r   r   r   
ValueErrorr   r   loggerwarning_oncer   r	   get_seq_lengthr+   arangerl   rz   r   rv   rw   r   r   r   r   r   r   r   r   r   )rD   rG   r   r   r3   r   r   r   r   r   r   past_seen_tokenscausal_mask_mappingmask_kwargsr4   r   r   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr.   r.   r/   rI     s    






zGemma3TextModel.forward)	NNNNNNNNN)r'   r(   r)   r$   r-   rA   r   r   r   r+   r   rO   r   r,   r   r   r   r   rI   rP   r.   r.   rE   r/   r     s4   
         r   c                       s   e Zd ZU dgZddiZddgdgfiZeed< dZed fd	d
Z	e
edeej eej eej ee eej eej ee ee ee eej eeejf edddZ  ZS )Gemma3ForCausalLMlm_head.weightlm_headZcolwise_repr4   r2   rS   language_modelrR   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rT   )
r@   rA   r   modelr   rW   rX   rV   r  r   r]   rE   r.   r/   rA   R  s
    
zGemma3ForCausalLM.__init__Nr   )rG   r   r   r3   r   labelsr   r   r   r   logits_to_keepr   c                 K   s  | j r(| jjdkr(td| jj d |dur4|n| jj}|	durH|	n| jj}	| jf ||||||||	|
d	|}|j}t	|t
rt| dn|}| |dd|ddf }| jjdur|| jj }t|}|| jj }d}|dur| j||| jfi |}t|||j|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM

        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```r   zhIt is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	rG   r   r   r3   r   r   r   r   r   r1   r2   r3   r4   r5   )r   rS   r   r   r   r   r   r  r   rv   rM   slicer  Zfinal_logit_softcappingr+   r   loss_functionr   r   r3   r4   r5   )rD   rG   r   r   r3   r   r  r   r   r   r   r  r   r   r4   slice_indicesr2   r1   r.   r.   r/   rI   [  sN    #


zGemma3ForCausalLM.forward)NNNNNNNNNNr   )r'   r(   r)   _tied_weights_keysZ_tp_planZ_pp_planr$   r-   r   rA   r   r   r   r+   r   rO   r   r,   r   r   rM   r   rI   rP   r.   r.   rE   r/   r   J  sB   
	           r   c                       s2   e Zd Zed fddZejdddZ  ZS )r   rR   c                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )Nr   r   )kernel_sizeZstride)r@   rA   rW   rd   r+   re   vision_configrV   text_configr   r_   Zlayer_norm_epsmm_soft_emb_normrM   Z
image_sizeZ
patch_sizepatches_per_imagemm_tokens_per_imageZtokens_per_sider  Z	AvgPool2davg_poolr]   rE   r.   r/   rA     s    
z"Gemma3MultiModalProjector.__init__)vision_outputsc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )Nr"   r    )rl   r   r   r  r   r  flattenr  r+   r   r   rj   )	rD   r  
batch_size_Z
seq_lengthZreshaped_vision_outputsZpooled_vision_outputsZnormed_vision_outputsZprojected_vision_outputsr.   r.   r/   rI     s    


z!Gemma3MultiModalProjector.forward)	r'   r(   r)   r#   rA   r+   rO   rI   rP   r.   r.   rE   r/   r     s   r   )token_type_idsimage_group_idstokens_per_imager   c                    s,   du rdS t t t t td fdd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    N)	batch_idxhead_idxq_idxkv_idxr   c           	         s   t |jd k |d}| |f }t |jd k |d} | |f }t | jd k |d}| |f dk|dk@ } | |f |k}||@ S )Nr"   r   rf   )r+   whererl   )	r  r  r  r  Zsafe_idxZtoken_type_ids_at_kv_idxZimage_group_ids_at_kv_idxZis_image_blockZsame_image_blockr  r  r.   r/   
inner_mask  s    z0token_type_ids_mask_function.<locals>.inner_mask)rM   r   )r  r  r  r   r.   r  r/   token_type_ids_mask_function  s    
r!  zx
    The Base Gemma3 model which consists of a vision backbone and a language model withou language modeling head.,
    c                       s   e Zd ZddiZdZed fddZdd Zd	d
 Zdd Z	dd Z
ejejdddZejejejdddZeedejejeej eej eeeej ef  eej eej eej eej ee ee ee ee eeef dddZ  ZS )Gemma3Modelzlanguage_model.modelr  FrR   c                    sj   t  | tj|jd| _t|| _|jj	| _	tj|jd}|| _
| jjd urX| jjnd| _|   d S )NrR   rf   )r@   rA   r!   from_configr  vision_towerr   multi_modal_projectorr  r   r  rS   r   r   )rD   rS   r  rE   r.   r/   rA     s    

zGemma3Model.__init__c                 C   s
   | j  S rH   )r  get_input_embeddingsrm   r.   r.   r/   r&    s    z Gemma3Model.get_input_embeddingsc                 C   s   | j | d S rH   )r  set_input_embeddingsrD   r   r.   r.   r/   r'  	  s    z Gemma3Model.set_input_embeddingsc                 C   s
   || _ d S rH   r  rD   decoderr.   r.   r/   set_decoder  s    zGemma3Model.set_decoderc                 C   s   | j S rH   r)  rm   r.   r.   r/   get_decoder  s    zGemma3Model.get_decoder)pixel_valuesr   c                 C   s   | j |dj}| |}|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r.  )r$  r   r%  )rD   r.  r  image_featuresr.   r.   r/   get_image_features  s    

zGemma3Model.get_image_features)rG   r   r/  c                 C   s   |du r8||   tj| jjtj|jdk}|d}n|| jjk}| }|	d
||j}|jd |jd  }||  | krtd| d| |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)rL   rz   rf   r   r"   z6Image features and image tokens do not match: tokens: z, features )r&  r+   rC   rS   image_token_idlongrz   allsumr   Z	expand_asrJ   rl   Znumelr   )rD   rG   r   r/  special_image_maskZn_image_tokensZn_image_featuresr.   r.   r/   get_placeholder_mask   s    z Gemma3Model.get_placeholder_maskN)rG   r.  r   r   r3   r  r   r   r  r   r   r   return_dictr   c                 K   sT  |du |duA rt d|dur$|n| jj}|dur8|n| jj}|durL|n| jj}|dur| jj| jkr|| jjk}| }d||< n|}|du r|  |}|du r|dur|	 nd}t
j|||jd  |jd}|dur| |}||j|j}| j|||d}|||}t| }ts| j |||||d}|dur|jd dkr|dk|j}|tjj|ddd	dddd
f  @ }t
j| ddd }t
||t
j|d
|jd}t||j|| jj|d< tf i |t f i |d}| j!f |||||
||d|d	|}t"|j#|
r4|j$nd|j%|j&|durL|nddS )a]  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nr   r   r"   r   )r   r/  r   r"   r   r   rf   r}   or_mask_functionr   T)	r   r   r3   r   r   r   r   r7  r   )r   r3   r4   r5   r&   )'r   rS   r   r   use_return_dictr1  r   cloner&  r   r+   r   rl   rz   r0  rJ   rL   r6  Zmasked_scatterrv   rw   get_text_configrW   r   padcumsumrM   r  	full_liker!  r  r   r   r  r%   r   r3   r4   r5   )rD   rG   r.  r   r   r3   r  r   r   r  r   r   r   r7  	lm_kwargsr5  Zllm_input_idsr   r/  r   r   is_imagenew_image_startr  r   r.   r.   r/   rI   8  s    .


(
zGemma3Model.forward)NNNNNNNNNNNNN)r'   r(   r)   _checkpoint_conversion_mappingZaccepts_loss_kwargsr#   rA   r&  r'  r,  r-  r+   rO   r0  r   r,   r6  r   r   r   r   r6   r   r   r7   r%   rI   rP   r.   r.   rE   r/   r"    sR                
r"  zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c                       sX  e Zd ZdddddZdgZed fdd	Zd
d Zdd Zdd Z	dd Z
dd Zedd Zedd Zedd Zed%ejejeej eej eeeej ef  eej eej eej eej ee ee ee ee eeejf eeef dddZd& fd d!	Zed'e ejeej ejee eej eej e!d"d#d$Z"  Z#S )(Gemma3ForConditionalGenerationmodel.language_modelmodel.vision_towermodel.multi_modal_projectorr  )^language_model.model^vision_tower^multi_modal_projectorz^language_model.lm_headr  rR   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S rT   )r@   rA   r"  r  rW   rX   r  rV   r   r  r   r]   rE   r.   r/   rA     s    
z'Gemma3ForConditionalGeneration.__init__c                 C   s
   | j  S rH   r  r&  rm   r.   r.   r/   r&    s    z3Gemma3ForConditionalGeneration.get_input_embeddingsc                 C   s   | j | d S rH   r  r'  r(  r.   r.   r/   r'    s    z3Gemma3ForConditionalGeneration.set_input_embeddingsc                 C   s   | j | d S rH   )r  r,  r*  r.   r.   r/   r,    s    z*Gemma3ForConditionalGeneration.set_decoderc                 C   s
   | j  S rH   )r  r-  rm   r.   r.   r/   r-    s    z*Gemma3ForConditionalGeneration.get_decoderc                 C   s   | j |S rH   )r  r0  )rD   r.  r.   r.   r/   r0    s    z1Gemma3ForConditionalGeneration.get_image_featuresc                 C   s   | j jS rH   )r  r  rm   r.   r.   r/   r    s    z-Gemma3ForConditionalGeneration.language_modelc                 C   s   | j jS rH   )r  r$  rm   r.   r.   r/   r$    s    z+Gemma3ForConditionalGeneration.vision_towerc                 C   s   | j jS rH   )r  r%  rm   r.   r.   r/   r%    s    z4Gemma3ForConditionalGeneration.multi_modal_projectorNr   )rG   r.  r   r   r3   r  r   r   r  r   r   r   r7  r  r   c                 K   s  |dur|n| j j}|dur |n| j j}|dur4|n| j j}| jf ||||||||
|	||||d|}|d }t|trt| dn|}| |dd|ddf }d}|	dur|	 }|dddddf }|	dddf }|durB|dd|j
d  df |j}|||jdk  }|||jdk  }n| }| }t }|d| j jj}|d|j}|||}|s|f|dd  }|dur|f| S |S t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)rG   r.  r  r   r   r3   r   r   r  r   r   r7  r   r   .rf   r"   )r1   r2   r3   r4   r5   r&   )rS   r   r   r;  r  rv   rM   r  r  rN   rl   rJ   rz   r   rW   ZCrossEntropyLossr   r  r   r0   r3   r4   r5   r&   )rD   rG   r.  r   r   r3   r  r   r   r  r   r   r   r7  r  rA  r   r4   r
  r2   r1   Zshift_logitsZshift_labelsZshift_attention_maskZloss_fctZflat_logitsZflat_labelsrk   r.   r.   r/   rI     sd    @

$
z&Gemma3ForConditionalGeneration.forwardTc                    s>   t  j|f||||||	|
|d|}|d dkr:||d< |S )N)r3   r   r   r   r   r   r  r  r   r.  )r@   prepare_inputs_for_generation)rD   rG   r3   r   r   r   r.  r   r  r   r  r  r   Zmodel_inputsrE   r.   r/   rN  n  s"    
z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation)rS   r   r   r   r3   r   r  r   c                 K   s   |   |||||d}|d ur|jd dkr|dk|j}	|	tjj|	dddd d d df  @ }
tj|
	 ddd }t
|	|t|d}t||j|| j|d< tf i |S )	Nr   r"   r8  r   r9  rf   r}   r:  )r=  rl   rJ   rz   rW   r   r>  r+   r?  rM   r  r@  r!  r  r   )rS   r   r   r   r3   r   r  r   r   rB  rC  r  r.   r.   r/   r     s     	(z8Gemma3ForConditionalGeneration.create_masks_for_generate)NNNNNNNNNNNNNr   )
NNNNNNNTNN)N)$r'   r(   r)   rD  r  r#   rA   r&  r'  r,  r-  r0  propertyr  r$  r%  r   r+   r   r,   r   rO   r   r6   r   r   rM   r7   r0   rI   rN  staticmethodr
   rw   r   rP   r.   r.   rE   r/   rE    s   


              
           $ rE  c                       s   e Zd ZddddZ fddZdd Zd	d
 Zeede	j
ee	j ee	j ee	j
 ee ee	j ee	j
 ee	j
 ee ee edddZ  ZS )Gemma3ForSequenceClassificationrF  rG  rH  )rI  rJ  rK  c                    sB   t  | |j| _t|| _tj|jj| jdd| _	| 
  d S rT   )r@   rA   Z
num_labelsr"  r  rW   rX   r  rV   scorer   r]   rE   r.   r/   rA     s
    
z(Gemma3ForSequenceClassification.__init__c                 C   s
   | j  S rH   rL  rm   r.   r.   r/   r&    s    z4Gemma3ForSequenceClassification.get_input_embeddingsc                 C   s   | j | d S rH   rM  r(  r.   r.   r/   r'    s    z4Gemma3ForSequenceClassification.set_input_embeddingsN)rG   r.  r   r   r3   r   r  r  r   r   r   c
              
   K   s8  | j |f|||||||	d|
}|j}| |}|durF|jd }n
|jd }| jjjdu rn|dkrntd| jjjdu rd}nd|dur|| jjjk|j	t
j}t
j|jd |j	t
jd}|| d}nd}t| jj d |t
j||j	d	|f }d}|dur | j|||| jd
}t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   r.  r   r3   r   r  r   Nr   r"   z=Cannot handle batch sizes > 1 if no padding token is defined.rf   )rz   rL   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   )r2   r  pooled_logitsrS   r  )r  r   rR  rl   rS   r  r   r   rJ   rz   r+   Zint32r   Zargmaxr   r   rF   r'   r	  r   r3   r4   r5   )rD   rG   r.  r   r   r3   r   r  r  r   r   Ztransformer_outputsr4   r2   r  Zlast_non_pad_tokenZnon_pad_maskZtoken_indicesrS  r1   r.   r.   r/   rI     sR    	


z'Gemma3ForSequenceClassification.forward)	NNNNNNNNN)r'   r(   r)   rD  rA   r&  r'  r   r   r+   r   r   r,   rO   r   r   r   r   r   rI   rP   r.   r.   rE   r/   rQ    s>   	         rQ  )r   r   r   rE  r"  rQ  )Nr"   )r   NN)Ur   collections.abcr   dataclassesr   typingr   r   r+   Ztorch.nnrW   Zactivationsr   Zcache_utilsr   r	   Zconfiguration_utilsr
   Z
generationr   Zmasking_utilsr   r   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zutils.deprecationr   Zutils.genericr   autor!   Zconfiguration_gemma3r#   r$   Z
get_loggerr'   r   r%   r0   Z	Embeddingr8   ModulerQ   r_   ro   r   r   rO   rM   r   rN   r7   r   r   r   r   r   r   r   r!  r"  rE  rQ  __all__r.   r.   r.   r/   <module>   s   
$
   #QB `%! K t^