a
    h                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 d dlZddlmZmZ ddlmZmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z; ddl<m=Z= e(>e?Z@G dd de,eZAG dd deZBG dd de;ZCG dd de8ZDG dd de
jEZFG dd  d e0ZGG d!d" d"e3ZHG d#d$ d$e4ZIG d%d& d&e.ZJG d'd( d(eZKdZLG d)d* d*e2ZMG d+d, d,e1ZNG d-d. d.e/ZOG d/d0 d0e
jPZQeejR eejR eSee d1d2d3ZTG d4d5 d5e:ZUG d6d7 d7e9ZVG d8d9 d9eMZWg d:ZXdS );    N)Callable)AnyOptionalUnion   )CacheDynamicCache)PretrainedConfiglayer_type_validation)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPast SequenceClassifierOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaligemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPast)SiglipVisionConfigc                   @   s8   e Zd ZdZdZdddZedd Zejdd ZdS )Gemma3TextConfigaN   
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    Zgemma3_text@   	   $              gelu_pytorch_tanh   {Gz?ư>Tr      r       .AF           N     @c                    s   t jf ||||d| | _|	 _| _| _| _| _| _| _	|
 _
| _| _| _| _| _| _| _| _| _| _| _| _| _t  |dd _ jd u r؇ fddt jD  _t j d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingssliding_window_pattern   c                    s&   g | ]}t |d   j rdndqS )r7   sliding_attentionfull_attention)bool_sliding_window_pattern).0iself e/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/gemma3/modular_gemma3.py
<listcomp>   s   z-Gemma3TextConfig.__init__.<locals>.<listcomp>)r	   __init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappinglayer_typesrope_local_base_freqrope_scalingr   getrE   ranger
   )rI   rN   rP   rQ   rR   rS   rU   rT   r\   rO   rV   rW   rX   r<   r>   r=   r?   rY   rZ   r[   r]   r^   ra   r_   r`   rc   rb   kwargsrJ   rH   rK   rM      sJ    

zGemma3TextConfig.__init__c                 C   s   t dt | jS )NzTThe `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.)warningswarnFutureWarningrE   rH   rJ   rJ   rK   r@      s
    z'Gemma3TextConfig.sliding_window_patternc                 C   s
   || _ d S N)rE   rI   valuerJ   rJ   rK   r@     s    )r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   Tr   r7   r   Tr8   Fr9   r2   r:   NNNNr;   )	__name__
__module____qualname____doc__
model_typerM   propertyr@   setterrJ   rJ   rJ   rK   r+   <   sB   t                          
H
r+   c                	       sv   e Zd ZdZdZddddZeedZde	e
eeeef f  e	e
eeeef f  eeeeed fddZ  ZS )Gemma3Configa  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Zgemma3image_token_indexboi_token_indexeoi_token_index)image_token_idZboi_token_idZeoi_token_id)text_configvision_configNr2         r5   )ry   rz   mm_tokens_per_imagerv   rw   ru   rV   c           	         s   |d u rt  }td nt|tr2t f i |}t|trLtf i |}n|d u rdt }td || _|| _|| _|| _	|| _
|| _|| _t jf i | d S )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.)r+   loggerinfo
isinstancedictr*   ry   rz   r~   rv   rw   ru   rV   superrM   )	rI   ry   rz   r~   rv   rw   ru   rV   rf   	__class__rJ   rK   rM   D  s$    


zGemma3Config.__init__)NNr2   r{   r|   r}   r5   )rm   rn   ro   rp   rq   Zattribute_mapr+   r*   Zsub_configsr   r   r   strr   intfloatrM   __classcell__rJ   rJ   r   rK   rt     s2   0       rt   c                   @   s   e Zd ZdS )Gemma3ModelOutputWithPastNrm   rn   ro   rJ   rJ   rJ   rK   r   f  s   r   c                   @   s   e Zd ZdS )Gemma3CausalLMOutputWithPastNr   rJ   rJ   rJ   rK   r   j  s   r   c                       sB   e Zd ZdZd	eeeed fddZejd fddZ	  Z
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?)num_embeddingsembedding_dimpadding_idxembed_scalec                    s*   t  ||| | jdt|dd d S )Nr   F)
persistent)r   rM   Zregister_buffertorchZtensor)rI   r   r   r   r   r   rJ   rK   rM   s  s    z&Gemma3TextScaledWordEmbedding.__init__)	input_idsc                    s   t  || j| jj S rj   )r   forwardr   toweightdtype)rI   r   r   rJ   rK   r   w  s    z%Gemma3TextScaledWordEmbedding.forward)r   )rm   rn   ro   rp   r   r   rM   r   Tensorr   r   rJ   rJ   r   rK   r   n  s   r   c                       s"   e Zd Zed fddZ  ZS )	Gemma3MLPconfigc                    s   t  | d S rj   r   rM   rI   r   r   rJ   rK   rM   |  s    zGemma3MLP.__init__rm   rn   ro   r+   rM   r   rJ   rJ   r   rK   r   {  s   r   c                       s&   e Zd Zdeed fddZ  ZS )Gemma3RMSNormr6   dimepsc                    s   t  j||d d S )Nr   r   )rI   r   r   r   rJ   rK   rM     s    zGemma3RMSNorm.__init__)r6   )rm   rn   ro   r   r   rM   r   rJ   rJ   r   rK   r     s   r   c                       s$   e Zd Zded fddZ  ZS )Gemma3RotaryEmbeddingNr   c                    s   t  | d S rj   r   )rI   r   devicer   rJ   rK   rM     s    zGemma3RotaryEmbedding.__init__)Nr   rJ   rJ   r   rK   r     s   r   c                       s   e Zd Zeed fddZedddddejeje	ej e	e
 e	ej ee eeje	ej e	eej  f d	d
dZ  ZS )Gemma3Attentionr   	layer_idxc                    sX   |j | dk| _t || | jr*|jnd | _t|j|jd| _t|j|jd| _	d S )NrB   r   )
ra   
is_slidingr   rM   r^   r   rT   rW   q_normk_normrI   r   r   r   rJ   rK   rM     s
    zGemma3Attention.__init__past_key_valuepast_key_values4.58new_nameversionN)hidden_statesposition_embeddingsattention_maskr   cache_positionrf   returnc                 K   s<  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| |	}	| |
}
|\}}t	|	|
||\}	}
|d ur|||d}|
|
|| j|\}
}t}| jjdkrt| jj }|| |	|
||f| jr| jnd| j| jd|\}}|jg |dR   }| |}||fS )Nr7   r   )sincosr   eagerr9   )Zdropoutscalingr^   )shaperT   Zq_projview	transposeZk_projZv_projr   r   r$   updater   r%   r   Z_attn_implementationr   trainingr[   r   r^   reshape
contiguousZo_proj)rI   r   r   r   r   r   rf   Zinput_shapeZhidden_shapeZquery_statesZ
key_statesZvalue_statesr   r   Zcache_kwargsZattention_interfaceZattn_outputZattn_weightsrJ   rJ   rK   r     s>    


	

zGemma3Attention.forward)NN)rm   rn   ro   r+   r   rM   r   r   r   r   r   
LongTensorr   r   tupler   r   rJ   rJ   r   rK   r     s   	  r   c                       s   e Zd Zeed fddZedddddejejeje	ej e	ej
 e	e e	e e	e e	ej
 eeje	eejejf  f d

ddZ  ZS )Gemma3DecoderLayerr   c                    s   t    || _|j| _|| _|j| | _t||d| _t	|| _
t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _d S )Nr   r   )r   rM   r   rP   r   ra   attention_typer   	self_attnr   mlpr   rW   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r   rJ   rK   rM     s    

zGemma3DecoderLayer.__init__r   r   r   r   NF)
r   position_embeddings_globalposition_embeddings_localr   position_idsr   output_attentionsrX   r   r   c
                 K   s   |}|  |}| jjr|}n|}| jf ||||||||	d|
\}}| |}|| }|}| |}| |}| |}|| }|f}|r||f7 }|S )N)r   r   r   r   r   r   rX   r   )r   r   r   r   r   r   r   )rI   r   r   r   r   r   r   r   rX   r   rf   Zresidualr   Zself_attn_weightsoutputsrJ   rJ   rK   r     s8    
	





zGemma3DecoderLayer.forward)NNNFFN)rm   rn   ro   r+   r   rM   r   r   r   r   r   r   rD   r   FloatTensorr   r   rJ   rJ   r   rK   r     s(         r   c                   @   s    e Zd ZdZg dZdd ZdS )Gemma3PreTrainedModel )r   ZSiglipVisionEmbeddingsZSiglipEncoderLayerZ#SiglipMultiheadAttentionPoolingHeadc                 C   s&   t | | t|tr"|jj  d S rj   )r   _init_weightsr   Gemma3MultiModalProjectormm_input_projection_weightdataZzero_)rI   modulerJ   rJ   rK   r     s    
z#Gemma3PreTrainedModel._init_weightsN)rm   rn   ro   base_model_prefixZ_no_split_modulesr   rJ   rJ   rJ   rK   r     s   r   c                       s   e Zd ZU eed< ed fddZd	eej eej	 eej ee
 eej ee ee ee eej ee edddZ  ZS )
Gemma3TextModelr   r   c                    sX   t  | t|j|j| j| jjd d| _t	|}|j
|_ddi|_t|d| _d S )N      ?)r   Z	rope_typedefaultr   )r   rM   r   rN   rP   r   r   embed_tokenscopydeepcopyrb   rY   rc   r   rotary_emb_localr   r   rJ   rK   rM     s    

zGemma3TextModel.__init__N)r   r   r   r   inputs_embedsrX   r   output_hidden_statesr   rf   r   c
                 K   s  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|d u |d uA rTtd| jrr| jrr|rrtd d}|d u r| 	|}|r|d u r| jst
| j d}|	d u r|d ur| nd}tj|||jd  |jd}	|d u r|	d}t| }ts.| j |||	||d}tf i |tf i |d	}|}| ||}| ||}|rTd
nd }|rbd
nd }| jd | j j D ]Z}|r||f7 }||f||||j |||||	d|
}|d }|rx||d f7 }qx| |}|r||f7 }t||||dS )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   r7   r   r   input_embedsr   r   r   r   rC   rB   rJ   )r   r   r   r   r   r   rX   r   )last_hidden_stater   r   
attentions)r   r   r   rX   
ValueErrorZgradient_checkpointingr   r   warning_oncer   r   get_seq_lengthr   aranger   r   Z	unsqueezer   r   r   r   Z
rotary_embr   ZlayersrR   r   Znormr   )rI   r   r   r   r   r   rX   r   r   r   rf   past_seen_tokenscausal_mask_mappingmask_kwargsr   r   r   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsrJ   rJ   rK   r   ,  s    






zGemma3TextModel.forward)	NNNNNNNNN)rm   rn   ro   r+   __annotations__rM   r   r   r   r   r   r   rD   r   r   r   r   r   rJ   rJ   r   rK   r     s0   
         r   c                       s0   e Zd ZU eed< dZed fddZ  ZS )Gemma3ForCausalLMr   language_modelr   c                    s   t  | t|| _d S rj   )r   rM   r   modelr   r   rJ   rK   rM     s    zGemma3ForCausalLM.__init__)rm   rn   ro   r+   r   r   rM   r   rJ   rJ   r   rK   r     s   
r   c                       s2   e Zd Zed fddZejdddZ  ZS )r   r   c                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )Nr   r   )kernel_sizeZstride)r   rM   nn	Parameterr   Zzerosrz   rP   ry   r   r   Zlayer_norm_epsmm_soft_emb_normr   Z
image_sizeZ
patch_sizepatches_per_imager~   Ztokens_per_sider   Z	AvgPool2davg_poolr   r   rJ   rK   rM     s    
z"Gemma3MultiModalProjector.__init__)vision_outputsc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )Nr7   r   )r   r   r   r   r   r   flattenr   r   matmulr   Ztype_as)	rI   r   
batch_size_Z
seq_lengthZreshaped_vision_outputsZpooled_vision_outputsZnormed_vision_outputsZprojected_vision_outputsrJ   rJ   rK   r     s    


z!Gemma3MultiModalProjector.forward)	rm   rn   ro   rt   rM   r   r   r   r   rJ   rJ   r   rK   r     s   r   )token_type_idsimage_group_idstokens_per_imager   c                    s,   du rdS t t t t td fdd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    N)	batch_idxhead_idxq_idxkv_idxr   c           	         s   t |jd k |d}| |f }t |jd k |d} | |f }t | jd k |d}| |f dk|dk@ } | |f |k}||@ S )Nr7   r   r   )r   wherer   )	r  r  r	  r
  Zsafe_idxZtoken_type_ids_at_kv_idxZimage_group_ids_at_kv_idxZis_image_blockZsame_image_blockr  r  rJ   rK   
inner_mask  s    z0token_type_ids_mask_function.<locals>.inner_mask)r   rD   )r  r  r  r  rJ   r  rK   token_type_ids_mask_function  s    
r  c                   @   s   e Zd ZdZejejdddZdd Zee	dej
ejeej eej
 eeeej ef  eej
 eej
 eej eej
 ee ee ee ee eeef dd	d
ZdS )Gemma3ModelF)pixel_valuesr   c                 C   s   | j |dj}| |}|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r  )Zvision_towerr   Zmulti_modal_projector)rI   r  r   image_featuresrJ   rJ   rK   get_image_features  s    

zGemma3Model.get_image_featuresc                 K   s   t dd S NzWe don't want to inherit itAttributeErrorrI   Zsuper_kwargsrJ   rJ   rK   _update_causal_mask  s    zGemma3Model._update_causal_maskN)r   r  r   r   r   r  r   r   labelsrX   r   r   return_dictr   c                 K   sT  |d u |d uA rt d|d ur$|n| jj}|d ur8|n| jj}|d urL|n| jj}|d ur| jj| jkr|| jjk}| }d||< n|}|d u r|  |}|d u r|d ur|	 nd}t
j|||jd  |jd}|d ur| |}||j|j}| j|||d}|||}t| }ts| j |||||d}|d ur|jd dkr|dk|j}|tjj|dddd d d d	f  @ }t
j| dd
d }t
||t
j|d	|jd}t||j|| jj|d< tf i |t f i |d}| j!f |||||
||d|d	|}t"|j#|
r4|j$nd |j%|j&|d urL|nd dS )Nr   r   r7   r   )r   r  r   r7   r   rl   r   r   or_mask_functionr   T)	r   r   r   r   rX   r   r   r  r   )r   r   r   r   image_hidden_states)'r   r   r   r   use_return_dictrx   rN   cloneget_input_embeddingsr   r   r   r   r   r  r   r   Zget_placeholder_maskZmasked_scatterr   r   get_text_configr   
functionalpadcumsumr   r  	full_liker  r~   r   r   r   r   r   r   r   r   )rI   r   r  r   r   r   r  r   r   r  rX   r   r   r  	lm_kwargsZspecial_image_maskZllm_input_idsr   r  r   r   is_imagenew_image_startr  r   rJ   rJ   rK   r     s    


(
zGemma3Model.forward)NNNNNNNNNNNNN)rm   rn   ro   Zaccepts_loss_kwargsr   r   r  r  r   r   r   r   r   r   listr   rD   r   r   r   rJ   rJ   rJ   rK   r    sD                
r  c                       s   e Zd Zedejejeej eej ee	e
ej ef  eej eej eej eej ee ee ee ee e	eejf e	eef dddZd fdd	Zd	d
 Zedeejeej ejee eej eej edddZ  ZS )Gemma3ForConditionalGenerationNr   )r   r  r   r   r   r  r   r   r  rX   r   r   r  logits_to_keepr   c                 K   s  |dur|n| j j}|dur |n| j j}|dur4|n| j j}| jf ||||||||
|	||||d|}|d }t|trt| dn|}| |dd|ddf }d}|	dur|	 }|dddddf }|	dddf }|durB|dd|j
d  df |j}|||jdk  }|||jdk  }n| }| }t }|d| j jj}|d|j}|||}|s|f|dd  }|dur|f| S |S t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)r   r  r  r   r   r   r   rX   r  r   r   r  r   r   .r   r7   )losslogitsr   r   r   r  )r   r   r   r  r   r   r   sliceZlm_headr   r   r   r   r   r   ZCrossEntropyLossr   ry   rN   r   r   r   r   r  )rI   r   r  r   r   r   r  r   r   r  rX   r   r   r  r,  r'  r   r   Zslice_indicesr.  r-  Zshift_logitsZshift_labelsZshift_attention_maskZloss_fctZflat_logitsZflat_labelsoutputrJ   rJ   rK   r   h  sd    @

$
z&Gemma3ForConditionalGeneration.forwardTc                    s>   t  j|f||||||	|
|d|}|d dkr:||d< |S )N)r   r   r   r   r   rX   r,  r  r   r  )r   prepare_inputs_for_generation)rI   r   r   r   r   r   r  r   r  rX   r,  r  rf   Zmodel_inputsr   rJ   rK   r1    s"    
z<Gemma3ForConditionalGeneration.prepare_inputs_for_generationc                 K   s   t dd S r  r  r  rJ   rJ   rK   5_prepare_4d_causal_attention_mask_with_cache_position  s    zTGemma3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position)r   r   r   r   r   r   r  r   c                 K   s   |   |||||d}|d ur|jd dkr|dk|j}	|	tjj|	dddd d d df  @ }
tj|
	 ddd }t
|	|t|d}t||j|| j|d< tf i |S )	Nr   r7   r  r   r  r   r  r  )r"  r   r   r   r   r#  r$  r   r%  r   r  r&  r  r~   r   )r   r   r   r   r   r   r  rf   r   r(  r)  r  rJ   rJ   rK   r     s     	(z8Gemma3ForConditionalGeneration.create_masks_for_generate)NNNNNNNNNNNNNr   )
NNNNNNNTNN)N)rm   rn   ro   r   r   r   r   r   r   r   r*  r   rD   r   r   r   r   r1  r2  staticmethodr	   r   r   r   rJ   rJ   r   rK   r+  g  sr                 
           $ r+  c                       s   e Zd ZddddZ fddZdd Zd	d
 Zeede	j
ee	j ee	j ee	j
 ee ee	j ee	j
 ee	j
 ee ee edddZ  ZS )Gemma3ForSequenceClassificationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projector)z^language_model.modelz^vision_towerz^multi_modal_projectorc                    sB   t  | |j| _t|| _tj|jj| jdd| _	| 
  d S )NF)Zbias)r   rM   Z
num_labelsr  r   r   ZLinearry   rP   scoreZ	post_initr   r   rJ   rK   rM   :  s
    
z(Gemma3ForSequenceClassification.__init__c                 C   s
   | j  S rj   )r   r!  rH   rJ   rJ   rK   r!  C  s    z4Gemma3ForSequenceClassification.get_input_embeddingsc                 C   s   | j | d S rj   )r   set_input_embeddingsrk   rJ   rJ   rK   r6  F  s    z4Gemma3ForSequenceClassification.set_input_embeddingsN)r   r  r   r   r   r   r  r  rX   rf   r   c
              
   K   s8  | j |f|||||||	d|
}|j}| |}|durF|jd }n
|jd }| jjjdu rn|dkrntd| jjjdu rd}nd|dur|| jjjk|j	t
j}t
j|jd |j	t
jd}|| d}nd}t| jj d |t
j||j	d	|f }d}|dur | j|||| jd
}t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   r  r   r   r   r  rX   Nr   r7   z=Cannot handle batch sizes > 1 if no padding token is defined.r   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   )r.  r  pooled_logitsr   )r-  r.  r   r   r   )r   r   r5  r   r   ry   r<   r   r   r   r   Zint32r   Zargmaxr   r   r   rm   Zloss_functionr   r   r   r   )rI   r   r  r   r   r   r   r  r  rX   rf   Ztransformer_outputsr   r.  r  Zlast_non_pad_tokenZnon_pad_maskZtoken_indicesr7  r-  rJ   rJ   rK   r   I  sR    	


z'Gemma3ForSequenceClassification.forward)	NNNNNNNNN)rm   rn   ro   Z_checkpoint_conversion_mappingrM   r!  r6  r   r   r   r   r   r   r   r   rD   r   r   r   r   r   rJ   rJ   r   rK   r4  3  s>   	         r4  )rt   r+   r   r   r   r+  r  r4  )Yr   rg   collections.abcr   typingr   r   r   r   Ztorch.nnr   Ztorch.utils.checkpointZcache_utilsr   r   Zconfiguration_utilsr	   r
   Zmasking_utilsr   r   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zutils.deprecationr   Zgemma2.configuration_gemma2r   Zgemma2.modeling_gemma2r   r   r   r    r!   r"   r#   r$   r%   Zpaligemma.modeling_paligemmar&   r'   r(   r)   Zsiglipr*   Z
get_loggerrm   r   r+   rt   r   r   Z	Embeddingr   r   r   r   r   r   ZGEMMA3_START_DOCSTRINGr   r   r   Moduler   r   r   r  r  r+  r4  __all__rJ   rJ   rJ   rK   <module>   sb   ,
 M^;B~	%!  M^