a
    hT                    @   s  d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlm  mZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7 e.8e9Z:ee,ddG dd deZ;ee,ddG dd de*Z<G dd dej=Z>G dd  d ej=Z?G d!d" d"ej=Z@G d#d$ d$ej=ZAG d%d& d&ej=ZBG d'd( d(ej=ZCG d)d* d*ej=ZDG d+d, d,ej=ZEG d-d. d.ej=ZFG d/d0 d0ej=ZGG d1d2 d2e&ZHG d3d4 d4ejIZJG d5d6 d6ej=ZKG d7d8 d8ej=ZLG d9d: d:ej=ZMG d;d< d<ej=ZNd=d> ZOe
jPeQe
jPd?d@dAZRd^ej=e
jPe
jPe
jPee
jP eSeeS eeS eTe
jPe
jPf dC	dDdEZUd_e
jPe
jPe
jPee
jP eQdFdGdHZVG dIdJ dJej=ZWG dKdL dLeZXe,G dMdN dNe&ZYe,dOdG dPdQ dQeYZZe,dRdG dSdT dTeYeZ[G dUdV dVej=Z\e,dWdG dXdY dYeYZ]e,dZdG d[d\ d\eYeZ^g d]Z_dS )`    N)CallableSequence)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )	AutoModel   )Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfigzL
    Base class for Gemma3n outputs, with hidden states and attentions.
    )Zcustom_introc                   @   s6   e Zd ZU dZdZeej ed< dZ	eej ed< dS )Gemma3nModelOutputWithPasta   
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nimage_hidden_statesaudio_hidden_states)
__name__
__module____qualname____doc__r%   r   torchFloatTensor__annotations__r&    r.   r.   h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/gemma3n/modeling_gemma3n.pyr$   3   s   
r$   zS
    Base class for Gemma3n causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeeej ef  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dZeej ed	< dS )
Gemma3nCausalLMOutputWithPasta  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr%   r&   )r'   r(   r)   r*   r1   r   r+   r,   r-   r2   r3   r   listr	   r4   tupler5   r%   r&   r.   r.   r.   r/   r0   N   s   
r0   c                       sL   e Zd Zdeeed fddZdd Zej	ej	dd	d
Z
dd Z  ZS )Gemma3nRMSNormư>Tdimeps
with_scalec                    sJ   t    || _|| _| jr0tt|| _n| j	dt
ddd d S )Nweight      ?F
persistent)super__init__r<   r=   nn	Parameterr+   onesr>   register_buffertensor)selfr;   r<   r=   	__class__r.   r/   rC   s   s    
zGemma3nRMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr   T)keepdim)r+   sqrtpowmeanr<   )rI   xr.   r.   r/   _norm}   s    zGemma3nRMSNorm._normrQ   returnc                 C   s"   |  | | j  }||S N)rR   floatr>   type_as)rI   rQ   outputr.   r.   r/   forward   s    zGemma3nRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r7   r>   shaper<   rI   r.   r.   r/   
extra_repr   s    zGemma3nRMSNorm.extra_repr)r9   T)r'   r(   r)   intrV   boolrC   rR   r+   TensorrY   r\   __classcell__r.   r.   rJ   r/   r8   r   s   
r8   c                	       sr   e Zd Zed fddZejejejdddZeje	e	e	e	e	e	ejddd	Z
ejejejd
ddZ  ZS )%Gemma3nAudioRelativePositionEmbeddingconfigc                    s   t    || _| jj| _| jj| _| j| j | _td| jj	d | _
| jj| _tj| j| j| j dd| _d}d}| jd }tt|t| t|d d }|tt||   }| jd| dddd	 d S )
Nr   r   Fbiasr?   g     @r   inv_timescalesr@   )rB   rC   rc   conf_num_attention_heads	num_headshidden_sizeZchannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardrD   Linearpos_projmathlogrV   r+   exparangerG   	unsqueeze)rI   rc   Zmin_timescaleZmax_timescaleZnum_timescalesZlog_timescale_incrementrf   rJ   r.   r/   rC      s$    




$z.Gemma3nAudioRelativePositionEmbedding.__init__)positiondtyperT   c                 C   sN   |  d}|| jj|jtjd }tjt|t	|gdd}|
|S )NrL   )devicerx   r;   )rV   rv   rf   tory   r+   float32catsincostype)rI   rw   rx   Zscaled_timeZtiming_signalr.   r.   r/   _get_timing_signal_1d_pos   s    z?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_pos)term_bd_before_shift
batch_sizerh   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1rT   c                 C   sx   |d | }d|f}	t j||	}
|
|||||d  f}|ddddddd|| f }||||||f}|S )aZ  Performs the relative shift.

        Args:
          term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
            (B), num_heads (N), num_query_blocks (U), query_block_size (W),
            key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

        Returns:
          Tensor of shape [B, N, U, W, C].
        r   r   N)rD   
functionalpadreshape)rI   r   r   rh   r   r   r   r   Zpad_amount_last_dimZpadding_tupleZterm_bd_paddedZterm_bd_reshapedZterm_bd_slicedterm_bd_shiftedr.   r.   r/   _relative_shift   s(    

$	z5Gemma3nAudioRelativePositionEmbedding._relative_shift)querieskeysrT   c              	   C   s"  |j \}}}}}|j \}}}	}}tj| j| j d d|jdd}
|
j d }| j|
|jd}| 	|}|
d|| j| jd}|ddddd}|ddddd}t||}|ddddd}|ddd}|
|||| |}t||}|
|||||}| ||||||	|}|| S )	Nr   rL   ry   r   rx   r   r      )rZ   r+   ru   rm   ro   ry   rv   r   rx   rq   r   rh   rj   Zsqueezepermutematmulr   )rI   r   r   r   r   r   rh   rj   _r   Zpos_indicesr   Zsin_emb_timing_signalZprojected_sin_embZsin_embZ	queries_pZkeys_p_tZterm_acZ
q_permutedZ
s_permutedZ
q_reshapedZterm_bd_unshifed_matmulZterm_bd_unshifedr   r.   r.   r/   rY      sJ    

		
z-Gemma3nAudioRelativePositionEmbedding.forward)r'   r(   r)   r    rC   r+   r_   rx   r   r]   r   rY   r`   r.   r.   rJ   r/   ra      s   =ra   c                       sz   e Zd Zed fddZejeeejdddZejejddd	Z	ejejdd
dZ
ejejejdddZ  ZS )Gemma3nAudioAttentionrb   c                    s  t    || _| jj| _| jj| _| j| j | _| jj| _| jj	| _
td| jjd | _| jj| _| j| j | j
 | _t|| _tt| jf| _tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j| j dd| _| jd }dtjjtd }| jd||   ! dd	 tj"tj#| j| jftj$d
ddj%}tj"tj#| j| jftj$d
| j| j
 d}tj#| j| jftj$d
}|| | }| jd|dd	 | jdt| j& dd	 d S )Nr   r   Frd         r?           q_scaler@   r   )Zdiagonallocal_causal_valid_masksoftcap)'rB   rC   rc   rg   rh   ri   rj   Zconf_attention_chunk_size
chunk_sizern   max_future_horizonrk   rl   max_past_horizonZconf_attention_logit_capZattention_logits_soft_capcontext_sizera   relative_position_embeddingrD   rE   r+   zerosper_dim_scalerp   q_projk_projv_projr   softplusrH   rG   clonedetachZtrilrF   r^   TrV   )rI   rc   r   Zr_softplus_0Zlower_causal_maskZupper_causal_maskr   rJ   r.   r/   rC   8  sD    








zGemma3nAudioAttention.__init__)rQ   pad_left	pad_rightrT   c           	      C   sL   |j ^}}}|||g|R }|||g|R }tj|||gdd}|S )Nr   rz   )rZ   Z	new_zerosr+   r}   )	rI   rQ   r   r   batchr   Z
tail_shapeleftrightr.   r.   r/   	_pad_dim1c  s
    zGemma3nAudioAttention._pad_dim1r4   rT   c                 C   sx   |j }|dd \}}|| j d | j }|| j |  }dkrN| |d|}||| jf|dd  }|| }|S )aE  Turns a sequence to non overlapping blocks.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, block_size, ...], with necessary
            paddings,
            where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
        Nr   r   r   )rZ   r   r   r   
contiguous)rI   r4   rZ   btZ
num_blocksZpadding_lenZpermute_dimsr.   r.   r/   _convert_to_blockj  s    z'Gemma3nAudioAttention._convert_to_blockc                 C   sl   | j }| j| j d }| |||}| j}| j}|jd||d}|jdkrd|jdkrdtj|ddd}|	 S )a  Extracts temporal context for every block.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, context_size, ...], with necessary
            paddings,
            where context_size = block_size + left_context + right_context,
            and output[:, i, ...] are x[:, start-left_context:end+right_context,
            ...],
            start = i * block_size, end = (i + 1) * block_size.
        r   )	dimensionsizestepr   r   rL   )sourceZdestination)
r   r   r   r   r   Zunfoldndimr+   Zmovedimr   )rI   r4   r   r   Z	frame_lenZ
frame_stepZ
x_unfoldedr.   r.   r/   _extract_block_context  s    z,Gemma3nAudioAttention._extract_block_context)r4   maskrT   c           "   
   C   s  g |j d d | j| jR }| || }| || }| || }tj	j
| j}ddd| jf}||}	|| j |	 }|j d d \}
}| |}| |}| |}|j d }| }| |}|jdkr|j d |j d  | jkr||
|| j}|j |
|| jfkrNtd|j  d|
 d| d| j d		|dd
}| jddd}t|||j}| ||}| j|j}|| }t|}|| }t||t|jj}tj	j
j |dtj!dj|jd}|j \}}}}}|j d }|"dddddd||}|"dddddd||}t#||} | |||||"ddddd}!|!|
|| j$ | j| jf}!|!d d d |f }!|!S )NrL   r   r   r   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   r;   rx   r   )%rZ   rh   rj   r   r   r   r   r   r+   rD   r   r   r   viewr   r   r   r   r   
ValueErrorrv   r   logical_andr{   ry   r   r   tanhwhereZfinforx   minsoftmaxr|   r   Zbmmr   )"rI   r4   r   Z	qkv_shapequery_states
key_statesvalue_statesZper_dim_scale_spZbroadcast_shapeZper_dim_scale_sp_broadcastr   Zq_timeZquery_blocksZ
key_blocksZvalue_blocksr   Zoriginal_valid_maskZextracted_valid_mask_blocksZcondition_from_input_validityZcondition_from_causalityZfinal_condition_for_wherer2   Zsoftcap_valZprobabilitiesZb_dimZn_dimZu_dimZw_dimZc_dimZh_dimZprob_bunZv_bunZ
result_bmmZcontext_vectorsr.   r.   r/   rY     s     





	

 
 zGemma3nAudioAttention.forward)r'   r(   r)   r    rC   r+   r_   r]   r   r   r   
BoolTensorrY   r`   r.   r.   rJ   r/   r   7  s
   +0r   c                       sD   e Zd ZdZd	eee ed fddZej	ej	dddZ
  ZS )
Gemma3nAudioCumulativeGroupNorma  Applies Group Normalization cumulatively over the time dimension.

    This layer normalizes the input by calculating the mean and variance
    cumulatively over the time dimension (dim 1). The statistics are computed
    over all feature dimensions (specified by `feature_dims` and `num_channels`)
    for elements marked as valid by the optional `mask`.

    If a `mask` is provided (True for valid, False for invalid/padded),
    invalid time steps do not contribute to the statistics calculation, and
    their corresponding output values are zeroed out.

    Scale and bias, if enabled, are applied per-channel (last dimension).
    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
    and `cumulative=True`.
    MbP?num_channelsfeature_dimsr<   c                    sT   t    || _t|| _|| _tt	|| _
ttddt| j d | _d S )Nr   r   )rB   rC   r   r7   r   r<   rD   rE   r+   rF   r>   rangelenreduction_axes)rI   r   r   r<   rJ   r.   r/   rC   (  s    

z(Gemma3nAudioCumulativeGroupNorm.__init__r   c                 C   sL  | j | jf }|jdd |kr>td|jdd  d| |j}tj}||}tj||d}tj	|| j
dd}tj|dd	}tj	|| j
dd}	tj|	dd	}
tj|
d
d}|| }|| d}tj	|| j
dd}tj|dd	}|| }|| t|| j  }| j|}dg| d  | jg }||| }|| }||S )zApplies cumulative group norm, optionally using a mask.

        Args:
          hidden_states: Input tensor, shape [B, T, *feature_dims, C].

        Returns:
          Normalized tensor with the same shape as x.
        r   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r   Tr;   rM   r   rz   r?   )r   )r   r   rZ   r   rx   r+   r|   r{   Z	ones_likesumr   ZcumsumclamprO   rsqrtr<   r>   r;   r   )rI   r4   Zexpected_input_suffixZinput_dtypeZ
calc_dtypeZx_calcZ	mask_calcZsum_values_at_tZcum_sum_valuesZelements_in_group_at_tZcum_count_elementsZsafe_cum_count_elementsZcum_meanZsquared_diff_from_meanZsum_sq_diff_at_tZcum_sum_sq_diffZcum_varianceZnormalized_xscaleZscale_view_shapeZfinal_outputr.   r.   r/   rY   :  s6    	
z'Gemma3nAudioCumulativeGroupNorm.forward)r   )r'   r(   r)   r*   r]   r   rV   rC   r+   r_   rY   r`   r.   r.   rJ   r/   r     s    r   c                	       sN   e Zd ZdZd	eeeeeeeef d fddZej	ej	dddZ
  ZS )
Gemma3nAudioSSCPConvBlockzA single convolution block for the SubSampleConvProjection.

    This block consists of a 2D convolution, followed by CumulativeGroupNorm,
    and a ReLU activation. It handles manual padding for the convolution.
    r   r   r   r   )rc   idxinput_freq_dimmanual_paddingc                    s   t    || _|| _|dkr"dn| jj|d  }| jj| }| jj| \}}| jj| \}	}
tj||||f|	|
fddd| _	|| jd  | jd  }|| |
 d }t
||f| jjd| _t | _d S )Nr   r   )r   r   F)in_channelsout_channelskernel_sizestridepaddingre   r   )rB   rC   rc   r   sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizerD   ZConv2dconvr   Zsscp_conv_group_norm_epsnormZReLU
activation)rI   rc   r   r   r   r   r   kernel_hkernel_wstride_hstride_wf_in_paddedZ
f_out_convrJ   r.   r/   rC     s2    
z"Gemma3nAudioSSCPConvBlock.__init__audio_encodingsrT   c                 C   sf   t j|| jddd| jjj}| |}|dddd }| 	|}|dddd }| 
|S )NZconstantr   )modevaluer   r   r   r   )Fr   r   r{   r   r>   rx   r   r   r   r   )rI   r   Zaudio_encodings_paddedZaudio_encodings_convZ
x_for_normZx_normedZaudio_encodings_normedr.   r.   r/   rY     s    

z!Gemma3nAudioSSCPConvBlock.forward)r   )r'   r(   r)   r*   r    r]   r7   rC   r+   r_   rY   r`   r.   r.   rJ   r/   r     s    +r   c                       s6   e Zd Zed fddZejejdddZ  ZS )#Gemma3nAudioSubSampleConvProjectionrb   c                    s  t    || _|j}g }g }tdD ]t}|j| \}}|j| \}}	d}
|d }d}d}|||
|f}|| || | }|| |	 d }|| |}q&td|j||d d| _	td|d ||d d| _
|jd }|d }|| | _tj| j| jjdd| _d S )Nr   r   r   )r   r   rc   r   rL   Frd   )rB   rC   rc   Zinput_feat_sizer   r   r   appendr   conv_0conv_1r   Zinput_proj_in_featuresrD   rp   ri   input_proj_linear)rI   rc   Zcurrent_f_for_block_inputZcalculated_block_paddingZcalculated_f_out_dimsir   r   r   r   Z	pad_t_topZpad_t_bottomZ
pad_f_leftZpad_f_rightZmanual_padding_tupler   Zf_out_after_convZfinal_c_outZfinal_f_outrJ   r.   r/   rC     sL    




z,Gemma3nAudioSubSampleConvProjection.__init__r   c                 C   s`   | d}| |}| |}|j\}}}}|dddd }||||| }	| |	}
|
S )Nr   r   r   r   )rv   r   r   rZ   r   r   r   r   )rI   r   audio_encodings_reshapedrQ   r   Zc_outZt_outZf_outZ
x_permutedZoutput_flattenedrX   r.   r.   r/   rY     s    



z+Gemma3nAudioSubSampleConvProjection.forward	r'   r(   r)   r    rC   r+   r_   rY   r`   r.   r.   rJ   r/   r     s   9r   c                       s:   e Zd Zed fddZejejejdddZ  Z	S )Gemma3nAudioConformerAttentionrb   c                    sv   t    || _| jj| _| jdt| jjdd t	| jj| _
t|| _tj| j| jjdd| _t	| jj| _d S )Ngradient_clippingFr@   rd   )rB   rC   rc   ri   Zpost_in_featuresrG   r+   rH   r   r8   pre_attn_normr   attnrD   rp   post	post_normrI   rc   rJ   r.   r/   rC     s    


z'Gemma3nAudioConformerAttention.__init__r   audio_mel_maskrT   c                 C   sz   |}t || j | j}| |}| ||}|j\}}}}	|||||	 }
| |
}t || j | j}|| | S rU   )	r+   r   r   r   r   rZ   r   r   r   )rI   r   r   Zaudio_encodings_input_to_attnZaudio_encodings_normZaudio_encodings_attn_outr   r   rh   rj   r   r.   r.   r/   rY     s    

z&Gemma3nAudioConformerAttention.forward
r'   r(   r)   r    rC   r+   r_   r   rY   r`   r.   r.   rJ   r/   r     s   
r   c                       s6   e Zd Zed fddZejejdddZ  ZS ) Gemma3nAudioConformerFeedForwardrb   c                    s   t    || _| jdt| jjdd t| jj| _	t
j| jj| jjd dd| _t
j| jjd | jjdd| _t| jj| _t| jj| _d S )Nr   Fr@   r   rd   )rB   rC   rc   rG   r+   rH   r   r8   ri   pre_layer_normrD   rp   ffw_layer_1ffw_layer_2post_layer_normZconf_residual_weightpost_layer_scaler   rJ   r.   r/   rC   /  s    
z)Gemma3nAudioConformerFeedForward.__init__r   c                 C   sn   |}t || j | j}| |}| |}tj|}| |}t || j | j}| 	|}||| j
  S rU   )r+   r   r   r  r  rD   r   silur  r  r  )rI   r   Zresidualr.   r.   r/   rY   ;  s    



z(Gemma3nAudioConformerFeedForward.forwardr   r.   r.   rJ   r/   r  .  s   r  c                       s6   e Zd Zed fddZejejdddZ  ZS ) Gemma3nAudioConformerLightConv1drb   c              	      s   t    || _t| jj| jjd| _tj| jj| jjd dd| _	tj
| jj| jj| jjdd| jjdd| _| jdt| jjdd	 t| jj| jjd| _tj| jj| jjdd| _| jjd | _d S )
Nr<   r   Frd   r   r   )r   r   r   r   r   groupsre   r   r@   )rB   rC   rc   r8   ri   rms_norm_epsr  rD   rp   linear_startZConv1dZconf_conv_kernel_sizedepthwise_conv1drG   r+   rH   r   	conv_norm
linear_endcausal_paddingr   rJ   r.   r/   rC   H  s"    
	z)Gemma3nAudioConformerLightConv1d.__init__r   c                 C   s   |}|  |}| |}tjjj|dd}|ddd}t|| j	df}| 
|}|ddd}t|| j | j}| |}tj|}| |}|| }|S )NrL   rz   r   r   r   )r  r  r+   rD   r   Zglur   r   r   r  r  r   r   r  r  r  )rI   r   Zaudio_encodings_residualZaudio_encodings_permutedZaudio_encodings_permuted_paddedrX   r.   r.   r/   rY   ]  s    




z(Gemma3nAudioConformerLightConv1d.forwardr   r.   r.   rJ   r/   r  G  s   r  c                       s:   e Zd Zed fddZejejejdddZ  Z	S )Gemma3nAudioConformerBlockrb   c                    sl   t    || _t| j| _t| j| _t| j| _t| j| _	| j
dt| jjdd t| jj| _d S )Nr   Fr@   )rB   rC   rc   r  ffw_layer_startr   	attentionr  lconv1dffw_layer_endrG   r+   rH   r   r8   ri   r   r   rJ   r.   r/   rC   s  s    
z#Gemma3nAudioConformerBlock.__init__r   c                 C   sh   |  |}| ||}| }||d|j }| |}| |}t|| j	 | j	}| 
|}|S )NrL   )r  r  rv   r{   rx   r  r  r+   r   r   r   )rI   r   r   Zvalidity_mask_for_lconvZaudio_encodings_for_lconv_inputrX   r.   r.   r/   rY   ~  s    



z"Gemma3nAudioConformerBlock.forwardr   r.   r.   rJ   r/   r  r  s   r  c                       sV   e Zd ZU dZeed< dZed fddZej	ej
eej	ej
f ddd	Z  ZS )
Gemma3nAudioEncoderzx
    An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
    rc   	audio_melrb   c                    s@   t     | _t | _t fddt jD | _	d S )Nc                    s   g | ]}t  qS r.   )r  .0r   rb   r.   r/   
<listcomp>      z0Gemma3nAudioEncoder.__init__.<locals>.<listcomp>)
rB   rC   rc   r   subsample_conv_projectionrD   
ModuleListr   Zconf_num_hidden_layers	conformerr   rJ   rb   r/   rC     s    
zGemma3nAudioEncoder.__init__)r  r   rT   c           
      C   s\  |  |}|jd }d}tt| jjD ]}|| jj| d 9 }q(tj||jd| }tj	||jd d d}|j
dkr|j
dkr|d|jd d}n@|j
|j
kr|jd dkr|jd dkr||jd kr|d}t|d|}| jD ]}	|	||}q| jjdkrB|dddd| jjf }|dddd| jjf }||dd}||fS )a  Encodes a batch of MELs.

        Args:
            audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
              mel_bins].

        Returns:
            audio_encodings: a torch.Tensor of shape
                `[batch_size, self.config.audio_soft_tokens_per_image,
                self.config.audio_config.hidden_size]`
            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
        r   r   r   )rk   rL   Nr   )r  rZ   r   r   rc   r   r+   ru   ry   r   r   rv   expandZgatherr  Zconf_reduction_factorZmasked_fill)
rI   r  r   r   Zt_subZtime_stride_productZstride_pair_idxindicesZcurrent_maskblockr.   r.   r/   rY     s4    




zGemma3nAudioEncoder.forward)r'   r(   r)   r*   r    r-   Zmain_input_namerC   r+   r_   r   r7   rY   r`   r.   r.   rJ   r/   r    s   

r  c                       sB   e Zd ZdZd	eeeed fddZejd fddZ	  Z
S )
Gemma3nTextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    r?   )num_embeddingsembedding_dimpadding_idxembed_scalec                    s*   t  ||| | jdt|dd d S )Nr&  Fr@   )rB   rC   rG   r+   rH   )rI   r#  r$  r%  r&  rJ   r.   r/   rC     s    z'Gemma3nTextScaledWordEmbedding.__init__	input_idsc                    s   t  || j| jj S rU   )rB   rY   r&  r{   r>   rx   rI   r(  rJ   r.   r/   rY     s    z&Gemma3nTextScaledWordEmbedding.forward)r?   )r'   r(   r)   r*   r]   rV   rC   r+   r_   rY   r`   r.   r.   rJ   r/   r"    s   r"  c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )Gemma3nTextLaurelBlockz Learned Augmented Residual Layerrb   c                    s^   t    || _tj| jj| jjdd| _tj| jj| jjdd| _t	| jj| jj
d| _d S )NFrd   r	  )rB   rC   rc   rD   rp   ri   Zlaurel_ranklinear_leftlinear_rightr8   r  post_laurel_normr   rJ   r.   r/   rC     s
    
zGemma3nTextLaurelBlock.__init__r   c                 C   s&   |  |}| |}| |}|| S rU   )r+  r,  r-  )rI   r4   Zlaurel_hidden_statesZnormed_laurel_hidden_statesr.   r.   r/   rY     s    


zGemma3nTextLaurelBlock.forward)
r'   r(   r)   r*   r"   rC   r+   r_   rY   r`   r.   r.   rJ   r/   r*    s   r*  c                       sN   e Zd Zdeed fddZejejdddZejejdd	d
Z	  Z
S )Gemma3nTextMLPr   rc   	layer_idxc                    s   t    || _|j| _|j| | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _|j| | _d S NFrd   )rB   rC   rc   ri   Zintermediate_sizerD   rp   	gate_projup_proj	down_projr   hidden_activationact_fnZactivation_sparsity_patternactivation_sparsityrI   rc   r0  rJ   r.   r/   rC     s    
zGemma3nTextMLP.__init__r   c                 C   sD   |  |}| jdkr| |}| |}| |}| || }|S )Nr   )r2  r7  _gaussian_topkr6  r3  r4  )rI   r4   r2  activationsr3  r4  r.   r.   r/   rY     s    




zGemma3nTextMLP.forward)inputsrT   c                 C   sz   t j| jt j|jd}t jjdd}||}|	|j
}t j|ddd}t j|dddd}|||  }tj|| S )	Nrx   ry   r   r   rL   Tr   F)r;   rM   Zunbiased)r+   rH   r7  r|   ry   distributionsnormalZNormalZicdfr   rx   rP   ZstdrD   r   Zrelu)rI   r;  Ztarget_sparsity_tensorZnormal_distZstd_multiplierZinputs_meanZ
inputs_stdZcutoff_xr.   r.   r/   r9    s    
zGemma3nTextMLP._gaussian_topk)r   )r'   r(   r)   r"   r]   rC   r+   r_   rY   r9  r`   r.   r.   rJ   r/   r.    s   	r.  c                       s   e Zd ZdZed fddZejejdddZejejdd	d
Z	ejejejdddZ
ejejdddZejejdddZ  ZS )Gemma3nTextAltUpa  Alternating Updates (AltUp)

    The AltUp module wraps transformer layers. The `predict` step modifies the
    input to the transformer layer, and the `correct` step propagates the output
    of the transformer layer to the sparsely updated dimensions.

    See more in the research paper:

    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
    rb   c                    s   t    || _tt| jj| _tj	| jj
| jj
dd| _tj	| jj
| jj
d dd| _tj	| jj| jj
dd| _t| jj| jjd| _| jdt| jjd dd d S )NFrd   r   r	  router_input_scaleg      r@   )rB   rC   rc   rD   rE   r+   r   ri   correct_output_scalerp   altup_num_inputscorrection_coefsprediction_coefsmodality_routerr8   r  router_normrG   rH   r   rJ   r.   r/   rC   (  s    
zGemma3nTextAltUp.__init__rS   c                 C   s.   |  || j }| |}t| |S rU   )rF  r@  rE  r+   r   rV   rW   )rI   rQ   Zrouter_inputsZroutedr.   r.   r/   compute_router_modalities2  s    
z*Gemma3nTextAltUp.compute_router_modalitiesr   c                 C   s   |  || jj }| jr@| jjdur@| jjj| jj | jj | |j	g |j
dd | jj| jjR  dddd}t|dddd|}|dddd}||7 }| |S )a  Predicts the output of a layer using a trainable map.

        Args:
            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
        NrL   r   r   r   r   )rG  rc   altup_active_idxtrainingaltup_coef_cliprD  r>   dataclamp_r   rZ   rB  r   r+   r   r   rW   )rI   r4   
modalities	all_coefspredictionsr.   r.   r/   predict7  s"    
zGemma3nTextAltUp.predict)rO  	activatedrT   c                 C   s   |  |}||| jj  }|| jjddd}| jjdurV| jjj	| jj | jj | |d }|
dddd}t||}||7 }| |S )a_  Corrects the predictions relative to the

        Args:
            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
                predictions relative to the activated input embeddings.
        r   Nr?   r   r   rL   )rG  rc   rH  repeatrB  rJ  rC  r>   rK  rL  r   rv   r+   mulr   rW   )rI   rO  rQ  rM  Z
innovationrN  	correctedr.   r.   r/   correctS  s    
zGemma3nTextAltUp.correct)rT  rT   c                 C   s   | | j| j  |S )a	  
        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
        (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
        `scale_corrected_output`
        )rW   rA  rI   rT  r.   r.   r/   rY   p  s    zGemma3nTextAltUp.forwardc                 C   s
   |  |S )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)rY   rV  r.   r.   r/   scale_corrected_outputx  s    z'Gemma3nTextAltUp.scale_corrected_output)r'   r(   r)   r*   r"   rC   r+   r_   rG  rP  rU  rY   rW  r`   r.   r.   rJ   r/   r?    s   
r?  c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	Gemma3nTextRotaryEmbeddinginv_freqNrb   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typer   defaultrY  Fr@   )rB   rC   hasattr
isinstancerZ  dictgetr[  Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrc   r   Zrope_init_fnattention_scalingrG   rY  Zoriginal_inv_freq)rI   rc   ry   rY  rJ   r.   r/   rC     s    
z#Gemma3nTextRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   rL   r   ZmpscpuF)device_typeZenabledr   rz   r   )rY  rV   r  rZ   r{   ry   r^  r   strr+   Zautocast	transposer}   r   ra  r~   rx   )
rI   rQ   position_idsZinv_freq_expandedZposition_ids_expandedrc  ZfreqsZembr   r~   r.   r.   r/   rY     s    0&,z"Gemma3nTextRotaryEmbedding.forward)N)r'   r(   r)   r+   r_   r-   r"   rC   Zno_gradr   rY   r`   r.   r.   rJ   r/   rX  }  s
   

rX  c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrL   r   rz   )rZ   r+   r}   )rQ   x1Zx2r.   r.   r/   rotate_half  s    rh  )r4   n_reprT   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rZ   r  r   )r4   ri  r   num_key_value_headsslenrj   r.   r.   r/   	repeat_kv  s
    0rl  r   )	modulequerykeyr   attention_maskdropoutscalingr   rT   c                 K   s   |d u r| j d }t|| j}	t|| j}
t||	dd| }|d urd|| }t|}|| }|d ur|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	Nr   r   r   r   rL   r   )prI  r   )rj   rl  num_key_value_groupsr+   r   re  r   rZ   rD   r   r   r|   r{   rx   rq  rI  r   )rm  rn  ro  r   rp  rq  rr  r   kwargsr   r   attn_weightscausal_maskattn_outputr.   r.   r/   eager_attention_forward  s"    

&ry  rQ   r   r~   rf  unsqueeze_dimc                 C   s(   | |}| |}| | t| |  S )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )rv   rh  rz  r.   r.   r/   apply_rotary_pos_emb  s    

r|  c                       s   e Zd ZdZeed fddZedddddej	ej	e
ej	 e
e e
ej ee eej	e
ej	 e
eej	  f d
ddZ  ZS )Gemma3nTextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr/  c                    s  t    |j| dk| _|| _|| _t|d|j|j | _	|j|j
 | _| jj| _d| _tj|j|j| j	 |jd| _tj|j|j
| j	 |jd| _tj|j|j
| j	 |jd| _tj|j| j	 |j|jd| _| jr|jnd | _t|j	|jd| _t|j	|jd| _t|j	|jdd| _| jj| jj }||  ko@dkn  | _|jd | }| jrt|d	 |d d d
 |j|  | _ d| _!n2d | _ |t|d	 |d d d
 |j|  k| _!d S )Nsliding_attentionrj   Trd   )r;   r<   Fr:   r   r   rL   )"rB   rC   layer_types
is_slidingrc   r0  getattrri   Znum_attention_headsrj   rj  rt  attention_dropoutZ	is_causalrD   rp   Zattention_biasr   r   r   o_projsliding_windowr8   r  q_normk_normv_normnum_hidden_layersZnum_kv_shared_layersis_kv_shared_layerr   indexkv_shared_layer_indexstore_full_length_kv)rI   rc   r0  Zfirst_kv_shared_layer_idxZprev_layersrJ   r.   r/   rC     sD    

(zGemma3nTextAttention.__init__past_key_valuer3   4.58new_nameversionN)r4   position_embeddingsrp  r3   cache_positionru  rT   c                 K   s  |j d d }g |d| jjR }|\}	}
| ||}| |}t||	|
dd}|dd}| jr|d ur|j	| j
 \}}||j}||j}n\| ||}| |}t||	|
dd}|dd}| ||}| |}|dd}|d urX|
|	|| jd}| js.|||| j|\}}| jrXt|dsHi |_	||f|j	| j< t}| jjdkrvt| jj }|| ||||f| jr| jndd	| jd
|\}}|jg |dR   }| |}||fS )NrL   r   )r{  r   )r~   r   r  r  shared_layerseagerr   r?   )rq  rr  r  )rZ   rc   rj   r   r   r  r|  re  r  r  r  r{   ry   r   r  r   r  r  updater0  r  r]  ry  _attn_implementationr   rI  r  r   r   r  )rI   r4   r  rp  r3   r  ru  Zinput_shapeZhidden_shaper   r~   r   r   r   Zcache_kwargsZattention_interfacerx  rv  r.   r.   r/   rY   "  sf    





	

zGemma3nTextAttention.forward)NN)r'   r(   r)   r*   r"   r]   rC   r   r+   r_   r   r	   
LongTensorr   r   r7   rY   r`   r.   r.   rJ   r/   r}    s   *  r}  c                       s   e Zd Zeed fddZedddddejejejeje	ej e	ej
 e	e e	e e	e e	ej
 eeje	eejejf  f d
ddZ  ZS )Gemma3nTextDecoderLayerr/  c                    s   t    || _|j| _|| _|j| | _t||| _t	||d| _
t| j|jd| _t| j|jd| _t| j|jd| _t| j|jd| _|j| _t|j | _t|| _t|| _tj| j| jdd| _tj| j| jdd| _t| j|jd| _d S )N)r0  r	  Frd   )rB   rC   rc   ri   r0  r  attention_typer}  	self_attnr.  mlpr8   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr   r5  r6  r?  altupr*  laurelrD   rp   per_layer_input_gateper_layer_projectionpost_per_layer_input_normr8  rJ   r.   r/   rC   m  s$    


z Gemma3nTextDecoderLayer.__init__r  r3   r  r  NF)r4   position_embeddings_globalposition_embeddings_localper_layer_inputrp  rf  r3   output_attentions	use_cacher  rT   c                 K   sB  | j |}|| jj }| |}| |}| jjr:|}n|}| jf |||||||	|
d|\}}| |}|| }|| t	
d }| |}| |}| |}|| }| j ||}|| jj  }| jjr| j |}| |}| |}t||}| |}| |}|dd   |7  < |f}|r>||f7 }|S )N)r4   r  rp  rf  r3   r  r  r  r   r   )r  rP  rc   rH  r  r  r  r  r  rr   rN   r  r  r  rU  r   Zaltup_correct_scalerW  r  r6  r+   multiplyr  r  )rI   r4   r  r  r  rp  rf  r3   r  r  r  ru  rO  Zactive_predictionZactive_prediction_normedZlaurel_outputr  r   Zself_attn_weightsZ
attn_gatedZattn_laurelZ	attn_normZattn_ffwZattn_ffw_normZattn_ffw_laurel_gatedZcorrected_predictionsZfirst_predictionoutputsr.   r.   r/   rY     sP    

	









zGemma3nTextDecoderLayer.forward)NNNFFN)r'   r(   r)   r"   r]   rC   r   r+   r_   r   r  r	   r^   r7   r,   rY   r`   r.   r.   rJ   r/   r  l  s*         r  c                       sX   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZ fddZ  ZS )	Gemma3nPreTrainedModelrc    Tr  r3   )r4   r5   c                    sX   t  | t|tr&|jjd n.t|tr>|jj	  nt|t
rT|jj	  d S )Nr?   )rB   _init_weightsr^  r   r>   rK  Zfill_r   r   Zzero_r?  rA  )rI   rm  rJ   r.   r/   r    s    


z$Gemma3nPreTrainedModel._init_weights)r'   r(   r)   r!   r-   base_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZ_supports_attention_backendr  r}  Z_can_record_outputsr  r`   r.   r.   rJ   r/   r    s   
r  zBThe base Gemma 3n language model without a language modeling head.c                       s   e Zd ZU eed< ed fddZeedee	j
 ee	j ee	j ee	j
 ee ee	j ee ee ee ee	j
 ee edddZe	j
e	jd	d
dZde	jee	j e	jdddZ  ZS )Gemma3nTextModelrc   rb   c                    s  t     j_ j_t j jjjjd d_t	
 fddt jD _t j jd_t d_d_t   j _dd	i _t d_ j_ j_t j j j j jd d_t	jj j j dd
_t j jd_t	
fddtdjj D _!t	
fddtdjj D _"j#dt$%jd dd j#dt$&t$%ddd '  d S )N      ?)r&  c                    s   g | ]}t  |qS r.   )r  )r  r0  rb   r.   r/   r    r  z-Gemma3nTextModel.__init__.<locals>.<listcomp>r	  rb   Fr[  r\  rd   c                    s    g | ]}t j j jd dqS Frd   rD   rp   ri   r  r[   r.   r/   r    r  r   c                    s    g | ]}t j j jd dqS r  r  r  r[   r.   r/   r    r  per_layer_projection_scaler   r@   per_layer_input_scaleg       @)(rB   rC   pad_token_idr%  
vocab_sizer"  ri   rc   embed_tokensrD   r  r   r  layersr8   r  r   rX  
rotary_embgradient_checkpointingcopydeepcopyZrope_local_base_freqZ
rope_thetarZ  rotary_emb_localr  vocab_size_per_layer_inputembed_tokens_per_layerrp   per_layer_model_projectionper_layer_projection_normrB  altup_projectionsaltup_unembed_projectionsrG   r+   rH   r   	post_initr   rJ   )rc   rI   r/   rC     sN    



zGemma3nTextModel.__init__N)r(  per_layer_inputsrp  rf  r3   inputs_embedsr  r  output_hidden_statesr  ru  rT   c           !   
   K   s  |dur|n| j j}|	dur |	n| j j}	|dur4|n| j j}|du |duA rTtd| jrr| jrr|rrtd d}|dur| 	|}| 
|}| ||}|r|du r| jst| j d}|
du r|dur| nd}tj|||jd  |jd}
|du r|
d}t| }tsF| j |||
||d	}tf i |tf i |d
}|}| ||}| ||}tj|d dddd }td}|g}td| j jD ]n}| j|d  |}|j|j|jd}tj|d ddd}t t!|||j}|| | }|"| qtj#|dd}|	r dnd}|r.dnd}| j$d| j j% D ]~}|	rX||f7 }||j& }|dddd|j'ddf }|||||f||||||
d|}|d }|rD||d f7 }qD|	r||f7 }tj|d d dddd }|d g}td| j jD ]r}| j(|d  || } | j|j|jd}tj|d ddd}t t!|||j}|| | }|"| qt#|}tj|dd}| )|}t*||||dS )z
        per_layer_inputs (torch.Tensor, *optional*, defaults to None):
            Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
        N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Frb   r   r   r   )rc   Zinput_embedsrp  r  r3   rf  )Zfull_attentionr~  r   rL   Tr   r  gh㈵>r<  rz   r.   )rp  rf  r3   r  r  r  )last_hidden_stater3   r4   r5   )+rc   r  r  r  r   r  rI  loggerwarning_oncer  get_per_layer_inputsproject_per_layer_inputsr
   Zget_seq_lengthr+   ru   rZ   ry   rv   r^  r_  r   r   r  r  rP   rH   r   rB  r  r{   rx   rN   maximumr   stackr  r  r  r0  r  r   r   )!rI   r(  r  rp  rf  r3   r  r  r  r  r  ru  Zpast_seen_tokensZcausal_mask_mappingZmask_kwargsZhidden_states_0r  r  Ztarget_magnitudeZepsilon_tensorZtemp_hidden_statesr   Z
altup_projZcurrent_hidden_stateZnew_magnituder4   Zall_hidden_statesZall_self_attnsZdecoder_layerrw  r  Zlayer_outputsZaltup_unemb_projr.   r.   r/   rY   #  s    











zGemma3nTextModel.forward)r(  rT   c                 C   s&   |  |jg |j| jj| jR  S rU   )r  r   rZ   rc   r  r  r)  r.   r.   r/   r    s    z%Gemma3nTextModel.get_per_layer_inputs)r  r  rT   c                 C   s   |  |}|| jj|j|jd9 }|jg |jd d | jj| j	R  }| 
|}|d u r`|S |j|jkr|dd | jjd d f }|| | jj|j|jd S )Nr<  rL   .)r  r  r{   rx   ry   r   rZ   rc   r  r  r  r  )rI   r  r  r  r.   r.   r/   r    s&    

z)Gemma3nTextModel.project_per_layer_inputs)
NNNNNNNNNN)N)r'   r(   r)   r"   r-   rC   r   r   r   r+   r  r_   r	   r,   r^   r   r   r   rY   r  r  r`   r.   r.   rJ   r/   r    sH   
9           
 r  z?The base Gemma 3n language model with a language modeling head.c                       s   e Zd ZU dgZddiZddgdgfiZeed< dZddiZ	ed	 fd
dZ
eedeej eej eej ee eej eej ee ee ee eej eeejf edddZ  ZS )Gemma3nForCausalLMlm_head.weightlm_headZcolwise_repr4   r2   rc   modelzmodel.language_modelrb   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r1  )
rB   rC   r  r  r  rD   rp   ri   r  r  r   rJ   r.   r/   rC     s
    
zGemma3nForCausalLM.__init__Nr   )r(  rp  rf  r3   r  labelsr  r  r  r  logits_to_keeprT   c                 K   s  | j r(| jjdkr(td| jj d |dur4|n| jj}|	durH|	n| jj}	| jf ||||||||	|
d	|}|j}t	|t
rt| dn|}| |dd|ddf }| jjdur|| jj }t|}|| jj }d}|dur| j||| jfi |}t|||j|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3nForCausalLM

        >>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```r  ziIt is strongly recommended to train Gemma3n models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	r(  rp  rf  r3   r  r  r  r  r  )r1   r2   r3   r4   r5   )rI  rc   r  r  r  r  r  r  r  r^  r]   slicer  final_logit_softcappingr+   r   Zloss_functionr  r   r3   r4   r5   )rI   r(  rp  rf  r3   r  r  r  r  r  r  r  ru  r  r4   slice_indicesr2   r1   r.   r.   r/   rY     sN    #


zGemma3nForCausalLM.forward)NNNNNNNNNNr   )r'   r(   r)   _tied_weights_keysZ_tp_planZ_pp_planr"   r-   r  _checkpoint_conversion_mappingrC   r   r   r   r+   r  r_   r	   r,   r^   r   r]   r   rY   r`   r.   r.   rJ   r/   r    sD   
	           r  c                       sR   e Zd ZdZeeef ed fddZd	e	e
j e	e
j e
jdddZ  ZS )
Gemma3nMultimodalEmbedderzQEmbeds token ids or soft tokens for multimodal content into language model space.)multimodal_configtext_configc                    s   t    |j| _|j| _|j| _|j| _|j| _t	
| j| j| _t| j| jd| _t| j| jd| _t	j| j| jdd| _t| j| jdd| _d S )Nr	  Frd   )r<   r=   )rB   rC   ri   Zmultimodal_hidden_sizer  r<   vocab_offsetr  Ztext_hidden_sizerD   	Embedding	embeddingr8   hard_embedding_normsoft_embedding_normrp   embedding_projectionembedding_post_projection_norm)rI   r  r  rJ   r.   r/   rC   C  s    
z"Gemma3nMultimodalEmbedder.__init__N)r(  r  rT   c                 C   sZ   |du |duA rt d|dur,| |}n| || j }| |}| |}| |S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        Nr  )r   r  r  r  r  r  r  )rI   r(  r  Zemb_normZhard_embZemb_norm_projr.   r.   r/   rY   V  s    

z!Gemma3nMultimodalEmbedder.forward)NN)r'   r(   r)   r*   r   r    r#   r"   rC   r   r+   r  r_   rY   r`   r.   r.   rJ   r/   r  @  s   
  r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c                       s8  e Zd Zi ZdZed fddZdd Zdd Zd	d
 Z	dd Z
ejejdddZdeej eej eej eej dddZedeej eej eej eej eej eej eeeej ef  eej eej eej eej ee ee ee edddZejejeejejf dddZ  ZS )Gemma3nModelFrb   c                    s   t  | tj|jd| _|jj| _tj|jd}|| _| j	j
d urN| j	j
nd| _
|jj| _t|j| _t|j|j| _t|j|j| _|   d S )Nrb   rL   )rB   rC   r   from_configvision_configvision_towerr  r  language_modelrc   r  r  Zaudio_configaudio_towerr  embed_visionembed_audior  )rI   rc   r  rJ   r.   r/   rC   }  s    

zGemma3nModel.__init__c                 C   s
   | j  S rU   )r  get_input_embeddingsr[   r.   r.   r/   r    s    z!Gemma3nModel.get_input_embeddingsc                 C   s   | j | d S rU   )r  set_input_embeddingsrI   r   r.   r.   r/   r    s    z!Gemma3nModel.set_input_embeddingsc                 C   s
   || _ d S rU   r  rI   decoderr.   r.   r/   set_decoder  s    zGemma3nModel.set_decoderc                 C   s   | j S rU   r  r[   r.   r.   r/   get_decoder  s    zGemma3nModel.get_decoder)pixel_valuesrT   c                 C   sX   | j |dddj}||jd | jjj| jjddd}|| jjjd 9 }| j	|dS )	a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.

        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        FT)r  Z
do_poolingreturn_dictr   r   r   r  r  )
r  r  r   rZ   rc   r  ri   Zvision_soft_tokens_per_imager   r  )rI   r  Zvision_outputsr.   r.   r/   get_image_features  s    zGemma3nModel.get_image_featuresN)r(  r  image_featuresaudio_featuresc           	      C   sF  |du rb||   tj| jjtj|jdk}|d}||   tj| jjtj|jdkd}n|| jjk}|| jjk}|	 }|
d||j}|dur||  | krtd| d|jd |jd   |	 }|
d||j}|dur>||  | kr>td| d|jd |jd   ||fS )	z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr<  rL   z6Image features and image tokens do not match: tokens: z, features r   r   z6Audio features and image tokens do not match: tokens: )r  r+   rH   rc   Zimage_token_idlongry   allZaudio_token_idr   rv   	expand_asr{   Znumelr   rZ   )	rI   r(  r  r  r  special_image_maskspecial_audio_maskZn_image_tokensZn_audio_tokensr.   r.   r/   get_placeholder_mask  s8    
 z!Gemma3nModel.get_placeholder_mask)r(  r  input_featuresrp  input_features_maskrf  r3   token_type_idsr  r  r  r  r  r  rT   c           *      K   s  |du |
duA rt d|dur$|n| jj}|dur8|n| jj}|dur^|  |}
t|dk|| jk }t||t	|}| j
|}t|| jjk|| jjk }| jj| jj d }t||||
j}| j|d}|d|
}t|||
}
|| jjk}| jj| jj d }t||||
j}| j|d}|d|
}t|||
}
nd}|dur| |}||
j|
j}| j||
|d\}}|
||}
|durr|durr| || \} }tj| jd ggtj| jd}!| j|!d}"t|d|"| } | j\}#}$}%| jj|$ }&|"|#|&|%}'tj| |'fdd	} | |
j|
j} | j||
| d
\}}(|
|(| }
| j
f d|||||
|||d|	d|})t|)j |r|)j!nd|)j"|)j#|dur|nd|dur| nddS )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```
        Nr  r   r   r'  rL   )r  r  r<  rz   )r  r  T)r(  r  rp  rf  r3   r  r  r  r  r  r  )r  r3   r4   r5   r%   r&   )$r   rc   r  r  r  r+   r   r  r   Z
zeros_liker  r  r  r  r  r  r{   ry   rv   r   r  rx   r  Zmasked_scatterget_audio_featuresrH   r  rZ   Zaudio_soft_tokens_per_imager  r}   r$   r  r3   r4   r5   )*rI   r(  r  r  rp  r  rf  r3   r  r  r  r  r  r  r  	lm_kwargsZper_layer_inputs_maskZper_layer_inputs_tokensr  Zvision_maskZdummy_vision_token_idZvision_input_idsZvision_embedsZexpanded_vision_mask
audio_maskZdummy_audio_token_idZaudio_input_idsZaudio_embedsZexpanded_audio_maskr  r  r   r  Zaudio_padding_toksZaudio_padding_embsZaudio_batch_sizeZaudio_seq_lenZaudio_embed_dimZextra_padding_tokensZextra_padding_featuresr  r  r.   r.   r/   rY     s    /




zGemma3nModel.forward)r  r  rT   c                 C   s    |  ||\}}| j|d|fS )a-  
        Projects the last hidden state from the audio encoder into language model space.

        Args:
            input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
               The tensors corresponding to the input audio.
            input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
               The attention mask for the input audio.

        Returns:
            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`).
        r  )r  r  )rI   r  r  Zaudio_outputsr	  r.   r.   r/   r  g  s    zGemma3nModel.get_audio_features)NNNN)NNNNNNNNNNNNNN)r'   r(   r)   r  Zaccepts_loss_kwargsr!   rC   r  r  r  r  r+   r_   r  r   r  r,   r  r   r   r6   r	   r^   r0   rY   r7   r  r`   r.   r.   rJ   r/   r  r  sl       *               r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c                       s0  e Zd Zi ZdgZdZed fddZdd Zdd	 Z	d
d Z
dd Zdd Zedd Zedd Zedd Zeed eej eej eej eej eej eej eeeej ef  eej eej eej eej ee ee ee eeejf edddZd! fdd	Zedd Z   Z!S )"Gemma3nForConditionalGenerationr  r  rb   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S r1  )rB   rC   r  r  rD   rp   r  ri   r  r  r  r   rJ   r.   r/   rC     s    
z(Gemma3nForConditionalGeneration.__init__c                 C   s
   | j  S rU   )r  r  r[   r.   r.   r/   r    s    z4Gemma3nForConditionalGeneration.get_input_embeddingsc                 C   s   | j | d S rU   )r  r  r  r.   r.   r/   r    s    z4Gemma3nForConditionalGeneration.set_input_embeddingsc                 C   s   | j | d S rU   )r  r  r  r.   r.   r/   r    s    z+Gemma3nForConditionalGeneration.set_decoderc                 C   s
   | j  S rU   )r  r  r[   r.   r.   r/   r    s    z+Gemma3nForConditionalGeneration.get_decoderc                 C   s   | j |S rU   )r  r  )rI   r  r.   r.   r/   r    s    z2Gemma3nForConditionalGeneration.get_image_featuresc                 C   s   | j jS rU   )r  r  r[   r.   r.   r/   r    s    z.Gemma3nForConditionalGeneration.language_modelc                 C   s   | j jS rU   )r  r  r[   r.   r.   r/   r    s    z,Gemma3nForConditionalGeneration.vision_towerc                 C   s   t dd S )Nz2Use embed_vision instead of multi_modal_projector.)AttributeErrorr[   r.   r.   r/   multi_modal_projector  s    z5Gemma3nForConditionalGeneration.multi_modal_projectorNr   )r(  r  r  rp  r  rf  r3   r  r  r  r  r  r  r  r  rT   c                 K   s  |dur|n| j j}|dur |n| j j}| jf |||||||||	|
||||dd|}|j}t|trtt| dn|}| |dd|ddf }| j 	 j
 }dur|| }t|}|| }d}|dur| }|dddddf }|dddf }|dur^|dd|jd  df |j}|||jdk  }|||jdk  }n| }| }t }|d| j jj}|d|j}|||}t|||j|j|j|j|jdS )	al  
        input_features_mask (torch.Tensor, *optional*, defaults to None):
            The attention mask for the input audio.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in
            `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        NT)r(  r  r  rp  r  rf  r3   r  r  r  r  r  r  r  r  .rL   r   r   )r1   r2   r3   r4   r5   r%   r&   )rc   r  r  r  r  r^  r]   r  r  Zget_text_configr  r+   r   rV   rZ   r{   ry   r   rD   ZCrossEntropyLossr   r  r  r0   r3   r4   r5   r%   r&   )rI   r(  r  r  rp  r  rf  r3   r  r  r  r  r  r  r  r  r  r  r4   r  r2   r  r1   Zshift_logitsZshift_labelsZshift_attention_maskZloss_fctZflat_logitsZflat_labelsr.   r.   r/   rY     sj    D


$
z'Gemma3nForConditionalGeneration.forwardTc                    sN   t  j|f||||||||
d|}|d dkrJ||d< ||d< |	|d< |S )N)r3   r  rp  rf  r  r  r  r  r   r  r  r  )rB   prepare_inputs_for_generation)rI   r(  r3   r  r  rf  r  r  rp  r  r  r  r  r  ru  Zmodel_inputsrJ   r.   r/   r  ,	  s&    
z=Gemma3nForConditionalGeneration.prepare_inputs_for_generationc                 C   s   | j jS rU   )r  r  r[   r.   r.   r/   r  U	  s    z+Gemma3nForConditionalGeneration.audio_tower)NNNNNNNNNNNNNNr   )NNNNNNNNNTNN)"r'   r(   r)   r  r  r  r!   rC   r  r  r  r  r  propertyr  r  r  r   r   r   r+   r  r,   r_   r   r6   r	   r^   r]   r0   rY   r  r  r`   r.   r.   rJ   r/   r
  z  s   


                            )r
  )r  r  r
  r  r  r  )r   NN)Nr   )`r  rr   collections.abcr   r   dataclassesr   typingr   r   r+   Ztorch.nnrD   Ztorch.nn.functionalr   r   r:  r   Zcache_utilsr	   r
   Z
generationr   Zmasking_utilsr   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zutils.deprecationr   autor   Zconfiguration_gemma3nr    r!   r"   r#   Z
get_loggerr'   r  r$   r0   Moduler8   ra   r   r   r   r   r   r  r  r  r  r  r"  r*  r.  r?  rX  rh  r_   r]   rl  rV   r7   ry  r|  r}  r  r  r  r  r  r  r
  __all__r.   r.   r.   r/   <module>   s   
 + amEI+J&a$   '  w^ xa2   [