a
    hc                     @   s2  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, e&-e.Z/G dd dej0Z1G dd dej0Z2dd Z3d3ddZ4ej5e6ej5dddZ7d4ej0ej5ej5ej5eej5 e8ee8 ee8 e9ej5ej5f d	d d!Z:G d"d# d#ej0Z;G d$d% d%eZ<G d&d' d'ej0Z=e$G d(d) d)eZ>e$G d*d+ d+e>Z?e$G d,d- d-e>eZ@G d.d/ d/ee>ZAG d0d1 d1ee>ZBg d2ZCdS )5    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )Gemma2Configc                       s>   e Zd Zdeed fddZdd Zdd Zd	d
 Z  Z	S )Gemma2RMSNormư>)dimepsc                    s&   t    || _tt|| _d S N)super__init__r"   nn	ParametertorchZzerosweight)selfr!   r"   	__class__ f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/gemma2/modeling_gemma2.pyr%   3   s    
zGemma2RMSNorm.__init__c                 C   s$   |t |djddd| j  S )N   T)Zkeepdim)r(   Zrsqrtpowmeanr"   )r*   xr-   r-   r.   _norm8   s    zGemma2RMSNorm._normc                 C   s*   |  | }|d| j   }||S )Ng      ?)r4   floatr)   Ztype_as)r*   r3   outputr-   r-   r.   forward;   s    zGemma2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler)   shaper"   )r*   r-   r-   r.   
extra_reprB   s    zGemma2RMSNorm.extra_repr)r    )
__name__
__module____qualname__intr5   r%   r4   r7   r:   __classcell__r-   r-   r+   r.   r   2   s   r   c                       s$   e Zd Z fddZdd Z  ZS )	Gemma2MLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFZbias)r$   r%   confighidden_sizeZintermediate_sizer&   Linear	gate_projup_proj	down_projr   Zhidden_activationact_fnr*   rC   r+   r-   r.   r%   G   s    
zGemma2MLP.__init__c                 C   s$   |  | | || | }|S r#   )rH   rI   rF   rG   )r*   r3   rH   r-   r-   r.   r7   Q   s     zGemma2MLP.forward)r;   r<   r=   r%   r7   r?   r-   r-   r+   r.   r@   F   s   
r@   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr0   r/   r!   )r9   r(   cat)r3   x1Zx2r-   r-   r.   rotate_halfV   s    rN   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerN   )qkcossinposition_idsZunsqueeze_dimZq_embedZk_embedr-   r-   r.   apply_rotary_pos_emb]   s
    

rU   )hidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r9   expandreshape)rV   rW   batchnum_key_value_headsslenhead_dimr-   r-   r.   	repeat_kvx   s
    0r_           )	modulequerykeyvalueattention_maskdropoutscalingsoftcaprX   c                 K   s   |d u r| j d }t|| j}	t|| j}
t||	dd| }|d urd|| }t|}|| }|d ur|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	N      r/   r   r0   )r!   dtype)ptrainingr   )r^   r_   num_key_value_groupsr(   matmul	transposetanhr9   r&   Z
functionalZsoftmaxZfloat32tork   rf   rm   
contiguous)ra   rb   rc   rd   re   rf   rg   rh   kwargs
key_statesvalue_statesattn_weightsZcausal_maskattn_outputr-   r-   r.   eager_attention_forward   s"    

&ry   c                       s   e Zd ZdZeed fddZedddddej	e
ej	ej	f eej	 ee eej ee e
ej	eej	 ee
ej	  f d
ddZ  ZS )Gemma2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrC   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _|j| dkr|jnd | _d S )Nr^   ri   TrB   sliding_attention)r$   r%   rC   r|   getattrrD   Znum_attention_headsr^   r\   rn   Zquery_pre_attn_scalarrg   attention_dropoutZ	is_causalr&   rE   Zattention_biasq_projk_projv_projo_projattn_logit_softcappinglayer_typessliding_windowr*   rC   r|   r+   r-   r.   r%      s,    


zGemma2Attention.__init__past_key_valuepast_key_values4.58new_nameversionN)rV   position_embeddingsre   r   cache_positionrt   rX   c                 K   s,  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d ur|||d}||
|| j	|\}
}t
}| jjdkrt| jj }|| |	|
||f| jr| jnd| j| j| jd|\}}|jg |dR   }| |}||fS )Nr0   r   r/   )rS   rR   r   eagerr`   )rf   rg   r   rh   )r9   r^   r   viewrp   r   r   rU   updater|   ry   rC   _attn_implementationr   rm   r   rg   r   r   rZ   rs   r   )r*   rV   r   re   r   r   rt   Zinput_shapeZhidden_shapeZquery_statesru   rv   rR   rS   Zcache_kwargsZattention_interfacerx   rw   r-   r-   r.   r7      s<    



zGemma2Attention.forward)NN)r;   r<   r=   __doc__r   r>   r%   r   r(   Tensorr8   r   r   
LongTensorr   r   r7   r?   r-   r-   r+   r.   rz      s     rz   c                       s   e Zd Zeed fddZedddddeje	ejejf e
ej e
ej e
e e
e e
e e
ej e	eje
e	ejejf  f d
	ddZ  ZS )Gemma2DecoderLayerr{   c                    s   t    |j| _|| _|j| | _t||d| _t|| _	t
|j|jd| _t
|j|jd| _t
|j|jd| _t
|j|jd| _d S )Nr{   r"   )r$   r%   rD   rC   r   attention_typerz   	self_attnr@   mlpr   rms_norm_epsinput_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r+   r-   r.   r%      s    

zGemma2DecoderLayer.__init__r   r   r   r   NF)	rV   r   re   rT   r   output_attentions	use_cacher   rX   c	                 K   s   |}
|  |}| jf ||||||||d|	\}}| |}|
| }|}
| |}| |}| |}|
| }|f}|r||f7 }|S )N)rV   r   re   rT   r   r   r   r   )r   r   r   r   r   r   )r*   rV   r   re   rT   r   r   r   r   rt   ZresidualZself_attn_weightsoutputsr-   r-   r.   r7      s2    
	





zGemma2DecoderLayer.forward)NNNFFN)r;   r<   r=   r   r>   r%   r   r(   r   r8   r   r   r   boolFloatTensorr7   r?   r-   r-   r+   r.   r      s&         r   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	Gemma2RotaryEmbeddinginv_freqNrC   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)r$   r%   hasattr
isinstancer   dictgetr   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrC   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)r*   rC   devicer   r+   r-   r.   r%   1  s    
zGemma2RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r0   r   ZmpscpuF)device_typeZenabledr/   rK   rk   )r   r5   rY   r9   rr   r   r   r   strr(   Zautocastrp   rL   rR   r   rS   rk   )
r*   r3   rT   Zinv_freq_expandedZposition_ids_expandedr   ZfreqsZembrR   rS   r-   r-   r.   r7   B  s    0&,zGemma2RotaryEmbedding.forward)N)r;   r<   r=   r(   r   __annotations__r   r%   Zno_gradr   r7   r?   r-   r-   r+   r.   r   .  s
   

r   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )Gemma2PreTrainedModelrC   modelTr   r   )rV   
attentionsN)r;   r<   r=   r   r   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZ_supports_attention_backendr   rz   Z_can_record_outputsr-   r-   r-   r.   r   R  s   
r   c                       s   e Zd Zed fddZeedeej	 eej
 eej	 ee eej ee ee ee eej	 ee edddZ  ZS )	Gemma2Modelr   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r-   )r   ).0r|   r   r-   r.   
<listcomp>n      z(Gemma2Model.__init__.<locals>.<listcomp>r   r   F)r$   r%   Zpad_token_idZpadding_idx
vocab_sizer&   Z	EmbeddingrD   embed_tokensZ
ModuleListrangenum_hidden_layerslayersr   r   normr   
rotary_embgradient_checkpointing	post_initrJ   r+   r   r.   r%   g  s    zGemma2Model.__init__N)	input_idsre   rT   r   inputs_embedsr   r   output_hidden_statesr   rt   rX   c
                 K   s  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|d u |d uA rTtd| jrr| jrr|rrtd d}|d u r| 	|}|r|d u r| jst
| j d}|	d u r|d ur| nd}tj|||jd  |jd}	|d u r|	d}t| }ts.| j |||	||d}tf i |tf i |d	}|}| ||}tj| j jd
 |jd}|| }|rhdnd }|rvdnd }| jd | j j D ]X}|r||f7 }||f|||j |||||	d|
}|d }|r||d f7 }q| |}|r ||f7 }t||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   r   )r   )rC   Zinput_embedsre   r   r   rT   )Zfull_attentionr}   g      ?r   r-   )r   re   rT   r   r   r   r   )last_hidden_stater   rV   r   )rC   r   r   r   
ValueErrorr   rm   loggerwarning_oncer   r   Zget_seq_lengthr(   Zaranger9   r   rO   r   r   r
   r   r   ZtensorrD   rk   r   r   r   r   r   )r*   r   re   rT   r   r   r   r   r   r   rt   Zpast_seen_tokensZcausal_mask_mappingZmask_kwargsrV   r   Z
normalizerZall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr-   r-   r.   r7   w  s    



	

zGemma2Model.forward)	NNNNNNNNN)r;   r<   r=   r   r%   r   r   r   r(   r   r   r   r   r   r   r   r   r7   r?   r-   r-   r+   r.   r   e  s2            r   c                       s   e Zd ZdgZddiZddgdgfiZ fddZeede	e
j e	e
j e	e
j e	e e	e
j e	e
j e	e e	e e	e e	e
j eee
jf ed
ddZ  ZS )Gemma2ForCausalLMzlm_head.weightlm_headZcolwise_reprV   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rA   )
r$   r%   r   r   r   r&   rE   rD   r   r   rJ   r+   r-   r.   r%     s
    
zGemma2ForCausalLM.__init__Nr   )r   re   rT   r   r   labelsr   r   r   r   logits_to_keeprX   c                 K   s  | j r(| jjdkr(td| jj d |dur4|n| jj}|	durH|	n| jj}	| jf ||||||||	|
d	|}|j}t	|t
rt| dn|}| |dd|ddf }| jjdur|| jj }t|}|| jj }d}|dur| j||| jfi |}t|||j|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM

        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```r   zhIt is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	r   re   rT   r   r   r   r   r   r   )lossr   r   rV   r   )rm   rC   r   r   r   r   r   r   r   r   r>   slicer   Zfinal_logit_softcappingr(   rq   Zloss_functionr   r   r   rV   r   )r*   r   re   rT   r   r   r   r   r   r   r   r   rt   r   rV   Zslice_indicesr   r   r-   r-   r.   r7     sN    #


zGemma2ForCausalLM.forward)NNNNNNNNNNr   )r;   r<   r=   Z_tied_weights_keysZ_tp_planZ_pp_planr%   r   r   r   r(   r   r   r   r   r   r   r>   r   r7   r?   r-   r-   r+   r.   r     s>   	           r   c                   @   s   e Zd ZdS )Gemma2ForSequenceClassificationNr;   r<   r=   r-   r-   r-   r.   r   F  s   r   c                   @   s   e Zd ZdS )Gemma2ForTokenClassificationNr   r-   r-   r-   r.   r   J  s   r   )r   r   r   r   r   )Nr   )r`   NN)Dtypingr   r   r   r(   Ztorch.nnr&   Zactivationsr   Zcache_utilsr   r   Z
generationr	   Zmasking_utilsr
   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   r   r   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zutils.deprecationr   Zutils.genericr   Zconfiguration_gemma2r   Z
get_loggerr;   r   Moduler   r@   rN   rU   r   r>   r_   r5   r8   ry   rz   r   r   r   r   r   r   r   __all__r-   r-   r-   r.   <module>   sb   

   #K<$ ^