a
    h\                  
   @   s  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) G dd dej*Z+G dd dej*Z,ej-e.ej-dddZ/d/ej*ej-ej-ej-eej- e0e0ee! dddZ1dd Z2d0d d!Z3G d"d# d#ej*Z4G d$d% d%ej*Z5G d&d' d'eZ6e"G d(d) d)eZ7e"G d*d+ d+e7Z8e"G d,d- d-e7eZ9g d.Z:dS )1    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )Cohere2Configc                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	Cohere2RotaryEmbeddinginv_freqNconfigc                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)super__init__hasattr
isinstancer    dictgetr!   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)selfr   devicer   	__class__ h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/cohere2/modeling_cohere2.pyr&   .   s    
zCohere2RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtrd|jjdkrd|jjnd}tj	|ddT | |  
dd}tj|ddd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r   ZmpscpuF)device_typeZenabled   dimdtype)r   floatexpandshaper(   r-   r"   strtorchZautocast	transposeZrepeat_interleavecosr+   sintor9   )
r,   xposition_idsZinv_freq_expandedZposition_ids_expandedr4   ZfreqsZembr@   rA   r0   r0   r1   forward?   s    (&,zCohere2RotaryEmbedding.forward)N)__name__
__module____qualname__r>   Tensor__annotations__r   r&   Zno_gradr   rE   __classcell__r0   r0   r.   r1   r   +   s
   

r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	Cohere2LayerNormNh㈵>Fc                    s&   t    tt|| _|| _dS )zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)r%   r&   nn	Parameterr>   Zonesweightvariance_epsilon)r,   hidden_sizeepsbiasr.   r0   r1   r&   P   s    
zCohere2LayerNorm.__init__c                 C   sl   |j }|tj}|jddd}|| djddd}|| t|| j  }| jtj| }||S )Nr2   T)Zkeepdimr5   )	r9   rB   r>   float32meanpowZrsqrtrQ   rP   )r,   hidden_statesZinput_dtyperV   Zvariancer0   r0   r1   rE   V   s    zCohere2LayerNorm.forward)NrM   FrF   rG   rH   r&   rE   rK   r0   r0   r.   r1   rL   O   s   rL   )rX   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r<   r;   reshape)rX   rZ   batchnum_key_value_headsslenhead_dimr0   r0   r1   	repeat_kv`   s
    0ra           )modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr5   r   r2   )r7   r9   )ptrainingr   )ra   num_key_value_groupsr>   matmulr?   r<   rN   Z
functionalZsoftmaxrU   rB   r9   ri   rm   
contiguous)rc   rd   re   rf   rg   rh   ri   rj   
key_statesvalue_statesattn_weightsZcausal_maskattn_outputr0   r0   r1   eager_attention_forwardl   s    
&ru   c                 C   sB   | dd d df }| ddd df }t j| |gddd}|S )N.r5   r   r2   r6   rk   )r>   stackflatten)rC   x1Zx2Zrot_xr0   r0   r1   rotate_half   s    ry   c           	      C   sj   | j }|  } | }||}||}| | t| |  }|| t||  }|j|d|j|dfS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r8   )r9   r:   	unsqueezery   rB   )	qkr@   rA   rD   Zunsqueeze_dimr9   Zq_embedZk_embedr0   r0   r1   apply_rotary_pos_emb   s    

r}   c                       s   e Zd ZdZdeee d fddZedddd	de	j
ee	j
e	j
f ee	j
 ee ee	j ee ee	j
ee	j
 eee	j
  f d
ddZ  ZS )Cohere2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _|j| dkrh|jnd | _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )Nr`   g      Tsliding_attentionrT   )r%   r&   r   r   getattrrR   Znum_attention_headsr`   r^   rn   rh   attention_dropoutZ	is_causallayer_typessliding_windowrN   LinearZattention_biasq_projk_projv_projo_projr,   r   r   r.   r0   r1   r&      s*    
zCohere2Attention.__init__past_key_valuepast_key_values4.58new_nameversion)rX   position_embeddingsrg   r   cache_positionrj   r[   c                 K   s2  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}| jd urt|	|
||\}	}
|d ur|||d}|	|
|| j
|\}
}t}| jjdkrt| jj }|| |	|
||f| jsdn| j| j| jd|\}}|jg |dR   }| |}||fS )Nr2   r   r5   )rA   r@   r   eagerrb   )ri   rh   r   )r<   r`   r   viewr?   r   r   r   r}   updater   ru   r   Z_attn_implementationr   rm   r   rh   r\   rp   r   )r,   rX   r   rg   r   r   rj   Zinput_shapeZhidden_shapeZquery_statesrq   rr   r@   rA   Zcache_kwargsZattention_interfacert   rs   r0   r0   r1   rE      s<    

	

zCohere2Attention.forward)N)NN)rF   rG   rH   __doc__r   r   intr&   r   r>   rI   tupler   
LongTensorr   r   rE   rK   r0   r0   r.   r1   r~      s     r~   c                       s$   e Zd Z fddZdd Z  ZS )
Cohere2MLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFr   )r%   r&   r   rR   Zintermediate_sizerN   r   	gate_projup_proj	down_projr   Z
hidden_actact_fnr,   r   r.   r0   r1   r&      s    
zCohere2MLP.__init__c                 C   s$   |  | | || | }|S )N)r   r   r   r   )r,   rC   r   r0   r0   r1   rE      s     zCohere2MLP.forwardrY   r0   r0   r.   r1   r      s   
r   c                       s   e Zd Zeed fddZedddddeje	ejejf e
ej e
e e
e e
ej ee e	eje
e	ejejf  f d
ddZ  ZS )Cohere2DecoderLayerr   c                    sL   t    |j| _t||d| _t|| _t|j|jd| _	|j
| | _d S )Nr   rR   rS   )r%   r&   rR   r~   	self_attnr   mlprL   layer_norm_epsinput_layernormr   attention_typer   r.   r0   r1   r&     s    

zCohere2DecoderLayer.__init__r   r   r   r   NF)rX   r   rg   r   	use_cacher   rj   r[   c              	   K   sJ   |}|  |}| jf ||||||d|\}	}
| |}||	 | }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )rX   r   rg   r   r   r   )r   r   r   )r,   rX   r   rg   r   r   r   rj   ZresidualZhidden_states_attention_Zhidden_states_mlpr0   r0   r1   rE     s    



zCohere2DecoderLayer.forward)NNFN)rF   rG   rH   r   r   r&   r   r>   rI   r   r   r   boolr   r   r   FloatTensorrE   rK   r0   r0   r.   r1   r     s        r   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )Cohere2PreTrainedModelr   modelTr   r   )rX   
attentionsN)rF   rG   rH   r   rJ   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZ_supports_attention_backendr   r~   Z_can_record_outputsr0   r0   r0   r1   r   =  s   
r   c                       st   e Zd Zed fddZeedeej	 eej
 eej	 ee eej ee eej	 ee ed	ddZ  ZS )	Cohere2Modelr   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r0   )r   ).0r   r   r0   r1   
<listcomp>Y      z)Cohere2Model.__init__.<locals>.<listcomp>r   r   F)r%   r&   Zpad_token_idZpadding_idx
vocab_sizerN   Z	EmbeddingrR   embed_tokensZ
ModuleListrangeZnum_hidden_layerslayersrL   r   normr   
rotary_embZgradient_checkpointing	post_initr   r.   r   r1   r&   R  s    zCohere2Model.__init__N)		input_idsrg   rD   r   inputs_embedsr   r   rj   r[   c              	   K   s&  |d u |d uA rt d|d u r*| |}|rH|d u rH| jsHt| jd}|d u r|d ur`| nd}	tj|	|	|jd  |j	d}|d u r|
d}t| }
ts| j|||||d}tf i |tf i |d}
|}| ||}| jD ]&}||f||
|j |||d|}q| |}t||d	S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r-   )r   Zinput_embedsrg   r   r   rD   )Zfull_attentionr   )r   rg   r   r   r   )last_hidden_stater   )
ValueErrorr   rm   r   r   Zget_seq_lengthr>   Zaranger<   r-   rz   r(   r)   r
   r   r   r   r   r   r   )r,   r   rg   rD   r   r   r   r   rj   Zpast_seen_tokensZcausal_mask_mappingZmask_kwargsrX   r   Zdecoder_layerr0   r0   r1   rE   b  sV    

	


zCohere2Model.forward)NNNNNNN)rF   rG   rH   r   r&   r   r   r   r>   r   rI   r   r   r   r   r   r   rE   rK   r0   r0   r.   r1   r   P  s*          r   c                       s   e Zd ZdgZddiZddgdgfiZ fddZeede	e
j e	e
j e	e
j e	eeee
j f  e	e
j e	e
j e	e e	e e	e e	e
j eee
jf ee ed
ddZ  ZS )Cohere2ForCausalLMzlm_head.weightlm_headZcolwise_reprX   logitsc                    sP   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _
|   d S r   )r%   r&   r   r   r   rN   r   rR   r   logit_scaleZtie_word_embeddingsr   r   r.   r0   r1   r&     s    
zCohere2ForCausalLM.__init__Nr   )r   rg   rD   r   r   labelsr   output_attentionsoutput_hidden_statesr   logits_to_keeprj   r[   c                 K   s   |dur|n| j j}|	dur |	n| j j}	| jf ||||||||	|
d	|}|j}t|trht| dn|}| |dd|ddf }|| j	 }d}|dur| j
f ||| j jd|}t|||j|j|jdS )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, Cohere2ForCausalLM

        >> model = Cohere2ForCausalLM.from_pretrained("Cohere2ForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("Cohere2ForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r   rg   rD   r   r   r   r   r   r   )r   r   r   )lossr   r   rX   r   )r   r   r   r   r   r(   r   slicer   r   Zloss_functionr   r   r   rX   r   )r,   r   rg   rD   r   r   r   r   r   r   r   r   rj   outputsrX   Zslice_indicesr   r   r0   r0   r1   rE     s<    '

zCohere2ForCausalLM.forward)NNNNNNNNNNr   )rF   rG   rH   Z_tied_weights_keysZ_tp_planZ_pp_planr&   r   r   r   r>   r   rI   r   r   listr   r   r   r   r   r   rE   rK   r0   r0   r.   r1   r     s@              r   )r   r   r   )rb   )Nr   );typingr   r   r   r>   Ztorch.nnrN   Zactivationsr   Zcache_utilsr   r   Z
generationr	   Zmasking_utilsr
   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   Zutils.deprecationr   Zutils.genericr   Zconfiguration_cohere2r   Moduler   rL   rI   r   ra   r:   ru   ry   r}   r~   r   r   r   r   r   __all__r0   r0   r0   r1   <module>   sT   $ 
I8R]