a
    hn?                     @   sn  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$ e%e&Z'G dd dej(Z)G dd de!Z*dd Z+d&ddZ,G dd deZ-G dd deZ.G dd  d eZ/G d!d" d"e Z0G d#d$ d$eZ1g d%Z2dS )'zPyTorch Cohere model.    )CallableOptionalUnionN)nn   )Cache)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)dynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )LlamaAttentionLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forward   )CohereConfigc                       s&   e Zd Zd fdd	Zdd Z  ZS )	CohereLayerNormNh㈵>Fc                    s&   t    tt|| _|| _dS )zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)super__init__r   	ParametertorchZonesweightvariance_epsilon)selfhidden_sizeepsbias	__class__ e/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/cohere/modular_cohere.pyr   7   s    
zCohereLayerNorm.__init__c                 C   sl   |j }|tj}|jddd}|| djddd}|| t|| j  }| jtj| }||S )NT)Zkeepdimr   )	dtypetor    Zfloat32meanpowZrsqrtr"   r!   )r#   hidden_statesZinput_dtyper.   Zvariancer)   r)   r*   forward=   s    zCohereLayerNorm.forward)Nr   F)__name__
__module____qualname__r   r1   __classcell__r)   r)   r'   r*   r   6   s   r   c                   @   s    e Zd Ze edd ZdS )CohereRotaryEmbeddingc           
      C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtrd|jjdkrd|jjnd}tj	|ddT | |  
dd}tj|ddd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r+   r   ZmpscpuF)device_typeZenabledr   dimr,   )Zinv_freqfloatexpandshape
isinstanceZdevicetypestrr    Zautocast	transposeZrepeat_interleavecosZattention_scalingsinr-   r,   )
r#   xposition_idsZinv_freq_expandedZposition_ids_expandedr8   ZfreqsZembrC   rD   r)   r)   r*   r1   H   s    (&,zCohereRotaryEmbedding.forwardN)r2   r3   r4   r    Zno_gradr   r1   r)   r)   r)   r*   r6   G   s   r6   c                 C   sB   | dd d df }| ddd df }t j| |gddd}|S )N.r   r   r+   r9   )r    stackflatten)rE   x1Zx2Zrot_xr)   r)   r*   rotate_halfX   s    rK   c           	      C   sj   | j }|  } | }||}||}| | t| |  }|| t||  }|j|d|j|dfS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r;   )r,   r<   Z	unsqueezerK   r-   )	qkrC   rD   rF   Zunsqueeze_dimr,   Zq_embedZk_embedr)   r)   r*   apply_rotary_pos_emb`   s    

rN   c                       s   e Zd Z fddZ  ZS )	CohereMLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NF)r&   )	r   r   r   ZLinearr$   Zintermediate_sizeZ	gate_projZup_projZ	down_projr#   configr'   r)   r*   r      s    zCohereMLP.__init__)r2   r3   r4   r   r5   r)   r)   r'   r*   rO   ~   s   rO   c                       s   e Zd ZdZdeee d fddZedddd	de	j
ee	j
e	j
f ee	j
 ee ee	j ee ee	j
ee	j
 f d
ddZ  ZS )CohereAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrQ   	layer_idxc                    sP   t  || |j| _| jrLt|j| jf|jd| _t|j| jf|jd| _	d S )Nr$   r%   )
r   r   use_qk_normr   Znum_attention_headshead_dimlayer_norm_epsq_normZnum_key_value_headsk_normr#   rQ   rT   r'   r)   r*   r      s    zCohereAttention.__init__past_key_valuepast_key_values4.58new_nameversion)r0   position_embeddingsattention_maskr]   cache_positionkwargsreturnc                 K   sL  |j d d }g |d| jR }| ||}	| ||}
| ||}| jrl| |	}	| |
}
|		dd}	|
	dd}
|	dd}|\}}t
|	|
||\}	}
|d ur|||d}||
|| j|\}
}t}| jjdkrt| jj }|| |	|
||f| js
dn| j| jd|\}}|jg |dR   }| |}||fS )Nr+   r   r   )rD   rC   rd   eagerg        )Zdropoutscaling)r>   rW   Zq_projviewZk_projZv_projrV   rY   rZ   rB   rN   updaterT   r   rQ   Z_attn_implementationr   ZtrainingZattention_dropoutrh   Zreshape
contiguousZo_proj)r#   r0   rb   rc   r]   rd   re   Zinput_shapeZhidden_shapeZquery_statesZ
key_statesZvalue_statesrC   rD   Zcache_kwargsZattention_interfaceZattn_outputZattn_weightsr)   r)   r*   r1      sD    




zCohereAttention.forward)N)NN)r2   r3   r4   __doc__r   r   intr   r   r    Tensortupler   
LongTensorr   r   r1   r5   r)   r)   r'   r*   rR      s     rR   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	e e	e e	ej
 e	eejejf  ee eeje	eejejf  f d
	ddZ  ZS )CohereDecoderLayerrS   c                    s@   t    |j| _t||d| _t|| _t|j|jd| _	d S )NrS   rU   )
r   r   r$   rR   	self_attnrO   mlpr   rX   input_layernormr[   r'   r)   r*   r      s
    

zCohereDecoderLayer.__init__r\   r]   r^   r_   NF)	r0   rc   rF   r]   	use_cacherd   rb   re   rf   c              
   K   sL   |}	|  |}| jf |||||||d|\}
}| |}|	|
 | }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r0   rc   rF   r]   ru   rd   rb   )rt   rr   rs   )r#   r0   rc   rF   r]   ru   rd   rb   re   ZresidualZhidden_states_attention_Zhidden_states_mlpr)   r)   r*   r1      s     


zCohereDecoderLayer.forward)NNNFNN)r2   r3   r4   r   rm   r   r   r    rn   r   rp   r   boolro   r   r   FloatTensorr1   r5   r)   r)   r'   r*   rq      s&         rq   c                       s"   e Zd Zed fddZ  ZS )CohereModelrQ   c                    sN   t    t fddt jD | _t d| _t	 j
 jd| _d S )Nc                    s   g | ]}t  |qS r)   )rq   ).0rT   rz   r)   r*   
<listcomp>      z(CohereModel.__init__.<locals>.<listcomp>rz   rU   )r   r   r   Z
ModuleListrangeZnum_hidden_layersZlayersr6   Z
rotary_embr   r$   rX   ZnormrP   r'   rz   r*   r     s    zCohereModel.__init__)r2   r3   r4   r   r   r5   r)   r)   r'   r*   ry     s   ry   c                       s   e Zd Z fddZdeej eej eej eee	e
ej f  eej eej ee ee ee eej eeejf ee edddZ  ZS )	CohereForCausalLMc                    s*   t  | t|| _|j| _|j| _d S )N)r   r   ry   modellogit_scaleZtie_word_embeddingsrP   r'   r)   r*   r     s    
zCohereForCausalLM.__init__Nr   )	input_idsrc   rF   r]   inputs_embedslabelsru   output_attentionsoutput_hidden_statesrd   logits_to_keepre   rf   c                 K   s   |dur|n| j j}|	dur |	n| j j}	| jf ||||||||	|
d	|}|j}t|trht| dn|}| |dd|ddf }|| j	 }d}|dur| j
f ||| j jd|}t|||j|j|jdS )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, CohereForCausalLM

        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r   rc   rF   r]   r   ru   r   r   rd   )logitsr   
vocab_size)lossr   r]   r0   
attentions)rQ   r   r   r   Zlast_hidden_stater?   rm   sliceZlm_headr   Zloss_functionr   r   r]   r0   r   )r#   r   rc   rF   r]   r   r   ru   r   r   rd   r   re   outputsr0   Zslice_indicesr   r   r)   r)   r*   r1     s<    %

zCohereForCausalLM.forward)NNNNNNNNNNr   )r2   r3   r4   r   r   r    rp   rn   r   r   listrx   rw   rm   r   r   r   r1   r5   r)   r)   r'   r*   r     s6              r   )r   ry   ZCoherePreTrainedModel)Nr   )3rl   typingr   r   r   r    Ztorch.utils.checkpointr   Zcache_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr	   Zmodeling_outputsr
   r   Zmodeling_rope_utilsr   Zmodeling_utilsr   Zprocessing_utilsr   utilsr   r   Zutils.deprecationr   Zllama.modeling_llamar   r   r   r   r   r   Zconfiguration_coherer   Z
get_loggerr2   loggerModuler   r6   rK   rN   rO   rR   rq   ry   r   __all__r)   r)   r)   r*   <module>   s4    

D:
R