a
    hI                  	   @   s
  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* e% rddl+m,Z, ddl-m.Z. e&/e0Z1G dd dej2Z3dd Z4d1ddZ5G dd dej2Z6d2ej2ej7ej7ej7eej7 e8e8dd d!Z9G d"d# d#ej2Z:G d$d% d%eZ;e#G d&d' d'eZ<e#G d(d) d)e<Z=G d*d+ d+e<eZ>G d,d- d-ee<Z?G d.d/ d/ee<Z@g d0ZAdS )3zPyTorch Persimmon model.    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg   )PersimmonConfig)	BlockMask)make_flex_block_causal_maskc                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	PersimmonRotaryEmbeddinginv_freqNconfigc                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr!   F)
persistent)super__init__hasattr
isinstancer$   dictgetr%   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr#   r   Zrope_init_fnattention_scalingZregister_bufferr!   Zoriginal_inv_freq)selfr#   devicer!   	__class__ l/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/persimmon/modeling_persimmon.pyr*   ?   s    
z!PersimmonRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r   ZmpscpuF)device_typeZenabled   dim)dtype)r!   floatexpandshapetor1   r,   r&   strtorchZautocast	transposecatcosr/   sinr<   )
r0   xposition_idsZinv_freq_expandedZposition_ids_expandedr8   ZfreqsZembrE   rF   r4   r4   r5   forwardP   s    0&,z PersimmonRotaryEmbedding.forward)N)__name__
__module____qualname__rB   Tensor__annotations__r   r*   Zno_gradr   rI   __classcell__r4   r4   r2   r5   r    <   s
   

r    c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr6   r9   r:   )r?   rB   rD   )rG   x1Zx2r4   r4   r5   rotate_halfa   s    rQ   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerQ   )qkrE   rF   rH   Zunsqueeze_dimZq_embedZk_embedr4   r4   r5   apply_rotary_pos_embi   s
    

rU   c                       s$   e Zd Z fddZdd Z  ZS )PersimmonMLPc                    s>   t    t|j|j| _t|j|j| _t|j	 | _
d S N)r)   r*   r   Linearhidden_sizeZintermediate_sizedense_h_to_4hdense_4h_to_hr   Z
hidden_actactr0   r#   r2   r4   r5   r*      s    
zPersimmonMLP.__init__c                 C   s"   |  |}| |}| |}|S rW   )rZ   r\   r[   )r0   hidden_statesr4   r4   r5   rI      s    


zPersimmonMLP.forward)rJ   rK   rL   r*   rI   rO   r4   r4   r2   r5   rV      s   rV           )modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t ||dd| }|d urN|d d d d d d d |jd f }	||	 }tjj|dt jd|j	}tjj
||| jd}t ||}
|
dd }
|
|fS )Nr9   r   r6   )r;   r<   )ptrainingr   )rB   matmulrC   r?   r   Z
functionalZsoftmaxZfloat32r@   r<   rf   ri   
contiguous)r`   ra   rb   rc   rd   re   rf   kwargsattn_weightscausal_maskattn_outputr4   r4   r5   eager_attention_forward   s    
&rp   c                       s   e Zd ZdZdeee d fddZej	e
ej	ej	ej	f dddZed	d
dddej	eej	 eej ee eeeej ee
ej	ej	f  ee e
ej	eej	 ee
ej	  f d
ddZ  ZS )PersimmonAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr#   	layer_idxc                    sH  t    || _|| _|d u r4td| jj d |j| _|j	| _
| j| j
 | _|j| _t| j|j | _d| _| j| j
 | jkrtd| j d| j
 dtj| jd| j dd| _tj| j
| j | jdd| _|j| _| jd	 | _| jr(tj|j| j
 |jdd
| _tj|j| j
 |jdd
| _t|j| _t| jd| _d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r   biasg      )epsZelementwise_affiner"   ) r)   r*   r#   rs   loggerwarning_oncer3   rJ   rY   Znum_attention_heads	num_headshead_dimZ
rope_thetaintZpartial_rotary_factorrotary_ndimsZ	is_causal
ValueErrorr   rX   query_key_valuedenseqk_layernormre   	LayerNormlayer_norm_epsq_layernormk_layernormDropoutattention_dropoutr    
rotary_embr0   r#   rs   r2   r4   r5   r*      s@    

zPersimmonAttention.__init__)	fused_qkvreturnc                 C   sV   |j \}}}|||| jd| j}|ddddf |ddddf |ddddf fS )a  
        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
        storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
            value: [batch_size, seq_length, num_heads, head_dim]
        r   .r   Nr   r9   )r?   viewry   rz   )r0   r   
batch_sizeZ
seq_lengthZthree_times_hidden_sizer4   r4   r5   _split_heads   s    zPersimmonAttention._split_headspast_key_valuepast_key_values4.58new_nameversionF
r^   rd   rH   r   output_attentions	use_cachecache_positionposition_embeddingsrl   r   c	                 K   s  |  \}
}}| |}| |\}}}| jrB| |}| |}|dd}|dd}|dd}|\}}|dd | jf |d| jd f  }}|dd | jf |d| jd f  }}t||||\}}t	j
||fdd}t	j
||fdd}|d ur ||| j|d}|||| j|\}}t}| jjdkr>t| jj }|| ||||f| jsXdn| jj| jd	|	\}}||
|d}| |}|sd }||fS )
Nr   r9   .r6   r:   )rF   rE   Zpartial_rotation_sizer   eagerr_   )rf   re   )sizer~   r   r   r   r   rC   r|   rU   rB   rD   updaters   rp   r#   _attn_implementationr   ri   r   re   reshaper   )r0   r^   rd   rH   r   r   r   r   r   rl   ZbszZq_len_r   Zquery_statesZ
key_statesZvalue_statesrE   rF   Z	query_rotZ
query_passZkey_rotZkey_passZcache_kwargsZattention_interfacero   rm   r4   r4   r5   rI      s\    





zPersimmonAttention.forward)N)NNNFFNN)rJ   rK   rL   __doc__r   r   r{   r*   rB   rM   tupler   r   
LongTensorr   boolr   r   rI   rO   r4   r4   r2   r5   rq      s.   &"       rq   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	eej  e	e e	e e	ej
 e	eejejf  ee eeje	eejejf  f d

ddZ  ZS )PersimmonDecoderLayerrr   c                    sd   t    |j| _t||d| _t|| _tj|j|j	d| _
tj|j|j	d| _t|j| _d S )Nrr   rv   )r)   r*   rY   rq   	self_attnrV   mlpr   r   r   input_layernormpost_attention_layernormr   Zhidden_dropoutrf   r   r2   r4   r5   r*   6  s    

zPersimmonDecoderLayer.__init__r   r   r   r   NFr   c	                 K   s~   |}
|  |}| jf ||||||||d|	\}}|
| }|}
| |}| |}| |}||
 }|f}|rz||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`.
                [What are position IDs?](../glossary#position-ids)
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*):
                cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r^   rd   rH   r   r   r   r   r   )r   r   r   r   rf   )r0   r^   rd   rH   r   r   r   r   r   rl   ZresidualZself_attn_weightsoutputsr4   r4   r5   rI   ?  s0    %
	




zPersimmonDecoderLayer.forward)NNNFFNN)rJ   rK   rL   r   r{   r*   r   rB   rM   r   r   r   r   r   r   FloatTensorrI   rO   r4   r4   r2   r5   r   5  s*   	       r   c                   @   s@   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZdZdd ZdS )	PersimmonPreTrainedModelr#   modelTr   r   c                 C   s   | j j}t|tjr>|jjjd|d |jd ur|jj	  nbt|tj
rz|jjjd|d |jd ur|jj|j 	  n&t|tjr|jjd |jj	  d S )Nr_   )meanstdg      ?)r#   Zinitializer_ranger,   r   rX   weightdataZnormal_ru   Zzero_	Embeddingpadding_idxr   Zfill_)r0   r`   r   r4   r4   r5   _init_weights  s    

z&PersimmonPreTrainedModel._init_weightsN)rJ   rK   rL   r   rN   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_can_compile_fullgraphZ_supports_sdpaZ_supports_flash_attnZ_supports_attention_backendr   r4   r4   r4   r5   r     s   
r   c                       s   e Zd ZdZed fddZeedee	j
 ee	j ee	j
 ee ee	j ee ee ee ee	j
 ee edddZdee	jd
f e	je	jeedddZee	jeee	je	jedddZ  ZS )PersimmonModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PersimmonDecoderLayer`]

    Args:
        config: PersimmonConfig
    r"   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _tj j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r4   )r   ).0rs   r"   r4   r5   
<listcomp>      z+PersimmonModel.__init__.<locals>.<listcomp>r   r"   F)r)   r*   Zpad_token_idr   
vocab_sizer   r   rY   embed_tokensZ
ModuleListrangeZnum_hidden_layerslayersr   r   final_layernormr    r   gradient_checkpointing	post_initr]   r2   r"   r5   r*     s    zPersimmonModel.__init__N)	input_idsrd   rH   r   inputs_embedsr   r   output_hidden_statesr   rl   r   c
                 K   s  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|d u |d uA rTtd| jrr| jrr|rrtd d}t	|t
d tfstd|r|d u rt| j d}|d u r| |}|	d u r|d ur| nd}tj|||jd  |jd}	|d u r|	d}| |||	||}|}| ||}|r.d	nd }|r<d	nd }| jD ]R}|rZ||f7 }||f||||||	|d
|
}|d }|rF||d f7 }qF| |}|r||f7 }t||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzBThe `past_key_values` should be either a `Cache` object or `None`.r"   r   r   r1   r4   )rd   rH   r   r   r   r   r   )last_hidden_stater   r^   
attentions)r#   r   r   r   r}   r   ri   rw   rx   r,   r&   r   r	   r   get_seq_lengthrB   aranger?   r1   rR   _update_causal_maskr   r   r   r   )r0   r   rd   rH   r   r   r   r   r   r   rl   past_seen_tokensrn   r^   r   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr4   r4   r5   rI     sx    





	

zPersimmonModel.forwardFr   )rd   input_tensorr   r   r   c                 C   sB  | j jdkr(|d ur$|dk r$|S d S | j jdkrLt|tjrHt|}|S |d ur\| nd}|d urn|jnd}| j jdkr|s|st	j
|||| jdrd S |j}|jd }	|r| }
n"t|tjr|jd	 n
||	 d }
| j||	|
|||jd d
}| j jdkr>|d ur>|jjdv r>|s>t|j}t	||}|S )NZflash_attention_2r_   Zflex_attentionr   FZsdpa)r   Zpast_key_values_lengthZis_trainingr   r6   )sequence_lengthtarget_lengthr<   r   r   )cudaZxpuZnpu)r#   r   anyr,   rB   rM   r   r   Zis_compileabler   Z_ignore_causal_mask_sdpari   r<   r?   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr1   r&   finfominZ_unmask_unattended)r0   rd   r   r   r   r   r   Zusing_compilable_cacher<   r   r   rn   	min_dtyper4   r4   r5   r     sZ    






	z"PersimmonModel._update_causal_mask)rd   r   r   r<   r   r   c                 K   sF  | dur|   dkr| }n&t|j}tj||f|||jd}|dkrVtj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| durB|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuer<   r1   r   )Zdiagonalr   r6   r   )r;   rB   r   r   fullr1   Ztriur   r   r>   cloner?   r@   Zmasked_fill)rd   r   r   r<   r   r   rl   rn   r   Zmask_lengthZpadding_maskr4   r4   r5   r   a  s*     $

6  zDPersimmonModel._prepare_4d_causal_attention_mask_with_cache_position)	NNNNNNNNN)F)rJ   rK   rL   r   r   r*   r   r   r   rB   r   rM   r   r   r   r   r   r   rI   r   r   staticmethodr{   r<   r   rO   r4   r4   r2   r5   r     sT            e Dr   c                       s   e Zd ZdgZ fddZeed	eej	 eej
 eej	 ee eej eej	 ee ee ee eej	 eeej
f edddZ  ZS )
PersimmonForCausalLMzlm_head.weightc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S )NFrt   )
r)   r*   r   r   r   r   rX   rY   lm_headr   r]   r2   r4   r5   r*     s
    
zPersimmonForCausalLM.__init__Nr   )r   rd   rH   r   r   labelsr   r   r   r   logits_to_keepr   c                 K   s   |dur|n| j j}|	dur |	n| j j}	| jf ||||||||	|
d	|}|j}t|trht| dn|}| |dd|ddf }d}|dur| j	||fd| j j
i|}t|||j|j|jdS )uk  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PersimmonForCausalLM

        >>> model = PersimmonForCausalLM.from_pretrained("adept/persimmon-8b-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("adept/persimmon-8b-base")

        >>> prompt = "human: Hey, what should I eat for dinner?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'human: Hey, what should I eat for dinner?\n\ncat: 🐱\n\nhuman: 😐\n\n'
        ```N)	r   rd   rH   r   r   r   r   r   r   r   )losslogitsr   r^   r   )r#   r   r   r   r   r,   r{   slicer   Zloss_functionr   r   r   r^   r   )r0   r   rd   rH   r   r   r   r   r   r   r   r   rl   r   r^   Zslice_indicesr   r   r4   r4   r5   rI     sH    (
zPersimmonForCausalLM.forward)NNNNNNNNNNr   )rJ   rK   rL   Z_tied_weights_keysr*   r   r   r   rB   r   rM   r   r   r   r   r{   r   rI   rO   r4   r4   r2   r5   r     s:   	           r   c                   @   s   e Zd ZdS )"PersimmonForSequenceClassificationNrJ   rK   rL   r4   r4   r4   r5   r     r   r   c                   @   s   e Zd ZdS )PersimmonForTokenClassificationNr   r4   r4   r4   r5   r     r   r   )r   r   r   r   r   )Nr   )r_   )Br   typingr   r   r   rB   Ztorch.utils.checkpointr   Zactivationsr   Zcache_utilsr   r	   Z
generationr
   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   r   r   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zutils.deprecationr   Zconfiguration_persimmonr   Z!torch.nn.attention.flex_attentionr   Zintegrations.flex_attentionr   Z
get_loggerrJ   rw   Moduler    rQ   rU   rV   rM   r=   rp   rq   r   r   r   r   r   r   __all__r4   r4   r4   r5   <module>   s^   
%
  Q x_