a
    hV                  
   @   s  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) e#*e+Z,dd Z-d0ddZ.ej/e0ej/dddZ1d1ej2ej/ej/ej/eej/ e3e3ee  dddZ4G dd dej2Z5G dd  d ej2Z6G d!d" d"eZ7G d#d$ d$ej2Z8e!G d%d& d&eZ9e!G d'd( d(e9Z:e!G d)d* d*e9eZ;G d+d, d,ee9Z<G d-d. d.ee9Z=g d/Z>dS )2    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )	PhiConfigc                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1Zx2 r&   `/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/phi/modeling_phi.pyrotate_half"   s    r(   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer(   )qkcossinposition_idsZunsqueeze_dimZq_embedZk_embedr&   r&   r'   apply_rotary_pos_emb)   s
    

r/   )hidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r!   expandreshape)r0   r1   batchnum_key_value_headsslenhead_dimr&   r&   r'   	repeat_kvD   s
    0r9           )modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r   r   )r    dtype)ptrainingr   )r9   num_key_value_groupsr"   matmul	transposer!   nnZ
functionalZsoftmaxZfloat32torD   rA   rF   
contiguous)r;   r<   r=   r>   r?   r@   rA   rB   
key_statesvalue_statesattn_weightscausal_maskattn_outputr&   r&   r'   eager_attention_forwardP   s    
&rR   c                       s~   e Zd ZdZeed fddZedddddej	e
ej	ej	f eej	 ee eej e
ej	eej	 f d
ddZ  ZS )PhiAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    s"  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j| j |jdd| _t| j|j | _|j| _| jrtj|j|j |jdd| _tj|j|j |jdd| _d S )Nr8   g      TZbias)epsZelementwise_affine)super__init__rU   rV   getattrhidden_sizeZnum_attention_headsr8   r6   rG   r@   attention_dropoutZ	is_causalrJ   Linearq_projk_projv_projdenseintZpartial_rotary_factorrotary_ndimsqk_layernorm	LayerNormlayer_norm_epsq_layernormk_layernormselfrU   rV   	__class__r&   r'   rZ   m   s*    
zPhiAttention.__init__past_key_valuepast_key_values4.58new_nameversionN)r0   position_embeddingsr?   ro   cache_positionr2   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| jr| |	}	| 	|
}
|\}}|	dd | j
f |	d| j
d f  }}|
dd | j
f |
d| j
d f  }}t||||\}}tj||fdd}	tj||fdd}
|d ur:|||d}||
|| j|\}
}t}| jjdkrXt| jj }|| |	|
||f| jsrdn| j| jd	|\}}|jg |dR   }| |}||fS )
Nr   r   r   .r   )r-   r,   ru   eagerr:   )rA   r@   )r!   r8   r_   viewrI   r`   ra   re   rh   ri   rd   r/   r"   r#   updaterV   rR   rU   Z_attn_implementationr   rF   r]   r@   r4   rL   rb   )rk   r0   rt   r?   ro   ru   rB   Zinput_shapeZhidden_shapeZquery_statesrM   rN   r,   r-   Z	query_rotZ
query_passZkey_rotZkey_passZcache_kwargsZattention_interfacerQ   rO   r&   r&   r'   forward   sN    





zPhiAttention.forward)NN)__name__
__module____qualname____doc__r   rc   rZ   r   r"   Tensortupler   r   
LongTensorry   __classcell__r&   r&   rl   r'   rS   j   s     rS   c                       s0   e Zd Z fddZejejdddZ  ZS )PhiMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)rY   rZ   rU   r   Z
hidden_actactivation_fnrJ   r^   r\   Zintermediate_sizefc1fc2rk   rU   rl   r&   r'   rZ      s
    
zPhiMLP.__init__)r0   r2   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rk   r0   r&   r&   r'   ry      s    


zPhiMLP.forward)rz   r{   r|   rZ   r"   r~   ry   r   r&   r&   rl   r'   r      s   r   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	eej  e	e e	e e	ej
 e	eejejf  eeje	eejejf  f d
	ddZ  ZS )PhiDecoderLayerrT   c                    sH   t    t||d| _t|| _tj|j|j	d| _
t|j| _d S )N)rV   rX   )rY   rZ   rS   	self_attnr   mlprJ   rf   r\   rg   input_layernormDropoutZresid_pdropresid_dropoutrj   rl   r&   r'   rZ      s
    

zPhiDecoderLayer.__init__rn   ro   rp   rq   NF)	r0   r?   r.   ro   output_attentions	use_cacheru   rt   r2   c	                 K   sr   |}
|  |}| jf ||||||||d|	\}}| |}| | |}|| |
 }|f}|rn||f7 }|S )N)r0   r?   r.   ro   r   r   ru   rt   )r   r   r   r   )rk   r0   r?   r.   ro   r   r   ru   rt   rB   ZresidualZattn_outputsZself_attn_weightsZfeed_forward_hidden_statesoutputsr&   r&   r'   ry      s*    
	


zPhiDecoderLayer.forward)NNNFFNN)rz   r{   r|   r   rc   rZ   r   r"   r~   r   r   r   boolFloatTensorry   r   r&   r&   rl   r'   r      s(          r   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	PhiRotaryEmbeddinginv_freqNrU   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)rY   rZ   hasattr
isinstancer   dictgetr   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrU   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)rk   rU   devicer   rl   r&   r'   rZ     s    
zPhiRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r   r   ZmpscpuF)device_typeZenabledr   r   )rD   )r   floatr3   r!   rK   r   r   r   strr"   ZautocastrI   r#   r,   r   r-   rD   )
rk   r$   r.   Zinv_freq_expandedZposition_ids_expandedr   ZfreqsZembr,   r-   r&   r&   r'   ry     s    0&,zPhiRotaryEmbedding.forward)N)rz   r{   r|   r"   r~   __annotations__r   rZ   Zno_gradr   ry   r   r&   r&   rl   r'   r     s
   

r   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )PhiPreTrainedModelrU   modelTr   ro   )r0   
attentionsN)rz   r{   r|   r   r   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZ_supports_attention_backendr   rS   Z_can_record_outputsr&   r&   r&   r'   r   '  s   
r   c                       s   e Zd Zed fddZeedeej	 eej
 eej	 ee eej ee ee ee eej	 ee edddZ  ZS )	PhiModelr   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t d| _d| _t j| _tj j jd| _|   d S )Nc                    s   g | ]}t  |qS r&   )r   ).0rV   r   r&   r'   
<listcomp>C      z%PhiModel.__init__.<locals>.<listcomp>r   Fr   )rY   rZ   Zpad_token_idZpadding_idx
vocab_sizerJ   Z	Embeddingr\   embed_tokensZ
ModuleListrangenum_hidden_layerslayersr   
rotary_embgradient_checkpointingr   Z
embd_pdropembed_dropoutrf   rg   final_layernorm	post_initr   rl   r   r'   rZ   <  s    zPhiModel.__init__N)	input_idsr?   r.   ro   inputs_embedsr   r   output_hidden_statesru   rB   r2   c
                 K   s  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|d u |d uA rTtd| jrr| jrr|rrtd d}|d u r| 	|}|r|d u rt
| j d}|	d u r|d ur| nd}tj|||jd  |jd}	|d u r|	d}t| j |||	||d}| |}|}| ||}|r d	nd }|r.d	nd }| jd | j j D ]R}|rX||f7 }||f||||||	|d
|
}|d }|rD||d f7 }qD| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   r   )r   )rU   Zinput_embedsr?   ru   ro   r.   r&   )r?   r.   ro   r   r   ru   rt   )last_hidden_statero   r0   r   )rU   r   r   r   
ValueErrorr   rF   loggerZwarning_oncer   r   Zget_seq_lengthr"   Zaranger!   r   r)   r
   r   r   r   r   r   r   )rk   r   r?   r.   ro   r   r   r   r   ru   rB   Zpast_seen_tokensrP   r0   rt   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr&   r&   r'   ry   M  s~    

	

	

zPhiModel.forward)	NNNNNNNNN)rz   r{   r|   r   rZ   r   r   r   r"   r   r~   r   r   r   r   r   r   ry   r   r&   r&   rl   r'   r   :  s2            r   c                       s   e Zd ZdgZddiZddgdgfiZ fddZeede	e
j e	e
j e	e
j e	e e	e
j e	e
j e	e e	e
j eee
jf ee ed
ddZ  ZS )PhiForCausalLMzlm_head.weightlm_headZcolwise_repr0   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S )NTrW   )
rY   rZ   r   r   r   rJ   r^   r\   r   r   r   rl   r&   r'   rZ     s
    
zPhiForCausalLM.__init__Nr   )r   r?   r.   ro   r   labelsr   ru   logits_to_keeprB   r2   c
              
   K   s   | j f |||||||d|
}|j}t|	tr<t|	 dn|	}| |dd|ddf }d}|dur| jf ||| jjd|
}t	|||j
|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, PhiForCausalLM

        >>> model = PhiForCausalLM.from_pretrained("meta-phi/Phi-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-phi/Phi-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r   r?   r.   ro   r   r   ru   N)r   r   r   )lossr   ro   r0   r   )r   r   r   rc   slicer   Zloss_functionrU   r   r   ro   r0   r   )rk   r   r?   r.   ro   r   r   r   ru   r   rB   r   r0   Zslice_indicesr   r   r&   r&   r'   ry     s0     zPhiForCausalLM.forward)	NNNNNNNNr   )rz   r{   r|   Z_tied_weights_keysZ_tp_planZ_pp_planrZ   r   r   r   r"   r   r~   r   r   r   r   rc   r   r   r   ry   r   r&   r&   rl   r'   r     s8   	         r   c                   @   s   e Zd ZdS )PhiForSequenceClassificationNrz   r{   r|   r&   r&   r&   r'   r     s   r   c                   @   s   e Zd ZdS )PhiForTokenClassificationNr   r&   r&   r&   r'   r      s   r   )r   r   r   r   r   )Nr   )r:   )?typingr   r   r   r"   Ztorch.nnrJ   Zactivationsr   Zcache_utilsr   r   Z
generationr	   Zmasking_utilsr
   Zmodeling_layersr   r   r   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zutils.deprecationr   Zutils.genericr   Zconfiguration_phir   Z
get_loggerrz   r   r(   r/   r~   rc   r9   Moduler   rR   rS   r   r   r   r   r   r   r   r   __all__r&   r&   r&   r'   <module>   sV   

 Y1$uK