a
    h+                     @   s\  d Z ddlmZmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlm Z  e!e"Z#dZ$dZ%G dd dej&Z'd%ddZ(G dd dej&Z)G dd deZ*G dd deZ+G dd deZ,G d d! d!eZ-G d"d# d#eZ.g d$Z/dS )&zPyTorch Phi-3 model.    )CallableOptionalN)nn   )ACT2FN)Cache)GenerationMixin)FlashAttentionKwargs)ALL_ATTENTION_FUNCTIONS)Unpack)logging)deprecate_kwarg   )MistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralPreTrainedModeleager_attention_forwardrotate_half   )
Phi3Configz microsoft/Phi-3-mini-4k-instructr   c                       s0   e Zd Z fddZejejdddZ  ZS )Phi3MLPc                    sP   t    || _tj|jd|j dd| _tj|j|jdd| _t	|j
 | _d S )Nr   FZbias)super__init__configr   Linearhidden_sizeZintermediate_sizegate_up_proj	down_projr   Z
hidden_actactivation_fn)selfr   	__class__ a/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/phi3/modular_phi3.pyr   3   s
    
zPhi3MLP.__init__)hidden_statesreturnc                 C   s4   |  |}|jddd\}}|| | }| |S )Nr   dim)r   chunkr!   r    )r"   r'   Z	up_statesZgater%   r%   r&   forward;   s    
zPhi3MLP.forward)__name__
__module____qualname__r   torchFloatTensorr-   __classcell__r%   r%   r#   r&   r   2   s   r   c                 C   s   | |}| |}|jd }| dd|f | d|df  }}|dd|f |d|df  }	}
tj|| t||  |gdd}tj|	| t|	|  |
gdd}||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r)   .Nr*   )Z	unsqueezeshaper1   catr   )qkcossinposition_idsZunsqueeze_dimZ
rotary_dimZq_rotZq_passZk_rotZk_passZq_embedZk_embedr%   r%   r&   apply_rotary_pos_embD   s    


""""r;   c                       s   e Zd ZdZdeee d fddZedddd	de	j
ee	j
e	j
f ee	j
 ee ee	j ee ee	j
ee	j
 eee	j
  f d
ddZ  ZS )Phi3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j| _| jd | _
|j| _d| _|j| j d|j| j   }tj|j| j |jdd| _tj|j|dd| _d S )Nhead_dimg      Tr   Fr   )r   r   r   r>   getattrr   num_attention_headsr?   num_key_value_headsZnum_key_value_groupsscalingattention_dropoutZ	is_causalr   r   o_projqkv_proj)r"   r   r>   Zop_sizer#   r%   r&   r   g   s    
zPhi3Attention.__init__past_key_valuepast_key_values4.58new_nameversion)r'   position_embeddingsattention_maskrH   cache_positionkwargsr(   c                 K   s  |j d d }g |d| jR }| |}	| jj| j }
|	dd |
f }|	d|
|
| j| j  f }|	d|
| j| j  d f }||dd}||dd}||dd}|\}}t||||\}}|d ur|||d}|	||| j
|\}}t}| jjdkrt| jj }|| ||||f| js4dn| j| jt| jdd d	|\}}|jg |dR   }| |}||fS )
Nr)   .r   r   )r9   r8   rO   eagerg        sliding_window)ZdropoutrC   rR   )r4   r?   rF   r   rA   rB   viewZ	transposer;   updater>   r   Z_attn_implementationr
   ZtrainingrD   rC   r@   Zreshape
contiguousrE   )r"   r'   rM   rN   rH   rO   rP   Zinput_shapeZhidden_shapeZqkvZ	query_posZquery_statesZ
key_statesZvalue_statesr8   r9   Zcache_kwargsZattention_interfaceZattn_outputZattn_weightsr%   r%   r&   r-   v   sD    

	

zPhi3Attention.forward)N)NN)r.   r/   r0   __doc__r   r   intr   r   r1   Tensortupler   
LongTensorr   r	   r-   r3   r%   r%   r#   r&   r<   d   s     r<   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	e e	e e	ej
 e	eejejf  ee eeje	eejejf  f d
	ddZ  ZS )Phi3DecoderLayerr=   c                    sL   t  || || _t||d| _t|| _t|j	| _
t|j	| _d S )Nr=   )r   r   r   r<   	self_attnr   mlpr   ZDropoutZresid_pdropresid_attn_dropoutresid_mlp_dropout)r"   r   r>   r#   r%   r&   r      s    
zPhi3DecoderLayer.__init__rG   rH   rI   rJ   NF)	r'   rN   r:   rH   	use_cacherO   rM   rP   r(   c              
   K   sj   |}	|  |}| jf |||||||d|\}}
|	| | }|}	| |}| |}|	| | }|S )N)r'   rN   r:   rH   r`   rO   rM   )Zinput_layernormr\   r^   Zpost_attention_layernormr]   r_   )r"   r'   rN   r:   rH   r`   rO   rM   rP   ZresidualZself_attn_weightsr%   r%   r&   r-      s&    




zPhi3DecoderLayer.forward)NNNFNN)r.   r/   r0   r   rW   r   r   r1   rX   r   rZ   r   boolrY   r   r	   r2   r-   r3   r%   r%   r#   r&   r[      s&         r[   c                   @   s   e Zd ZdZdS )Phi3PreTrainedModelz0.0.5N)r.   r/   r0   _versionr%   r%   r%   r&   rb      s   rb   c                   @   s   e Zd ZdddZdS )Phi3ForCausalLMNTc	                 K   sb   |r:| j jr:|jd | j jd kr:|d }
|
| j jkr:d }tj| f||||||||d|	}|S )Nr   r   )	input_idsrH   rN   inputs_embedsrO   r:   r`   logits_to_keep)r   Zrope_scalingr4   Z original_max_position_embeddingsr   prepare_inputs_for_generation)r"   re   rH   rN   rf   rO   r:   r`   rg   rP   Zpast_lengthZmodel_inputsr%   r%   r&   rh      s0    
z-Phi3ForCausalLM.prepare_inputs_for_generation)NNNNNTN)r.   r/   r0   rh   r%   r%   r%   r&   rd      s          rd   c                   @   s   e Zd ZdS )Phi3ForSequenceClassificationNr.   r/   r0   r%   r%   r%   r&   ri     s   ri   c                   @   s   e Zd ZdS )Phi3ForTokenClassificationNrj   r%   r%   r%   r&   rk     s   rk   )rb   Z	Phi3Modelrd   ri   rk   )Nr   )0rV   typingr   r   r1   Ztorch.utils.checkpointr   Zactivationsr   Zcache_utilsr   Z
generationr   Zmodeling_flash_attention_utilsr	   Zmodeling_utilsr
   Zprocessing_utilsr   utilsr   Zutils.deprecationr   Zmistral.modeling_mistralr   r   r   r   r   r   r   Zconfiguration_phi3r   Z
get_loggerr.   loggerZ_CHECKPOINT_FOR_DOCZ_CONFIG_FOR_DOCModuler   r;   r<   r[   rb   rd   ri   rk   __all__r%   r%   r%   r&   <module>   s4   $	

 F+*