a
    ½ÀhÇP  ã                   @   sœ  d dl Z d dlmZ d dlZd dlmZ ddlmZmZ ddlm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZ d
dlmZmZmZmZmZmZmZmZ d
dlmZ ddlmZ e  e!¡Z"dZ#dZ$G dd„ deƒZ%dd„ Z&G dd„ dej'ƒZ(G dd„ de(ƒZ)G dd„ de(ƒZ*e(e)e*dœZ+G dd„ deƒZ,G dd „ d eƒZ-G d!d"„ d"eƒZ.G d#d$„ d$eƒZ/G d%d&„ d&eƒZ0G d'd(„ d(eƒZ1G d)d*„ d*eƒZ2g d+¢Z3dS ),é    N)ÚOptional)Únné   )ÚCacheÚStaticCache)Ú_flash_attention_forwardÚ!flash_attn_supports_top_left_mask)ÚPreTrainedModel)Úlogging)Údeprecate_kwargé   )ÚGemmaForCausalLM)ÚLlamaDecoderLayerÚLlamaForQuestionAnsweringÚLlamaForSequenceClassificationÚLlamaForTokenClassificationÚ
LlamaModelÚLlamaPreTrainedModelÚapply_rotary_pos_embÚ	repeat_kv)Ú
MistralMLPé   )ÚDiffLlamaConfigzkajuma/DiffLlama-0.3B-handcutr   c                   @   s   e Zd ZdS )ÚDiffLlamaMLPN©Ú__name__Ú
__module__Ú__qualname__© r   r   úk/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/diffllama/modular_diffllama.pyr   2   s   r   c                 C   s   ddt  d|  ¡  S )Ngš™™™™™é?g333333ã?g333333Ó¿)ÚmathÚexp)Ú	layer_idxr   r   r   Úlambda_init_fn6   s    r#   c                       sš   e Zd ZdZdeee dœ‡ fdd„Zedddd	de	j
ee	j
e	j
f ee	j
 ee	j ee eee	j ee	j
ee	j
 eee	j
  f dœdd„ƒZ‡  ZS )ÚDiffLlamaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN©Úconfigr"   c                    s¦  t ƒ  ¡  || _|| _|d u r4t d| jj› d¡ |j| _|j	| _	|j
| _t|d| j	| j ƒ| _|j| _| j| j | _|j| _|j| _d| _tj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j| j | j	|jd| _t|ƒ| _t tjd|j| jfd¡| _ t tjd|j| jfd¡| _!t tjd|j| jfd¡| _"t tjd|j| jfd¡| _#tj$d| j |j%d	d
| _&d S )NzInstantiating z¹ without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Úhead_dimT)Zbiasr   )Úsizer   F)ÚepsZelementwise_affine)'ÚsuperÚ__init__r&   r"   ÚloggerÚwarning_onceÚ	__class__r   Úattention_dropoutZhidden_sizeZnum_attention_headsÚ	num_headsÚgetattrr'   Únum_key_value_headsÚnum_key_value_groupsZmax_position_embeddingsZ
rope_thetaÚ	is_causalr   ZLinearZattention_biasÚq_projÚk_projÚv_projÚo_projr#   Úlambda_initÚ	ParameterÚtorchÚnormalÚlambda_std_devÚ	lambda_q1Ú	lambda_k1Ú	lambda_q2Ú	lambda_k2ZRMSNormZrms_norm_epsÚ	groupnorm©Úselfr&   r"   ©r.   r   r   r+   =   s4    
ÿ
zDiffLlamaAttention.__init__Úpast_key_valueÚpast_key_valuesú4.58©Únew_nameÚversionF©Úhidden_statesÚposition_embeddingsÚattention_maskÚposition_idsrG   Ú	use_cacheÚcache_positionÚreturnc                 K   sb  |  ¡ \}	}
}|
}|  |¡}|  |¡}|  |¡}| |	|| j| j¡ dd¡}| |	|| j| j¡ dd¡}| |	|| j| j¡ dd¡}|\}}t	||||ƒ\}}|d urÈ|||dœ}| 
||| j|¡\}}t|| jƒ}t|| jƒ}tjtj|ddddd}| dddd¡}t || dd¡¡t | j¡ }|d urb|d d …d d …d d …d |jd …f }|| }tjj|dtjd |j¡}tjj|| j| jd	}t tj | j!| j" dtjd¡ |j¡}t tj | j#| j$ dtjd¡ |j¡}|| | j% }t ||¡}tj|ddd\}}|||  }d| j% |  &|¡ }| dd¡ '¡ }| (|	|d¡}|  )|¡}||fS )
Nr   r   ©ÚsinÚcosrR   ©Údiméÿÿÿÿr   éþÿÿÿ©rX   Údtype)ÚpÚtraining)*r(   r5   r6   r7   Úviewr0   r'   Ú	transposer2   r   Úupdater"   r   r3   r;   ÚcatÚchunkÚrepeatÚmatmulr    ÚsqrtÚshaper   Ú
functionalZsoftmaxÚfloat32Útor\   Údropoutr/   r^   r!   Úsumr>   r?   r@   rA   r9   rB   Ú
contiguousÚreshaper8   )rD   rM   rN   rO   rP   rG   rQ   rR   ÚkwargsÚbszZ
target_lenÚ_Úq_lenÚquery_statesÚ
key_statesÚvalue_statesrV   rU   Úcache_kwargsZattn_weightsÚcausal_maskÚlambda_1Úlambda_2Úlambda_fullÚattn_outputÚattn_output1Úattn_output2r   r   r   Úforward_   sL    


 
& ÿ ÿ
zDiffLlamaAttention.forward)N)NNNFN)r   r   r   Ú__doc__r   r   Úintr+   r   r;   ÚTensorÚtupleÚ
LongTensorr   Úboolr~   Ú__classcell__r   r   rE   r   r$   :   s$   "     øör$   c                       sz   e Zd ZdZ‡ fdd„Zedddddejeejejf e	ej
 e	ej
 e	e ee	ej
 eejdf d
œdd„ƒZ‡  ZS )ÚDiffLlamaFlashAttention2aN  
    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                    s   t ƒ j|i |¤Ž tƒ | _d S )N)r*   r+   r   Ú_flash_attn_uses_top_left_mask)rD   Úargsro   rE   r   r   r+   ¦   s    z!DiffLlamaFlashAttention2.__init__rF   rG   rH   rI   NFrL   c                 C   s$  t |tƒrtdƒ‚| ¡ \}}	}
|  |¡}|  |¡}|  |¡}| ||	| j| j	¡ 
dd¡}| ||	| j| j	¡ 
dd¡}| ||	| j| j	¡ 
dd¡}|d u r¶t d¡ |  ||¡\}}n|\}}t||||ƒ\}}|d urú|||dœ}| ||| j|¡\}}| 
dd¡}| 
dd¡}| 
dd¡}| jr,| jnd}|j}|jjdkrL|jjnd}|tjkrØt ¡ r†ttd	ƒr|t |¡nt ¡ }n"t| jd
ƒrž| jj}n
| jjj}t d|› d¡ |  |¡}|  |¡}|  |¡}tj!|ddd\}}| "dddd¡}| "dddd¡}t#|||||	||t$| dd ƒ| j%| j&d
}t#|||||	||t$| dd ƒ| j%| j&d
}tj'||gdd}tj!|ddd\}}t (tj)| j*| j+ dtjd¡  |j¡}t (tj)| j,| j- dtjd¡  |j¡}|| | j. }|||  }d| j. |  /|¡ }| 0||	d¡ 1¡ }|  2|¡}|d fS )NzÈ`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersr   r   aY  The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.rT   ç        ZmpsÚcpuÚget_autocast_dtypeÚ_pre_quantization_dtypez¾The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in Ú.rW   Úsliding_window)rP   rk   rŽ   Zuse_top_left_maskr4   rY   r[   )3Ú
isinstancer   Ú
ValueErrorr(   r5   r6   r7   r_   r0   r'   r`   r2   r,   r-   Z
rotary_embr   ra   r"   r^   r/   r\   ÚdeviceÚtyper;   ri   Zis_autocast_enabledÚhasattrr‹   Zget_autocast_gpu_dtyper&   rŒ   Úweightrj   rc   rd   r   r1   r‡   r4   rb   r!   rl   r>   r?   r@   rA   r9   rB   rn   rm   r8   )rD   rM   rN   rO   rP   rG   rQ   rR   rp   rr   rq   rs   rt   ru   rV   rU   rv   Zdropout_rateZinput_dtypeZdevice_typeZtarget_dtypeZvalue_states1Zvalue_states2r|   r}   r{   rx   ry   rz   r   r   r   r~   ®   sª    
ÿ


ÿ
ÿý

þÿ



ö
ö ÿ ÿ
z DiffLlamaFlashAttention2.forward)NNNFN)r   r   r   r   r+   r   r;   r   r‚   r   rƒ   r   r„   r~   r…   r   r   rE   r   r†   Ÿ   s$        ø÷r†   c                   @   s|   e Zd ZdZedddddejeejejf eej eej	 ee
 eeej	 eejeej eeej  f dœd	d
„ƒZdS )ÚDiffLlamaSdpaAttentiona   
    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    rF   rG   rH   rI   NFrL   c                 K   sf  |  ¡ \}	}
}|  |¡}|  |¡}|  |¡}| |	|
| j| j¡ dd¡}| |	|
| j| j¡ dd¡}| |	|
| j| j¡ dd¡}|\}}t	||||ƒ\}}|d urÄ|||dœ}| 
||| j|¡\}}t|| jƒ}t|| jƒ}tjtj|ddddd}| dddd¡}|}|d ur:|d d …d d …d d …d |jd …f }|jjdkrj|d urj| ¡ }| ¡ }| ¡ }|d u oz|
dk}tjjj||||| jrš| jnd|d	}tj|ddd\}}t tj| j| j dtjd
¡  |j!¡}t tj| j"| j# dtjd
¡  |j!¡}|| | j$ }|||  }d| j$ |  %|¡ }| dd¡ ¡ }| |	|
d¡}|  &|¡}|d fS )Nr   r   rT   rW   rY   rZ   Úcudar‰   )Z	attn_maskZ	dropout_pr4   r[   )'r(   r5   r6   r7   r_   r0   r'   r`   r2   r   ra   r"   r   r3   r;   rb   rc   rd   rg   r‘   r’   rm   r   rh   Zscaled_dot_product_attentionr^   r/   r!   rl   r>   r?   ri   rj   r\   r@   rA   r9   rB   r8   )rD   rM   rN   rO   rP   rG   rQ   rR   ro   rp   rr   rq   rs   rt   ru   rV   rU   rv   rw   r4   r{   r|   r}   rx   ry   rz   r   r   r   r~   <  s\    



&ú	 ÿ ÿ
zDiffLlamaSdpaAttention.forward)NNNFN)r   r   r   r   r   r;   r   r‚   r   rƒ   r   r„   r~   r   r   r   r   r•   4  s"        øör•   )ÚeagerZflash_attention_2Zsdpac                       s$   e Zd Zeedœ‡ fdd„Z‡  ZS )ÚDiffLlamaDecoderLayerr%   c                    s&   t ƒ  ||¡ t|j ||d| _d S )Nr%   )r*   r+   ÚDIFFLLAMA_ATTENTION_CLASSESZ_attn_implementationZ	self_attnrC   rE   r   r   r+   ‘  s    zDiffLlamaDecoderLayer.__init__)r   r   r   r   r€   r+   r…   r   r   rE   r   r˜     s   r˜   c                   @   s   e Zd ZdZdZdd„ ZdS )ÚDiffLlamaPreTrainedModelFc                 C   sj   t  | |¡ t|tƒrf|jj d| jj¡ |j	j d| jj¡ |j
j d| jj¡ |jj d| jj¡ d S )Nr   )r	   Ú_init_weightsr   r$   r>   ÚdataZnormal_r&   r=   r?   r@   rA   )rD   Úmoduler   r   r   r›   ›  s    
z&DiffLlamaPreTrainedModel._init_weightsN)r   r   r   Z_supports_flex_attnZ_supports_attention_backendr›   r   r   r   r   rš   —  s   rš   c                   @   s   e Zd ZdS )ÚDiffLlamaModelNr   r   r   r   r   rž   ¤  s   rž   c                   @   s   e Zd ZdS )ÚDiffLlamaForCausalLMNr   r   r   r   r   rŸ   ¨  s   rŸ   c                   @   s   e Zd ZdS )Ú"DiffLlamaForSequenceClassificationNr   r   r   r   r   r    ¬  s   r    c                   @   s   e Zd ZdS )ÚDiffLlamaForQuestionAnsweringNr   r   r   r   r   r¡   °  s   r¡   c                   @   s   e Zd ZdS )ÚDiffLlamaForTokenClassificationNr   r   r   r   r   r¢   ´  s   r¢   )rš   rž   rŸ   r    r¡   r¢   )4r    Útypingr   r;   r   Zcache_utilsr   r   Zmodeling_flash_attention_utilsr   r   Zmodeling_utilsr	   Úutilsr
   Zutils.deprecationr   Zgemma.modeling_gemmar   Zllama.modeling_llamar   r   r   r   r   r   r   r   Zmistral.modeling_mistralr   Zconfiguration_diffllamar   Z
get_loggerr   r,   Z_CHECKPOINT_FOR_DOCZ_CONFIG_FOR_DOCr   r#   ÚModuler$   r†   r•   r™   r˜   rš   rž   rŸ   r    r¡   r¢   Ú__all__r   r   r   r   Ú<module>   sB   (

e Vý