a
    ½ÀhQ  ã                
   @   s<  d dl mZmZmZ d dlZd dlmZ d dlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- G dd„ dej.ƒZ/dd„ Z0d8dd„Z1ej2e3ej2dœdd„Z4d9ej.ej2ej2ej2eej2 e5e5e%e' dœd d!„Z6G d"d#„ d#ej.ƒZ7ed$ƒG d%d&„ d&ej.ƒƒZ8G d'd(„ d(eƒZ9e(G d)d*„ d*e#ƒƒZ:G d+d,„ d,ej.ƒZ;e(G d-d.„ d.e:ƒƒZ<e(G d/d0„ d0e:eƒƒZ=G d1d2„ d2ee:ƒZ>G d3d4„ d4ee:ƒZ?G d5d6„ d6ee:ƒZ@g d7¢ZAdS ):é    )ÚCallableÚOptionalÚUnionN)Únn)Úcheck_model_inputsé   )ÚACT2FN)ÚCacheÚDynamicCache)ÚGenerationMixin)Úuse_kernel_forward_from_hub)Úcreate_causal_maskÚ!create_sliding_window_causal_mask)ÚFlashAttentionKwargs)ÚGenericForQuestionAnsweringÚ GenericForSequenceClassificationÚGenericForTokenClassificationÚGradientCheckpointingLayer)ÚBaseModelOutputWithPastÚCausalLMOutputWithPast)ÚROPE_INIT_FUNCTIONSÚdynamic_rope_update)ÚALL_ATTENTION_FUNCTIONSÚPreTrainedModel)ÚUnpack)ÚTransformersKwargsÚauto_docstringÚcan_return_tuple)Údeprecate_kwargé   )ÚMistralConfigc                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )Ú
MistralMLPc                    sr   t ƒ  ¡  || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S ©NF©Zbias)ÚsuperÚ__init__ÚconfigÚhidden_sizeZintermediate_sizer   ÚLinearÚ	gate_projÚup_projÚ	down_projr   Z
hidden_actÚact_fn©Úselfr&   ©Ú	__class__© úh/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/mistral/modeling_mistral.pyr%   $   s    
zMistralMLP.__init__c                 C   s$   |   |  |  |¡¡|  |¡ ¡}|S )N)r+   r,   r)   r*   )r.   Úxr+   r1   r1   r2   Úforward.   s     zMistralMLP.forward)Ú__name__Ú
__module__Ú__qualname__r%   r4   Ú__classcell__r1   r1   r/   r2   r!   #   s   
r!   c                 C   sH   | dd| j d d …f }| d| j d d d…f }tj| |fddS )z*Rotates half the hidden dims of the input..Néÿÿÿÿé   ©Údim)ÚshapeÚtorchÚcat)r3   Úx1Zx2r1   r1   r2   Úrotate_half3   s    rA   c                 C   sD   |  |¡}|  |¡}| | t| ƒ|  }|| t|ƒ|  }||fS )aÛ  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )Ú	unsqueezerA   )ÚqÚkÚcosÚsinÚposition_idsZunsqueeze_dimZq_embedZk_embedr1   r1   r2   Úapply_rotary_pos_emb:   s
    

rH   )Úhidden_statesÚn_repÚreturnc                 C   s^   | j \}}}}|dkr| S | dd…dd…ddd…dd…f  |||||¡} |  ||| ||¡S )zÔ
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r=   ÚexpandÚreshape)rI   rJ   ÚbatchÚnum_key_value_headsÚslenÚhead_dimr1   r1   r2   Ú	repeat_kvU   s
    0rR   ç        )ÚmoduleÚqueryÚkeyÚvalueÚattention_maskÚscalingÚdropoutÚkwargsc                 K   sº   t || jƒ}t || jƒ}	t || dd¡¡| }
|d urf|d d …d d …d d …d |jd …f }|
| }
tjj|
dtj	d 
|j¡}
tjj|
|| jd}
t |
|	¡}| dd¡ ¡ }||
fS )Nr:   r   éþÿÿÿr9   )r<   Údtype)ÚpÚtrainingr   )rR   Únum_key_value_groupsr>   ÚmatmulÚ	transposer=   r   Z
functionalZsoftmaxÚfloat32Útor]   rZ   r_   Ú
contiguous)rT   rU   rV   rW   rX   rY   rZ   r[   Ú
key_statesÚvalue_statesÚattn_weightsÚcausal_maskÚattn_outputr1   r1   r2   Úeager_attention_forwarda   s    
&rk   c                       s„   e Zd ZdZeedœ‡ fdd„Zedddddej	e
ej	ej	f eej	 ee eej ee e
ej	eej	 f d
œdd„ƒZ‡  ZS )ÚMistralAttentionz=Multi-headed attention from 'Attention Is All You Need' paper©r&   Ú	layer_idxc                    sÌ   t ƒ  ¡  || _|| _t|dd ƒp,|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j| j |jdd| _d S )NrQ   g      à¿TFr#   )r$   r%   r&   rn   Úgetattrr'   Znum_attention_headsrQ   rO   r`   rY   Úattention_dropoutZ	is_causalr   r(   Úq_projÚk_projÚv_projÚo_proj©r.   r&   rn   r/   r1   r2   r%   ~   s    
zMistralAttention.__init__Úpast_key_valueÚpast_key_valuesú4.58©Únew_nameÚversionN)rI   Úposition_embeddingsrX   rw   Úcache_positionr[   rK   c                 K   s0  |j d d… }g |¢d‘| j‘R }|  |¡ |¡ dd¡}	|  |¡ |¡ dd¡}
|  |¡ |¡ dd¡}|\}}t|	|
||ƒ\}	}
|d ur®|||dœ}| |
|| j	|¡\}
}t
}| jjdkrÊt| jj }|| |	|
||f| jsâdn| j| jt| jdd ƒdœ|¤Ž\}}|jg |¢d‘R Ž  ¡ }|  |¡}||fS )	Nr9   r   r:   )rF   rE   r}   ÚeagerrS   Úsliding_window)rZ   rY   r   )r=   rQ   rq   Úviewrb   rr   rs   rH   Úupdatern   rk   r&   Z_attn_implementationr   r_   rp   rY   ro   rM   re   rt   )r.   rI   r|   rX   rw   r}   r[   Zinput_shapeZhidden_shapeZquery_statesrf   rg   rE   rF   Zcache_kwargsZattention_interfacerj   rh   r1   r1   r2   r4   Œ   s:    
ûø	÷

zMistralAttention.forward)NN)r5   r6   r7   Ú__doc__r    Úintr%   r   r>   ÚTensorÚtupler   r	   Ú
LongTensorr   r   r4   r8   r1   r1   r/   r2   rl   {   s     úørl   ZRMSNormc                       s.   e Zd Zd‡ fdd„	Zdd„ Zdd„ Z‡  ZS )	ÚMistralRMSNormçíµ ÷Æ°>c                    s&   t ƒ  ¡  t t |¡¡| _|| _dS )z=
        MistralRMSNorm is equivalent to T5LayerNorm
        N)r$   r%   r   Ú	Parameterr>   ZonesÚweightÚvariance_epsilon)r.   r'   Úepsr/   r1   r2   r%   ¼   s    
zMistralRMSNorm.__init__c                 C   sJ   |j }| tj¡}| d¡jddd}|t || j ¡ }| j| |¡ S )Nr:   r9   T)Zkeepdim)	r]   rd   r>   rc   ÚpowÚmeanZrsqrtr‹   rŠ   )r.   rI   Zinput_dtypeZvariancer1   r1   r2   r4   Ä   s
    zMistralRMSNorm.forwardc                 C   s   t | jjƒ› d| j› S )Nz, eps=)r…   rŠ   r=   r‹   )r.   r1   r1   r2   Ú
extra_reprË   s    zMistralRMSNorm.extra_repr)rˆ   )r5   r6   r7   r%   r4   r   r8   r1   r1   r/   r2   r‡   º   s   r‡   c                       s„   e Zd Zeedœ‡ fdd„Zedddddeje	ej e	ej
 e	e e	e e	ej
 e	eejejf  ee ejd
œ	dd„ƒZ‡  ZS )ÚMistralDecoderLayerrm   c                    sR   t ƒ  ¡  |j| _t||d| _t|ƒ| _t|j|jd| _	t|j|jd| _
d S )Nrm   ©rŒ   )r$   r%   r'   rl   Ú	self_attnr!   Úmlpr‡   Úrms_norm_epsÚinput_layernormÚpost_attention_layernormru   r/   r1   r2   r%   Ð   s    

zMistralDecoderLayer.__init__rv   rw   rx   ry   NF)	rI   rX   rG   rw   Ú	use_cacher}   r|   r[   rK   c              
   K   s^   |}	|   |¡}| jf |||||||dœ|¤Ž\}}
|	| }|}	|  |¡}|  |¡}|	| }|S )N)rI   rX   rG   rw   r—   r}   r|   )r•   r’   r–   r“   )r.   rI   rX   rG   rw   r—   r}   r|   r[   ZresidualÚ_r1   r1   r2   r4   Ø   s&    
ùø



zMistralDecoderLayer.forward)NNNFNN)r5   r6   r7   r    rƒ   r%   r   r>   r„   r   r†   r	   Úboolr…   r   r   r4   r8   r1   r1   r/   r2   r   Ï   s&         øör   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedœZdS )ÚMistralPreTrainedModelr&   ÚmodelTr   rw   )rI   Ú
attentionsN)r5   r6   r7   r    Ú__annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZ_supports_attention_backendr   rl   Z_can_record_outputsr1   r1   r1   r2   rš   û   s   
þrš   c                       sD   e Zd ZU ejed< dedœ‡ fdd„Ze ¡ e	dd„ ƒƒZ
‡  ZS )	ÚMistralRotaryEmbeddingÚinv_freqN©r&   c                    s’   t ƒ  ¡  t|dƒr:t|jtƒr:|j d|j d¡¡| _nd| _|j| _	|j| _
|| _t| j | _|  | j|¡\}| _| jd|dd | j| _d S )NÚrope_scalingÚ	rope_typeÚtypeÚdefaultrŸ   F)Ú
persistent)r$   r%   ÚhasattrÚ
isinstancer¡   ÚdictÚgetr¢   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr&   r   Zrope_init_fnÚattention_scalingZregister_bufferrŸ   Zoriginal_inv_freq)r.   r&   ÚdevicerŸ   r/   r1   r2   r%     s    
zMistralRotaryEmbedding.__init__c           
      C   s   | j d d d …d f  ¡  |jd dd¡ |j¡}|d d …d d d …f  ¡ }t|jjtƒrl|jjdkrl|jjnd}t	j
|ddV | ¡ | ¡   dd¡}t	j||fdd	}| ¡ | j }| ¡ | j }	W d   ƒ n1 sÚ0    Y  |j|jd
|	j|jd
fS )Nr   r9   r   ZmpsÚcpuF)Údevice_typeZenabledr:   r;   )r]   )rŸ   ÚfloatrL   r=   rd   r«   r§   r£   Ústrr>   Zautocastrb   r?   rE   rª   rF   r]   )
r.   r3   rG   Zinv_freq_expandedZposition_ids_expandedr­   ZfreqsZembrE   rF   r1   r1   r2   r4   "  s    0&,zMistralRotaryEmbedding.forward)N)r5   r6   r7   r>   r„   r   r    r%   Zno_gradr   r4   r8   r1   r1   r/   r2   rž     s
   

rž   c                       st   e Zd Zedœ‡ fdd„Zeedeej	 eej
 eej	 ee eej ee eej	 ee edœ	dd„ƒƒZ‡  ZS )	ÚMistralModelr    c                    s‚   t ƒ  ˆ ¡ ˆ j| _ˆ j| _t ˆ jˆ j| j¡| _t 	‡ fdd„t
ˆ jƒD ƒ¡| _tˆ jˆ jd| _tˆ d| _d| _|  ¡  d S )Nc                    s   g | ]}t ˆ |ƒ‘qS r1   )r   )Ú.0rn   r    r1   r2   Ú
<listcomp>;  ó    z)MistralModel.__init__.<locals>.<listcomp>r‘   r    F)r$   r%   Zpad_token_idZpadding_idxÚ
vocab_sizer   Z	Embeddingr'   Úembed_tokensZ
ModuleListÚrangeÚnum_hidden_layersÚlayersr‡   r”   Únormrž   Ú
rotary_embZgradient_checkpointingÚ	post_initr-   r/   r    r2   r%   4  s    ÿzMistralModel.__init__N)	Ú	input_idsrX   rG   rw   Úinputs_embedsr—   r}   r[   rK   c              
   K   s  |d u |d uA rt dƒ‚|d u r*|  |¡}|rB|d u rBt| jd}|d u rz|d urZ| ¡ nd}	tj|	|	|jd  |jd}|d u rŒ| 	d¡}| jj
d u rœtnt}
|
| j|||||d}|}|  ||¡}| jd | jj… D ]"}||f||||||dœ|¤Ž}qØ|  |¡}t||r|nd dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr    r   r   )r«   )r&   Zinput_embedsrX   r}   rw   rG   )rX   rG   rw   r—   r}   r|   )Úlast_hidden_staterw   )Ú
ValueErrorrµ   r
   r&   Zget_seq_lengthr>   Zaranger=   r«   rB   r   r   r   rº   r¸   r·   r¹   r   )r.   r¼   rX   rG   rw   r½   r—   r}   r[   Zpast_seen_tokensZmask_functionri   rI   r|   Zdecoder_layerr1   r1   r2   r4   D  sT    
ÿ
ú	ÿùø

þzMistralModel.forward)NNNNNNN)r5   r6   r7   r    r%   r   r   r   r>   r†   r„   r	   ÚFloatTensorr™   r   r   r   r4   r8   r1   r1   r/   r2   r°   2  s*          øör°   c                       s    e Zd ZdgZddiZddgdgfiZ‡ fdd„Zeede	e
j e	e
j e	e
j e	e e	e
j e	e
j e	e e	e
j eee
jf ee ed
œdd„ƒƒZ‡  ZS )ÚMistralForCausalLMzlm_head.weightÚlm_headZcolwise_reprI   Úlogitsc                    s@   t ƒ  |¡ t|ƒ| _|j| _tj|j|jdd| _|  	¡  d S r"   )
r$   r%   r°   r›   r´   r   r(   r'   rÂ   r»   r-   r/   r1   r2   r%   ˆ  s
    
zMistralForCausalLM.__init__Nr   )r¼   rX   rG   rw   r½   Úlabelsr—   r}   Úlogits_to_keepr[   rK   c
              
   K   sœ   | j f |||||||dœ|
¤Ž}|j}t|	tƒr<t|	 dƒn|	}|  |dd…|dd…f ¡}d}|dur„| jf ||| jjdœ|
¤Ž}t	|||j
|j|jdS )aÛ  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, MistralForCausalLM

        >>> model = MistralForCausalLM.from_pretrained("meta-mistral/Mistral-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-mistral/Mistral-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r¼   rX   rG   rw   r½   r—   r}   N)rÃ   rÄ   r´   )ÚlossrÃ   rw   rI   rœ   )r›   r¾   r§   rƒ   ÚslicerÂ   Zloss_functionr&   r´   r   rw   rI   rœ   )r.   r¼   rX   rG   rw   r½   rÄ   r—   r}   rÅ   r[   ÚoutputsrI   Zslice_indicesrÃ   rÆ   r1   r1   r2   r4   ‘  s0     ùøûzMistralForCausalLM.forward)	NNNNNNNNr   )r5   r6   r7   Z_tied_weights_keysZ_tp_planZ_pp_planr%   r   r   r   r>   r†   r„   r	   rÀ   r™   r   rƒ   r   r   r   r4   r8   r1   r1   r/   r2   rÁ   ‚  s8   	         öôrÁ   c                   @   s   e Zd ZdS )ÚMistralForTokenClassificationN©r5   r6   r7   r1   r1   r1   r2   rÉ   Î  s   rÉ   c                   @   s   e Zd ZdS )Ú MistralForSequenceClassificationNrÊ   r1   r1   r1   r2   rË   Ò  s   rË   c                   @   s   e Zd ZdS )ÚMistralForQuestionAnsweringNrÊ   r1   r1   r1   r2   rÌ   Ö  r³   rÌ   )rÁ   rÌ   r°   rš   rË   rÉ   )Nr   )rS   )BÚtypingr   r   r   r>   r   Ztransformers.utils.genericr   Zactivationsr   Zcache_utilsr	   r
   Z
generationr   Zintegrationsr   Zmasking_utilsr   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   r   r   r   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   Úutilsr   r   r   Zutils.deprecationr   Zconfiguration_mistralr    ÚModuler!   rA   rH   r„   rƒ   rR   r®   rk   rl   r‡   r   rš   rž   r°   rÁ   rÉ   rË   rÌ   Ú__all__r1   r1   r1   r2   Ú<module>   s^   
 ùø?,$OK