a
    hk                  
   @   s0  d dl mZmZmZ d dlZd dlm  mZ d dlmZ ddl	m
Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- edG dd dej.Z/G dd dej.Z0dd Z1d6ddZ2ej3e4ej3ddd Z5d7ej.ej3ej3ej3eej3 e6e6e#e% d"d#d$Z7G d%d& d&ej.Z8G d'd( d(ej.Z9G d)d* d*ej.Z:G d+d, d,ej.Z;G d-d. d.eZ<e&G d/d0 d0e!Z=e&G d1d2 d2e=Z>e&G d3d4 d4e=eZ?g d5Z@dS )8    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )Dots1ConfigZRMSNormc                       sB   e Zd Zdedd fddZejejdddZd	d
 Z  Z	S )Dots1RMSNormư>N)epsreturnc                    s&   t    tt|| _|| _dS )z;
        Dots1RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	ParametertorchZonesweightvariance_epsilon)selfhidden_sizer    	__class__ d/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/dots1/modeling_dots1.pyr#   .   s    
zDots1RMSNorm.__init__)hidden_statesr!   c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor%   float32powmeanZrsqrtr'   r&   )r(   r.   Zinput_dtypeZvariancer,   r,   r-   forward6   s
    zDots1RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler&   shaper'   )r(   r,   r,   r-   
extra_repr=   s    zDots1RMSNorm.extra_repr)r   )
__name__
__module____qualname__floatr#   r%   Tensorr7   r:   __classcell__r,   r,   r*   r-   r   ,   s   r   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	Dots1RotaryEmbeddinginv_freqNconfigc                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultrB   F)
persistent)r"   r#   hasattr
isinstancerE   dictgetrF   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrD   r   Zrope_init_fnattention_scalingregister_bufferrB   Zoriginal_inv_freq)r(   rD   devicerB   r*   r,   r-   r#   D   s    
zDots1RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r0   r   ZmpscpuF)device_typeZenabledr/   dimr2   )rB   r>   expandr9   r3   rP   rK   rG   strr%   Zautocast	transposecatcosrN   sinr2   )
r(   xposition_idsZinv_freq_expandedZposition_ids_expandedrR   ZfreqsZembrZ   r[   r,   r,   r-   r7   U   s    0&,zDots1RotaryEmbedding.forward)N)r;   r<   r=   r%   r?   __annotations__r   r#   no_gradr   r7   r@   r,   r,   r*   r-   rA   A   s
   

rA   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr0   r/   rS   )r9   r%   rY   )r\   x1Zx2r,   r,   r-   rotate_halfe   s    ra   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezera   )qkrZ   r[   r]   Zunsqueeze_dimZq_embedZk_embedr,   r,   r-   apply_rotary_pos_embl   s
    

re   )r.   n_repr!   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r9   rV   reshape)r.   rf   batchnum_key_value_headsslenhead_dimr,   r,   r-   	repeat_kv   s
    0rl           )modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr/   r   r0   )rT   r2   )ptrainingr   )rl   num_key_value_groupsr%   matmulrX   r9   r   
functionalZsoftmaxr4   r3   r2   rt   rx   
contiguous)rn   ro   rp   rq   rr   rs   rt   ru   
key_statesvalue_statesattn_weightsZcausal_maskattn_outputr,   r,   r-   eager_attention_forward   s    
&r   c                       s   e Zd ZdZeed fddZedddddej	e
ej	ej	f eej	 ee eej ee e
ej	eej	 f d
ddZ  ZS )Dots1Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrD   	layer_idxc                    s  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _t| j|jd| _t| j|jd| _|j| dkr|jnd | _d S )Nrk   g      TZbiasr    sliding_attention)r"   r#   rD   r   getattrr)   Znum_attention_headsrk   ri   ry   rs   attention_dropoutZ	is_causalr   LinearZattention_biasq_projk_projv_projo_projr   rms_norm_epsq_normk_normlayer_typessliding_windowr(   rD   r   r*   r,   r-   r#      s.    
zDots1Attention.__init__past_key_valuepast_key_values4.58new_nameversionN)r.   position_embeddingsrr   r   cache_positionru   r!   c                 K   s4  |j d d }g |d| jR }| | ||dd}	| | ||dd}
| ||dd}|\}}t	|	|
||\}	}
|d ur|||d}|
|
|| j|\}
}t}| jjdkrt| jj }|| |	|
||f| jsdn| j| j| jd|\}}|jg |dR   }| |}||fS )Nr0   r   r/   )r[   rZ   r   eagerrm   )rt   rs   r   )r9   rk   r   r   viewrX   r   r   r   re   updater   r   rD   Z_attn_implementationr   rx   r   rs   r   rg   r|   r   )r(   r.   r   rr   r   r   ru   Zinput_shapeZhidden_shapeZquery_statesr}   r~   rZ   r[   Zcache_kwargsZattention_interfacer   r   r,   r,   r-   r7      s:    
	

zDots1Attention.forward)NN)r;   r<   r=   __doc__r   intr#   r   r%   r?   r8   r   r   
LongTensorr   r   r7   r@   r,   r,   r*   r-   r      s     r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )Dots1MLPNc                    s   t    || _|d u r|jn|| _|d u r2|jn|| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFr   )r"   r#   rD   r)   intermediate_sizer   r   	gate_projup_proj	down_projr   Z
hidden_actact_fn)r(   rD   r)   r   r*   r,   r-   r#      s    
zDots1MLP.__init__c                 C   s$   |  | | || | }|S )N)r   r   r   r   )r(   r\   r   r,   r,   r-   r7     s     zDots1MLP.forward)NN)r;   r<   r=   r#   r7   r@   r,   r,   r*   r-   r      s   r   c                       s@   e Zd ZdZ fddZejejejdddZdd Z  Z	S )	Dots1MoEz:
    A mixed expert module containing shared experts.
    c                    sT   t     | _t fddt jD | _t | _	t
  j j d| _d S )Nc                    s   g | ]}t   jd qS ))r   )r   moe_intermediate_size).0_rC   r,   r-   
<listcomp>      z%Dots1MoE.__init__.<locals>.<listcomp>)rD   r   )r"   r#   rD   r   
ModuleListrangen_routed_expertsexpertsDots1TopkRoutergater   r   Zn_shared_expertsshared_expertsr(   rD   r*   rC   r-   r#     s    

zDots1MoE.__init__)r.   topk_indicestopk_weightsc                 C   s   t j||jd}t jjj|t| jd}|ddd}t	t| jD ]h}| j| }|| }t 
|\}	}
|	 dkrD||	|
f }||	 }||}||d }|d|	| qD||jS )z
        CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused
        to not have to do a loop here (deepseek has 256 experts soooo yeah).
        rU   )Znum_classesr/   r   r   r0   )r%   
zeros_liker2   r   r{   Zone_hotlenr   Zpermuter   whereZnumelrb   Z
index_add_rG   )r(   r.   r   r   Zfinal_hidden_statesZexpert_maskZ
expert_idxZexpertmaskZtoken_indicesZweight_indicesZexpert_weightsZexpert_inputZexpert_outputZweighted_outputr,   r,   r-   moe  s    
zDots1MoE.moec                 C   sP   |}|j }| |\}}|d|j d }| |||j| }|| | }|S )Nr0   )r9   r   r   r   r   )r(   r.   Z	residualsZ
orig_shaper   r   r,   r,   r-   r7   3  s    zDots1MoE.forward)
r;   r<   r=   r   r#   r%   r?   r   r7   r@   r,   r,   r*   r-   r   	  s   r   c                       s4   e Zd Z fddZe dd Zdd Z  ZS )r   c                    sr   t    || _|j| _|j| _|j| _|j| _|j| _|j	| _	t
t| j|jf| _| dt| j d S )Ne_score_correction_bias)r"   r#   rD   Znum_experts_per_toktop_kr   routed_scaling_factorn_group
topk_groupnorm_topk_probr   r$   r%   emptyr)   r&   rO   Zzerosr   r*   r,   r-   r#   >  s    
zDots1TopkRouter.__init__c                 C   s   | d| j| jd }| d| j| j| j jdddd jdd}tj|| jdddd }t	|}|
d|d |dd| j| j| j d| j}||  d}tj|| jdddd }|S )	Nr0   r   r/   rS   F)rd   rT   sortedr   rm   )r   r   r   rb   r   Ztopksumr%   r   r   Zscatter_rV   rg   Zmasked_fillboolr   )r(   scoresZscores_for_choiceZgroup_scoresZ	group_idxZ
group_maskZ
score_maskr   r,   r,   r-   get_topk_indicesK  s*    

z Dots1TopkRouter.get_topk_indicesc                 C   s~   | d| jj}t|tj| jtj}|	 }| 
|}|d|}| jrl|jdddd }|| }|| j }||fS )Nr0   r   T)rT   r1   g#B;)r   rD   r)   FZlinearrG   r%   r4   r&   Zsigmoidr   Zgatherr   r   r   )r(   r.   Zrouter_logitsr   r   r   denominatorr,   r,   r-   r7   _  s    

zDots1TopkRouter.forward)	r;   r<   r=   r#   r%   r_   r   r7   r@   r,   r,   r*   r-   r   =  s   
r   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	e e	e e	ej
 e	eejejf  ee ejd
	ddZ  ZS )Dots1DecoderLayerr   c                    st   t    |j| _t||d| _||jkr6t|| _n
t|| _t	|j|j
d| _t	|j|j
d| _|j| | _d S )Nr   r   )r"   r#   r)   r   	self_attnZfirst_k_dense_replacer   mlpr   r   r   input_layernormpost_attention_layernormr   attention_typer   r*   r,   r-   r#   m  s    


zDots1DecoderLayer.__init__r   r   r   r   NF)	r.   rr   r]   r   	use_cacher   r   ru   r!   c              
   K   s^   |}	|  |}| jf |||||||d|\}}
|	| }|}	| |}| |}|	| }|S )N)r.   rr   r]   r   r   r   r   )r   r   r   r   )r(   r.   rr   r]   r   r   r   r   ru   Zresidualr   r,   r,   r-   r7   |  s&    




zDots1DecoderLayer.forward)NNNFNN)r;   r<   r=   r   r   r#   r   r%   r?   r   r   r   r   r8   r   r   r7   r@   r,   r,   r*   r-   r   l  s&         r   c                       sX   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZ fdd	Z  ZS )
Dots1PreTrainedModelrD   modelTr   r   F)r.   
attentionsc                    s0   t  | t|tr,|jjjd| jjd d S )Nrm   )r6   Zstd)	r"   _init_weightsrK   r   r&   dataZnormal_rD   Zinitializer_range)r(   rn   r*   r,   r-   r     s    
z"Dots1PreTrainedModel._init_weights)r;   r<   r=   r   r^   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZ_supports_attention_backendr   r   Z_can_record_outputsr   r@   r,   r,   r*   r-   r     s   
r   c                       st   e Zd Zed fddZeedeej	 eej
 eej	 ee eej ee eej	 ee ed	ddZ  ZS )	
Dots1ModelrC   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _d| jjv | _|   d S )Nc                    s   g | ]}t  |qS r,   )r   )r   r   rC   r,   r-   r     r   z'Dots1Model.__init__.<locals>.<listcomp>r   rC   Fr   )r"   r#   Zpad_token_idZpadding_idx
vocab_sizer   Z	Embeddingr)   embed_tokensr   r   num_hidden_layerslayersr   r   normrA   
rotary_embZgradient_checkpointingrD   r   has_sliding_layers	post_initr   r*   rC   r-   r#     s    zDots1Model.__init__N)		input_idsrr   r]   r   inputs_embedsr   r   ru   r!   c              
   K   sD  |d u |d uA rt d|d u r*| |}|rB|d u rBt| jd}|d u rz|d urZ| nd}	tj|	|	|jd  |jd}|d u r|	d}t
| }
ts| j|||||d}dtf i |i}
| jrtf i ||
d< |}| ||}| jd | jj D ](}||f|
|j |||||d	|}q| |}t||r<|nd d
S )Nz:You must specify exactly one of input_ids or inputs_embedsrC   r   r   )rP   )rD   Zinput_embedsrr   r   r   r]   Zfull_attentionr   )rr   r]   r   r   r   r   )last_hidden_stater   )
ValueErrorr   r	   rD   Zget_seq_lengthr%   Zaranger9   rP   rb   rK   rL   r   r   r   r   r   r   r   r   r   )r(   r   rr   r]   r   r   r   r   ru   Zpast_seen_tokensZcausal_mask_mappingZmask_kwargsr.   r   Zdecoder_layerr,   r,   r-   r7     sZ    



zDots1Model.forward)NNNNNNN)r;   r<   r=   r   r#   r   r   r   r%   r   r?   r   FloatTensorr   r   r   r   r7   r@   r,   r,   r*   r-   r     s*          r   c                       s   e Zd ZdgZddiZddgdgfiZ fddZeede	e
j e	e
j e	e
j e	e e	e
j e	e
j e	e e	e
j eee
jf ee ed
ddZ  ZS )Dots1ForCausalLMzlm_head.weightlm_headZcolwise_repr.   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r   )
r"   r#   r   r   r   r   r   r)   r   r   r   r*   r,   r-   r#     s
    
zDots1ForCausalLM.__init__Nr   )r   rr   r]   r   r   labelsr   r   logits_to_keepru   r!   c
              
   K   s   | j f |||||||d|
}|j}t|	tr<t|	 dn|	}| |dd|ddf }d}|dur| jf ||| jjd|
}t	|||j
|j|jdS )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Dots1ForCausalLM

        >>> model = Dots1ForCausalLM.from_pretrained("rednote-hilab/dots1.llm1.inst")
        >>> tokenizer = AutoTokenizer.from_pretrained("rednote-hilab/dots1.llm1.inst")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r   rr   r]   r   r   r   r   N)r   r   r   )lossr   r   r.   r   )r   r   rK   r   slicer   Zloss_functionrD   r   r   r   r.   r   )r(   r   rr   r]   r   r   r   r   r   r   ru   outputsr.   Zslice_indicesr   r   r,   r,   r-   r7   "  s0    %zDots1ForCausalLM.forward)	NNNNNNNNr   )r;   r<   r=   Z_tied_weights_keysZ_tp_planZ_pp_planr#   r   r   r   r%   r   r?   r   r   r   r   r   r   r   r   r7   r@   r,   r,   r*   r-   r     s8   	         r   )r   r   r   )Nr   )rm   )Atypingr   r   r   r%   Ztorch.nn.functionalr   r{   r   Zactivationsr   Zcache_utilsr   r	   Z
generationr
   Zintegrationsr   Zmasking_utilsr   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   Zutils.deprecationr   Zutils.genericr   Zconfiguration_dots1r   Moduler   rA   ra   re   r?   r   rl   r>   r   r   r   r   r   r   r   r   r   __all__r,   r,   r,   r-   <module>   s^   $
 K4/3\P