a
    hs                     @   s  d Z ddlmZmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ e rddlmZ ddlm Z  e!e"Z#e$e$ej%dddZ&ej%ej%dddZ'ej%ej%ej%ej%dddZ(G dd dej)Z*G dd dej)Z+G dd  d eZ,eG d!d" d"eZ-eG d#d$ d$e-Z.ed%d&G d'd( d(e-eZ/g d)Z0dS )*zPyTorch CodeGen model.    )OptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)auto_docstringis_torch_flex_attn_availablelogging   )CodeGenConfig)	BlockMask)make_flex_block_causal_mask)num_posdimreturnc                 C   s`   ddt jd|dt jd|   }t dt j| t jd | }t jt |t |fddS )	N      ?i'  r      dtypezi , j -> i jr   r   )torcharangeZint64Zeinsumfloatcatsincos)r   r   Zinv_freqZsinusoid_inp r$   h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positions0   s     "r&   )xr   c                 C   sb   | d d d d d d d d df }| d d d d d d dd df }t j| |fdd} | dS )Nr   r   r   )r   stackflatten)r'   x1Zx2r$   r$   r%   rotate_every_two7   s    ""r-   )tensorr"   r#   r   c                 C   s`   t |d d d d d d d f dd}t |d d d d d d d f dd}| | t| |  S )Nr   r   )r   Zrepeat_interleaver-   )r.   r"   r#   r$   r$   r%   apply_rotary_pos_emb?   s    &&r/   c                       s   e Zd Zd fdd	Zdd Zdd Zddd	Zdeej	 ee
 eej	 eej eej	 ee ee eej eeejeej f eeejeej eejdf f  f d	ddZ  ZS )CodeGenAttentionNc                    s  t    |j}t|j| _t|j| _|| _	|d u rPt
d| jj d |j| _|j| _| j| j | _| j| j | jkrtd| j d| j dttj| jtjdt | _tj| j| jd dd	| _tj| j| jdd	| _|j| _| jp| j}t||| _d S )
NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   r   F)bias) super__init__Zmax_position_embeddingsr   DropoutZ
attn_pdropattn_dropoutresid_pdropresid_dropout	layer_idxloggerwarning_once	__class____name__Zhidden_size	embed_dimnum_attention_headshead_dim
ValueErrorr   sqrtr.   float32toZget_default_dtype
scale_attnLinearqkv_projout_proj
rotary_dimr&   embed_positions)selfconfigr8   Zmax_positionsZpos_embd_dimr;   r$   r%   r3   F   s0    

$zCodeGenAttention.__init__c                 C   sJ   | |jd d || |f }| |jd d d |jdd   }|S )Nr(   r)   )r(   )reshapeshape)rJ   r'   Zn_headZdim_headmp_numZreshapedr$   r$   r%   _split_headsd   s     &zCodeGenAttention._split_headsc                 C   s   t |jdkr&|ddddd }n8t |jdkrJ|dddd }ntdt |j | dd	 || f }||S )
zM
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
           r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr)   )lenrN   permute
contiguousr@   sizeview)rJ   r.   r>   Zattn_head_sizeZ	new_shaper$   r$   r%   _merge_headsi   s    zCodeGenAttention._merge_headsc           	      C   s   | tj}| tj}t||dd}|d urb|d d d d d d d |jd f }||7 }|| j }tjdd|}| |j	}| 
|}|d ur|| }t||}||fS )Nr(   r)   r   )rC   r   rB   matmulZ	transposerN   rD   r   ZSoftmaxr   r5   )	rJ   querykeyvalueattention_mask	head_maskattn_weightscausal_maskattn_outputr$   r$   r%   _attnv   s    	&

zCodeGenAttention._attnF.	hidden_states
layer_pastr]   position_idsr^   	use_cacheoutput_attentionscache_positionr   c	                 C   sr  |  |}	d}
|	|	jd d |
df }| j| j |
 }tj||dd\}}}| j|| j| j|
d}| j|| j| j|
d}| j|| j| j|
d}|dddd}| j	}|j
|j
kr||j
}|| _	|| }tj||jd d dd\}}| jd ur|d d d d d d d | jf }|d d d d d d | jd f }|d d d d d d d | jf }|d d d d d d | jd f }t|||}t|||}tj||gdd}tj||gdd}nt|||}t|||}|dddd}|dddd}|d ur.||| j|d	}|||j|| j|\}}| |||||\}}| || j| j}| |}| |}||fS )
NrR   r(   r   )rO   r   r   r   r   )r"   r#   Zpartial_rotation_sizeri   )rF   rM   rN   r?   r>   r   splitrP   rT   rI   devicerC   rH   r/   r!   updater   r8   rb   rX   rG   r7   )rJ   rd   re   r]   rf   r^   rg   rh   ri   ZqkvrO   Z	qkv_splitZ	local_dimrZ   r\   r[   rI   Zsincosr"   r#   Zk_rotZk_passZq_rotZq_passZcache_kwargsra   r_   r$   r$   r%   forward   sP    
""""


zCodeGenAttention.forward)N)NN)NNNNFFN)r<   
__module____qualname__r3   rP   rX   rb   r   r   FloatTensorr   
LongTensorboolr   tupleTensorrm   __classcell__r$   r$   rL   r%   r0   E   s8     
"       "r0   c                       s4   e Zd Z fddZeej ejdddZ  ZS )
CodeGenMLPc                    sJ   t    |j}t||| _t||| _t|j | _	t
|j| _d S N)r2   r3   n_embdr   rE   fc_infc_outr   Zactivation_functionactr4   r6   dropout)rJ   Zintermediate_sizerK   r=   rL   r$   r%   r3      s    
zCodeGenMLP.__init__)rd   r   c                 C   s,   |  |}| |}| |}| |}|S rw   )ry   r{   rz   r|   )rJ   rd   r$   r$   r%   rm      s
    



zCodeGenMLP.forward)	r<   rn   ro   r3   r   r   rp   rm   ru   r$   r$   rL   r%   rv      s   
rv   c                       s   e Zd Zd	 fdd	Zd
eej ee eej eej eej ee	 ee	 eej e
eej eeejeejdf f  f d	ddZ  ZS )CodeGenBlockNc                    sT   t    |jd ur|jnd|j }tj|j|jd| _t||| _	t
||| _d S )NrR   eps)r2   r3   Zn_innerrx   r   	LayerNormlayer_norm_epsilonln_1r0   attnrv   mlp)rJ   rK   r8   Z	inner_dimrL   r$   r%   r3      s
    
zCodeGenBlock.__init__F.rc   c	              
   C   sJ   |}	|  |}| j||||||||d\}
}| |}|
| |	 }||fS )N)rd   re   r]   rf   r^   rg   rh   ri   )r   r   r   )rJ   rd   re   r]   rf   r^   rg   rh   ri   ZresidualZattn_outputsr_   Zfeed_forward_hidden_statesr$   r$   r%   rm      s    



zCodeGenBlock.forward)N)NNNNFFN)r<   rn   ro   r3   r   r   rp   r   rq   rr   r   rs   rt   rm   ru   r$   r$   rL   r%   r}      s&   
       (r}   c                       sD   e Zd ZU eed< dZdZdgZdZdZ	 fddZ
dd	 Z  ZS )
CodeGenPreTrainedModelrK   transformerTr}   past_key_valuesc                    s   t  j|i | d S rw   )r2   r3   )rJ   inputskwargsrL   r$   r%   r3   $  s    zCodeGenPreTrainedModel.__init__c                 C   s   t |tjfr<|jjjd| jjd |jdur|jj	  nft |tj
r||jjjd| jjd |jdur|jj|j 	  n&t |tjr|jj	  |jjd dS )zInitialize the weights.        )meanZstdNr   )
isinstancer   rE   weightdataZnormal_rK   Zinitializer_ranger1   Zzero_	EmbeddingZpadding_idxr   Zfill_)rJ   moduler$   r$   r%   _init_weights'  s    

z$CodeGenPreTrainedModel._init_weights)r<   rn   ro   r   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_can_compile_fullgraphr3   r   ru   r$   r$   rL   r%   r     s   
r   c                       s   e Zd Z fddZdd Zdd Zedeej	 ee
eeeej  f  eej eej	 eej	 eej eej ee ee ee ee eej	 e
eef dd	d
Zde
ejdf ejejeedddZeejeeejejedddZ  ZS )CodeGenModelc                    s   t     j| _ j| _t j| j| _t j	| _
t fddt jD | _tj| j jd| _t j j j | _d| _|   d S )Nc                    s   g | ]}t  |d qS ))r8   )r}   ).0irK   r$   r%   
<listcomp>A      z)CodeGenModel.__init__.<locals>.<listcomp>r~   F)r2   r3   rx   r=   
vocab_sizer   r   wter4   Z
embd_pdropdropZ
ModuleListrangen_layerhr   r   ln_fminrH   Zn_ctxr>   gradient_checkpointing	post_initrJ   rK   rL   r   r%   r3   :  s     zCodeGenModel.__init__c                 C   s   | j S rw   r   )rJ   r$   r$   r%   get_input_embeddingsJ  s    z!CodeGenModel.get_input_embeddingsc                 C   s
   || _ d S rw   r   )rJ   Znew_embeddingsr$   r$   r%   set_input_embeddingsM  s    z!CodeGenModel.set_input_embeddingsN)	input_idsr   r]   token_type_idsrf   r^   inputs_embedsrg   rh   output_hidden_statesreturn_dictri   r   c                 K   sR  |	dur|	n| j j}	|
dur |
n| j j}
|dur4|n| j j}|durH|n| j j}|du |duA rhtd| jr| jr|rt	d d}|du r| 
|}t|tdtfstd|r|du rt| j d}|jd }|du r|dur| nd}tj||| |jd	}|du r|d}| |||||	}| || j j}|}|durj|d
|}| 
|}|| }| |}d
||d
f}|	rdnd}|
rdnd}t| jD ]R\}}|
r||f }||||||| ||	|d}|d }|	r||d f }q| |}||}|
r"||f }|sBtdd ||||fD S t||||dS )a  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r   r   rk   r(   r$   )re   r]   rf   r^   rg   rh   ri   c                 s   s   | ]}|d ur|V  qd S rw   r$   )r   vr$   r$   r%   	<genexpr>  s   z'CodeGenModel.forward.<locals>.<genexpr>)Zlast_hidden_stater   rd   
attentions) rK   rh   r   rg   use_return_dictr@   r   trainingr9   r:   r   r   typer   r   rN   get_seq_lengthr   r   rk   Z	unsqueeze_update_causal_maskZget_head_maskr   rW   r   rV   	enumerater   r   rs   r   )rJ   r   r   r]   r   rf   r^   r   rg   rh   r   r   ri   r   Z
seq_lengthpast_seen_tokensr`   rd   Ztoken_type_embedsZoutput_shapeZall_self_attentionsZall_hidden_statesr   blockoutputsr$   r$   r%   rm   P  s    













zCodeGenModel.forwardFr   )r]   input_tensorri   r   rh   c                 C   sB  | j jdkr(|d ur$|dk r$|S d S | j jdkrLt|tjrHt|}|S |d ur\| nd}|d urn|jnd}| j jdkr|s|st	j
|||| jdrd S |j}|jd }	|r| }
n"t|tjr|jd	 n
||	 d }
| j||	|
|||jd d
}| j jdkr>|d ur>|jjdv r>|s>t|j}t	||}|S )NZflash_attention_2r   Zflex_attentionr   FZsdpa)r   Zpast_key_values_lengthZis_trainingr   r(   )sequence_lengthtarget_lengthr   ri   
batch_size)cudaZxpuZnpu)rK   Z_attn_implementationanyr   r   rt   r   r   Zis_compileabler
   Z_ignore_causal_mask_sdpar   r   rN   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrk   r   finfor   Z_unmask_unattended)rJ   r]   r   ri   r   rh   r   Zusing_compilable_cacher   r   r   r`   	min_dtyper$   r$   r%   r     sZ    






	z CodeGenModel._update_causal_mask)r]   r   r   r   ri   r   c                 K   sF  | dur|   dkr| }n&t|j}tj||f|||jd}|dkrVtj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| durB|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        NrR   )Z
fill_valuer   rk   r   )Zdiagonalr   r(   r   )r   r   r   r   fullrk   Ztriur   rM   expandclonerN   rC   Zmasked_fill)r]   r   r   r   ri   r   r   r`   r   Zmask_lengthZpadding_maskr$   r$   r%   r   
  s*     $

6  zBCodeGenModel._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNN)F)r<   rn   ro   r3   r   r   r   r   r   rq   r   r   rs   rt   rp   rr   r   rm   r   staticmethodintr   r   ru   r$   r$   rL   r%   r   8  s^               
{ Dr   zM
    The CodeGen Model transformer with a language modeling head on top.
    )Zcustom_introc                       s   e Zd ZdgZ fddZedeej ee	e
eeej  f  eej eej eej eej eej eej ee ee ee ee eej e	eef dddZ  ZS )	CodeGenForCausalLMzlm_head.weightc                    s4   t  | t|| _t|j|j| _| 	  d S rw   )
r2   r3   r   r   r   rE   rx   r   lm_headr   r   rL   r$   r%   r3   K  s    
zCodeGenForCausalLM.__init__N)r   r   r]   r   rf   r^   r   labelsrg   rh   r   r   ri   r   c                 K   s   |dur|n| j j}| j||||||||	|
|||d}|d }| |tj}d}|dur||j}| j||fd| j j	i|}||j
}|s|f|dd  }|dur|f| S |S t|||j|j|jdS )aG  
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r   r]   r   rf   r^   r   rg   rh   r   r   ri   r   r   r   )lossZlogitsr   rd   r   )rK   r   r   r   rC   r   rB   rk   Zloss_functionr   r   r   r   rd   r   )rJ   r   r   r]   r   rf   r^   r   r   rg   rh   r   r   ri   r   Ztransformer_outputsrd   Z	lm_logitsr   outputr$   r$   r%   rm   S  sN    zCodeGenForCausalLM.forward)NNNNNNNNNNNNN)r<   rn   ro   Z_tied_weights_keysr3   r   r   r   rq   r   r   rs   rt   rp   rr   r   rm   ru   r$   r$   rL   r%   r   C  s@                
r   )r   r   r   )1__doc__typingr   r   r   Ztorch.utils.checkpointr   Zactivationsr   Zcache_utilsr   r   Z
generationr	   Zmodeling_attn_mask_utilsr
   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_utilsr   utilsr   r   r   Zconfiguration_codegenr   Z!torch.nn.attention.flex_attentionr   Zintegrations.flex_attentionr   Z
get_loggerr<   r9   r   rt   r&   r-   r/   Moduler0   rv   r}   r   r   r   __all__r$   r$   r$   r%   <module>   sF   
 &  Y