a
    hz                     @   s  d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& e rddl'm(Z(m)Z) G dd de Z*eG dd deZ+G dd deZ,G dd deZ-eG dd de+Z.eddG d d! d!e+eZ/G d"d# d#eZ0G d$d% d%e"Z1G d&d' d'eZ2g d(Z3dS ))zPyTorch PLBART model.    N)OptionalUnion)nn)CrossEntropyLoss   )Cache)GenerationMixin)AttentionMaskConverter_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)BaseModelOutputSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)auto_docstringis_torch_flex_attn_available   )BartClassificationHeadBartDecoderBartEncoderBartForCausalLMBartScaledWordEmbedding)'BigBirdPegasusForSequenceClassification)shift_tokens_right   )PLBartConfig)	BlockMaskmake_flex_block_causal_maskc                   @   s   e Zd ZdS )PLBartScaledWordEmbeddingN__name__
__module____qualname__ r#   r#   e/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/plbart/modular_plbart.pyr   7   s   r   c                   @   s   e Zd ZU eed< dZdZddgZdZdZ	dZ
eejdf ejddd	Zeeejd
f  ejejedddZeejeeejejedddZeejdf eejdf ejejdddZdS )PLBartPreTrainedModelconfigmodelTZPLBartDecoderLayerZPLBartEncoderLayerN)attention_maskinputs_embedsc                 C   sv   |d urr| j jdkr&d|v r |nd }nL| j jdkr@t||j}n2| j jdkrft|tjrrt|dd}nt||j}|S )Nflash_attention_2r   sdpaflex_attentionF)	is_causal	r&   _attn_implementationr   dtype
isinstancetorchTensorr   r
   )selfr(   r)   r#   r#   r$   _update_full_maskF   s    z'PLBartPreTrainedModel._update_full_maskr   )r(   input_tensorcache_positionpast_key_valuesc                 C   sf  | j jdkrRt|tjr"t|}n,|d u rNttj|jd |jd f|jd}|S | j jdkrz|d urv|dk	 rv|S d S |d ur|
 nd}|d ur|jnd}| j jdkr|stj|||| jd	rd S |j}|jd }|r| }	n"t|tjr|jd
 n
|| d }	| j|||	|||jd d}
| j jdkrb|d urb|jjdv rbt|j}t|
|}
|
S )Nr,   r   r   )sizedevicer*   g        Fr+   )r)   Zpast_key_values_lengthZis_training)sequence_lengthtarget_lengthr0   r7   
batch_size)cudaZxpuZnpu)r&   r/   r1   r2   r3   r   Zonesshaper:   anyZget_seq_lengthZis_compileabler	   Z_ignore_causal_mask_sdpaZtrainingr0   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfominZ_unmask_unattended)r4   r(   r6   r7   r8   Zpast_seen_tokensZusing_compilable_cacher0   r<   r=   causal_mask	min_dtyper#   r#   r$   _update_causal_mask]   sd    







z)PLBartPreTrainedModel._update_causal_mask)r(   r<   r=   r0   r7   r>   c                 K   sF  | dur|   dkr| }n&t|j}tj||f|||jd}|dkrVtj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| durB|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuer0   r:   r   )Zdiagonalr:   r;   r   )dimr2   rD   rE   fullr:   ZtriuZarangeZreshapeexpandcloner@   toZmasked_fill)r(   r<   r=   r0   r7   r>   kwargsrF   rG   Zmask_lengthZpadding_maskr#   r#   r$   rB      s*     $

6  zKPLBartPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position)encoder_hidden_statesencoder_attention_maskinput_shaper)   c                 C   s   |d ur|d ur| j jdkr.d|v r(|nd }nb| j jdkrPt||j|d d}n@| j jdkr|t|tjrt||d dd}nt||j|d d}|S )	Nr*   r   r+   r;   )Ztgt_lenr,   F)Zquery_lengthr-   r.   )r4   rQ   rR   rS   r)   r#   r#   r$   _update_cross_attn_mask   s(    z-PLBartPreTrainedModel._update_cross_attn_mask)r    r!   r"   r   __annotations__base_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnr   r2   r3   r5   r   r   rH   staticmethodintr0   rB   SizerT   r#   r#   r#   r$   r%   ;   s8   
L9r%   c                   @   s   e Zd ZdS )PLBartEncoderNr   r#   r#   r#   r$   rZ     s   rZ   c                   @   s   e Zd ZdS )PLBartDecoderNr   r#   r#   r#   r$   r[   
  s   r[   c                       s   e Zd ZddgZed fddZdd Zdd	 Zd
d Zdd Z	e
deej eej eej eej eej eej eej eeej  ee eej eej ee ee ee ee eej eeej ef dddZ  ZS )PLBartModelencoder.embed_tokens.weightdecoder.embed_tokens.weightr&   c                    sl   t  | |j|j }}|jr,t|jnd}t||j||d| _	t
|| j	| _t|| j	| _|   d S )Ng      ?)embed_scale)super__init__pad_token_id
vocab_sizeZscale_embeddingmathsqrtd_modelr   sharedrZ   encoderr[   decoderinit_weights)r4   r&   Zpadding_idxrd   r`   	__class__r#   r$   rb     s    zPLBartModel.__init__c                 C   s   | j S N)rh   r4   r#   r#   r$   get_input_embeddings  s    z PLBartModel.get_input_embeddingsc                 C   s   || _ | j | j_| j | j_d S rn   )rh   ri   embed_tokensrj   )r4   valuer#   r#   r$   set_input_embeddings!  s    
z PLBartModel.set_input_embeddingsc                 C   s0   | j jr,| | jj| j | | jj| j d S rn   )r&   Ztie_word_embeddingsZ_tie_or_clone_weightsri   rq   rh   rj   ro   r#   r#   r$   _tie_weights&  s    zPLBartModel._tie_weightsc                 C   s   | j S rn   )ri   ro   r#   r#   r$   get_encoder+  s    zPLBartModel.get_encoderN)	input_idsr(   decoder_input_idsdecoder_attention_mask	head_maskdecoder_head_maskcross_attn_head_maskencoder_outputsr8   r)   decoder_inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictr7   returnc                 C   s6  |dur|n| j j}|dur |n| j j}|dur4|n| j j}|durH|n| j j}|du rn|du rnt|| j j}|du r| j||||
|||d}nH|rt|t	st	|d t
|dkr|d ndt
|dkr|d ndd}| j|||d ||||	||||||d}|s|| S t|j|j|j|j|j|j|j|jdS )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        cross_attn_head_mask (:
            obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify
            selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        N)rv   r(   ry   r)   r   r   r   r   r   r   )last_hidden_statehidden_states
attentions)rv   r(   rQ   rR   ry   r{   r8   r)   r~   r   r   r   r7   )r   r8   decoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_staterQ   encoder_attentions)r&   r   r   r~   use_return_dictr   rc   ri   r1   r   lenrj   r   r   r8   r   r   r   )r4   rv   r(   rw   rx   ry   rz   r{   r|   r8   r)   r}   r~   r   r   r   r7   Zdecoder_outputsr#   r#   r$   forward.  sd    1
zPLBartModel.forward)NNNNNNNNNNNNNNNN)r    r!   r"   _tied_weights_keysr   rb   rp   rs   rt   ru   r   r   r2   
LongTensorr3   listFloatTensorr   boolr   tupler   r   __classcell__r#   r#   rl   r$   r\     sT                   r\   zv
    The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.
    )Zcustom_introc                       s(  e Zd ZdZdgZg dZed fddZdd Zd	d
 Z	de
ee
 eejd fddZe
ddddZedeej eej eej eej eej eej eej eeej  ee eej eej eej ee ee ee ee eej eeej ef dddZejdddZ  ZS )PLBartForConditionalGenerationr'   final_logits_bias)r]   r^   zlm_head.weightr_   c                    sX   t  | t|| _| dtd| jjjf t	j
|j| jjjdd| _|   d S )Nr   r   F)Zbias)ra   rb   r\   r'   register_bufferr2   zerosrh   Znum_embeddingsr   ZLinearrg   lm_headrk   )r4   r&   rl   r#   r$   rb     s
    
z'PLBartForConditionalGeneration.__init__c                 C   s
   | j  S rn   )r'   ru   ro   r#   r#   r$   ru     s    z*PLBartForConditionalGeneration.get_encoderc                 C   s
   | j  S rn   )r'   get_decoderro   r#   r#   r$   r     s    z*PLBartForConditionalGeneration.get_decoderNT)new_num_tokenspad_to_multiple_ofmean_resizingr   c                    s&   t  |||}| |jjd  |S )Nr   )ra   resize_token_embeddings_resize_final_logits_biasweightr@   )r4   r   r   r   Znew_embeddingsrl   r#   r$   r     s    z6PLBartForConditionalGeneration.resize_token_embeddings)r   r   c                 C   sj   | j jd }||kr,| j d d d |f }n.tjd|| f| j jd}tj| j |gdd}| d| d S )Nr;   r   rJ   )rK   r   )r   r@   r2   r   r:   catr   )r4   r   Zold_num_tokensZnew_biasZ
extra_biasr#   r#   r$   r     s    z8PLBartForConditionalGeneration._resize_final_logits_bias)rv   r(   rw   rx   ry   rz   r{   r|   r8   r)   r}   labelsr~   r   r   r   r7   r   c                 C   s  |dur|n| j j}|dur:|du r:|du r:t|| j j}| j|||||||||	|
||||||d}| |d }|| j|j }d}|durt	 }||
d| j j|
d}|s|f|dd  }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        cross_attn_head_mask (:
            obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify
            selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example Mask-filling:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForConditionalGeneration

        >>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")

        >>> # en_XX is the language symbol id <LID> for English
        >>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
        >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt").input_ids

        >>> logits = model(input_ids).logits
        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
        >>> probs = logits[0, masked_index].softmax(dim=0)
        >>> values, predictions = probs.topk(5)

        >>> tokenizer.decode(predictions).split()
        ['first', 'same', 'highest', 'result', 'number']
        ```
        N)r(   rw   r|   rx   ry   rz   r{   r8   r)   r}   r~   r   r   r   r7   r   r;   r   )	ZlossZlogitsr8   r   r   r   r   rQ   r   )r&   r   r   rc   r'   r   r   rO   r:   r   viewrd   r   r8   r   r   r   r   rQ   r   )r4   rv   r(   rw   rx   ry   rz   r{   r|   r8   r)   r}   r   r~   r   r   r   r7   outputsZ	lm_logitsZmasked_lm_lossZloss_fctoutputr#   r#   r$   r     sT    Kz&PLBartForConditionalGeneration.forward)r   c                 C   s   t || jjS rn   )r   r&   rc   )r4   r   r#   r#   r$   %prepare_decoder_input_ids_from_labels@  s    zDPLBartForConditionalGeneration.prepare_decoder_input_ids_from_labels)NT)NNNNNNNNNNNNNNNNN)r    r!   r"   rV   Z_keys_to_ignore_on_load_missingr   r   rb   ru   r   rX   r   r   r   Z	Embeddingr   r   r   r2   r   r3   r   r   r   r   r   r   r   r   r   r#   r#   rl   r$   r     sf    
	                 zr   c                   @   s   e Zd ZdS )PLBartClassificationHeadNr   r#   r#   r#   r$   r   D  s   r   c                       s   e Zd Z fddZ  ZS )PLBartForSequenceClassificationc                     s   t  jf i |  dS )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        cross_attn_head_mask (:
            obj:*torch.Tensor* of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify
            selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nra   r   Zsuper_kwargsrl   r#   r$   r   I  s    !z'PLBartForSequenceClassification.forward)r    r!   r"   r   r   r#   r#   rl   r$   r   H  s   r   c                       s    e Zd Ze fddZ  ZS )PLBartForCausalLMc                     s   t  jf i |  dS )a  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
        >>> model = PLBartForCausalLM.from_pretrained("uclanlp/plbart-base", add_cross_attention=False)
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
        >>> list(logits.shape) == expected_shape
        True
        ```Nr   r   rl   r#   r$   r   n  s    zPLBartForCausalLM.forward)r    r!   r"   r   r   r   r#   r#   rl   r$   r   m  s   r   )r   r   r   r\   r%   )4__doc__re   typingr   r   r2   Ztorch.utils.checkpointr   Ztorch.nnr   Zcache_utilsr   Z
generationr   Zmodeling_attn_mask_utilsr	   r
   r   Zmodeling_outputsr   r   r   Zmodeling_utilsr   utilsr   r   Zbart.modeling_bartr   r   r   r   r   Z(bigbird_pegasus.modeling_bigbird_pegasusr   Zmbart.modeling_mbartr   Zconfiguration_plbartr   Zintegrations.flex_attentionr   r   r   r%   rZ   r[   r\   r   r   r   r   __all__r#   r#   r#   r$   <module>   sH    K  #%!