a
    h                  	   @   s  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, e' rddl-m.Z. ddl/m0Z0 e(1e2Z3G dd dej4Z5d/ej6ej7ej7ej7eej7 e8e8dddZ9G dd dej6Z:G dd deZ;e%G d d! d!e Z<G d"d# d#e<Z=e%G d$d% d%e<Z>G d&d' d'e<eZ?e%d(d)G d*d+ d+e<Z@e%G d,d- d-e<ZAg d.ZBdS )0zPyTorch OPT model.    )CallableOptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg   )	OPTConfig)	BlockMask)make_flex_block_causal_maskc                       sH   e Zd ZdZeed fddZd
ejeeej d fdd	Z	  Z
S )OPTLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    )num_embeddingsembedding_dimc                    s   d| _ t || j  | d S N   )offsetsuper__init__)selfr#   r$   	__class__ `/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/opt/modeling_opt.pyr)   9   s    z&OPTLearnedPositionalEmbedding.__init__r   N)attention_maskpast_key_values_lengthposition_idsc                    sL   |du r:t j|dd}|| d  }|dd|df }t || j S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr(   forwardr'   )r*   r/   r0   r1   r+   r-   r.   r7   ?   s
    z%OPTLearnedPositionalEmbedding.forward)r   N)__name__
__module____qualname____doc__intr)   r4   
LongTensorr   r7   __classcell__r-   r-   r+   r.   r"   4   s   	  r"           )modulequerykeyvaluer/   scalingdropoutc           
      K   s|   t ||dd| }|d ur(|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )N)r3   dtypeptrainingr   r&   )r4   matmul	transposer   
functionalZsoftmaxZfloat32torH   rE   rK   
contiguous)
r@   rA   rB   rC   r/   rD   rE   kwargsattn_weightsattn_outputr-   r-   r.   eager_attention_forwardQ   s    
rT   c                       s   e Zd ZdZdeee d fddZedddd	de	j
eee	j
  ee	j
 ee	j
 eee	j
 ee	j
ee	j
 ee f dddZ  ZS )OPTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    s  t    || _|j| _|j| _|j| _|j	| _	|| _
|d u rTtd| jj d | j| j | _d| _| j| j | jkrtd| j d| j d| jd | _tj| j| j| j	d| _tj| j| j| j	d| _tj| j| j| j	d| _tj| j| j| j	d| _d S )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩bias)r(   r)   rW   hidden_size	embed_dimZnum_attention_heads	num_headsZattention_dropoutrE   enable_biasrX   loggerwarning_oncer,   r8   head_dimZ	is_causal
ValueErrorrD   r   Lineark_projv_projq_projout_proj)r*   rW   rX   rQ   r+   r-   r.   r)   k   s0    

zOPTAttention.__init__past_key_valuepast_key_values4.58new_nameversionF)hidden_statesri   r/   layer_head_maskoutput_attentionscache_positionreturnc                 K   s<  |  \}}	}
| || j }||d| j| jdd}| |}| |}||d| j| jdd}||d| j| jdd}|dur|	||| j
d|i\}}t}| jjdkr| jjdkr|rtd nt| jj }|| ||||f| jsd	n| jd
d|\}}|||	d }| |}|s4d}||fS )z#Input shape: Batch x Time x ChannelrF   r   r&   Nrq   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r?         ?)rE   rD   )sizerf   rD   viewr]   ra   rM   rd   re   updaterX   rT   rW   _attn_implementationr_   r`   r   rK   rE   reshaperP   rg   )r*   rn   ri   r/   ro   rp   rq   rQ   ZbszZtgt_len_Zquery_statesZ
key_statesZvalue_statesZattention_interfacerS   rR   r-   r-   r.   r7      sF    



zOPTAttention.forward)N)NNNFN)r8   r9   r:   r;   r   r   r<   r)   r   r4   Tensortupleboolr   r7   r>   r-   r-   r+   r.   rU   h   s*    #     rU   c                       s   e Zd Zdeee d fddZedddddej	eej	 eej	 ee
ej	  ee ee eej eej	 ee e
ejee
ejejf  f d

ddZ  ZS )OPTDecoderLayerNrV   c                    s   t    |j| _t||d| _|j| _|j| _t|j	 | _
tj| j|jd| _tj| j|j|jd| _tj|j| j|jd| _tj| j|jd| _d S )NrV   Zelementwise_affinerY   )r(   r)   r[   r\   rU   	self_attndo_layer_norm_beforerE   r
   Zactivation_functionactivation_fnr   	LayerNormlayer_norm_elementwise_affineself_attn_layer_normrc   Zffn_dimr^   fc1fc2final_layer_norm)r*   rW   rX   r+   r-   r.   r)      s    
zOPTDecoderLayer.__init__rh   ri   rj   rk   F)
rn   r/   ro   ri   rp   	use_cacher1   rq   rQ   rr   c	              
   K   s   |}
| j r| |}| jf |||||||d|	\}}tjj|| j| jd}|
| }| j sf| |}|j}|d|	d}|}
| j r| 
|}| |}| |}| |}tjj|| j| jd}|
| |}| j s| 
|}|f}|r||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence..
        )rn   ri   r1   r/   ro   rp   rq   rI   rF   )r   r   r   r   rN   rE   rK   shaperz   rv   r   r   r   r   rw   )r*   rn   r/   ro   ri   rp   r   r1   rq   rQ   ZresidualZself_attn_weightsZhidden_states_shapeoutputsr-   r-   r.   r7      sD    









zOPTDecoderLayer.forward)N)NNNFFNN)r8   r9   r:   r   r   r<   r)   r   r4   r|   r}   r~   r=   r   r   FloatTensorr7   r>   r-   r-   r+   r.   r      s*          r   c                   @   s@   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZdZdd ZdS )OPTPreTrainedModelrW   modelTr   c                 C   s   | j j}t|tjr>|jjjd|d |jd ur|jj	  nbt|tj
rz|jjjd|d |jd ur|jj|j 	  n&t|tjr|jjd |jj	  d S )Nr?   )meanstdru   )rW   Zinit_std
isinstancer   rc   weightdataZnormal_rZ   Zzero_	Embeddingpadding_idxr   Zfill_)r*   r@   r   r-   r-   r.   _init_weightsA  s    

z OPTPreTrainedModel._init_weightsN)r8   r9   r:   r   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_attention_backendZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphr   r-   r-   r-   r.   r   4  s   
r   c                       s   e Zd ZdZed fddZdeejdf ejeje	e
ddd	Zeejeeejejed
ddZedeej eej eej ee	 eej ee
 ee
 ee
 ee
 eej eej ee eeef dddZ  ZS )
OPTDecoderz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

    Args:
        config: OPTConfig
    rW   c                    s  t     j| _ j| _ j| _ j| _ j| _t	
 j j| j| _t j j| _ j jkr~t	j j jdd| _nd | _ j jkrt	j j jdd| _nd | _ jrЈ jst	j j jd| _nd | _t	 fddt jD | _d| _|   d S )NFrY   r   c                    s   g | ]}t  |d qS ))rX   )r   ).0ir   r-   r.   
<listcomp>w      z'OPTDecoder.__init__.<locals>.<listcomp>)r(   r)   rE   	layerdroppad_token_idr   Zmax_position_embeddingsZmax_target_positions
vocab_sizer   r   word_embed_proj_dimembed_tokensr"   r[   embed_positionsrc   project_out
project_inr   Z_remove_final_layer_normr   r   r   Z
ModuleListrangeZnum_hidden_layerslayersgradient_checkpointing	post_initr*   rW   r+   r   r.   r)   X  s,    
 zOPTDecoder.__init__Fr    )r/   input_tensorrq   ri   rp   c                 C   sB  | j jdkr(|d ur$|dk r$|S d S | j jdkrLt|tjrHt|}|S |d ur\| nd}|d urn|jnd}| j jdkr|s|st	j
|||| jdrd S |j}|jd }	|r| }
n"t|tjr|jd	 n
||	 d }
| j||	|
|||jd d
}| j jdkr>|d ur>|jjdv r>|s>t|j}t	||}|S )NZflash_attention_2r?   Zflex_attentionr   Frt   )inputs_embedsr0   Zis_trainingr   rF   )sequence_lengthtarget_lengthrH   rq   
batch_size)cudaZxpuZnpu)rW   ry   anyr   r4   r|   r!   get_seq_lengthZis_compileabler   Z_ignore_causal_mask_sdparK   rH   r   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiondevicetypefinfominZ_unmask_unattended)r*   r/   r   rq   ri   rp   past_seen_tokensZusing_compilable_cacherH   r   r   causal_mask	min_dtyper-   r-   r.   _update_causal_mask~  sZ    






	zOPTDecoder._update_causal_mask)r/   r   r   rH   rq   r   c                 K   sF  | dur|   dkr| }n&t|j}tj||f|||jd}|dkrVtj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| durB|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuerH   r   r   )Zdiagonalr   rF   r   )r3   r4   r   r   fullr   Ztriuarangerz   expandcloner   rO   Zmasked_fill)r/   r   r   rH   rq   r   rQ   r   r   Zmask_lengthZpadding_maskr-   r-   r.   r     s*     $

6  z@OPTDecoder._prepare_4d_causal_attention_mask_with_cache_positionN	input_idsr/   	head_maskri   r   r   rp   output_hidden_statesreturn_dictr1   rq   rQ   rr   c                 K   s  |dur|n| j j}|dur |n| j j}|dur4|n| j j}|	durH|	n| j j}	|du |duA rhtd| jr| jr|rt	d d}|dur|
d|jd }|du r| |}|r|du rt| j d}|dur| nd}|du rtj|||jd  |jd	}|du r4||jd  }tj|jd ||jd	}| |||||}|
du rtj|dd
}
|
| d  }
|
dd|df }
| j|||
d}| jdur| |}|||j }|rdnd}|rdnd}t|gdgD ]V\}}|dur| d t| jkrtd| dt| j d| d  dqt| jD ]\}}|r\||f7 }| jr~tg }|| jk r~qD||f||
|dur|| nd||||d|}|d }|rD||d f7 }qD| j dur|  |}| j!dur| !|}|r||f7 }t"||||dS )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.n_positions - 1]`. for padding use -1.

                [What are position IDs?](../glossary#position-ids)
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
                the complete sequence length.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrF   r   r   r   r   r2   )r1   r-   r   zThe `z` should be specified for z layers, but it is for .)r/   r1   ro   ri   rp   r   rq   last_hidden_stateri   rn   
attentions)#rW   rp   r   r   use_return_dictrb   r   rK   r_   r`   rw   r   r   r   r   r4   r   r   Zonesr   r5   r6   r   r   rO   ziprv   lenr   	enumerateZrandr   r   r   r   )r*   r   r/   r   ri   r   r   rp   r   r   r1   rq   rQ   r   Z
seq_lengthr   Z
pos_embedsrn   Zall_hidden_statesZall_self_attnsZ	attn_maskZ	mask_nameidxZdecoder_layerZdropout_probabilityZlayer_outputsr-   r-   r.   r7     s    H









	


zOPTDecoder.forward)F)NNNNNNNNNNN)r8   r9   r:   r;   r   r)   r   r4   r|   r   r~   r   staticmethodr<   rH   r   r   r   r=   r   r   r   r}   r   r7   r>   r-   r-   r+   r.   r   P  sZ   , D6           
r   c                       s   e Zd Zed fddZdd Zdd Zeede	e
j e	e
j e	e
j e	eee
j ef  e	e
j e	e e	e e	e e	e e	e
j e	e
j ee eeef d	d
dZ  ZS )OPTModelr   c                    s"   t  | t|| _|   d S N)r(   r)   r   decoderr   r   r+   r-   r.   r)     s    
zOPTModel.__init__c                 C   s   | j jS r   r   r   r*   r-   r-   r.   get_input_embeddings  s    zOPTModel.get_input_embeddingsc                 C   s   || j _d S r   r   r*   rC   r-   r-   r.   set_input_embeddings  s    zOPTModel.set_input_embeddingsNr   c                 K   s   |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|	d urH|	n| j j}	| jf |||
||||||d|d|}t|j|j|j	|j
dS )NTr   r/   r1   r   ri   r   r   rp   r   r   rq   r   )rW   rp   r   r   r   r   r   r   ri   rn   r   )r*   r   r/   r   ri   r   r   rp   r   r   r1   rq   rQ   Zdecoder_outputsr-   r-   r.   r7     s4    zOPTModel.forward)NNNNNNNNNNN)r8   r9   r:   r   r)   r   r   r   r   r   r4   r=   r|   r   listr   r   r~   r   r   r}   r   r7   r>   r-   r-   r+   r.   r     s>              
r   c                       s   e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Ze	e
deej eej eej eeeej ef  eej eej ee ee ee ee eej eej ee eeef dddZ  ZS )OPTForCausalLMzlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFrY   )
r(   r)   r   r   r   rc   r   r   lm_headr   r   r+   r-   r.   r)     s    
zOPTForCausalLM.__init__c                 C   s
   | j jjS r   r   r   r   r   r-   r-   r.   r     s    z#OPTForCausalLM.get_input_embeddingsc                 C   s   || j j_d S r   r   r   r-   r-   r.   r     s    z#OPTForCausalLM.set_input_embeddingsc                 C   s   || j _d S r   r   r   )r*   r   r-   r-   r.   set_decoder  s    zOPTForCausalLM.set_decoderc                 C   s   | j jS r   r   r   r-   r-   r.   get_decoder  s    zOPTForCausalLM.get_decoderN)r   r/   r   ri   r   labelsr   rp   r   r   r1   rq   rQ   rr   c                 K   s   |dur|n| j j}|	dur |	n| j j}	|
dur4|
n| j j}
| jjf |||||||||	d|d|}| |d  }d}|dur||j	}| j
||fd| j ji|}t|||j|j|jdS )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForCausalLM

        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
        ```NTr   r   r   losslogitsri   rn   r   )rW   rp   r   r   r   r   r   rP   rO   r   Zloss_functionr   r   ri   rn   r   )r*   r   r/   r   ri   r   r   r   rp   r   r   r1   rq   rQ   r   r   r   r-   r-   r.   r7   
  sL    )zOPTForCausalLM.forward)NNNNNNNNNNNN)r8   r9   r:   Z_tied_weights_keysr)   r   r   r   r   r   r   r   r4   r=   r|   r   r   r   r   r~   r   r   r}   r   r7   r>   r-   r-   r+   r.   r     sH   
            
r   a  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )Zcustom_introc                       s   e Zd Zed fddZedeej eej	 eej	 ee
eej	 ef  eej	 eej ee ee ee ee eej e
eef dddZdd	 Zd
d Z  ZS )OPTForSequenceClassificationr   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r   )
r(   r)   
num_labelsr   r   r   rc   r   scorer   r   r+   r-   r.   r)   n  s
    
z%OPTForSequenceClassification.__init__N)r   r/   r   ri   r   r   r   rp   r   r   r1   rr   c                 C   s\  |
dur|
n| j j}
| j|||||||||	|
d
}|d }| |}|dur`|jdd \}}n|jdd \}}| j jdu r|dkrtd| j jdu rd}nb|dur|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d	 |t	j||jd
|f }d}|dur| j jdu r| jdkrLd| j _n:| jdkr~|jt	jkst|jt	jkr~d| j _nd| j _| j jdkrt }| jdkr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkrt }|||}|
sD|f|dd  }|dur@|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	ri   r/   r1   r   r   r   rp   r   r   r   r&   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rF   )r   rH   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   Z
regressionZsingle_label_classificationZmulti_label_classificationr   )rW   r   r   r   r   r   rb   rO   r   r4   Zint32r   Zargmaxr_   r`   r,   r8   Zproblem_typer   rH   r6   r<   r   squeezer   rw   r   r   ri   rn   r   )r*   r   r/   r   ri   r   r   r   rp   r   r   r1   transformer_outputsrn   r   r   r   Zlast_non_pad_tokenZnon_pad_maskZtoken_indicesZpooled_logitsr   loss_fctoutputr-   r-   r.   r7   w  sx    


(

z$OPTForSequenceClassification.forwardc                 C   s
   | j jjS r   r   r   r-   r-   r.   r     s    z1OPTForSequenceClassification.get_input_embeddingsc                 C   s   || j j_d S r   r   r   r-   r-   r.   r     s    z1OPTForSequenceClassification.set_input_embeddings)NNNNNNNNNNN)r8   r9   r:   r   r)   r   r   r4   r=   r   r   r   r   r~   r}   r   r7   r   r   r>   r-   r-   r+   r.   r   _  s:   	           
^r   c                       s   e Zd Zed fddZedeej eej	 eej	 ee
eej	 ef  eej	 eej eej ee ee ee ee eej e
eef dddZdd	 Zd
d Z  ZS )OPTForQuestionAnsweringr   c                    s2   t  | t|| _t|jd| _|   d S r%   )	r(   r)   r   r   r   rc   r   
qa_outputsr   r   r+   r-   r.   r)     s    
z OPTForQuestionAnswering.__init__N)r   r/   r   ri   r   start_positionsend_positionsr   rp   r   r   r1   rr   c                 C   sb  |dur|n| j j}| j||||||||	|
|d
}|d }| |}|jddd\}}|d }|d }d}|dur|durt| dkr|d}t| dkr|d}|d}|	d|
|j}|	d|
|j}t|d}|||}|||}|| d }|sL||f|dd  }|durH|f| S |S t||||j|jd	S )
a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForQuestionAnswering
        >>> import torch

        >>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> # note: we are loading a OPTForQuestionAnswering from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

        >>> inputs = tokenizer(question, text, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> answer_offset = len(tokenizer(question)[0])

        >>> predict_answer_tokens = inputs.input_ids[
        ...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
        ... ]
        >>> predicted = tokenizer.decode(predict_answer_tokens)
        >>> predicted
        ' a nice puppet'
        ```Nr   r   r   rF   r2   )Zignore_indexr&   )r   start_logits
end_logitsrn   r   )rW   r   r   r   splitr   rP   r   rv   clamprO   r   r   r   rn   r   )r*   r   r/   r   ri   r   r   r   r   rp   r   r   r1   r   rn   r   r   r   Z
total_lossZignored_indexr   Z
start_lossZend_lossr   r-   r-   r.   r7     sR    0






zOPTForQuestionAnswering.forwardc                 C   s
   | j jjS r   r   r   r-   r-   r.   r   I  s    z,OPTForQuestionAnswering.get_input_embeddingsc                 C   s   || j j_d S r   r   r   r-   r-   r.   r   L  s    z,OPTForQuestionAnswering.set_input_embeddings)NNNNNNNNNNNN)r8   r9   r:   r   r)   r   r   r4   r=   r   r   r   r   r~   r}   r   r7   r   r   r>   r-   r-   r+   r.   r     s>               
ar   )r   r   r   r   r   )r?   )Cr;   typingr   r   r   r4   Ztorch.utils.checkpointr   Ztorch.nnr   r   r   Zactivationsr
   Zcache_utilsr   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zutils.deprecationr   Zconfiguration_optr   Z!torch.nn.attention.flex_attentionr    Zintegrations.flex_attentionr!   Z
get_loggerr8   r_   r   r"   Moduler|   floatrT   rU   r   r   r   r   r   r   r   __all__r-   r-   r-   r.   <module>   sb   
$ ff  e=npr