a
    h}                     @   sf  d Z ddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ee Z!dd Z"dd Z#d'ddZ$G dd dej%Z&dd Z'G dd dej%Z(eG dd deZ)eG dd de)Z*edd G d!d" d"e)eZ+ed#d G d$d% d%e)Z,g d&Z-dS )(zPyTorch CTRL model.    )OptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )DynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPastSequenceClassifierOutput)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
CTRLConfigc                 C   s$   dt dd|d  |  }| | S )Nr   i'     )torchpow)posid_model_sizeZangle_rates r   b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/ctrl/modeling_ctrl.py
angle_defn(   s    r   c                 C   s   t tj| tjd|dtj|tjd|d|}t|d d dd df }t|d d dd df }tj||gdd}|S )Ndtyper   r   r   dim)	r   r   arangeZint64to	unsqueezesincoscat)positionr   r    Z
angle_radsZsinesZcosinespos_encodingr   r   r   positional_encoding-   s    r,   c              	   C   s   t | |dddd}|jd }|t| }|d urn|d|d }	}
|||
|	 |
d |
f d 7 }|d ur~|| }t j|dd}|d ur|| }t ||}||fS )	Nr   r   r   r   r!   g     r"   )r   matmulpermuteshapenpsqrtsizeZsoftmax)qkvmaskattention_mask	head_maskZ	matmul_qkZdkZscaled_attention_logitsndnsZattention_weightsoutputr   r   r   scaled_dot_product_attention<   s    
 r=   c                       s8   e Zd Zd fdd	Zdd Zdd Zdd	d
Z  ZS )MultiHeadAttentionNc                    sp   t    || _|| _|| _t|| j | _t||| _	t||| _
t||| _t||| _t | _d S N)super__init__	num_headsr   	layer_idxintdepthr   LinearWqWkWvdensesetpruned_heads)selfr   rB   rC   	__class__r   r   rA   W   s    
zMultiHeadAttention.__init__c                 C   s   | j | j }t|dkrd S t|| j|| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt| | _|| j | _ | j
|| _d S )Nr   r   r"   )r   rB   lenr   rL   r   rG   rH   rI   rJ   union)rM   headsZattention_head_sizeindexr   r   r   prune_headsf   s    zMultiHeadAttention.prune_headsc                 C   s"   | |d| j| j}|g dS )Nr!   r   r   r   r   )reshaperB   rE   r/   )rM   x
batch_sizer   r   r   split_into_headsw   s    z#MultiHeadAttention.split_into_headsFc                 C   s   |j d }| |}| |}| |}| ||}| ||}| ||}|d urn|||| jd|
i\}}t||||||}|d g d}|d }|	|d| j
}| |}||fS )Nr   cache_positionrU   r   r!   )r0   rG   rH   rI   rY   updaterC   r=   r/   rV   r   rJ   )rM   r6   r5   r4   r7   
layer_pastr8   r9   	use_cacheoutput_attentionsrZ   rX   r<   Zscaled_attentionZattnZoriginal_size_attentionr   r   r   forward{   s    




zMultiHeadAttention.forward)N)NNNFFN)__name__
__module____qualname__rA   rT   rY   r_   __classcell__r   r   rN   r   r>   V   s   
      r>   c                 C   s"   t t | |t  t || S r?   )r   Z
SequentialrF   ZReLU)r   dffr   r   r   point_wise_feed_forward_network   s    re   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
EncoderLayer皙?Nc                    sb   t    t|||d| _t||| _tj|dd| _tj|dd| _	t
|| _t
|| _d S )NrC   gư>eps)r@   rA   r>   multi_head_attentionre   ffnr   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)rM   r   rB   rd   ZraterC   rN   r   r   rA      s    
zEncoderLayer.__init__Fc	                 C   s~   |  |}	| j|	|	|	|||||||d
}
|
d }| |}|| }| |}| |}| |}|| }|f|
dd   }|S )Nr\   r8   r9   r]   r^   rZ   r   r   )rn   rk   rq   ro   rl   rr   )rM   rW   r7   r\   r8   r9   r]   r^   rZ   normedZattn_outputsZattn_outputZout1Zout2Z
ffn_outputoutputsr   r   r   r_      s,    




zEncoderLayer.forward)rg   N)NNNFFN)r`   ra   rb   rA   r_   rc   r   r   rN   r   rf      s         rf   c                   @   s"   e Zd ZU eed< dZdd ZdS )CTRLPreTrainedModelconfigtransformerc                 C   s   t |tjtfr>|jjjd| jjd |j	dur|j	j
  nft |tjr~|jjjd| jjd |jdur|jj|j 
  n&t |tjr|j	j
  |jjd dS )zInitialize the weights.g        )meanZstdN      ?)
isinstancer   rF   r   weightdataZnormal_rw   Zinitializer_rangebiasZzero_	EmbeddingZpadding_idxrm   Zfill_)rM   moduler   r   r   _init_weights   s    

z!CTRLPreTrainedModel._init_weightsN)r`   ra   rb   r   __annotations__Zbase_model_prefixr   r   r   r   r   rv      s   
rv   c                       s   e Zd Z fddZdd Zdd Zdd Zedee	j
 eeee	j   ee	j ee	j
 ee	j
 ee	j ee	j ee ee ee ee ee	j eee	j ef d
ddZ  ZS )	CTRLModelc                    s   t     j| _ j| _t j| jtj	| _
t j j| _t j| _t fddt jD | _tj j jd| _|   d S )Nc              	      s&   g | ]}t  j j j j|d qS )rh   )rf   n_embdZn_headrd   Zresid_pdrop).0r   rw   r   r   
<listcomp>   s   z&CTRLModel.__init__.<locals>.<listcomp>ri   )r@   rA   r   r   n_layerZ
num_layersr,   Zn_positionsr   floatr+   r   r   
vocab_sizewrp   Z
embd_pdropdropoutZ
ModuleListrangehrm   Zlayer_norm_epsilon	layernorm	post_initrM   rw   rN   r   r   rA      s    
zCTRLModel.__init__c                 C   s   | j S r?   r   )rM   r   r   r   get_input_embeddings  s    zCTRLModel.get_input_embeddingsc                 C   s
   || _ d S r?   r   )rM   Znew_embeddingsr   r   r   set_input_embeddings  s    zCTRLModel.set_input_embeddingsc                 C   s(   |  D ]\}}| j| j| qdS )zv
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsr   rk   rT   )rM   Zheads_to_prunelayerrR   r   r   r   _prune_heads  s    zCTRLModel._prune_headsN)	input_idspast_key_valuesr8   token_type_idsposition_idsr9   inputs_embedsr]   r^   output_hidden_statesreturn_dictrZ   returnc                 K   sT  |	dur|	n| j j}	|dur |n| j j}|
dur4|
n| j j}
|durH|n| j j}|durj|durjtdnd|dur| || | }|d|d }|j	d }n,|dur| dd }|j	d }ntd|dur|j
n|j
}|r|du rt| j d}|r t|tr td t|}|dur2| nd}|du rftj||d | tj|d}|d}|dur|dkrtd	||d}|d
d}|j| jd}d| t| jj }| || j j}|dur|d|d }| |}|t| j9 }nd}|du r | |}|d }t t!|| || d
|}|t| j9 }| j"|| _"| j"|ddf }|| | }| #|}|
rdnd}|	rdnd}t$| j%D ]R\}}|
r||f }||||||| ||	|d}|d }|	r||d
 f7 }q| &|}|
r$||f }|sDtdd ||||fD S t'||||dS )aE  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CTRLModel
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 5, 1280]
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer!   r   z5You have to specify either input_ids or inputs_embedsr   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.)r    devicez$batch_size has to be defined and > 0r   r   r   rz   r   rs   c                 s   s   | ]}|d ur|V  qd S r?   r   )r   r6   r   r   r   	<genexpr>  s   z$CTRLModel.forward.<locals>.<genexpr>)Zlast_hidden_stater   hidden_states
attentions)(rw   r^   r]   r   use_return_dict
ValueErrorZ%warn_if_padding_and_no_attention_maskr3   viewr0   r   r	   r{   tupleloggerwarning_onceZfrom_legacy_cacheget_seq_lengthr   r$   longr&   r%   r    ZfinfominZget_head_maskr   r   r1   r2   r   ZtriuZonesr+   r   	enumerater   r   r   )rM   r   r   r8   r   r   r9   r   r]   r^   r   r   rZ   kwargsZinput_shaperX   r   past_lengthZtoken_type_embedsZseq_lenr7   Z
pos_embedsr   Zall_hidden_statesZall_attentionsr   r   ru   r   r   r   r_     s    1









"





zCTRLModel.forward)NNNNNNNNNNNN)r`   ra   rb   rA   r   r   r   r   r   r   
LongTensorr   FloatTensorboolTensorr   r   r_   rc   r   r   rN   r   r      s@               r   z
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )Zcustom_introc                       s   e Zd ZdgZ fddZed
eej ee	e	ej
   eej
 eej eej eej
 eej
 eej ee ee ee ee eej ee	ej ef dddZddd	Z  ZS )CTRLLMHeadModelzlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NTr~   )
r@   rA   r   rx   r   rF   r   r   lm_headr   r   rN   r   r   rA     s    
zCTRLLMHeadModel.__init__N)r   r   r8   r   r   r9   r   labelsr]   r^   r   r   rZ   r   c                 K   s   |dur|n| j j}| j||||||||	|
|||d}|d }| |}d}|durp| j||fd| j ji|}|s|f|dd  }|dur|f| S |S t|||j|j|j	dS )a
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLLMHeadModel

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> sequence_ids = model.generate(inputs["input_ids"])
        >>> sequences = tokenizer.batch_decode(sequence_ids)
        >>> sequences
        ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']

        >>> outputs = model(**inputs, labels=inputs["input_ids"])
        >>> round(outputs.loss.item(), 2)
        9.21

        >>> list(outputs.logits.shape)
        [1, 5, 246534]
        ```N)r   r8   r   r   r9   r   r]   r^   r   r   rZ   r   r   r   )losslogitsr   r   r   )
rw   r   rx   r   Zloss_functionr   r   r   r   r   )rM   r   r   r8   r   r   r9   r   r   r]   r^   r   r   rZ   r   transformer_outputsr   Z	lm_logitsr   r<   r   r   r   r_     sJ    <
zCTRLLMHeadModel.forwardc                 K   sR   |d urF|  }|jd |kr$|}n|jd d }|d d |d f }|||dS )Nr   )r   r   r]   )r   r0   )rM   r   r   r]   r   r   Zremove_prefix_lengthr   r   r   prepare_inputs_for_generation-  s    z-CTRLLMHeadModel.prepare_inputs_for_generation)NNNNNNNNNNNNN)NN)r`   ra   rb   Z_tied_weights_keysrA   r   r   r   r   r   r   r   r   r   r   r_   r   rc   r   r   rN   r   r     sB                er   a  
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    c                       s   e Zd Z fddZedeej eeeej	   eej	 eej eej eej	 eej	 eej ee
 ee
 ee
 ee
 eeej ef dddZ  ZS )CTRLForSequenceClassificationc                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S )NFr   )
r@   rA   
num_labelsr   rx   r   rF   r   
classifierr   r   rN   r   r   rA   L  s
    
z&CTRLForSequenceClassification.__init__N)r   r   r8   r   r   r9   r   r   r]   r^   r   r   r   c                 C   sZ  |dur|n| j j}| j||||||||	|
||d}|d }| |}|durb|jdd \}}n|jdd \}}| j jdu r|dkrtd| j jdu rd}nb|dur|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d	 |t	j||jd
|f }d}|dur| j jdu r| jdkrNd| j _n:| jdkr|jt	jksv|jt	jkrd| j _nd| j _| j jdkrt }| jdkr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkrt }|||}|sF|f|dd  }|durB|f| S |S t|||j|jdS )a2  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> import torch

        >>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> labels = torch.tensor(1)
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)
        0.93
        ```

        Example of multi-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained(
        ...     "Salesforce/ctrl", problem_type="multi_label_classification"
        ... )

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> num_labels = len(model.config.id2label)
        >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
        ...     torch.float
        ... )
        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()  # doctest: +IGNORE_RESULT
        ```N)
r   r8   r   r   r9   r   r]   r^   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r!   )r   r    z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   Z
regressionZsingle_label_classificationZmulti_label_classification)r   r   r   r   )rw   r   rx   r   r0   Zpad_token_idr   r%   r   r   Zint32r$   Zargmaxr   r   rO   r`   Zproblem_typer   r    r   rD   r   Zsqueezer   r   r   r   r   r   )rM   r   r   r8   r   r   r9   r   r   r]   r^   r   r   r   r   r   rX   Zsequence_lengthZlast_non_pad_tokenZnon_pad_maskZtoken_indicesZpooled_logitsr   Zloss_fctr<   r   r   r   r_   U  sx    h


(

z%CTRLForSequenceClassification.forward)NNNNNNNNNNNN)r`   ra   rb   rA   r   r   r   r   r   r   r   r   r   r   r_   rc   r   r   rN   r   r   @  s:   	            r   )r   r   r   rv   )NN).__doc__typingr   r   numpyr1   r   r   Ztorch.nnr   r   r   Zcache_utilsr	   Z
generationr
   Zmodeling_outputsr   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   utilsr   r   Zconfiguration_ctrlr   Z
get_loggerr`   r   r   r,   r=   Moduler>   re   rf   rv   r   r   r   __all__r   r   r   r   <module>   sF   

G2 M  ?