a
    h                    @   s  d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ e+0e1Z2G dd dej3Z4G dd dej3Z5G dd de5Z6G dd dej3Z7e5e6dZ8G dd dej3Z9G dd dej3Z:G d d! d!ej3Z;G d"d# d#eZ<G d$d% d%ej3Z=G d&d' d'ej3Z>e*G d(d) d)e$Z?G d*d+ d+ej3Z@G d,d- d-ej3ZAe*G d.d/ d/e?ZBe*G d0d1 d1e?ZCe*d2d3G d4d5 d5e?ZDe*G d6d7 d7e?ZEe*G d8d9 d9e?ZFe*G d:d; d;e?ZGe*d<d3G d=d> d>e?eZHdBd?d@ZIg dAZJdS )CzPyTorch CamemBERT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging)deprecate_kwarg   )CamembertConfigc                       s2   e Zd ZdZ fddZd
ddZdd	 Z  ZS )CamembertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd |j| _tj|j|j| jd| _	d S )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r!   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizeZpad_token_idword_embeddingsmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr'   Zregister_buffertorcharangeexpandzerosr)   sizelongr$   selfconfig	__class__ l/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/camembert/modeling_camembert.pyr0   :   s"    
zCamembertEmbeddings.__init__Nr   c                 C   s   |d u r*|d ur t || j|}n
| |}|d ur<| }n| d d }|d }|d u rt| dr| jd d d |f }||d |}	|	}ntj|tj	| j
jd}|d u r| |}| |}
||
 }| jdkr| |}||7 }| |}| |}|S )Nr*   r!   r,   r   r.   devicer(   )"create_position_ids_from_input_idsr$   &create_position_ids_from_inputs_embedsrB   hasattrr,   r@   r>   rA   rC   r)   rL   r4   r7   r'   r6   r8   r<   )rE   	input_idsr,   r)   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr7   
embeddingsr6   rI   rI   rJ   forwardS   s0    








zCamembertEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr*   r!   rK   r   )rB   r>   r?   r$   rC   rL   Z	unsqueezer@   )rE   rQ   rS   Zsequence_lengthr)   rI   rI   rJ   rN   {   s    	z:CamembertEmbeddings.create_position_ids_from_inputs_embeds)NNNNr   )__name__
__module____qualname____doc__r0   rX   rN   __classcell__rI   rI   rG   rJ   r#   4   s
    
(r#   c                       sr   e Zd Zd fdd	Zedddddejeej eej eej ee	 ee
 eej eej d	d
dZ  ZS )CamembertSelfAttentionNc                    s  t    |j|j dkr>t|ds>td|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|pt|dd| _| jdks| jd	kr|j| _t	d
|j d | j| _|j| _|| _d S )Nr   Zembedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r'   r(   relative_keyrelative_key_query   r!   )r/   r0   r3   num_attention_headsrO   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer:   attention_probs_dropout_probr<   r=   r'   r5   r1   distance_embedding
is_decoder	layer_idxrE   rF   r'   ro   rG   rI   rJ   r0      s,    

zCamembertSelfAttention.__init__past_key_valuepast_key_values4.58new_nameversionFhidden_statesattention_mask	head_maskencoder_hidden_statesrr   output_attentionscache_positionreturnc                 C   s  |j \}}	}
| |}||d| j| jdd}|d u}|d urrt|trn|j	| j
}|rf|j}qr|j}n|}|rz|n|}|r|d ur|r|j| j
 j}|j| j
 j}n| |}||d| j| jdd}| |}||d| j| jdd}|d ur<|s|nd }|||| j
d|i\}}|r<d|j| j
< t||dd}| jdksh| jdkrd|j d |j d  }}|d urtj|d tj|jd	dd}ntj|tj|jd	dd}tj|tj|jd	dd}|| }| || j d }|j|jd
}| jdkr0td||}|| }n4| jdkrdtd||}td||}|| | }|t | j }|d ur|| }t!j"j#|dd}| $|}|d ur|| }t||}|%dddd& }|' d d | j(f }||}||fS )Nr*   r!   rb   r}   Tr`   ra   rK   r-   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r   ))shaperi   viewrc   rf   	transpose
isinstancer   
is_updatedgetro   cross_attention_cacheself_attention_cachelayerskeysvaluesrj   rk   updater>   matmulr'   ZtensorrC   rL   r?   rm   r5   tor.   Zeinsummathsqrtr   
functionalZsoftmaxr<   Zpermute
contiguousrB   rg   )rE   rx   ry   rz   r{   rr   r|   r}   
batch_sizerT   _query_layeris_cross_attentionr   curr_past_key_valuecurrent_states	key_layervalue_layerZattention_scoresZquery_lengthZ
key_lengthZposition_ids_lZposition_ids_rZdistanceZpositional_embeddingZrelative_position_scoresZrelative_position_scores_queryZrelative_position_scores_keyZattention_probsZcontext_layerZnew_context_layer_shaperI   rI   rJ   rX      s    










zCamembertSelfAttention.forward)NN)NNNNFNrY   rZ   r[   r0   r    r>   Tensorr   FloatTensorr   booltuplerX   r]   rI   rI   rG   rJ   r^      s$         r^   c                       sv   e Zd Zd fdd	Zedddddejeej eej eej ee	 ee
 eej eej d	 fd
dZ  ZS )CamembertSdpaSelfAttentionNc                    s   t  j|||d |j| _d S Nr'   ro   )r/   r0   rl   dropout_probrp   rG   rI   rJ   r0     s    z#CamembertSdpaSelfAttention.__init__rq   rr   rs   rt   Frw   c              	      s  | j dks|s|d ur8td t |||||||S | \}}	}
| ||d| j| j	
dd}|d u}|rx|n|}|d urt|tr|j| j}|r|j}q|j}n|}|r|n|}|r|d ur|r|j| j j}|j| j j}n| ||d| j| j	
dd}| ||d| j| j	
dd}|d urv|sF|nd }|||| jd|i\}}|rvd|j| j< | jo| o|d u o|	dk}tjjj||||| jr| jnd|d	}|
dd}|||	| j }|d fS )
Nr(   a  CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r*   r!   rb   r}   T        )Z	attn_maskZ	dropout_p	is_causal)!r'   loggerwarning_oncer/   rX   rB   ri   r   rc   rf   r   r   r   r   r   ro   r   r   r   r   r   rj   rk   r   rn   r>   r   r   Zscaled_dot_product_attentiontrainingr   Zreshaperg   )rE   rx   ry   rz   r{   rr   r|   r}   Zbsztgt_lenr   r   r   r   r   r   r   r   r   Zattn_outputrG   rI   rJ   rX     sx    
 



"	z"CamembertSdpaSelfAttention.forward)NN)NNNNFNr   rI   rI   rG   rJ   r     s$         r   c                       s4   e Zd Z fddZejejejdddZ  ZS )CamembertSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr%   )r/   r0   r   rh   r3   denser8   r9   r:   r;   r<   rD   rG   rI   rJ   r0   |  s    
zCamembertSelfOutput.__init__rx   input_tensorr~   c                 C   s&   |  |}| |}| || }|S Nr   r<   r8   rE   rx   r   rI   rI   rJ   rX     s    

zCamembertSelfOutput.forwardrY   rZ   r[   r0   r>   r   rX   r]   rI   rI   rG   rJ   r   {  s   r   )eagersdpac                       sz   e Zd Zd fdd	Zdd Zedddd	dejeej	 eej	 eej	 ee
 ee eej eej dddZ  ZS )CamembertAttentionNc                    s6   t    t|j |||d| _t|| _t | _d S r   )	r/   r0    CAMEMBERT_SELF_ATTENTION_CLASSES_attn_implementationrE   r   outputsetpruned_headsrp   rG   rI   rJ   r0     s    

zCamembertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r!   r   )lenr   rE   rc   rf   r   r   ri   rj   rk   r   r   rg   union)rE   headsindexrI   rI   rJ   prune_heads  s    zCamembertAttention.prune_headsrq   rr   rs   rt   Frw   c              	   C   s>   | j |||||||d}| |d |}	|	f|dd   }
|
S )Nry   rz   r{   rr   r|   r}   r   r!   )rE   r   )rE   rx   ry   rz   r{   rr   r|   r}   Zself_outputsattention_outputoutputsrI   rI   rJ   rX     s    	zCamembertAttention.forward)NN)NNNNFN)rY   rZ   r[   r0   r   r    r>   r   r   r   r   r   r   rX   r]   rI   rI   rG   rJ   r     s&   
      r   c                       s0   e Zd Z fddZejejdddZ  ZS )CamembertIntermediatec                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r   )r/   r0   r   rh   r3   intermediate_sizer   r   Z
hidden_actstrr	   intermediate_act_fnrD   rG   rI   rJ   r0     s
    
zCamembertIntermediate.__init__rx   r~   c                 C   s   |  |}| |}|S r   )r   r   )rE   rx   rI   rI   rJ   rX     s    

zCamembertIntermediate.forwardr   rI   rI   rG   rJ   r     s   r   c                       s4   e Zd Z fddZejejejdddZ  ZS )CamembertOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r/   r0   r   rh   r   r3   r   r8   r9   r:   r;   r<   rD   rG   rI   rJ   r0     s    
zCamembertOutput.__init__r   c                 C   s&   |  |}| |}| || }|S r   r   r   rI   rI   rJ   rX     s    

zCamembertOutput.forwardr   rI   rI   rG   rJ   r     s   r   c                       s   e Zd Zd fdd	Zedddddejeej eej eej eej ee	 ee
 eej eej d		d
dZdd Z  ZS )CamembertLayerNc                    sx   t    |j| _d| _t||d| _|j| _|j| _| jr`| jsPt|  dt|d|d| _	t
|| _t|| _d S )Nr!   ro   z> should be used as a decoder model if cross attention is addedr(   r   )r/   r0   chunk_size_feed_forwardseq_len_dimr   	attentionrn   add_cross_attentionrd   crossattentionr   intermediater   r   rE   rF   ro   rG   rI   rJ   r0     s    

zCamembertLayer.__init__rq   rr   rs   rt   F)	rx   ry   rz   r{   encoder_attention_maskrr   r|   r}   r~   c	              	   C   s   | j ||||||d}	|	d }
|	dd  }| jr|d urt| dsRtd|  d| j|
||||||d}|d }
||dd   }t| j| j| j|
}|f| }|S )N)ry   rz   r|   rr   r}   r   r!   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   rn   rO   rd   r   r   feed_forward_chunkr   r   )rE   rx   ry   rz   r{   r   rr   r|   r}   Zself_attention_outputsr   r   Zcross_attention_outputslayer_outputrI   rI   rJ   rX     s>    

	
zCamembertLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )rE   r   Zintermediate_outputr   rI   rI   rJ   r   &  s    
z!CamembertLayer.feed_forward_chunk)N)NNNNNFN)rY   rZ   r[   r0   r    r>   r   r   r   r   r   r   rX   r   r]   rI   rI   rG   rJ   r     s*          0r   c                       s   e Zd Zd	 fdd	Zd
ejeej eej eej eej eeeej   ee	 ee	 ee	 ee	 eej e
eej ef dddZ  ZS )CamembertEncoderNc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0irF   rI   rJ   
<listcomp>1      z-CamembertEncoder.__init__.<locals>.<listcomp>F)	r/   r0   rF   r   Z
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   rG   r   rJ   r0   .  s    
 zCamembertEncoder.__init__FT)rx   ry   rz   r{   r   rr   	use_cacher|   output_hidden_statesreturn_dictr}   r~   c                 C   sd  |	rdnd }|rdnd }|r(| j jr(dnd }| jrJ| jrJ|rJtd d}|rx| j jrx|d u rxtt| j dt| j d}|r| j jrt	|t
rtd t|}t| jD ]r\}}|	r||f }|d ur|| nd }|||||||||d}|d }|r||d f }| j jr||d	 f }q|	r0||f }|
sRt
d
d |||||fD S t|||||dS )NrI   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)r   rr   r|   r}   r   r!   rb   c                 s   s   | ]}|d ur|V  qd S r   rI   )r   vrI   rI   rJ   	<genexpr>s  s   z+CamembertEncoder.forward.<locals>.<genexpr>)last_hidden_staterr   rx   
attentionscross_attentions)rF   r   r   r   r   r   rn   r   r   r   r   Zfrom_legacy_cache	enumerater   r   )rE   rx   ry   rz   r{   r   rr   r   r|   r   r   r}   Zall_hidden_statesZall_self_attentionsZall_cross_attentionsr   Zlayer_moduleZlayer_head_maskZlayer_outputsrI   rI   rJ   rX   4  sj    


zCamembertEncoder.forward)N)
NNNNNNFFTN)rY   rZ   r[   r0   r>   r   r   r   r   r   r   r   rX   r]   rI   rI   rG   rJ   r   -  s2   	          r   c                       s0   e Zd Z fddZejejdddZ  ZS )CamembertPoolerc                    s*   t    t|j|j| _t | _d S r   )r/   r0   r   rh   r3   r   ZTanh
activationrD   rG   rI   rJ   r0     s    
zCamembertPooler.__init__r   c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r   )rE   rx   Zfirst_token_tensorpooled_outputrI   rI   rJ   rX     s    

zCamembertPooler.forwardr   rI   rI   rG   rJ   r     s   r   c                   @   s*   e Zd ZU eed< dZdZdZdd ZdS )CamembertPreTrainedModelrF   robertaTc                 C   s   t |tjr:|jjjd| jjd |jdur|jj	  n~t |tj
rz|jjjd| jjd |jdur|jj|j 	  n>t |tjr|jj	  |jjd nt |tr|jj	  dS )zInitialize the weightsr   )meanZstdNg      ?)r   r   rh   weightdataZnormal_rF   Zinitializer_rangebiasZzero_r1   r$   r8   Zfill_CamembertLMHead)rE   modulerI   rI   rJ   _init_weights  s    


z&CamembertPreTrainedModel._init_weightsN)	rY   rZ   r[   r"   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_supports_sdpar   rI   rI   rI   rJ   r     s
   
r   c                       s(   e Zd ZdZ fddZdd Z  ZS )CamembertClassificationHeadz-Head for sentence-level classification tasks.c                    sT   t    t|j|j| _|jd ur,|jn|j}t|| _	t|j|j
| _d S r   )r/   r0   r   rh   r3   r   classifier_dropoutr;   r:   r<   
num_labelsout_projrE   rF   r   rG   rI   rJ   r0     s    
z$CamembertClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S r   )r<   r   r>   tanhr   rE   featureskwargsxrI   rI   rJ   rX     s    




z#CamembertClassificationHead.forward)rY   rZ   r[   r\   r0   rX   r]   rI   rI   rG   rJ   r     s   	r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r   z,Camembert Head for masked language modeling.c                    sd   t    t|j|j| _tj|j|jd| _t|j|j	| _
tt|j	| _| j| j
_d S r   )r/   r0   r   rh   r3   r   r8   r9   
layer_normr2   decoder	Parameterr>   rA   r   rD   rG   rI   rJ   r0     s    
zCamembertLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r   )r   r
   r  r  r   rI   rI   rJ   rX     s
    


zCamembertLMHead.forwardc                 C   s*   | j jjjdkr| j| j _n
| j j| _d S )Nmeta)r  r   rL   typerE   rI   rI   rJ   _tie_weights  s    zCamembertLMHead._tie_weights)rY   rZ   r[   r\   r0   rX   r  r]   rI   rI   rG   rJ   r     s   	
r   c                       s   e Zd ZdZg Zd fdd	Zdd Zdd Zd	d
 Ze	de
ej e
ej e
ej e
ej e
ej e
ej e
ej e
ej e
eej  e
e e
e e
e e
e e
ej eeej ef dddZ  ZS )CamembertModela1  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
    `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762

    Tc                    sT   t  | || _t|| _t|| _|r2t|nd| _|j	| _
|j| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r/   r0   rF   r#   rW   r   encoderr   poolerr   attn_implementationr'   	post_init)rE   rF   add_pooling_layerrG   rI   rJ   r0     s    

zCamembertModel.__init__c                 C   s   | j jS r   rW   r4   r  rI   rI   rJ   get_input_embeddings  s    z#CamembertModel.get_input_embeddingsc                 C   s   || j _d S r   r  )rE   rk   rI   rI   rJ   set_input_embeddings  s    z#CamembertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr	  r   r   r   )rE   Zheads_to_pruner   r   rI   rI   rJ   _prune_heads  s    zCamembertModel._prune_headsN)rP   ry   r,   r)   rz   rQ   r{   r   rr   r   r|   r   r   r}   r~   c           !      C   s  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}| j jrZ|
d urP|
n| j j}
nd}
|d urx|d urxtdn@|d ur| || | }n"|d ur| d d }ntd|\}}|d ur|j	n|j	}d}|	d urt
|	ts|	d d jd n|	 }|d u rZt| jdrH| jjd d d |f }|||}|}ntj|tj|d}| j|||||d	}|d u rtj||| f|d
}| jdko| jdko|d u o| }|r| dkr| j jrt||||}nt||j|d}n| ||}| j jrv|d urv| \}}}||f}|d u rDtj||d
}|rj| dkrjt||j|d}n
| |}nd }| || j j}| j ||||||	|
||||d}|d }| j!d ur| !|nd } |s|| f|dd   S t"|| |j#|j$|j%|j&dS )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer*   z5You have to specify either input_ids or inputs_embedsr   r   r,   rK   )rP   r)   r,   rQ   rR   )rL   r   r(   rb   )r   )
ry   rz   r{   r   rr   r   r|   r   r   r}   r!   )r   Zpooler_outputrr   rx   r   r   )'rF   r|   r   use_return_dictrn   r   rd   Z%warn_if_padding_and_no_attention_maskrB   rL   r   r   r   Zget_seq_lengthrO   rW   r,   r@   r>   rA   rC   Zonesr  r'   r   r   r   r.   Zget_extended_attention_maskZinvert_attention_maskZget_head_maskr   r	  r
  r   rr   rx   r   r   )!rE   rP   ry   r,   r)   rz   rQ   r{   r   rr   r   r|   r   r   r}   rS   r   rT   rL   rR   rU   rV   Zembedding_outputZuse_sdpa_attention_masksZextended_attention_maskZencoder_batch_sizeZencoder_sequence_lengthr   Zencoder_hidden_shapeZencoder_extended_attention_maskZencoder_outputssequence_outputr   rI   rI   rJ   rX     s    






zCamembertModel.forward)T)NNNNNNNNNNNNNN)rY   rZ   r[   r\   Z_no_split_modulesr0   r  r  r  r   r   r>   r   listr   r   r   r   r   rX   r]   rI   rI   rG   rJ   r    sL                 r  c                       s   e Zd ZddgZ fddZdd Zdd Zedee	j
 ee	j ee	j
 ee	j
 ee	j ee	j ee	j ee	j ee	j
 ee ee ee eee	j ef d
ddZ  ZS )CamembertForMaskedLMlm_head.decoder.weightlm_head.decoder.biasc                    s@   t  | |jrtd t|dd| _t|| _| 	  d S )NzpIf you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  
r/   r0   rn   r   warningr  r   r   lm_headr  rD   rG   rI   rJ   r0     s    
zCamembertForMaskedLM.__init__c                 C   s   | j jS r   r  r  r  rI   rI   rJ   get_output_embeddings  s    z*CamembertForMaskedLM.get_output_embeddingsc                 C   s   || j _d S r   r  rE   Znew_embeddingsrI   rI   rJ   set_output_embeddings  s    z*CamembertForMaskedLM.set_output_embeddingsN)rP   ry   r,   r)   rz   rQ   r{   r   labelsr|   r   r   r~   c                 C   s   |dur|n| j j}| j|||||||||
||d}|d }| |}d}|	dur|	|j}	t }||d| j j|	d}|s|f|dd  }|dur|f| S |S t	|||j
|jdS )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
ry   r,   r)   rz   rQ   r{   r   r|   r   r   r   r*   rb   losslogitsrx   r   )rF   r  r   r  r   rL   r   r   r2   r   rx   r   )rE   rP   ry   r,   r)   rz   rQ   r{   r   r!  r|   r   r   r   r  prediction_scoresZmasked_lm_lossloss_fctr   rI   rI   rJ   rX     s<    
zCamembertForMaskedLM.forward)NNNNNNNNNNNN)rY   rZ   r[   _tied_weights_keysr0   r  r   r   r   r>   
LongTensorr   r   r   r   r   r   rX   r]   rI   rI   rG   rJ   r    s@               r  z
    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )Zcustom_introc                       s   e Zd Z fddZedeej eej eej eej eej eej eej ee	 ee	 ee	 e
eej ef dddZ  ZS )"CamembertForSequenceClassificationc                    s>   t  | |j| _|| _t|dd| _t|| _|   d S NFr  )	r/   r0   r   rF   r  r   r   
classifierr  rD   rG   rI   rJ   r0     s    
z+CamembertForSequenceClassification.__init__NrP   ry   r,   r)   rz   rQ   r!  r|   r   r   r~   c                 C   s~  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur:||j}| j jdu r| jdkr|d| j _n4| jdkr|jt	j
ks|jt	jkrd| j _nd| j _| j jdkrt }| jdkr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkr:t }|||}|
sj|f|d	d  }|durf|f| S |S t|||j|jd
S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nry   r,   r)   rz   rQ   r|   r   r   r   r!   Z
regressionZsingle_label_classificationZmulti_label_classificationr*   rb   r"  )rF   r  r   r+  r   rL   Zproblem_typer   r.   r>   rC   re   r   squeezer   r   r   r   rx   r   rE   rP   ry   r,   r)   rz   rQ   r!  r|   r   r   r   r  r$  r#  r&  r   rI   rI   rJ   rX   '  sV    



"


z*CamembertForSequenceClassification.forward)
NNNNNNNNNN)rY   rZ   r[   r0   r   r   r>   r(  r   r   r   r   r   r   rX   r]   rI   rI   rG   rJ   r)    s2             r)  c                       s   e Zd Z fddZedeej eej eej eej eej eej eej ee	 ee	 ee	 e
eej ef dddZ  ZS )CamembertForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr!   )r/   r0   r  r   r   r:   r;   r<   rh   r3   r+  r  rD   rG   rI   rJ   r0   |  s
    
z#CamembertForMultipleChoice.__init__N)rP   r,   ry   r!  r)   rz   rQ   r|   r   r   r~   c                 C   s  |
dur|
n| j j}
|dur&|jd n|jd }|durJ|d|dnd}|durh|d|dnd}|dur|d|dnd}|dur|d|dnd}|dur|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|dur<||j	}t
 }|||}|
sl|f|dd  }|durh|f| S |S t|||j|jdS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr!   r*   r   )r)   r,   ry   rz   rQ   r|   r   r   rb   r"  )rF   r  r   r   rB   r   r<   r+  r   rL   r   r   rx   r   )rE   rP   r,   ry   r!  r)   rz   rQ   r|   r   r   Znum_choicesZflat_input_idsZflat_position_idsZflat_token_type_idsZflat_attention_maskZflat_inputs_embedsr   r   r$  Zreshaped_logitsr#  r&  r   rI   rI   rJ   rX     sN    -



z"CamembertForMultipleChoice.forward)
NNNNNNNNNN)rY   rZ   r[   r0   r   r   r>   r(  r   r   r   r   r   r   rX   r]   rI   rI   rG   rJ   r0  y  s2   
          r0  c                       s   e Zd Z fddZedeej eej eej eej eej eej eej ee	 ee	 ee	 e
eej ef dddZ  ZS )CamembertForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd ur2|jn|j}t|| _	t
|j|j| _|   d S r*  )r/   r0   r   r  r   r   r;   r   r:   r<   rh   r3   r+  r  r   rG   rI   rJ   r0     s    z(CamembertForTokenClassification.__init__Nr,  c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur||j}t }||d| j	|d}|
s|f|dd  }|dur|f| S |S t
|||j|jdS )a-  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr-  r   r*   rb   r"  )rF   r  r   r<   r+  r   rL   r   r   r   r   rx   r   r/  rI   rI   rJ   rX     s:    

z'CamembertForTokenClassification.forward)
NNNNNNNNNN)rY   rZ   r[   r0   r   r   r>   r(  r   r   r   r   r   r   rX   r]   rI   rI   rG   rJ   r1    s2             r1  c                       s   e Zd Z fddZedeej eej eej eej eej eej eej eej ee	 ee	 ee	 e
eej ef dddZ  ZS )CamembertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r*  )
r/   r0   r   r  r   r   rh   r3   
qa_outputsr  rD   rG   rI   rJ   r0   9  s
    z&CamembertForQuestionAnswering.__init__N)rP   ry   r,   r)   rz   rQ   start_positionsend_positionsr|   r   r   r~   c                 C   sP  |dur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d}|dur|durt| dkr|d}t| dkr|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s:||f|dd  }|dur6|f| S |S t||||j|jd	S )
a[  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        Nr-  r   r!   r*   r   )Zignore_indexrb   )r#  start_logits
end_logitsrx   r   )rF   r  r   r3  splitr.  r   r   rB   clampr   r   rx   r   )rE   rP   ry   r,   r)   rz   rQ   r4  r5  r|   r   r   r   r  r$  r6  r7  Z
total_lossZignored_indexr&  Z
start_lossZend_lossr   rI   rI   rJ   rX   C  sP    






z%CamembertForQuestionAnswering.forward)NNNNNNNNNNN)rY   rZ   r[   r0   r   r   r>   r(  r   r   r   r   r   r   rX   r]   rI   rI   rG   rJ   r2  6  s6   
           r2  zU
    CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c                       s   e Zd ZddgZ fddZdd Zdd Zedee	j
 ee	j ee	j
 ee	j
 ee	j ee	j ee	j ee	j ee	j
 eeee	j   ee ee ee ee eee	j ef d
ddZ  ZS )CamembertForCausalLMr  r  c                    s@   t  | |jstd t|dd| _t|| _| 	  d S )NzQIf you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  r  rD   rG   rI   rJ   r0     s    

zCamembertForCausalLM.__init__c                 C   s   | j jS r   r  r  rI   rI   rJ   r    s    z*CamembertForCausalLM.get_output_embeddingsc                 C   s   || j _d S r   r  r  rI   rI   rJ   r     s    z*CamembertForCausalLM.set_output_embeddingsN)rP   ry   r,   r)   rz   rQ   r{   r   r!  rr   r   r|   r   r   r~   c                 K   s   |dur|n| j j}|	dur d}| j|||||||||
||||d}|d }| |}d}|	dur|	|j}	| j||	fd| j ji|}|s|f|dd  }|dur|f| S |S t|||j	|j
|j|jdS )aq  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
        >>> config = AutoConfig.from_pretrained("almanach/camembert-base")
        >>> config.is_decoder = True
        >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)ry   r,   r)   rz   rQ   r{   r   rr   r   r|   r   r   r   r2   rb   )r#  r$  rr   rx   r   r   )rF   r  r   r  r   rL   Zloss_functionr2   r   rr   rx   r   r   )rE   rP   ry   r,   r)   rz   rQ   r{   r   r!  rr   r   r|   r   r   r   r   r  r%  Zlm_lossr   rI   rI   rJ   rX     sT    2
zCamembertForCausalLM.forward)NNNNNNNNNNNNNN)rY   rZ   r[   r'  r0   r  r   r   r   r>   r(  r   r   r   r   r   r   rX   r]   rI   rI   rG   rJ   r:    sH                 r:  c                 C   s6   |  | }tj|dd|| | }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r!   r   )nere   r>   ZcumsumZtype_asrC   )rP   r$   rR   maskZincremental_indicesrI   rI   rJ   rM     s    rM   )r:  r  r0  r2  r)  r1  r  r   )r   )Kr\   r   typingr   r   r>   Ztorch.utils.checkpointr   Ztorch.nnr   r   r   Zactivationsr	   r
   Zcache_utilsr   r   r   Z
generationr   Zmodeling_attn_mask_utilsr   r   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   utilsr   r   Zutils.deprecationr    Zconfiguration_camembertr"   Z
get_loggerrY   r   Moduler#   r^   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r)  r0  r1  r2  r:  rM   __all__rI   rI   rI   rJ   <module>   sr   (

Z h7G[ M\^iPXx
