a
    h                     @   sB  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dlm	Z	 d dl
mZmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z m!Z! ddl"m#Z# e!$e%Z&dd Z'G dd de	j(Z)e	j*e)dZ+G dd de	j(Z,G dd de	j(Z-G dd de	j(Z.G dd de	j(Z/G dd de	j(Z0G dd de	j(Z1G dd  d e	j(Z2G d!d" d"e	j(Z3G d#d$ d$e	j(Z4G d%d& d&e	j(Z5G d'd( d(e	j(Z6G d)d* d*e	j(Z7G d+d, d,e	j(Z8G d-d. d.e	j(Z9G d/d0 d0e	j(Z:G d1d2 d2e	j(Z;G d3d4 d4e	j(Z<G d5d6 d6e	j(Z=e G d7d8 d8eZ>ee d9d:G d;d< d<eZ?e G d=d> d>e>Z@e d?d:G d@dA dAe>ZAe G dBdC dCe>ZBG dDdE dEe	j(ZCe dFd:G dGdH dHe>ZDe dId:G dJdK dKe>ZEe G dLdM dMe>ZFe G dNdO dOe>ZGe G dPdQ dQe>ZHg dRZIdS )S    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )MobileBertConfigc                 C   s
  zddl }ddl}ddl}W n ty:   td  Y n0 tj|}t	d|  |j
|}g }g }	|D ]@\}
}t	d|
 d|  |j
||
}||
 |	| qpt||	D ]F\}
}|
dd}
|
d	d
}
|
dd}
|
dd}
|
d}
tdd |
D r,t	dd|
  q| }|
D ]}|d|rV|d|}n|g}|d dksx|d dkrt|d}n|d dks|d dkrt|d}nx|d dkrt|d}n^|d dkrt|d}nDzt||d }W n0 ty"   t	dd|
  Y q4Y n0 t|dkr4t|d }|| }q4|d d d!krht|d}n|dkr|||}z,|j|jksJ d"|j d#|j d$W n> ty } z$| j|j|jf7  _ W Y d}~n
d}~0 0 t	d%|
  t||_q| S )&z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape Z	ffn_layerffnZFakeLayerNorm	LayerNormZextra_output_weightszdense/kernelZbert
mobilebert/c                 s   s   | ]}|d v V  qdS ))Zadam_vZadam_mZAdamWeightDecayOptimizerZAdamWeightDecayOptimizer_1Zglobal_stepN ).0nr   r   n/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/mobilebert/modeling_mobilebert.py	<genexpr>V   s   z0load_tf_weights_in_mobilebert.<locals>.<genexpr>z	Skipping z[A-Za-z]+_\d+z_(\d+)ZkernelgammaweightZoutput_biasbetabiasZoutput_weightsZsquad
classifier   r   iZ_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpyZ
tensorflowImportErrorloggererrorospathabspathinfotrainZlist_variablesZload_variableappendzipreplacesplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshapeAssertionErrorargstorchZ
from_numpydata)modelconfigZtf_checkpoint_pathr*   nptfZtf_pathZ	init_varsnamesZarraysnamer@   arrayZpointerZm_nameZscope_namesnumer   r   r"   load_tf_weights_in_mobilebert5   s    





rN   c                       s2   e Zd Zd fdd	ZejejdddZ  ZS )NoNormNc                    s2   t    tt|| _tt|| _d S N)	super__init__r   	ParameterrC   zerosr'   onesr%   )selfZ	feat_sizeeps	__class__r   r"   rR      s    
zNoNorm.__init__)input_tensorreturnc                 C   s   || j  | j S rP   )r%   r'   )rV   rZ   r   r   r"   forward   s    zNoNorm.forward)N__name__
__module____qualname__rR   rC   Tensorr\   __classcell__r   r   rX   r"   rO      s   rO   )
layer_normZno_normc                       sR   e Zd ZdZ fddZdeej eej eej eej ej	dddZ
  ZS )	MobileBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    |j| _|j| _|j| _tj|j|j|jd| _	t|j
|j| _t|j|j| _| jrhdnd}| j| }t||j| _t|j |j| _t|j| _| jdt|j
ddd d S )N)padding_idxr	   r   position_ids)r   F)
persistent)rQ   rR   trigram_inputembedding_sizehidden_sizer   	Embedding
vocab_sizeZpad_token_idword_embeddingsZmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddingsLinearembedding_transformationNORM2FNnormalization_typer   Dropouthidden_dropout_probdropoutZregister_bufferrC   Zarangeexpand)rV   rF   Zembed_dim_multiplierZembedded_input_sizerX   r   r"   rR      s    

zMobileBertEmbeddings.__init__N)	input_idstoken_type_idsrf   inputs_embedsr[   c           
      C   s*  |d ur|  }n|  d d }|d }|d u rH| jd d d |f }|d u rftj|tj| jjd}|d u rx| |}| jrtjt	j
j|d d dd f g ddd|t	j
j|d d d df g dddgdd	}| js| j| jkr| |}| |}| |}|| | }	| |	}	| |	}	|	S )
Nrg   r   dtypedevice)r   r   r   r   r   r           )value)r   r   r   r   r   r   r)   dim)sizerf   rC   rT   longr~   rn   ri   catr   
functionalpadrj   rk   rr   ro   rp   r   rw   )
rV   ry   rz   rf   r{   input_shape
seq_lengthro   rp   
embeddingsr   r   r"   r\      s4    

$$




zMobileBertEmbeddings.forward)NNNN)r^   r_   r`   __doc__rR   r   rC   
LongTensorFloatTensorra   r\   rb   r   r   rX   r"   rd      s       rd   c                	       sT   e Zd Z fddZdejejejeej eej ee e	ej dddZ
  ZS )MobileBertSelfAttentionc                    s   t    |j| _t|j|j | _| j| j | _t|j| j| _	t|j| j| _
t|jrf|jn|j| j| _t|j| _d S rP   )rQ   rR   num_attention_headsr>   true_hidden_sizeattention_head_sizeall_head_sizer   rq   querykeyuse_bottleneck_attentionrk   r   ru   Zattention_probs_dropout_probrw   rV   rF   rX   r   r"   rR      s    
z MobileBertSelfAttention.__init__N)query_tensor
key_tensorvalue_tensorattention_mask	head_maskoutput_attentionsr[   c                 C   s*  |j \}}}	| ||d| j| jdd}
| ||d| j| jdd}| ||d| j| jdd}t	|
|dd}|t
| j }|d ur|| }tjj|dd}| |}|d ur|| }t	||}|dddd }| d d | jf }||}|r ||fn|f}|S )Nrg   r   r)   r   r   r	   )r@   r   viewr   r   r?   r   r   rC   matmulmathsqrtr   r   Zsoftmaxrw   Zpermute
contiguousr   r   )rV   r   r   r   r   r   r   Z
batch_sizer   _Zquery_layerZ	key_layerZvalue_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr   r   r"   r\      sB    	




zMobileBertSelfAttention.forward)NNNr^   r_   r`   rR   rC   ra   r   r   booltupler\   rb   r   r   rX   r"   r      s      r   c                       s4   e Zd Z fddZejejejdddZ  ZS )MobileBertSelfOutputc                    sT   t    |j| _t|j|j| _t|j |j|j	d| _
| jsPt|j| _d S NrW   )rQ   rR   use_bottleneckr   rq   r   densers   rt   layer_norm_epsr   ru   rv   rw   r   rX   r   r"   rR     s    
zMobileBertSelfOutput.__init__hidden_statesresidual_tensorr[   c                 C   s,   |  |}| js| |}| || }|S rP   )r   r   rw   r   rV   r   r   layer_outputsr   r   r"   r\   #  s
    

zMobileBertSelfOutput.forwardr]   r   r   rX   r"   r     s   r   c                
       s`   e Zd Z fddZdd Zd	ejejejejeej eej ee	 e
ej dddZ  ZS )
MobileBertAttentionc                    s*   t    t|| _t|| _t | _d S rP   )rQ   rR   r   rV   r   outputsetpruned_headsr   rX   r   r"   rR   ,  s    


zMobileBertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )r=   r   rV   r   r   r   r   r   r   r   r   r   r   union)rV   headsindexr   r   r"   prune_heads2  s    zMobileBertAttention.prune_headsN)r   r   r   layer_inputr   r   r   r[   c                 C   s:   |  ||||||}| |d |}	|	f|dd   }
|
S )Nr   r   )rV   r   )rV   r   r   r   r   r   r   r   Zself_outputsattention_outputr   r   r   r"   r\   D  s    

zMobileBertAttention.forward)NNN)r^   r_   r`   rR   r   rC   ra   r   r   r   r   r\   rb   r   r   rX   r"   r   +  s      r   c                       s0   e Zd Z fddZejejdddZ  ZS )MobileBertIntermediatec                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S rP   )rQ   rR   r   rq   r   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnr   rX   r   r"   rR   ^  s
    
zMobileBertIntermediate.__init__r   r[   c                 C   s   |  |}| |}|S rP   )r   r   rV   r   r   r   r"   r\   f  s    

zMobileBertIntermediate.forwardr]   r   r   rX   r"   r   ]  s   r   c                       s4   e Zd Z fddZejejejdddZ  ZS )OutputBottleneckc                    sF   t    t|j|j| _t|j |j|j	d| _
t|j| _d S r   )rQ   rR   r   rq   r   rk   r   rs   rt   r   r   ru   rv   rw   r   rX   r   r"   rR   m  s    
zOutputBottleneck.__init__r   c                 C   s&   |  |}| |}| || }|S rP   )r   rw   r   r   r   r   r"   r\   s  s    

zOutputBottleneck.forwardr]   r   r   rX   r"   r   l  s   r   c                       s8   e Zd Z fddZejejejejdddZ  ZS )MobileBertOutputc                    sZ   t    |j| _t|j|j| _t|j	 |j| _
| jsLt|j| _n
t|| _d S rP   )rQ   rR   r   r   rq   r   r   r   rs   rt   r   ru   rv   rw   r   
bottleneckr   rX   r   r"   rR   {  s    
zMobileBertOutput.__init__)intermediate_statesresidual_tensor_1residual_tensor_2r[   c                 C   sH   |  |}| js*| |}| || }n| || }| ||}|S rP   )r   r   rw   r   r   )rV   r   r   r   layer_outputr   r   r"   r\     s    

zMobileBertOutput.forwardr]   r   r   rX   r"   r   z  s   r   c                       s0   e Zd Z fddZejejdddZ  ZS )BottleneckLayerc                    s8   t    t|j|j| _t|j |j|j	d| _
d S r   )rQ   rR   r   rq   rk   Zintra_bottleneck_sizer   rs   rt   r   r   r   rX   r   r"   rR     s    
zBottleneckLayer.__init__r   c                 C   s   |  |}| |}|S rP   r   r   )rV   r   r   r   r   r"   r\     s    

zBottleneckLayer.forwardr]   r   r   rX   r"   r     s   r   c                       s4   e Zd Z fddZejeej dddZ  ZS )
Bottleneckc                    s8   t    |j| _|j| _t|| _| jr4t|| _d S rP   )rQ   rR   key_query_shared_bottleneckr   r   input	attentionr   rX   r   r"   rR     s    

zBottleneck.__init__r   c                 C   sF   |  |}| jr|fd S | jr6| |}||||fS ||||fS d S )N   )r   r   r   r   )rV   r   Zbottlenecked_hidden_statesZshared_attention_inputr   r   r"   r\     s    


zBottleneck.forward	r^   r_   r`   rR   rC   ra   r   r\   rb   r   r   rX   r"   r     s   r   c                       s4   e Zd Z fddZejejejdddZ  ZS )	FFNOutputc                    s8   t    t|j|j| _t|j |j|j	d| _
d S r   )rQ   rR   r   rq   r   r   r   rs   rt   r   r   r   rX   r   r"   rR     s    
zFFNOutput.__init__r   c                 C   s   |  |}| || }|S rP   r   r   r   r   r"   r\     s    
zFFNOutput.forwardr]   r   r   rX   r"   r     s   r   c                       s0   e Zd Z fddZejejdddZ  ZS )FFNLayerc                    s"   t    t|| _t|| _d S rP   )rQ   rR   r   intermediater   r   r   rX   r   r"   rR     s    

zFFNLayer.__init__r   c                 C   s   |  |}| ||}|S rP   )r   r   )rV   r   intermediate_outputr   r   r   r"   r\     s    
zFFNLayer.forwardr]   r   r   rX   r"   r     s   r   c                       sL   e Zd Z fddZdejeej eej ee e	ej dddZ
  ZS )MobileBertLayerc                    sz   t     j| _ j| _t | _t | _t | _	| jrHt
 | _ jdkrvt fddt jd D | _d S )Nr   c                    s   g | ]}t  qS r   )r   r    r   rF   r   r"   
<listcomp>      z,MobileBertLayer.__init__.<locals>.<listcomp>)rQ   rR   r   num_feedforward_networksr   r   r   r   r   r   r   r   r   
ModuleListranger   r   rX   r   r"   rR     s    





zMobileBertLayer.__init__N)r   r   r   r   r[   c              	   C   s   | j r| |\}}}}n|gd \}}}}| j|||||||d}	|	d }
|
f}|	dd  }| jdkrt| jD ]\}}||
}
||
f7 }qr| |
}| ||
|}|f| t	d|||||
|f | }|S )Nr   )r   r   r   i  )
r   r   r   r   	enumerater   r   r   rC   Ztensor)rV   r   r   r   r   r   r   r   r   Zself_attention_outputsr   sr   iZ
ffn_moduler   r   r   r   r"   r\     sJ    	

zMobileBertLayer.forward)NNNr   r   r   rX   r"   r     s      r   c                
       sZ   e Zd Z fddZd	ejeej eej ee ee ee e	e
ef dddZ  ZS )
MobileBertEncoderc                    s.   t    t fddt jD | _d S )Nc                    s   g | ]}t  qS r   )r   r   r   r   r"   r     r   z.MobileBertEncoder.__init__.<locals>.<listcomp>)rQ   rR   r   r   r   num_hidden_layerslayerr   rX   r   r"   rR     s    
zMobileBertEncoder.__init__NFT)r   r   r   r   output_hidden_statesreturn_dictr[   c                 C   s   |rdnd }|rdnd }t | jD ]B\}	}
|r8||f }|
||||	 |}|d }|r"||d f }q"|rt||f }|stdd |||fD S t|||dS )Nr   r   r   c                 s   s   | ]}|d ur|V  qd S rP   r   )r    vr   r   r"   r#   =  r   z,MobileBertEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)r   r   r   r   )rV   r   r   r   r   r   r   Zall_hidden_statesZall_attentionsr   Zlayer_moduler   r   r   r"   r\     s*    	

zMobileBertEncoder.forward)NNFFT)r^   r_   r`   rR   rC   ra   r   r   r   r   r   r   r\   rb   r   r   rX   r"   r     s        
r   c                       s0   e Zd Z fddZejejdddZ  ZS )MobileBertPoolerc                    s.   t    |j| _| jr*t|j|j| _d S rP   )rQ   rR   Zclassifier_activationdo_activater   rq   rk   r   r   rX   r   r"   rR   D  s    
zMobileBertPooler.__init__r   c                 C   s6   |d d df }| j s|S | |}t|}|S d S )Nr   )r   r   rC   tanh)rV   r   Zfirst_token_tensorpooled_outputr   r   r"   r\   J  s    

zMobileBertPooler.forwardr]   r   r   rX   r"   r   C  s   r   c                       s0   e Zd Z fddZejejdddZ  ZS )!MobileBertPredictionHeadTransformc                    sX   t    t|j|j| _t|jtr6t	|j | _
n|j| _
td |j|jd| _d S )Nrc   r   )rQ   rR   r   rq   rk   r   r   r   r   r
   transform_act_fnrs   r   r   r   rX   r   r"   rR   W  s    
z*MobileBertPredictionHeadTransform.__init__r   c                 C   s"   |  |}| |}| |}|S rP   )r   r   r   r   r   r   r"   r\   `  s    


z)MobileBertPredictionHeadTransform.forwardr]   r   r   rX   r"   r   V  s   	r   c                       s>   e Zd Z fddZddddZejejddd	Z  ZS )
MobileBertLMPredictionHeadc                    sh   t    t|| _tj|j|j|j dd| _	tj|j|jdd| _
tt|j| _| j| j
_d S )NF)r'   )rQ   rR   r   	transformr   rq   rm   rk   rj   r   decoderrS   rC   rT   r'   r   rX   r   r"   rR   h  s    

z#MobileBertLMPredictionHead.__init__N)r[   c                 C   s   | j | j_ d S rP   )r'   r   rV   r   r   r"   _tie_weightss  s    z'MobileBertLMPredictionHead._tie_weightsr   c                 C   s>   |  |}|tj| jj | jjgdd}|| jj7 }|S )Nr   r   )	r   r   rC   r   r   r%   tr   r'   r   r   r   r"   r\   v  s    
$z"MobileBertLMPredictionHead.forward)	r^   r_   r`   rR   r   rC   ra   r\   rb   r   r   rX   r"   r   g  s   r   c                       s0   e Zd Z fddZejejdddZ  ZS )MobileBertOnlyMLMHeadc                    s   t    t|| _d S rP   )rQ   rR   r   predictionsr   rX   r   r"   rR   ~  s    
zMobileBertOnlyMLMHead.__init__)sequence_outputr[   c                 C   s   |  |}|S rP   )r   )rV   r   prediction_scoresr   r   r"   r\     s    
zMobileBertOnlyMLMHead.forwardr]   r   r   rX   r"   r   }  s   r   c                       s8   e Zd Z fddZejejeej dddZ  ZS )MobileBertPreTrainingHeadsc                    s(   t    t|| _t|jd| _d S Nr)   )rQ   rR   r   r   r   rq   rk   seq_relationshipr   rX   r   r"   rR     s    

z#MobileBertPreTrainingHeads.__init__)r   r   r[   c                 C   s   |  |}| |}||fS rP   )r   r   )rV   r   r   r   seq_relationship_scorer   r   r"   r\     s    

z"MobileBertPreTrainingHeads.forwardr   r   r   rX   r"   r     s   r   c                   @   s&   e Zd ZU eed< eZdZdd ZdS )MobileBertPreTrainedModelrF   r   c                 C   s   t |tjr:|jjjd| jjd |jdur|jj	  nt |tj
rz|jjjd| jjd |jdur|jj|j 	  nBt |tjtfr|jj	  |jjd nt |tr|jj	  dS )zInitialize the weightsr   )meanZstdNg      ?)r   r   rq   r%   rD   Znormal_rF   Zinitializer_ranger'   Zzero_rl   re   r   rO   Zfill_r   )rV   moduler   r   r"   _init_weights  s    


z'MobileBertPreTrainedModel._init_weightsN)	r^   r_   r`   r   __annotations__rN   Zload_tf_weightsZbase_model_prefixr  r   r   r   r"   r     s   
r   z6
    Output type of [`MobileBertForPreTraining`].
    )Zcustom_introc                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )MobileBertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logitsseq_relationship_logitsr   r   )r^   r_   r`   r   r  r   rC   r   r  r  r  r   r   r   r   r   r   r"   r    s   
r  c                       s   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Zede	e
j e	e
j e	e
j e	e
j e	e
j e	e
j e	e e	e e	e eeef d
ddZ  ZS )MobileBertModelz2
    https://huggingface.co/papers/2004.02984
    Tc                    sD   t  | || _t|| _t|| _|r2t|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rQ   rR   rF   rd   r   r   encoderr   pooler	post_init)rV   rF   add_pooling_layerrX   r   r"   rR     s    

zMobileBertModel.__init__c                 C   s   | j jS rP   r   rn   r   r   r   r"   get_input_embeddings  s    z$MobileBertModel.get_input_embeddingsc                 C   s   || j _d S rP   r  )rV   r   r   r   r"   set_input_embeddings  s    z$MobileBertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr	  r   r   r   )rV   Zheads_to_pruner   r   r   r   r"   _prune_heads  s    zMobileBertModel._prune_headsN)
ry   r   rz   rf   r   r{   r   r   r   r[   c
                 C   sn  |d ur|n| j j}|d ur |n| j j}|	d ur4|	n| j j}	|d urV|d urVtdn@|d urt| || | }
n"|d ur| d d }
ntd|d ur|jn|j}|d u rtj	|
|d}|d u rtj
|
tj|d}| ||
}| || j j}| j||||d}| j||||||	d}|d }| jd ur<| |nd }|	sZ||f|d	d   S t|||j|jd
S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerg   z5You have to specify either input_ids or inputs_embeds)r~   r|   )ry   rf   rz   r{   )r   r   r   r   r   r   r   )r   Zpooler_outputr   r   )rF   r   r   use_return_dict
ValueErrorZ%warn_if_padding_and_no_attention_maskr   r~   rC   rU   rT   r   Zget_extended_attention_maskZget_head_maskr   r   r	  r
  r   r   r   )rV   ry   r   rz   rf   r   r{   r   r   r   r   r~   Zextended_attention_maskZembedding_outputZencoder_outputsr   r   r   r   r"   r\     sP    

zMobileBertModel.forward)T)	NNNNNNNNN)r^   r_   r`   r   rR   r  r  r  r   r   rC   r   r   r   r   r   r   r\   rb   r   r   rX   r"   r    s6            
r  z
    MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `next sentence prediction (classification)` head.
    c                       s   e Zd ZddgZ fddZdd Zdd Zdee e	j
d
 fddZedeej eej eej eej eej eej eej eej eej eej eej eeef dddZ  ZS )MobileBertForPreTrainingcls.predictions.decoder.weightcls.predictions.decoder.biasc                    s,   t  | t|| _t|| _|   d S rP   )rQ   rR   r  r   r   clsr  r   rX   r   r"   rR   9  s    

z!MobileBertForPreTraining.__init__c                 C   s
   | j jjS rP   r  r   r   r   r   r   r"   get_output_embeddingsA  s    z.MobileBertForPreTraining.get_output_embeddingsc                 C   s   || j j_|j| j j_d S rP   r  r   r   r'   rV   Znew_embeddingsr   r   r"   set_output_embeddingsD  s    
z.MobileBertForPreTraining.set_output_embeddingsNnew_num_tokensr[   c                    s*   | j | jjj|dd| jj_t j|dS NT)r  Z
transposed)r  Z_get_resized_lm_headr  r   r   rQ   resize_token_embeddingsrV   r  rX   r   r"   r!  H  s    z0MobileBertForPreTraining.resize_token_embeddings)ry   r   rz   rf   r   r{   labelsnext_sentence_labelr   r   r   r[   c                 C   s   |dur|n| j j}| j|||||||	|
|d	}|dd \}}| ||\}}d}|dur|durt }||d| j j|d}||dd|d}|| }|s||f|dd  }|dur|f| S |S t||||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```Nr   rz   rf   r   r{   r   r   r   r)   rg   )r  r  r  r   r   )
rF   r  r   r  r   r   rm   r  r   r   )rV   ry   r   rz   rf   r   r{   r#  r$  r   r   r   r   r   r   r   r   
total_lossloss_fctmasked_lm_lossnext_sentence_lossr   r   r   r"   r\   P  s<    +z MobileBertForPreTraining.forward)N)NNNNNNNNNNN)r^   r_   r`   _tied_weights_keysrR   r  r  r   r>   r   rl   r!  r   rC   r   r   r   r   r  r\   rb   r   r   rX   r"   r  0  s>              
r  c                       s   e Zd ZddgZ fddZdd Zdd Zdee e	j
d
 fddZedeej eej eej eej eej eej eej ee ee ee eeef dddZ  ZS )MobileBertForMaskedLMr  r  c                    s6   t  | t|dd| _t|| _|| _|   d S NF)r  )rQ   rR   r  r   r   r  rF   r  r   rX   r   r"   rR     s
    
zMobileBertForMaskedLM.__init__c                 C   s
   | j jjS rP   r  r   r   r   r"   r    s    z+MobileBertForMaskedLM.get_output_embeddingsc                 C   s   || j j_|j| j j_d S rP   r  r  r   r   r"   r    s    
z+MobileBertForMaskedLM.set_output_embeddingsNr  c                    s*   | j | jjj|dd| jj_t j|dS r  r   r"  rX   r   r"   r!    s    z-MobileBertForMaskedLM.resize_token_embeddingsry   r   rz   rf   r   r{   r#  r   r   r   r[   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|durpt }||d| j j|d}|
s|f|dd  }|dur|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr%  r   rg   r)   r  logitsr   r   )
rF   r  r   r  r   r   rm   r   r   r   )rV   ry   r   rz   rf   r   r{   r#  r   r   r   r   r   r   r(  r'  r   r   r   r"   r\     s6    
zMobileBertForMaskedLM.forward)N)
NNNNNNNNNN)r^   r_   r`   r*  rR   r  r  r   r>   r   rl   r!  r   rC   r   r   r   r   r   r   r\   rb   r   r   rX   r"   r+    s:   	          
r+  c                       s0   e Zd Z fddZejejdddZ  ZS )MobileBertOnlyNSPHeadc                    s   t    t|jd| _d S r   )rQ   rR   r   rq   rk   r   r   rX   r   r"   rR     s    
zMobileBertOnlyNSPHead.__init__)r   r[   c                 C   s   |  |}|S rP   )r   )rV   r   r   r   r   r"   r\     s    
zMobileBertOnlyNSPHead.forwardr]   r   r   rX   r"   r0    s   r0  zZ
    MobileBert Model with a `next sentence prediction (classification)` head on top.
    c                       s   e Zd Z fddZedeej eej eej eej eej eej eej ee	 ee	 ee	 e
eef dddZ  ZS )#MobileBertForNextSentencePredictionc                    s,   t  | t|| _t|| _|   d S rP   )rQ   rR   r  r   r0  r  r  r   rX   r   r"   rR      s    

z,MobileBertForNextSentencePrediction.__init__Nr-  c                 K   s   d|v rt dt |d}|
dur*|
n| jj}
| j||||||||	|
d	}|d }| |}d}|durt }||	dd|	d}|
s|f|dd  }|dur|f| S |S t
|||j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`.

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```r$  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr%  r   rg   r)   r.  )warningswarnFutureWarningpoprF   r  r   r  r   r   r   r   r   )rV   ry   r   rz   rf   r   r{   r#  r   r   r   kwargsr   r   r   r)  r'  r   r   r   r"   r\   	  sB    )

z+MobileBertForNextSentencePrediction.forward)
NNNNNNNNNN)r^   r_   r`   rR   r   r   rC   r   r   r   r   r   r   r\   rb   r   r   rX   r"   r1    s2   	          
r1  z
    MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       s   e Zd Z fddZedeej eej eej eej eej eej eej ee ee ee e	e
ej ef dddZ  ZS )#MobileBertForSequenceClassificationc                    sd   t  | |j| _|| _t|| _|jd ur4|jn|j}t	|| _
t|j|j| _|   d S rP   )rQ   rR   
num_labelsrF   r  r   classifier_dropoutrv   r   ru   rw   rq   rk   r(   r  rV   rF   r9  rX   r   r"   rR   d  s    
z,MobileBertForSequenceClassification.__init__Nr-  c                 C   s|  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur8| j jdu r| jdkrzd| j _n4| jdkr|jtj	ks|jtj
krd| j _nd| j _| j jdkrt }| jdkr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkr8t }|||}|
sh|f|dd  }|durd|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr%  r   Z
regressionZsingle_label_classificationZmulti_label_classificationrg   r)   r.  )rF   r  r   rw   r(   Zproblem_typer8  r}   rC   r   r>   r   squeezer   r   r   r   r   r   )rV   ry   r   rz   rf   r   r{   r#  r   r   r   r   r   r/  r  r'  r   r   r   r"   r\   s  sV    




"


z+MobileBertForSequenceClassification.forward)
NNNNNNNNNN)r^   r_   r`   rR   r   r   rC   ra   r   r   r   r   r\   rb   r   r   rX   r"   r7  \  s2             r7  c                       s   e Zd Z fddZedeej eej eej eej eej eej eej eej ee ee ee e	e
ej ef dddZ  ZS )MobileBertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r,  )
rQ   rR   r8  r  r   r   rq   rk   
qa_outputsr  r   rX   r   r"   rR     s
    z'MobileBertForQuestionAnswering.__init__N)ry   r   rz   rf   r   r{   start_positionsend_positionsr   r   r   r[   c                 C   sP  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkr|d}t| dkr|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s:||f|dd   }|d ur6|f| S |S t||||j|jdS )	Nr%  r   r   rg   r   )Zignore_indexr)   )r  start_logits
end_logitsr   r   )rF   r  r   r=  r7   r;  r   r=   r   clampr   r   r   r   )rV   ry   r   rz   rf   r   r{   r>  r?  r   r   r   r   r   r/  r@  rA  r&  Zignored_indexr'  Z
start_lossZend_lossr   r   r   r"   r\     sP    






z&MobileBertForQuestionAnswering.forward)NNNNNNNNNNN)r^   r_   r`   rR   r   r   rC   ra   r   r   r   r   r\   rb   r   r   rX   r"   r<    s6   
           r<  c                       s   e Zd Z fddZedeej eej eej eej eej eej eej ee ee ee e	e
ej ef dddZ  ZS )MobileBertForMultipleChoicec                    sT   t  | t|| _|jd ur&|jn|j}t|| _t	|j
d| _|   d S )Nr   )rQ   rR   r  r   r9  rv   r   ru   rw   rq   rk   r(   r  r:  rX   r   r"   rR     s    
z$MobileBertForMultipleChoice.__init__Nr-  c                 C   st  |
dur|
n| j j}
|dur&|jd n|jd }|durJ|d|dnd}|durh|d|dnd}|dur|d|dnd}|dur|d|dnd}|dur|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|dur0t }|||}|
s`|f|dd  }|dur\|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rg   r   r%  r)   r.  )rF   r  r@   r   r   r   rw   r(   r   r   r   r   )rV   ry   r   rz   rf   r   r{   r#  r   r   r   Znum_choicesr   r   r/  Zreshaped_logitsr  r'  r   r   r   r"   r\     sL    ,



z#MobileBertForMultipleChoice.forward)
NNNNNNNNNN)r^   r_   r`   rR   r   r   rC   ra   r   r   r   r   r\   rb   r   r   rX   r"   rC    s2             rC  c                       s   e Zd Z fddZedeej eej eej eej eej eej eej ee ee ee e	e
ej ef dddZ  ZS ) MobileBertForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd ur2|jn|j}t|| _	t
|j|j| _|   d S r,  )rQ   rR   r8  r  r   r9  rv   r   ru   rw   rq   rk   r(   r  r:  rX   r   r"   rR   z  s    z)MobileBertForTokenClassification.__init__Nr-  c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|durxt }||d| j|d}|
s|f|dd  }|dur|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr%  r   rg   r)   r.  )rF   r  r   rw   r(   r   r   r8  r   r   r   )rV   ry   r   rz   rf   r   r{   r#  r   r   r   r   r   r/  r  r'  r   r   r   r"   r\     s8    

z(MobileBertForTokenClassification.forward)
NNNNNNNNNN)r^   r_   r`   rR   r   r   rC   ra   r   r   r   r   r\   rb   r   r   rX   r"   rD  w  s2             rD  )r+  rC  r1  r  r<  r7  rD  r   r  r   rN   )Jr   r/   r2  dataclassesr   typingr   r   rC   r   Ztorch.nnr   r   r   Zactivationsr
   Zmodeling_outputsr   r   r   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   utilsr   r   r   Zconfiguration_mobilebertr   Z
get_loggerr^   r-   rN   ModulerO   r   rs   rd   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r+  r0  r1  r7  r<  rC  rD  __all__r   r   r   r"   <module>   s   (

N
L>2$?*
jiP
]YMjE