a
    ½ÀhQá  ã                   @   sT  d Z ddlZddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% e# &e'¡Z(dd„ Z)G dd„ de	j*ƒZ+G dd„ de	j*ƒZ,G dd„ de,ƒZ-e,e-dœZ.G dd„ de	j*ƒZ/G dd„ de	j*ƒZ0G dd„ de	j*ƒZ1e"G dd „ d eƒƒZ2ee"d!d"G d#d$„ d$e!ƒƒƒZ3e"G d%d&„ d&e2ƒƒZ4e"d'd"G d(d)„ d)e2ƒƒZ5G d*d+„ d+e	j*ƒZ6G d,d-„ d-e	j*ƒZ7e"G d.d/„ d/e2ƒƒZ8e"d0d"G d1d2„ d2e2ƒƒZ9e"G d3d4„ d4e2ƒƒZ:e"G d5d6„ d6e2ƒƒZ;e"G d7d8„ d8e2ƒƒZ<g d9¢Z=dS ):zPyTorch ALBERT model.é    N)Ú	dataclass)ÚOptionalÚUnion)Únn)ÚBCEWithLogitsLossÚCrossEntropyLossÚMSELossé   )ÚACT2FN)Ú#_prepare_4d_attention_mask_for_sdpa)ÚBaseModelOutputÚBaseModelOutputWithPoolingÚMaskedLMOutputÚMultipleChoiceModelOutputÚQuestionAnsweringModelOutputÚSequenceClassifierOutputÚTokenClassifierOutput)ÚPreTrainedModel)Úapply_chunking_to_forwardÚ find_pruneable_heads_and_indicesÚprune_linear_layer)ÚModelOutputÚauto_docstringÚloggingé   )ÚAlbertConfigc                 C   sF  zddl }ddl}ddl}W n ty:   t d¡ ‚ Y n0 tj |¡}t 	d|› ¡ |j
 |¡}g }g }	|D ]@\}
}t 	d|
› d|› ¡ |j
 ||
¡}| |
¡ |	 |¡ qpt||	ƒD ]\}
}t|
ƒ q¼t||	ƒD ]f\}
}|
}|
 dd¡}
|
 d	d
¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 dd¡}
|
 d d!¡}
|
 d"d#¡}
|
 d$d%¡}
t|
 d¡ƒd&krîd'|
v sæd(|
v rîd)|
 }
d*|
v r|
 d+d,¡}
|
 d-d.¡}
|
 d¡}
d/|
v sLd0|
v sLd1|
v sLd2|
v sLd3|
v rdt 	d4d |
¡› ¡ qØ| }|
D ]}| d5|¡rŽ| d6|¡}n|g}|d d7ks°|d d8kr¼t|d.ƒ}n |d d'ksØ|d d9krät|d:ƒ}nx|d d(krþt|d.ƒ}n^|d d;krt|d<ƒ}nDzt||d ƒ}W n0 tyZ   t 	d4d |
¡› ¡ Y qlY n0 t|ƒd=krlt|d& ƒ}|| }ql|d>d… d?kr t|d.ƒ}n|d7kr´| |¡}z,|j|jkrÞtd@|j› dA|j› dBƒ‚W n> ty } z$| j|j|jf7  _‚ W Y d}~n
d}~0 0 tdC|
› dD|› ƒ t |¡|_qØ| S )Ez'Load tf checkpoints in a pytorch model.r   Nz™Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape zmodule/Ú Zffn_1Úffnzbert/zalbert/Zattention_1Ú	attentionz
transform/ZLayerNorm_1Úfull_layer_layer_normÚ	LayerNormzattention/LayerNormztransformer/zintermediate/dense/zffn/intermediate/output/dense/zffn_output/z/output/ú/z/self/zpooler/denseÚpoolerzcls/predictionsÚpredictionszpredictions/attentionzembeddings/attentionÚ
embeddingsZinner_group_zalbert_layers/Zgroup_zalbert_layer_groups/r   Zoutput_biasZoutput_weightszclassifier/Zseq_relationshipzseq_relationship/output_zsop_classifier/classifier/ÚweightsÚweightZadam_mZadam_vZAdamWeightDecayOptimizerZAdamWeightDecayOptimizer_1Zglobal_stepz	Skipping z[A-Za-z]+_\d+z_(\d+)ZkernelÚgammaÚbetaÚbiasZsquadÚ
classifieré   iõÿÿÿZ_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight z from )ÚreÚnumpyZ
tensorflowÚImportErrorÚloggerÚerrorÚosÚpathÚabspathÚinfoÚtrainZlist_variablesZload_variableÚappendÚzipÚprintÚreplaceÚlenÚsplitÚjoinÚ	fullmatchÚgetattrÚAttributeErrorÚintÚ	transposeÚshapeÚ
ValueErrorÚargsÚtorchZ
from_numpyÚdata)ÚmodelÚconfigZtf_checkpoint_pathr,   ÚnpÚtfZtf_pathZ	init_varsÚnamesZarraysÚnamerB   ÚarrayÚoriginal_nameZpointerZm_nameZscope_namesÚnumÚe© rQ   úf/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/albert/modeling_albert.pyÚload_tf_weights_in_albert2   sº    ÿ

(

ÿþýüû


rS   c                       sZ   e Zd ZdZedœ‡ fdd„Zd
eej eej eej eej	 e
ejdœdd	„Z‡  ZS )ÚAlbertEmbeddingszQ
    Construct the embeddings from word, position and token_type embeddings.
    ©rH   c                    sº   t ƒ  ¡  tj|j|j|jd| _t |j|j¡| _	t |j
|j¡| _tj|j|jd| _t |j¡| _| jdt |j¡ d¡dd t|ddƒ| _| jd	tj| j ¡ tjd
dd d S )N)Úpadding_idx©ÚepsÚposition_ids)r   éÿÿÿÿF)Ú
persistentÚposition_embedding_typeÚabsoluteÚtoken_type_ids©Údtype)ÚsuperÚ__init__r   Ú	EmbeddingÚ
vocab_sizeÚembedding_sizeZpad_token_idÚword_embeddingsÚmax_position_embeddingsÚposition_embeddingsZtype_vocab_sizeÚtoken_type_embeddingsr    Úlayer_norm_epsÚDropoutÚhidden_dropout_probÚdropoutZregister_bufferrE   ÚarangeÚexpandr>   r\   ÚzerosrY   ÚsizeÚlong©ÚselfrH   ©Ú	__class__rQ   rR   rb   µ   s    
ÿÿzAlbertEmbeddings.__init__Nr   )Ú	input_idsr^   rY   Úinputs_embedsÚpast_key_values_lengthÚreturnc                 C   sø   |d ur|  ¡ }n|  ¡ d d… }|d }|d u rL| jd d …||| …f }|d u r t| dƒrŠ| jd d …d |…f }| |d |¡}	|	}ntj|tj| jjd}|d u r²|  	|¡}|  
|¡}
||
 }| jdkrà|  |¡}||7 }|  |¡}|  |¡}|S )NrZ   r   r^   r   ©r`   Údevicer]   )rq   rY   Úhasattrr^   ro   rE   rp   rr   r|   rf   ri   r\   rh   r    rm   )rt   rw   r^   rY   rx   ry   Úinput_shapeÚ
seq_lengthÚbuffered_token_type_idsÚ buffered_token_type_ids_expandedri   r$   rh   rQ   rQ   rR   ÚforwardÊ   s,    







zAlbertEmbeddings.forward)NNNNr   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   rb   r   rE   Ú
LongTensorÚFloatTensorr@   ÚTensorr‚   Ú__classcell__rQ   rQ   ru   rR   rT   °   s        úùrT   c                
       sv   e Zd Zedœ‡ fdd„Zee ddœdd„Zdej	e
ej e
ej eeeej	 eej	ej	f f d	œd
d„Z‡  ZS )ÚAlbertAttentionrU   c                    s4  t ƒ  ¡  |j|j dkr<t|dƒs<td|j› d|j› ƒ‚|j| _|j| _|j|j | _| j| j | _t 	|j| j¡| _
t 	|j| j¡| _t 	|j| j¡| _t |j¡| _t |j¡| _t 	|j|j¡| _tj|j|jd| _tƒ | _t|ddƒ| _| jdks| jd	kr0|j| _t d
|j d | j¡| _d S )Nr   re   zThe hidden size (z6) is not a multiple of the number of attention heads (rW   r\   r]   Úrelative_keyÚrelative_key_queryr+   r   )ra   rb   Úhidden_sizeÚnum_attention_headsr}   rC   Úattention_head_sizeÚall_head_sizer   ÚLinearÚqueryÚkeyÚvaluerk   Úattention_probs_dropout_probÚattention_dropoutrl   Úoutput_dropoutÚdenser    rj   ÚsetÚpruned_headsr>   r\   rg   rc   Údistance_embeddingrs   ru   rQ   rR   rb   õ   s.    

ÿÿzAlbertAttention.__init__N)Úheadsrz   c                 C   s”   t |ƒdkrd S t|| j| j| jƒ\}}t| j|ƒ| _t| j|ƒ| _t| j|ƒ| _t| j	|dd| _	| jt |ƒ | _| j| j | _
| j |¡| _d S )Nr   r   ©Údim)r:   r   r   r   r›   r   r“   r”   r•   r™   r‘   Úunion)rt   r   ÚindexrQ   rQ   rR   Úprune_heads  s    ÿzAlbertAttention.prune_headsF©Úhidden_statesÚattention_maskÚ	head_maskÚoutput_attentionsrz   c                 C   s  |j \}}}|  |¡}|  |¡}	|  |¡}
| |d| j| j¡ dd¡}|	 |d| j| j¡ dd¡}	|
 |d| j| j¡ dd¡}
t 	||	 dd¡¡}|t
 | j¡ }|d ur²|| }| jdksÈ| jdkrŽ| ¡ d }tj|tj|jd dd¡}tj|tj|jd dd¡}|| }|  || j d ¡}|j|jd}| jdkrZt d	||¡}|| }n4| jdkrŽt d	||¡}t d
|	|¡}|| | }tjj|dd}|  |¡}|d urº|| }t 	||
¡}| dd¡ d¡}|  |¡}|  |¡}|  || ¡}|r||fS |fS )NrZ   r   r+   éþÿÿÿrŒ   r   r{   r_   zbhld,lrd->bhlrzbhrd,lrd->bhlrrž   )rB   r“   r”   r•   Úviewr   r   rA   rE   ÚmatmulÚmathÚsqrtr\   rq   rn   rr   r|   rœ   rg   Útor`   Zeinsumr   Ú
functionalZsoftmaxr—   Úflattenr™   r˜   r    )rt   r¤   r¥   r¦   r§   Ú
batch_sizer   Ú_Úquery_layerÚ	key_layerÚvalue_layerZattention_scoresZposition_ids_lZposition_ids_rZdistanceZpositional_embeddingZrelative_position_scoresZrelative_position_scores_queryZrelative_position_scores_keyZattention_probsZcontext_layerÚprojected_context_layerÚprojected_context_layer_dropoutÚlayernormed_context_layerrQ   rQ   rR   r‚   #  sN    


ÿÿ




zAlbertAttention.forward)NNF)rƒ   r„   r…   r   rb   Úlistr@   r¢   rE   r‰   r   rˆ   Úboolr   Útupler‚   rŠ   rQ   rQ   ru   rR   r‹   ô   s      ûúr‹   c                
       s`   e Zd Z‡ fdd„Zdejeej eej ee	e
ej e
ejejf f dœ‡ fdd„Z‡  ZS )	ÚAlbertSdpaAttentionc                    s   t ƒ  |¡ |j| _d S ©N)ra   rb   r–   Údropout_probrs   ru   rQ   rR   rb   c  s    zAlbertSdpaAttention.__init__NFr£   c                    s  | j dks|r*t d¡ tƒ j|||dS | ¡ \}}}|  |¡ |d| j| j	¡ 
dd¡}|  |¡ |d| j| j	¡ 
dd¡}	|  |¡ |d| j| j	¡ 
dd¡}
tjjj||	|
|| jrº| jnddd	}| 
dd¡}| ||| j¡}|  |¡}|  |¡}|  || ¡}|fS )
Nr]   a¤  AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` . Falling back to the eager attention implementation, but specifying the eager implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r§   rZ   r   r+   ç        F)r“   r”   r•   Z	attn_maskZ	dropout_pZ	is_causal)r\   r/   Úwarningra   r‚   rq   r“   r©   r   r   rA   r”   r•   rE   r   r®   Zscaled_dot_product_attentionZtrainingr½   Zreshaper‘   r™   r˜   r    )rt   r¤   r¥   r¦   r§   r°   Zseq_lenr±   r²   r³   r´   Úattention_outputrµ   r¶   r·   ru   rQ   rR   r‚   g  sL    ÿ
ÿþÿ
ÿþÿ
ÿþÿú	

zAlbertSdpaAttention.forward)NNF)rƒ   r„   r…   rb   rE   r‰   r   rˆ   r¹   r   rº   r‚   rŠ   rQ   rQ   ru   rR   r»   b  s      ûúr»   )ÚeagerÚsdpac                	       sj   e Zd Zedœ‡ fdd„Zdejeej eej e	e	e
ejejf dœdd„Zejejd	œd
d„Z‡  ZS )ÚAlbertLayerrU   c                    s„   t ƒ  ¡  || _|j| _d| _tj|j|jd| _	t
|j |ƒ| _t |j|j¡| _t |j|j¡| _t|j | _t |j¡| _d S )Nr   rW   )ra   rb   rH   Úchunk_size_feed_forwardÚseq_len_dimr   r    rŽ   rj   r   ÚALBERT_ATTENTION_CLASSESÚ_attn_implementationr   r’   Zintermediate_sizer   Ú
ffn_outputr
   Ú
hidden_actÚ
activationrk   rl   rm   rs   ru   rQ   rR   rb   ¢  s    
zAlbertLayer.__init__NF©r¤   r¥   r¦   r§   Úoutput_hidden_statesrz   c                 C   sL   |   ||||¡}t| j| j| j|d ƒ}|  ||d  ¡}|f|dd …  S )Nr   r   )r   r   Úff_chunkrÄ   rÅ   r   )rt   r¤   r¥   r¦   r§   rÌ   rÀ   rÈ   rQ   rQ   rR   r‚   ¯  s    üzAlbertLayer.forward)rÀ   rz   c                 C   s"   |   |¡}|  |¡}|  |¡}|S r¼   )r   rÊ   rÈ   )rt   rÀ   rÈ   rQ   rQ   rR   rÍ   Ã  s    


zAlbertLayer.ff_chunk)NNFF)rƒ   r„   r…   r   rb   rE   r‰   r   rˆ   r¹   rº   r‚   rÍ   rŠ   rQ   rQ   ru   rR   rÃ   ¡  s       úùrÃ   c                       sb   e Zd Zedœ‡ fdd„Zd
ejeej eej e	e	e
eeje
ej f df dœdd	„Z‡  ZS )ÚAlbertLayerGrouprU   c                    s.   t ƒ  ¡  t ‡ fdd„tˆ jƒD ƒ¡| _d S )Nc                    s   g | ]}t ˆ ƒ‘qS rQ   )rÃ   ©Ú.0r±   rU   rQ   rR   Ú
<listcomp>Î  ó    z-AlbertLayerGroup.__init__.<locals>.<listcomp>)ra   rb   r   Ú
ModuleListÚrangeÚinner_group_numÚalbert_layersrs   ru   rU   rR   rb   Ë  s    
zAlbertLayerGroup.__init__NF.rË   c                 C   s|   d}d}t | jƒD ]B\}}	|	|||| |ƒ}
|
d }|rF||
d f }|r||f }q|f}|rj||f }|rx||f }|S )NrQ   r   r   )Ú	enumeraterÖ   )rt   r¤   r¥   r¦   r§   rÌ   Zlayer_hidden_statesZlayer_attentionsZlayer_indexZalbert_layerZlayer_outputÚoutputsrQ   rQ   rR   r‚   Ð  s    

zAlbertLayerGroup.forward)NNFF)rƒ   r„   r…   r   rb   rE   r‰   r   rˆ   r¹   rº   r   r‚   rŠ   rQ   rQ   ru   rR   rÎ   Ê  s       úùrÎ   c                
       sT   e Zd Zedœ‡ fdd„Zd
ejeej eej e	e	e	e
eef dœdd	„Z‡  ZS )ÚAlbertTransformerrU   c                    sF   t ƒ  ¡  ˆ | _t ˆ jˆ j¡| _t ‡ fdd„t	ˆ j
ƒD ƒ¡| _d S )Nc                    s   g | ]}t ˆ ƒ‘qS rQ   )rÎ   rÏ   rU   rQ   rR   rÑ   ó  rÒ   z.AlbertTransformer.__init__.<locals>.<listcomp>)ra   rb   rH   r   r’   re   rŽ   Úembedding_hidden_mapping_inrÓ   rÔ   Únum_hidden_groupsÚalbert_layer_groupsrs   ru   rU   rR   rb   î  s    
zAlbertTransformer.__init__NFT)r¤   r¥   r¦   r§   rÌ   Úreturn_dictrz   c                 C   sö   |   |¡}|r|fnd }|r dnd }|d u r:d g| jj n|}t| jjƒD ]€}	t| jj| jj ƒ}
t|	| jj| jj  ƒ}| j| |||||
 |d |
 … ||ƒ}|d }|r¼||d  }|rJ||f }qJ|sètdd„ |||fD ƒƒS t|||dS )NrQ   r   r   rZ   c                 s   s   | ]}|d ur|V  qd S r¼   rQ   )rÐ   ÚvrQ   rQ   rR   Ú	<genexpr>  rÒ   z,AlbertTransformer.forward.<locals>.<genexpr>)Úlast_hidden_stater¤   Ú
attentions)	rÚ   rH   Únum_hidden_layersrÔ   r@   rÛ   rÜ   rº   r   )rt   r¤   r¥   r¦   r§   rÌ   rÝ   Zall_hidden_statesZall_attentionsÚiZlayers_per_groupÚ	group_idxZlayer_group_outputrQ   rQ   rR   r‚   õ  s0    	
ûÿzAlbertTransformer.forward)NNFFT)rƒ   r„   r…   r   rb   rE   r‰   r   rˆ   r¹   r   r   rº   r‚   rŠ   rQ   rQ   ru   rR   rÙ   í  s   
     ù
ørÙ   c                   @   s*   e Zd ZU eed< eZdZdZdd„ Z	dS )ÚAlbertPreTrainedModelrH   ÚalbertTc                 C   s¼   t |tjƒr:|jjjd| jjd |jdur¸|jj 	¡  n~t |tj
ƒrz|jjjd| jjd |jdur¸|jj|j  	¡  n>t |tjƒr¢|jj 	¡  |jj d¡ nt |tƒr¸|jj 	¡  dS )zInitialize the weights.r¾   )ÚmeanZstdNç      ð?)Ú
isinstancer   r’   r&   rF   Znormal_rH   Zinitializer_ranger)   Zzero_rc   rV   r    Zfill_ÚAlbertMLMHead)rt   ÚmodulerQ   rQ   rR   Ú_init_weights)  s    


z#AlbertPreTrainedModel._init_weightsN)
rƒ   r„   r…   r   Ú__annotations__rS   Zload_tf_weightsÚbase_model_prefixZ_supports_sdparì   rQ   rQ   rQ   rR   rå   "  s
   
rå   z2
    Output type of [`AlbertForPreTraining`].
    )Zcustom_introc                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )ÚAlbertForPreTrainingOutputa‰  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    NÚlossÚprediction_logitsÚ
sop_logitsr¤   rá   )rƒ   r„   r…   r†   rð   r   rE   rˆ   rí   rñ   rò   r¤   rº   rá   rQ   rQ   rQ   rR   rï   <  s   
rï   c                       sÐ   e Zd ZU eed< dZdeedœ‡ fdd„Zej	dœdd	„Z
ej	d
dœdd„Zeeee f d
dœdd„Zedeej eej eej eej eej eej ee ee ee eeef dœ
dd„ƒZ‡  ZS )ÚAlbertModelrH   ræ   T)rH   Úadd_pooling_layerc                    sp   t ƒ  |¡ || _t|ƒ| _t|ƒ| _|rHt |j	|j	¡| _
t ¡ | _nd| _
d| _|j| _|j| _|  ¡  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)ra   rb   rH   rT   r$   rÙ   Úencoderr   r’   rŽ   r"   ZTanhÚpooler_activationrÇ   Úattn_implementationr\   Ú	post_init)rt   rH   rô   ru   rQ   rR   rb   Z  s    

zAlbertModel.__init__©rz   c                 C   s   | j jS r¼   ©r$   rf   ©rt   rQ   rQ   rR   Úget_input_embeddingsq  s    z AlbertModel.get_input_embeddingsN)r•   rz   c                 C   s   || j _d S r¼   rú   )rt   r•   rQ   rQ   rR   Úset_input_embeddingst  s    z AlbertModel.set_input_embeddings)Úheads_to_prunerz   c                 C   sT   |  ¡ D ]F\}}t|| jj ƒ}t||| jj  ƒ}| jj| j| j |¡ qdS )aÖ  
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
        a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
        model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.

        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
        while [2,3] correspond to the two inner groups of the second hidden layer.

        Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
        information about head pruning
        N)	Úitemsr@   rH   rÕ   rõ   rÜ   rÖ   r   r¢   )rt   rþ   Úlayerr   rä   Zinner_group_idxrQ   rQ   rR   Ú_prune_headsw  s    zAlbertModel._prune_heads)
rw   r¥   r^   rY   r¦   rx   r§   rÌ   rÝ   rz   c
                 C   s*  |d ur|n| j j}|d ur |n| j j}|	d ur4|	n| j j}	|d urV|d urVtdƒ‚n@|d urt|  ||¡ | ¡ }
n"|d urŽ| ¡ d d… }
ntdƒ‚|
\}}|d ur¬|jn|j}|d u rÈtj	|
|d}|d u rt
| jdƒr
| jjd d …d |…f }| ||¡}|}ntj|
tj|d}| j||||d}| jdkoT| jd	koT|d u oT| }|rnt||j|d
}n4| d¡ d¡}|j| jd}d| t | j¡j }|  || j j¡}| j||||||	d}|d }| jd urø|  |  |d d …df ¡¡nd }|	s||f|dd …  S t|||j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timerZ   z5You have to specify either input_ids or inputs_embeds)r|   r^   r{   )rY   r^   rx   rÂ   r]   )Ztgt_lenr   r+   r_   rè   )r¦   r§   rÌ   rÝ   r   )rà   Zpooler_outputr¤   rá   ) rH   r§   rÌ   Úuse_return_dictrC   Z%warn_if_padding_and_no_attention_maskrq   r|   rE   Zonesr}   r$   r^   ro   rp   rr   r÷   r\   r   r`   Z	unsqueezer­   ZfinfoÚminZget_head_maskrâ   rõ   r"   rö   r   r¤   rá   )rt   rw   r¥   r^   rY   r¦   rx   r§   rÌ   rÝ   r~   r°   r   r|   r€   r   Zembedding_outputZuse_sdpa_attention_maskZextended_attention_maskZencoder_outputsÚsequence_outputÚpooled_outputrQ   rQ   rR   r‚   ˆ  st    ÿ


ÿÿþüÿú	,üzAlbertModel.forward)T)	NNNNNNNNN)rƒ   r„   r…   r   rí   rî   r¹   rb   r   rc   rü   rý   Údictr@   r¸   r  r   r   rE   r‡   rˆ   r   r   rº   r‚   rŠ   rQ   rQ   ru   rR   ró   U  s8   
         ö
õró   z«
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                       sÊ   e Zd ZddgZedœ‡ fdd„Zejdœdd„Zejd	d
œdd„Z	ej
dœdd„Zedeej eej eej eej eej eej eej eej ee ee ee eeef dœdd„ƒZ‡  ZS )ÚAlbertForPreTrainingúpredictions.decoder.biasúpredictions.decoder.weightrU   c                    s6   t ƒ  |¡ t|ƒ| _t|ƒ| _t|ƒ| _|  ¡  d S r¼   )	ra   rb   ró   ræ   rê   r#   ÚAlbertSOPHeadÚsop_classifierrø   rs   ru   rQ   rR   rb   é  s
    


zAlbertForPreTraining.__init__rù   c                 C   s   | j jS r¼   ©r#   Údecoderrû   rQ   rQ   rR   Úget_output_embeddingsó  s    z*AlbertForPreTraining.get_output_embeddingsN©Únew_embeddingsrz   c                 C   s   || j _d S r¼   r  ©rt   r  rQ   rQ   rR   Úset_output_embeddingsö  s    z*AlbertForPreTraining.set_output_embeddingsc                 C   s
   | j jjS r¼   ©ræ   r$   rf   rû   rQ   rQ   rR   rü   ù  s    z)AlbertForPreTraining.get_input_embeddings)rw   r¥   r^   rY   r¦   rx   ÚlabelsÚsentence_order_labelr§   rÌ   rÝ   rz   c                 C   sî   |dur|n| j j}| j|||||||	|
|d	}|dd… \}}|  |¡}|  |¡}d}|durª|durªtƒ }|| d| j j¡| d¡ƒ}|| dd¡| d¡ƒ}|| }|sØ||f|dd…  }|durÔ|f| S |S t||||j	|j
dS )a†  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
            sequence B), `1` indicates switched order (sequence B, then sequence A).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, AlbertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> sop_logits = outputs.sop_logits
        ```N©r¥   r^   rY   r¦   rx   r§   rÌ   rÝ   r+   rZ   )rð   rñ   rò   r¤   rá   )rH   r  ræ   r#   r  r   r©   rd   rï   r¤   rá   )rt   rw   r¥   r^   rY   r¦   rx   r  r  r§   rÌ   rÝ   rØ   r  r  Úprediction_scoresZ
sop_scoresÚ
total_lossÚloss_fctÚmasked_lm_lossZsentence_order_lossÚoutputrQ   rQ   rR   r‚   ü  s>    )÷

ûzAlbertForPreTraining.forward)NNNNNNNNNNN)rƒ   r„   r…   Ú_tied_weights_keysr   rb   r   r’   r  r  rc   rü   r   r   rE   r‡   rˆ   r¹   r   rï   rº   r‚   rŠ   rQ   rQ   ru   rR   r  à  s>   
           ô
ór  c                       sD   e Zd Zedœ‡ fdd„Zejejdœdd„Zddœd	d
„Z‡  Z	S )rê   rU   c                    sp   t ƒ  ¡  tj|j|jd| _t t |j	¡¡| _
t |j|j¡| _t |j|j	¡| _t|j | _| j
| j_
d S )NrW   )ra   rb   r   r    re   rj   Ú	ParameterrE   rp   rd   r)   r’   rŽ   r™   r  r
   rÉ   rÊ   rs   ru   rQ   rR   rb   M  s    
zAlbertMLMHead.__init__)r¤   rz   c                 C   s0   |   |¡}|  |¡}|  |¡}|  |¡}|}|S r¼   )r™   rÊ   r    r  )rt   r¤   r  rQ   rQ   rR   r‚   W  s    



zAlbertMLMHead.forwardNrù   c                 C   s*   | j jjjdkr| j| j _n
| j j| _d S )NÚmeta)r  r)   r|   Útyperû   rQ   rQ   rR   Ú_tie_weightsa  s    zAlbertMLMHead._tie_weights)
rƒ   r„   r…   r   rb   rE   r‰   r‚   r   rŠ   rQ   rQ   ru   rR   rê   L  s   

rê   c                       s6   e Zd Zedœ‡ fdd„Zejejdœdd„Z‡  ZS )r
  rU   c                    s.   t ƒ  ¡  t |j¡| _t |j|j¡| _	d S r¼   )
ra   rb   r   rk   Úclassifier_dropout_probrm   r’   rŽ   Ú
num_labelsr*   rs   ru   rQ   rR   rb   k  s    
zAlbertSOPHead.__init__)r  rz   c                 C   s   |   |¡}|  |¡}|S r¼   )rm   r*   )rt   r  Zdropout_pooled_outputÚlogitsrQ   rQ   rR   r‚   q  s    

zAlbertSOPHead.forward)	rƒ   r„   r…   r   rb   rE   r‰   r‚   rŠ   rQ   rQ   ru   rR   r
  j  s   r
  c                       s¼   e Zd ZddgZ‡ fdd„Zejdœdd„Zejdd	œd
d„Zej	dœdd„Z
edeej eej eej eej eej eej eej ee ee ee eeef dœdd„ƒZ‡  ZS )ÚAlbertForMaskedLMr  r	  c                    s0   t ƒ  |¡ t|dd| _t|ƒ| _|  ¡  d S ©NF)rô   )ra   rb   ró   ræ   rê   r#   rø   rs   ru   rQ   rR   rb   {  s    
zAlbertForMaskedLM.__init__rù   c                 C   s   | j jS r¼   r  rû   rQ   rQ   rR   r  „  s    z'AlbertForMaskedLM.get_output_embeddingsNr  c                 C   s   || j _|j| j _d S r¼   )r#   r  r)   r  rQ   rQ   rR   r  ‡  s    z'AlbertForMaskedLM.set_output_embeddingsc                 C   s
   | j jjS r¼   r  rû   rQ   rQ   rR   rü   ‹  s    z&AlbertForMaskedLM.get_input_embeddings©rw   r¥   r^   rY   r¦   rx   r  r§   rÌ   rÝ   rz   c                 C   s°   |
dur|
n| j j}
| j||||||||	|
d	}|d }|  |¡}d}|durptƒ }|| d| j j¡| d¡ƒ}|
sœ|f|dd…  }|dur˜|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlbertForMaskedLM

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

        >>> # add mask_token
        >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> # retrieve index of [MASK]
        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        >>> tokenizer.decode(predicted_token_id)
        'france'
        ```

        ```python
        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
        >>> outputs = model(**inputs, labels=labels)
        >>> round(outputs.loss.item(), 2)
        0.81
        ```
        N©	rw   r¥   r^   rY   r¦   rx   r§   rÌ   rÝ   r   rZ   r+   ©rð   r#  r¤   rá   )
rH   r  ræ   r#   r   r©   rd   r   r¤   rá   )rt   rw   r¥   r^   rY   r¦   rx   r  r§   rÌ   rÝ   rØ   Zsequence_outputsr  r  r  r  rQ   rQ   rR   r‚   Ž  s6    1÷
üzAlbertForMaskedLM.forward)
NNNNNNNNNN)rƒ   r„   r…   r  rb   r   r’   r  r  rc   rü   r   r   rE   r‡   rˆ   r¹   r   r   rº   r‚   rŠ   rQ   rQ   ru   rR   r$  w  s:   	          õ
ôr$  zž
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       sˆ   e Zd Zedœ‡ fdd„Zedeej eej	 eej eej eej	 eej	 eej ee
 ee
 ee
 eeef dœdd„ƒZ‡  ZS )	ÚAlbertForSequenceClassificationrU   c                    sR   t ƒ  |¡ |j| _|| _t|ƒ| _t |j¡| _	t 
|j| jj¡| _|  ¡  d S r¼   )ra   rb   r"  rH   ró   ræ   r   rk   r!  rm   r’   rŽ   r*   rø   rs   ru   rQ   rR   rb   è  s    
z(AlbertForSequenceClassification.__init__Nr&  c                 C   s|  |
dur|
n| j j}
| j||||||||	|
d	}|d }|  |¡}|  |¡}d}|dur8| j jdu r®| jdkrzd| j _n4| jdkr¦|jtj	ksœ|jtj
kr¦d| j _nd| j _| j jdkrêtƒ }| jdkrÞ|| ¡ | ¡ ƒ}n
|||ƒ}nN| j jdkrtƒ }|| d| j¡| d¡ƒ}n| j jdkr8tƒ }|||ƒ}|
sh|f|dd…  }|durd|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr'  r   Z
regressionZsingle_label_classificationZmulti_label_classificationrZ   r+   r(  )rH   r  ræ   rm   r*   Zproblem_typer"  r`   rE   rr   r@   r   Úsqueezer   r©   r   r   r¤   rá   )rt   rw   r¥   r^   rY   r¦   rx   r  r§   rÌ   rÝ   rØ   r  r#  rð   r  r  rQ   rQ   rR   r‚   ô  sV    ÷




"


üz'AlbertForSequenceClassification.forward)
NNNNNNNNNN)rƒ   r„   r…   r   rb   r   r   rE   r‡   rˆ   r¹   r   r   rº   r‚   rŠ   rQ   rQ   ru   rR   r)  á  s2             õ
ôr)  c                       sˆ   e Zd Zedœ‡ fdd„Zedeej eej	 eej eej eej	 eej	 eej ee
 ee
 ee
 eeef dœdd„ƒZ‡  ZS )	ÚAlbertForTokenClassificationrU   c                    sd   t ƒ  |¡ |j| _t|dd| _|jd ur2|jn|j}t |¡| _	t 
|j| jj¡| _|  ¡  d S r%  )ra   rb   r"  ró   ræ   r!  rl   r   rk   rm   r’   rŽ   rH   r*   rø   )rt   rH   r!  ru   rQ   rR   rb   @  s    ÿýz%AlbertForTokenClassification.__init__Nr&  c                 C   s¸   |
dur|
n| j j}
| j||||||||	|
d	}|d }|  |¡}|  |¡}d}|durxtƒ }|| d| j¡| d¡ƒ}|
s¤|f|dd…  }|dur |f| S |S t|||j	|j
dS )zÛ
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rZ   r+   r(  )rH   r  ræ   rm   r*   r   r©   r"  r   r¤   rá   )rt   rw   r¥   r^   rY   r¦   rx   r  r§   rÌ   rÝ   rØ   r  r#  rð   r  r  rQ   rQ   rR   r‚   P  s8    ÷

üz$AlbertForTokenClassification.forward)
NNNNNNNNNN)rƒ   r„   r…   r   rb   r   r   rE   r‡   rˆ   r¹   r   r   rº   r‚   rŠ   rQ   rQ   ru   rR   r+  >  s2             õ
ôr+  c                       s   e Zd Zedœ‡ fdd„Zedeej eej	 eej eej eej	 eej	 eej eej ee
 ee
 ee
 eeef dœdd„ƒZ‡  ZS )	ÚAlbertForQuestionAnsweringrU   c                    s@   t ƒ  |¡ |j| _t|dd| _t |j|j¡| _|  	¡  d S r%  )
ra   rb   r"  ró   ræ   r   r’   rŽ   Ú
qa_outputsrø   rs   ru   rQ   rR   rb   ˆ  s
    z#AlbertForQuestionAnswering.__init__N)rw   r¥   r^   rY   r¦   rx   Ústart_positionsÚend_positionsr§   rÌ   rÝ   rz   c                 C   sP  |d ur|n| j j}| j|||||||	|
|d	}|d }|  |¡}|jddd\}}| d¡ ¡ }| d¡ ¡ }d }|d ur|d urt| ¡ ƒdkr¢| d¡}t| ¡ ƒdkr¼| d¡}| d¡}| 	d|¡}| 	d|¡}t
|d}|||ƒ}|||ƒ}|| d }|s:||f|dd …  }|d ur6|f| S |S t||||j|jdS )	Nr'  r   r   rZ   rž   )Zignore_indexr+   )rð   Ústart_logitsÚ
end_logitsr¤   rá   )rH   r  ræ   r-  r;   r*  Ú
contiguousr:   rq   Úclampr   r   r¤   rá   )rt   rw   r¥   r^   rY   r¦   rx   r.  r/  r§   rÌ   rÝ   rØ   r  r#  r0  r1  r  Zignored_indexr  Z
start_lossZend_lossr  rQ   rQ   rR   r‚   ’  sP    ÷






ûz"AlbertForQuestionAnswering.forward)NNNNNNNNNNN©rƒ   r„   r…   r   rb   r   r   rE   r‡   rˆ   r¹   r   rï   rº   r‚   rŠ   rQ   rQ   ru   rR   r,  †  s6   
           ô
ór,  c                       sˆ   e Zd Zedœ‡ fdd„Zedeej eej	 eej eej eej	 eej	 eej ee
 ee
 ee
 eeef dœdd„ƒZ‡  ZS )	ÚAlbertForMultipleChoicerU   c                    s@   t ƒ  |¡ t|ƒ| _t |j¡| _t |j	d¡| _
|  ¡  d S )Nr   )ra   rb   ró   ræ   r   rk   r!  rm   r’   rŽ   r*   rø   rs   ru   rQ   rR   rb   Ö  s
    
z AlbertForMultipleChoice.__init__Nr&  c                 C   st  |
dur|
n| j j}
|dur&|jd n|jd }|durJ| d| d¡¡nd}|durh| d| d¡¡nd}|dur†| d| d¡¡nd}|dur¤| d| d¡¡nd}|durÊ| d| d¡| d¡¡nd}| j||||||||	|
d	}|d }|  |¡}|  |¡}| d|¡}d}|dur0tƒ }|||ƒ}|
s`|f|dd…  }|dur\|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        Nr   rZ   r¨   r  r+   r(  )rH   r  rB   r©   rq   ræ   rm   r*   r   r   r¤   rá   )rt   rw   r¥   r^   rY   r¦   rx   r  r§   rÌ   rÝ   Znum_choicesrØ   r  r#  Zreshaped_logitsrð   r  r  rQ   rQ   rR   r‚   à  sL    ,ÿý÷



üzAlbertForMultipleChoice.forward)
NNNNNNNNNNr4  rQ   rQ   ru   rR   r5  Ô  s2   
          õ
ôr5  )	rS   rå   ró   r  r$  r)  r+  r,  r5  )>r†   r«   r1   Údataclassesr   Útypingr   r   rE   r   Ztorch.nnr   r   r   Zactivationsr
   Zmodeling_attn_mask_utilsr   Zmodeling_outputsr   r   r   r   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   Úutilsr   r   r   Zconfiguration_albertr   Z
get_loggerrƒ   r/   rS   ÚModulerT   r‹   r»   rÆ   rÃ   rÎ   rÙ   rå   rï   ró   r  rê   r
  r$  r)  r+  r,  r5  Ú__all__rQ   rQ   rQ   rR   Ú<module>   sl   $	
~Dn:þ)#5ÿ ÿfiÿWGMf