a
    h                  	   @   sb  d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' e!(e)Z*G dd dej+Z,G dd dej+Z-d;ej+e	j.e	j.e	j.ee	j. e/e/dddZ0G dd dej+Z1G dd dej+Z2G dd dej+Z3G dd  d ej+Z4G d!d" d"ej+Z5G d#d$ d$eZ6G d%d& d&ej+Z7e G d'd( d(eZ8e G d)d* d*e8Z9G d+d, d,ej+Z:e d-d.G d/d0 d0e8Z;e d1d.G d2d3 d3e8Z<ee d4d.G d5d6 d6eZ=e d7d.G d8d9 d9e8Z>g d:Z?dS )<zPyTorch DeiT model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplecheck_model_inputs   )
DeiTConfigc                       sd   e Zd ZdZdeedd fddZeje	e	ejddd	Z
dejeej eejd
ddZ  ZS )DeiTEmbeddingszv
    Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.
    FN)configuse_mask_tokenreturnc                    s   t    ttdd|j| _ttdd|j| _|rTttdd|jnd | _	t
|| _| jj}ttd|d |j| _t|j| _|j| _d S )Nr      )super__init__r   	ParametertorchZzeroshidden_size	cls_tokendistillation_token
mask_tokenDeiTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_size)selfr   r   r+   	__class__ b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/deit/modeling_deit.pyr"   1   s    
 
zDeiTEmbeddings.__init__)
embeddingsheightwidthr   c                 C   s   |j d d }| jj d d }tj s>||kr>||kr>| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and 2 class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r    N      ?r   r   ZbicubicF)sizemodeZalign_cornersdim)shaper,   r$   Zjit
is_tracingr0   r   reshapepermuter   
functionalZinterpolateviewcat)r1   r6   r7   r8   r+   Znum_positionsZclass_and_dist_pos_embedZpatch_pos_embedr>   Z
new_heightZ	new_widthZsqrt_num_positionsr4   r4   r5   interpolate_pos_encoding=   s(    



z'DeiTEmbeddings.interpolate_pos_encoding)pixel_valuesbool_masked_posrF   r   c                 C   s   |j \}}}}| |}| \}}	}|d urb| j||	d}
|d|
}|d|  |
|  }| j|dd}| j|dd}t	j
|||fdd}| j}|r| |||}|| }| |}|S )Nr9         ?r   r=   )r?   r*   r;   r(   expand	unsqueezeZtype_asr&   r'   r$   rE   r,   rF   r/   )r1   rG   rH   rF   _r7   r8   r6   
batch_sizeZ
seq_lengthZmask_tokensmaskZ
cls_tokensZdistillation_tokensZposition_embeddingr4   r4   r5   forwarde   s     

zDeiTEmbeddings.forward)F)NF)__name__
__module____qualname____doc__r   boolr"   r$   TensorintrF   r   
BoolTensorrO   __classcell__r4   r4   r2   r5   r   ,   s   +  r   c                       s4   e Zd ZdZ fddZejejdddZ  ZS )r)   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizeZstride)r!   r"   
image_sizer0   num_channelsr%   
isinstancecollectionsabcIterabler+   r   Conv2d
projection)r1   r   rZ   r0   r[   r%   r+   r2   r4   r5   r"      s    
 zDeiTPatchEmbeddings.__init__)rG   r   c                 C   s<   |j \}}}}|| jkr td| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r    r   )r?   r[   
ValueErrorra   flatten	transpose)r1   rG   rM   r[   r7   r8   xr4   r4   r5   rO      s    
zDeiTPatchEmbeddings.forward)	rP   rQ   rR   rS   r"   r$   rU   rO   rX   r4   r4   r2   r5   r)      s   r)           )modulequerykeyvalueattention_maskscalingr/   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d urX|| }t ||}	|	dd }	|	|fS )Nr9   )r>   dtype)ptrainingr   r    )r$   matmulrd   r   rC   Zsoftmaxfloat32torn   r/   rp   
contiguous)
rg   rh   ri   rj   rk   rl   r/   kwargsZattn_weightsZattn_outputr4   r4   r5   eager_attention_forward   s    rv   c                       sJ   e Zd Zed fddZdejeej eejejf dddZ	  Z
S )	DeiTSelfAttentionr   c                    s   t    |j|j dkr>t|ds>td|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r!   r"   r%   num_attention_headshasattrrb   r   rV   attention_head_sizeall_head_sizeZattention_probs_dropout_probdropout_probrl   	is_causalr   LinearZqkv_biasrh   ri   rj   r1   r   r2   r4   r5   r"      s"    

zDeiTSelfAttention.__init__Nhidden_states	head_maskr   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t}| j	j
dkr~t| j	j
 }|| ||||| j| j| jsdn| jd\}	}
|	 d d | jf }|	|}	|	|
fS )	Nr   r9   r   r    eagerrf   )r   rl   r/   rm   )r?   r{   r}   ri   rD   rd   rj   rh   rv   r   Z_attn_implementationr   r   rl   rp   r   r;   r~   rA   )r1   r   r   rM   Z	new_shapeZ	key_layerZvalue_layerZquery_layerZattention_interfaceZcontext_layerZattention_probsZnew_context_layer_shaper4   r4   r5   rO      s*    


zDeiTSelfAttention.forward)N)rP   rQ   rR   r   r"   r$   rU   r   tuplerO   rX   r4   r4   r2   r5   rw      s    rw   c                       s>   e Zd ZdZed fddZejejejdddZ  Z	S )DeiTSelfOutputz
    The residual connection is defined in DeiTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rx   c                    s.   t    t|j|j| _t|j| _d S N)	r!   r"   r   r   r%   denser-   r.   r/   r   r2   r4   r5   r"      s    
zDeiTSelfOutput.__init__r   input_tensorr   c                 C   s   |  |}| |}|S r   r   r/   r1   r   r   r4   r4   r5   rO     s    

zDeiTSelfOutput.forward)
rP   rQ   rR   rS   r   r"   r$   rU   rO   rX   r4   r4   r2   r5   r      s   r   c                       sR   e Zd Zed fddZee dddZdej	e
ej	 ej	dd	d
Z  ZS )DeiTAttentionrx   c                    s*   t    t|| _t|| _t | _d S r   )r!   r"   rw   	attentionr   outputsetpruned_headsr   r2   r4   r5   r"     s    


zDeiTAttention.__init__)headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r=   )lenr   r   r{   r}   r   r   rh   ri   rj   r   r   r~   union)r1   r   indexr4   r4   r5   prune_heads  s    zDeiTAttention.prune_headsNr   c                 C   s    |  ||\}}| ||}|S r   )r   r   )r1   r   r   Zself_attn_outputrL   r   r4   r4   r5   rO   %  s    zDeiTAttention.forward)N)rP   rQ   rR   r   r"   r   rV   r   r$   rU   r   rO   rX   r4   r4   r2   r5   r     s   r   c                       s6   e Zd Zed fddZejejdddZ  ZS )DeiTIntermediaterx   c                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r   )r!   r"   r   r   r%   intermediate_sizer   r\   Z
hidden_actstrr   intermediate_act_fnr   r2   r4   r5   r"   -  s
    
zDeiTIntermediate.__init__r   r   c                 C   s   |  |}| |}|S r   )r   r   )r1   r   r4   r4   r5   rO   5  s    

zDeiTIntermediate.forward	rP   rQ   rR   r   r"   r$   rU   rO   rX   r4   r4   r2   r5   r   ,  s   r   c                       s:   e Zd Zed fddZejejejdddZ  ZS )
DeiTOutputrx   c                    s.   t    t|j|j| _t|j| _	d S r   )
r!   r"   r   r   r   r%   r   r-   r.   r/   r   r2   r4   r5   r"   =  s    
zDeiTOutput.__init__r   c                 C   s    |  |}| |}|| }|S r   r   r   r4   r4   r5   rO   B  s    

zDeiTOutput.forwardr   r4   r4   r2   r5   r   <  s   r   c                       sD   e Zd ZdZed fddZd	ejeej ejdddZ	  Z
S )
	DeiTLayerz?This corresponds to the Block class in the timm implementation.rx   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r!   r"   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r   r   	LayerNormr%   layer_norm_epslayernorm_beforelayernorm_afterr   r2   r4   r5   r"   M  s    



zDeiTLayer.__init__Nr   c                 C   sB   |  |}| ||}|| }| |}| |}| ||}|S r   )r   r   r   r   r   )r1   r   r   Zhidden_states_normZattention_outputZlayer_outputr4   r4   r5   rO   W  s    


zDeiTLayer.forward)N)rP   rQ   rR   rS   r   r"   r$   rU   r   rO   rX   r4   r4   r2   r5   r   J  s   
r   c                       s>   e Zd Zed fddZdejeej edddZ	  Z
S )	DeiTEncoderrx   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r4   )r   ).0rL   rx   r4   r5   
<listcomp>m      z(DeiTEncoder.__init__.<locals>.<listcomp>F)	r!   r"   r   r   Z
ModuleListrangenum_hidden_layerslayerZgradient_checkpointingr   r2   rx   r5   r"   j  s    
 zDeiTEncoder.__init__Nr   c                 C   s<   t | jD ]&\}}|d ur"|| nd }|||}q
t|dS )N)last_hidden_state)	enumerater   r
   )r1   r   r   iZlayer_moduleZlayer_head_maskr4   r4   r5   rO   p  s    zDeiTEncoder.forward)N)rP   rQ   rR   r   r"   r$   rU   r   r
   rO   rX   r4   r4   r2   r5   r   i  s   r   c                   @   sb   e Zd ZU eed< dZdZdZdgZdZ	dZ
dZdZeedZeejejejf ddd	d
ZdS )DeiTPreTrainedModelr   deitrG   Tr   )r   
attentionsN)rg   r   c                 C   s   t |tjtjfrZtjj|jjt	j
d| jjd|jj|j_|jdur|jj  nlt |tjr|jj  |jjd nDt |tr|jj  |jj  |jj  |jdur|jj  dS )zInitialize the weightsrf   )meanZstdNrI   )r\   r   r   r`   initZtrunc_normal_weightdatars   r$   rr   r   Zinitializer_rangern   rz   Zzero_r   Zfill_r   r&   r,   r'   r(   )r1   rg   r4   r4   r5   _init_weights  s"    


z!DeiTPreTrainedModel._init_weights)rP   rQ   rR   r   __annotations__Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attnZ_supports_flex_attnZ_supports_attention_backendr   rw   Z_can_record_outputsr   r   r   r`   r   r   r4   r4   r4   r5   r   x  s   
r   c                
       sx   e Zd Zdeeedd fddZeddd	Zd
d Ze	e
deej eej eej eee edddZ  ZS )	DeiTModelTFN)r   add_pooling_layerr   r   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|rJt|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   r   N)r!   r"   r   r   r6   r   encoderr   r   r%   r   	layernorm
DeiTPoolerpooler	post_init)r1   r   r   r   r2   r4   r5   r"     s    
zDeiTModel.__init__)r   c                 C   s   | j jS r   )r6   r*   )r1   r4   r4   r5   get_input_embeddings  s    zDeiTModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r1   Zheads_to_pruner   r   r4   r4   r5   _prune_heads  s    zDeiTModel._prune_headsrG   rH   r   rF   ru   r   c                 K   s   |du rt d| || jj}| jjjjj}|j|krB|	|}| j|||d}| j
||d}|j}	| |	}	| jdur| |	nd}
t|	|
dS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rH   rF   )r   )r   Zpooler_output)rb   Zget_head_maskr   r   r6   r*   ra   r   rn   rs   r   r   r   r   r   )r1   rG   rH   r   rF   ru   Zexpected_dtypeZembedding_outputZencoder_outputssequence_outputpooled_outputr4   r4   r5   rO     s"    


zDeiTModel.forward)TF)NNNF)rP   rQ   rR   r   rT   r"   r)   r   r   r   r   r   r$   rU   rW   r   r   r   rO   rX   r4   r4   r2   r5   r     s"       r   c                       s6   e Zd Zed fddZejejdddZ  ZS )r   rx   c                    s,   t    t|j|j| _t|j | _	d S r   )
r!   r"   r   r   r%   Zpooler_output_sizer   r   Z
pooler_act
activationr   r2   r4   r5   r"     s    
zDeiTPooler.__init__r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r1   r   Zfirst_token_tensorr   r4   r4   r5   rO     s    

zDeiTPooler.forwardr   r4   r4   r2   r5   r     s   r   ad  
    DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )Zcustom_introc                
       s\   e Zd Zedd fddZeed	eej	 eej
 eej	 eee edddZ  ZS )
DeiTForMaskedImageModelingNr   r   c                    sX   t  | t|ddd| _ttj|j|jd |j	 ddt
|j| _|   d S )NFT)r   r   r    r   )Zin_channelsZout_channelsrY   )r!   r"   r   r   r   Z
Sequentialr`   r%   Zencoder_strider[   ZPixelShuffledecoderr   r   r2   r4   r5   r"     s    

z#DeiTForMaskedImageModeling.__init__Fr   c                 K   s
  | j |f|||d|}|j}|ddddf }|j\}}	}
t|	d  }}|ddd||
||}| |}d}|dur| jj| jj	 }|d||}|
| jj	d
| jj	dd }tjj||dd	}||  | d
  | jj }t|||j|jdS )a;  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DeiTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```)rH   r   rF   Nr   r9   r:   r   r    none)Z	reductiongh㈵>)lossZreconstructionr   r   )r   r   r?   rV   rB   rA   r   r   rZ   r0   Zrepeat_interleaverK   rt   r   rC   Zl1_losssumr[   r   r   r   )r1   rG   rH   r   rF   ru   outputsr   rM   Zsequence_lengthr[   r7   r8   Zreconstructed_pixel_valuesZmasked_im_lossr;   rN   Zreconstruction_lossr4   r4   r5   rO     sB    &
 z"DeiTForMaskedImageModeling.forward)NNNF)rP   rQ   rR   r   r"   r   r   r   r$   rU   rW   rT   r   r   r   rO   rX   r4   r4   r2   r5   r     s       r   z
    DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                
       s\   e Zd Zedd fddZeed	eej	 eej	 eej	 e
ee edddZ  ZS )
DeiTForImageClassificationNr   c                    sR   t  | |j| _t|dd| _|jdkr<t|j|jnt | _	| 
  d S NF)r   r   )r!   r"   
num_labelsr   r   r   r   r%   Identity
classifierr   r   r2   r4   r5   r"   m  s
    $z#DeiTForImageClassification.__init__F)rG   r   labelsrF   ru   r   c           
      K   sr   | j |f||d|}|j}| |dddddf }d}	|dur^| j||| jfi |}	t|	||j|jdS )aZ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DeiTForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: Polaroid camera, Polaroid Land camera
        ```r   rF   Nr   )r   logitsr   r   )r   r   r   Zloss_functionr   r   r   r   )
r1   rG   r   r   rF   ru   r   r   r   r   r4   r4   r5   rO   y  s&    *z"DeiTForImageClassification.forward)NNNF)rP   rQ   rR   r   r"   r   r   r   r$   rU   rT   r   r   r   rO   rX   r4   r4   r2   r5   r   f  s       r   zC
    Output type of [`DeiTForImageClassificationWithTeacher`].
    c                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )+DeiTForImageClassificationWithTeacherOutputaj  
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores as the average of the cls_logits and distillation logits.
    cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
        class token).
    distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
        distillation token).
    Nr   
cls_logitsdistillation_logitsr   r   )rP   rQ   rR   rS   r   r   r$   ZFloatTensorr   r   r   r   r   r   r4   r4   r4   r5   r     s   
r   a  
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                	       sT   e Zd Zedd fddZeed	eej	 eej	 e
ee edddZ  ZS )
%DeiTForImageClassificationWithTeacherNr   c                    sv   t  | |j| _t|dd| _|jdkr<t|j|jnt | _	|jdkr`t|j|jnt | _
|   d S r   )r!   r"   r   r   r   r   r   r%   r   cls_classifierdistillation_classifierr   r   r2   r4   r5   r"     s      z.DeiTForImageClassificationWithTeacher.__init__F)rG   r   rF   ru   r   c           
      K   sx   | j |f||d|}|j}| |d d dd d f }| |d d dd d f }|| d }	t|	|||j|jdS )Nr   r   r   r    )r   r   r   r   r   )r   r   r   r   r   r   r   )
r1   rG   r   rF   ru   r   r   r   r   r   r4   r4   r5   rO     s&    	z-DeiTForImageClassificationWithTeacher.forward)NNF)rP   rQ   rR   r   r"   r   r   r   r$   rU   rT   r   r   r   rO   rX   r4   r4   r2   r5   r     s      r   )r   r   r   r   r   )rf   )@rS   collections.abcr]   dataclassesr   typingr   r   r   r$   Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_layersr	   Zmodeling_outputsr
   r   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   Zpytorch_utilsr   r   utilsr   r   r   r   r   Zutils.genericr   r   Zconfiguration_deitr   Z
get_loggerrP   loggerModuler   r)   rU   floatrv   rw   r   r   r   r   r   r   r   r   r   r   r   r   r   __all__r4   r4   r4   r5   <module>   st   
Y( 5 $M`O3