a
    h                     @   s   d dl mZmZ d dlZd dlmZ d dlmZ ddlm	Z	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZmZ G d
d deZeG dd deZG dd deeZeddG dd deeZg dZdS )    )OptionalUnionN)IJepaConfig   )BaseModelOutputWithPoolingImageClassifierOutput)Unpack)TransformersKwargsauto_docstring	torch_int   )ViTEmbeddingsViTForImageClassificationViTModelViTPreTrainedModelc                       s`   e Zd Zdeedd fddZejeeejdddZ	deje
ej eejd	d
dZ  ZS )IJepaEmbeddingsFN)configuse_mask_tokenreturnc                    s6   t  || | `| jj}ttd||j	| _
d S )N   )super__init__Z	cls_tokenpatch_embeddingsnum_patchesnn	ParametertorchZrandnZhidden_sizeposition_embeddings)selfr   r   r   	__class__ c/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/ijepa/modular_ijepa.pyr      s    zIJepaEmbeddings.__init__)
embeddingsheightwidthr   c                 C   s   |j d }| jj d }tj s6||kr6||kr6| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r   ZbicubicF)sizemodeZalign_corners)shaper   r   Zjit
is_tracingZ
patch_sizer   ZreshapeZpermuter   Z
functionalZinterpolateview)r   r#   r$   r%   r   Znum_positionsZpatch_pos_embeddimZ
new_heightZ	new_widthZsqrt_num_positionsr!   r!   r"   interpolate_pos_encoding   s&    




z(IJepaEmbeddings.interpolate_pos_encoding)pixel_valuesbool_masked_posr-   r   c                 C   s   |j \}}}}| j||d}|d urb|j d }	| j||	d}
|d|
}|d|  |
|  }|rz|| ||| }n
|| j }| |}|S )N)r-   r   r&         ?)	r)   r   
mask_tokenexpandZ	unsqueezeZtype_asr-   r   Zdropout)r   r.   r/   r-   Z
batch_size_r$   r%   r#   Z
seq_lengthZmask_tokensmaskr!   r!   r"   forward=   s    


zIJepaEmbeddings.forward)F)NF)__name__
__module____qualname__r   boolr   r   Tensorintr-   r   Z
BoolTensorr5   __classcell__r!   r!   r   r"   r      s   *  r   c                   @   s,   e Zd Zeejejejf ddddZdS )IJepaPreTrainedModelN)moduler   c                 C   s   t |tjtjfrZtjj|jjt	j
d| jjd|jj|j_|jdur|jj  nxt |tjr|jj  |jjd nPt |trtjj|jjt	j
d| jjd|jj|j_|jdur|jj  dS )zInitialize the weightsg        )meanZstdNr0   )
isinstancer   LinearConv2dinitZtrunc_normal_weightdatator   Zfloat32r   Zinitializer_rangeZdtypeZbiasZzero_	LayerNormZfill_r   r   r1   )r   r>   r!   r!   r"   _init_weightsZ   s*    


z"IJepaPreTrainedModel._init_weights)	r6   r7   r8   r   r   rA   rB   rG   rH   r!   r!   r!   r"   r=   X   s   r=   c                       s(   e Zd Zdeeed fddZ  ZS )
IJepaModelF)r   add_pooling_layerr   c                    s$   t  | || _t||d| _dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   N)r   r   r   r   r#   )r   r   rJ   r   r   r!   r"   r   r   s    zIJepaModel.__init__)FF)r6   r7   r8   r   r9   r   r<   r!   r!   r   r"   rI   q   s   rI   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )Zcustom_introc                       sV   e Zd Zed fddZdeej eej eej ee e	e
 edddZ  ZS )	IJepaForImageClassification)r   c                    s&   t  | t|dd| _|   d S )NF)rJ   )r   r   rI   ijepaZ	post_init)r   r   r   r!   r"   r      s    z$IJepaForImageClassification.__init__N)r.   	head_masklabelsr-   kwargsr   c           
      K   sh   | j |f||d|}|j}| |jdd}d}	|durT| j||| jfi |}	t|	||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )rM   r-   r   )r,   N)losslogitshidden_states
attentions)	rL   Zlast_hidden_state
classifierr?   Zloss_functionr   r   rR   rS   )
r   r.   rM   rN   r-   rO   outputsZsequence_outputrQ   rP   r!   r!   r"   r5      s&    z#IJepaForImageClassification.forward)NNNN)r6   r7   r8   r   r   r   r   r:   r9   r   r	   r   r5   r<   r!   r!   r   r"   rK   ~   s       rK   )r=   rI   rK   )typingr   r   r   Ztorch.nnr   Z-transformers.models.ijepa.configuration_ijepar   Zmodeling_outputsr   r   Zprocessing_utilsr   utilsr	   r
   r   Zvit.modeling_vitr   r   r   r   r   r=   rI   rK   __all__r!   r!   r!   r"   <module>   s    J*