a
    h+p                  	   @   sT  d Z ddlZddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& e'e(Z)G dd de	j*Z+G dd de	j*Z,d;e	j*ej-ej-ej-eej- e.e.dddZ/G dd de	j*Z0G dd de	j*Z1G dd de	j*Z2G dd  d e	j*Z3d<ej-e.e4ej-d"d#d$Z5G d%d& d&e	j*Z6G d'd( d(e	j*Z7G d)d* d*e	j*Z8G d+d, d,eZ9G d-d. d.e	j*Z:eG d/d0 d0eZ;eG d1d2 d2e;Z<ed3d4G d5d6 d6e;Z=ed7d4G d8d9 d9e;e!Z>g d:Z?dS )=zPyTorch DINOv2 model.    N)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringlogging	torch_int)BackboneMixin)can_return_tuplecheck_model_inputs   )Dinov2Configc                       s^   e Zd ZdZedd fddZejeeejdddZ	deje
ej ejd	d
dZ  ZS )Dinov2EmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    Nconfigreturnc                    s   t    ttdd|j| _|jr>tt	d|j| _
t|| _| jj}ttd|d |j| _t|j| _|j| _|j| _|| _d S )Nr   )super__init__r   	ParametertorchZrandnhidden_size	cls_tokenuse_mask_tokenZzeros
mask_tokenDinov2PatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r)   	__class__ f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/dinov2/modeling_dinov2.pyr    ,   s    

zDinov2Embeddings.__init__)
embeddingsheightwidthr   c                 C   s  |j d d }| jj d d }tj s>||kr>||kr>| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}|j	}t
jj|tj|	|
fdd	d
j|d}|dddddd|}tj||fddS )a-  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      ZbicubicF)sizemodeZalign_cornersdtypedim)shaper*   r"   Zjit
is_tracingr.   r   reshapepermuter<   r   
functionalZinterpolatetofloat32viewcat)r/   r4   r5   r6   r)   Znum_positionsZclass_pos_embedZpatch_pos_embedr>   Z
new_heightZ	new_widthZsqrt_num_positionstarget_dtyper2   r2   r3   interpolate_pos_encoding:   s.    




z)Dinov2Embeddings.interpolate_pos_encoding)pixel_valuesbool_masked_posr   c           
      C   s   |j \}}}}| jjjj}| |j|d}|d ur^| jr^t|	d| j
|j	d|}| j|dd}	tj|	|fdd}|| ||| }| |}|S )Nr;   r7   r   r   r=   )r?   r(   
projectionweightr<   rD   r%   r"   whereZ	unsqueezer&   r$   expandrG   rI   r-   )
r/   rJ   rK   
batch_size_r5   r6   rH   r4   Z
cls_tokensr2   r2   r3   forwardb   s    
zDinov2Embeddings.forward)N)__name__
__module____qualname____doc__r   r    r"   TensorintrI   r   rR   __classcell__r2   r2   r0   r3   r   '   s   (r   c                       s4   e Zd ZdZ fddZejejdddZ  ZS )r'   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )Zkernel_sizeZstride)r   r    
image_sizer.   num_channelsr#   
isinstancecollectionsabcIterabler)   r   Conv2drL   )r/   r   rZ   r.   r[   r#   r)   r0   r2   r3   r       s    
 zDinov2PatchEmbeddings.__init__)rJ   r   c                 C   sH   |j d }|| jkr,td| j d| d| |ddd}|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r8   )r?   r[   
ValueErrorrL   flatten	transpose)r/   rJ   r[   r4   r2   r2   r3   rR      s    

zDinov2PatchEmbeddings.forward)	rS   rT   rU   rV   r    r"   rW   rR   rY   r2   r2   r0   r3   r'   x   s   r'           )modulequerykeyvalueattention_maskscalingr-   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d urX|| }t ||}	|	dd }	|	|fS )Nr7   )r>   r<   )ptrainingr   r8   )r"   matmulrd   r   rC   ZsoftmaxrE   rD   r<   r-   rn   
contiguous)
rf   rg   rh   ri   rj   rk   r-   kwargsZattn_weightsZattn_outputr2   r2   r3   eager_attention_forward   s    rr   c                       sJ   e Zd Zed fddZdejeej eejejf dddZ	  Z
S )	Dinov2SelfAttentionr   c                    s   t    |j|j dkr>t|ds>td|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads ra   g      Fbias)r   r    r#   num_attention_headshasattrrb   r   rX   attention_head_sizeall_head_sizeZattention_probs_dropout_probdropout_probrk   	is_causalr   LinearZqkv_biasrg   rh   ri   r/   r   r0   r2   r3   r       s"    

zDinov2SelfAttention.__init__Nhidden_states	head_maskr   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t}| j	j
dkr~t| j	j
 }|| ||||| j| j| jsdn| jd\}	}
|	 d d | jf }|	|}	|	|
fS )	Nr   r7   r   r8   eagerre   )r|   rk   r-   rl   )r?   rw   ry   rh   rF   rd   ri   rg   rr   r   Z_attn_implementationr   r|   rk   rn   r{   r9   rz   rA   )r/   r   r   rP   Z	new_shapeZ	key_layerZvalue_layerZquery_layerZattention_interfaceZcontext_layerZattention_probsZnew_context_layer_shaper2   r2   r3   rR      s*    


zDinov2SelfAttention.forward)N)rS   rT   rU   r   r    r"   rW   r   tuplerR   rY   r2   r2   r0   r3   rs      s    rs   c                       s>   e Zd ZdZed fddZejejejdddZ  Z	S )Dinov2SelfOutputz
    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rt   c                    s.   t    t|j|j| _t|j| _d S N)	r   r    r   r}   r#   denser+   r,   r-   r~   r0   r2   r3   r       s    
zDinov2SelfOutput.__init__)r   input_tensorr   c                 C   s   |  |}| |}|S r   )r   r-   )r/   r   r   r2   r2   r3   rR      s    

zDinov2SelfOutput.forward)
rS   rT   rU   rV   r   r    r"   rW   rR   rY   r2   r2   r0   r3   r      s   r   c                       sR   e Zd Zed fddZee dddZdej	e
ej	 ej	dd	d
Z  ZS )Dinov2Attentionrt   c                    s*   t    t|| _t|| _t | _d S r   )r   r    rs   	attentionr   outputsetpruned_headsr~   r0   r2   r3   r      s    


zDinov2Attention.__init__)headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r=   )lenr   r   rw   ry   r   r   rg   rh   ri   r   r   rz   union)r/   r   indexr2   r2   r3   prune_heads  s    zDinov2Attention.prune_headsNr   c                 C   s    |  ||\}}| ||}|S r   )r   r   )r/   r   r   Zself_attn_outputrQ   r   r2   r2   r3   rR     s    zDinov2Attention.forward)N)rS   rT   rU   r   r    r   rX   r   r"   rW   r   rR   rY   r2   r2   r0   r3   r      s   r   c                       s6   e Zd Zdd fddZejejdddZ  ZS )Dinov2LayerScaleNr   c                    s(   t    t|jt|j | _d S r   )	r   r    r   r!   layerscale_valuer"   Zonesr#   lambda1r~   r0   r2   r3   r       s    
zDinov2LayerScale.__init__hidden_stater   c                 C   s
   || j  S r   )r   r/   r   r2   r2   r3   rR   $  s    zDinov2LayerScale.forwardrS   rT   rU   r    r"   rW   rR   rY   r2   r2   r0   r3   r     s   r   F)input	drop_probrn   r   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    re   r   r   )r   )r<   device)r?   ndimr"   Zrandr<   r   Zfloor_div)r   r   rn   Z	keep_probr?   Zrandom_tensorr   r2   r2   r3   	drop_path)  s    
r   c                       sP   e Zd ZdZdee dd fddZejejdddZ	e
d	d
dZ  ZS )Dinov2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).N)r   r   c                    s   t    || _d S r   )r   r    r   )r/   r   r0   r2   r3   r    A  s    
zDinov2DropPath.__init__)r   r   c                 C   s   t || j| jS r   )r   r   rn   )r/   r   r2   r2   r3   rR   E  s    zDinov2DropPath.forwardr   c                 C   s   d| j  S )Nzp=)r   r/   r2   r2   r3   
extra_reprH  s    zDinov2DropPath.extra_repr)N)rS   rT   rU   rV   r   floatr    r"   rW   rR   strr   rY   r2   r2   r0   r3   r   >  s   r   c                       s6   e Zd Zdd fddZejejdddZ  ZS )	Dinov2MLPNr   c                    sn   t    |j }}t|j|j }tj||dd| _t|j	t
rPt|j	 | _n|j	| _tj||dd| _d S )NTru   )r   r    r#   rX   	mlp_ratior   r}   fc1r\   Z
hidden_actr   r   
activationfc2r/   r   Zin_featuresout_featuresZhidden_featuresr0   r2   r3   r    M  s    

zDinov2MLP.__init__r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   r   r2   r2   r3   rR   X  s    


zDinov2MLP.forwardr   r2   r2   r0   r3   r   L  s   r   c                       s6   e Zd Zdd fddZejejdddZ  ZS )Dinov2SwiGLUFFNNr   c                    sl   t    |j }}t|j|j }t|d d d d d }tj|d| dd| _tj||dd| _d S )Nr8   r         Tru   )	r   r    r#   rX   r   r   r}   
weights_inweights_outr   r0   r2   r3   r    `  s    

zDinov2SwiGLUFFN.__init__r   c                 C   s6   |  |}|jddd\}}tj|| }| |S )Nr8   r7   r=   )r   chunkr   rC   Zsilur   )r/   r   x1Zx2Zhiddenr2   r2   r3   rR   i  s    
zDinov2SwiGLUFFN.forwardr   r2   r2   r0   r3   r   _  s   	r   c                       sF   e Zd ZdZedd fddZd	ejeej ejdddZ	  Z
S )
Dinov2LayerzCThis corresponds to the Block class in the original implementation.Nr   c                    s   t    tj|j|jd| _t|| _t	|| _
|jdkrFt|jnt | _tj|j|jd| _|jrvt|| _n
t|| _t	|| _d S )Nepsre   )r   r    r   	LayerNormr#   layer_norm_epsnorm1r   r   r   layer_scale1Zdrop_path_rater   Identityr   norm2Zuse_swiglu_ffnr   mlpr   layer_scale2r~   r0   r2   r3   r    s  s    



zDinov2Layer.__init__r   c                 C   s^   |  |}| ||}| |}| || }| |}| |}| |}| || }|S r   )r   r   r   r   r   r   r   )r/   r   r   Zhidden_states_normZself_attention_outputZlayer_outputr2   r2   r3   rR     s    




zDinov2Layer.forward)N)rS   rT   rU   rV   r   r    r"   rW   r   rR   rY   r2   r2   r0   r3   r   p  s    r   c                       s@   e Zd Zed fddZd	ejeej ee	dddZ
  ZS )
Dinov2Encoderrt   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r2   )r   .0rQ   rt   r2   r3   
<listcomp>      z*Dinov2Encoder.__init__.<locals>.<listcomp>F)	r   r    r   r   Z
ModuleListrangenum_hidden_layerslayerZgradient_checkpointingr~   r0   rt   r3   r      s    
 zDinov2Encoder.__init__NF)r   r   output_hidden_statesr   c                 C   sf   |r
|gnd }t | jD ]4\}}|d ur0|| nd }|||}|r|| qt||r^t|nd dS )N)last_hidden_stater   )	enumerater   appendr
   r   )r/   r   r   r   Zall_hidden_statesiZlayer_moduleZlayer_head_maskr2   r2   r3   rR     s    
zDinov2Encoder.forward)NF)rS   rT   rU   r   r    r"   rW   r   boolr
   rR   rY   r2   r2   r0   r3   r     s    r   c                   @   s`   e Zd ZU eed< dZdZdZdgZdZ	dZ
dZdZdeiZeejejejf ddd	d
ZdS )Dinov2PreTrainedModelr   dinov2rJ   Tr   
attentionsN)rf   r   c                 C   s(  t |tjtjfrZtjj|jjt	j
d| jjd|jj|j_|jdurX|jj  nt |tjr|jj  |jjd nt |trtjj|jjt	j
d| jjd|jj|j_tjj|jjt	j
d| jjd|jj|j_| jjr$|jj  nt |tr$|jj| jj dS )zInitialize the weightsre   )meanZstdNg      ?)r\   r   r}   r`   initZtrunc_normal_rM   datarD   r"   rE   r   Zinitializer_ranger<   rv   Zzero_r   Zfill_r   r*   r$   r%   r&   r   r   r   )r/   rf   r2   r2   r3   _init_weights  s<    

z#Dinov2PreTrainedModel._init_weights)rS   rT   rU   r   __annotations__Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attnZ_supports_flex_attnZ_supports_attention_backendrs   Z_can_record_outputsr   r   r}   r`   r   r   r2   r2   r2   r3   r     s   
r   c                	       s   e Zd Zed fddZedddZeee	e f ddd	d
Z
eedeej eej eej ee edddZ  ZS )Dinov2Modelrt   c                    sF   t  | || _t|| _t|| _tj|j	|j
d| _|   d S )Nr   )r   r    r   r   r4   r   encoderr   r   r#   r   	layernorm	post_initr~   r0   r2   r3   r      s    

zDinov2Model.__init__r   c                 C   s   | j jS r   r4   r(   r   r2   r2   r3   get_input_embeddings  s    z Dinov2Model.get_input_embeddingsN)heads_to_pruner   c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r/   r   r   r   r2   r2   r3   _prune_heads  s    zDinov2Model._prune_heads)rJ   rK   r   r   r   c           
      K   s   |du r| j j}|du r td| || j j}| j||d}| j|||d}|j}| |}|dddddf }	t	||	|j
dS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
            pre-training.
        Nz You have to specify pixel_values)rK   )r   r   r   )r   Zpooler_outputr   )r   r   rb   Zget_head_maskr   r4   r   r   r   r   r   )
r/   rJ   rK   r   r   rq   embedding_outputZencoder_outputssequence_outputZpooled_outputr2   r2   r3   rR     s"    
zDinov2Model.forward)NNNN)rS   rT   rU   r   r    r'   r   dictrX   listr   r   r   r   r"   rW   r   r   rR   rY   r2   r2   r0   r3   r     s        r   z
    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    )Zcustom_introc                	       sZ   e Zd Zedd fddZeedeej	 eej	 eej	 e
e edddZ  ZS )	Dinov2ForImageClassificationNr   c                    sR   t  | |j| _t|| _|jdkr<t|jd |jnt | _	| 
  d S )Nr   r8   )r   r    Z
num_labelsr   r   r   r}   r#   r   
classifierr   r~   r0   r2   r3   r    ,  s    
$z%Dinov2ForImageClassification.__init__)rJ   r   labelsrq   r   c                 K   s   | j |fd|i|}|j}|dddf }|ddddf }tj||jddgdd}	| |	}
d}|dur| j||
| jfi |}t||
|j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r   Nr   r   r=   )losslogitsr   r   )r   r   r"   rG   r   r   Zloss_functionr   r   r   r   )r/   rJ   r   r   rq   outputsr   r$   Zpatch_tokensZlinear_inputr   r   r2   r2   r3   rR   :  s    
z$Dinov2ForImageClassification.forward)NNN)rS   rT   rU   r   r    r   r   r   r"   rW   r   r   r   rR   rY   r2   r2   r0   r3   r   %  s      r   zO
    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sL   e Zd Z fddZedddZeed
ej	e
e eddd	Z  ZS )Dinov2Backbonec                    sj   t    t     fddt jd D | _t | _t | _	t
j j jd| _|   d S )Nc                    s   g | ]
} j qS r2   )r#   r   rt   r2   r3   r   h  r   z+Dinov2Backbone.__init__.<locals>.<listcomp>r   r   )r   r    Z_init_backboner   r   Znum_featuresr   r4   r   r   r   r   r#   r   r   r   r~   r0   rt   r3   r    d  s    

zDinov2Backbone.__init__r   c                 C   s   | j jS r   r   r   r2   r2   r3   r   q  s    z#Dinov2Backbone.get_input_embeddingsN)rJ   r   r   c                 K   s   |du r| j j}| |}| j|dd}|j}g }t| j|D ]\}}	|| jv r>| j jrb| 	|	}	| j j
r|	ddddf }	|j\}
}}}| j j}|	|
|| || d}	|	dddd }	||	 q>tt||r|ndd	S )
a%  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 16, 16]
        ```NT)r   r   r7   r   r   r8   )feature_mapsr   )r   r   r4   r   r   zipZstage_namesr   Zapply_layernormr   Zreshape_hidden_statesr?   r.   rA   rB   rp   r   r	   r   )r/   rJ   r   rq   r   r   r   r   Zstager   rP   rQ   r5   r6   r.   r2   r2   r3   rR   t  s*    



zDinov2Backbone.forward)N)rS   rT   rU   r    r'   r   r   r   r"   rW   r   r   r	   rR   rY   r2   r2   r0   r3   r   ^  s    
r   )r   r   r   r   )re   )re   F)@rV   collections.abcr]   typingr   r   r   r"   Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_layersr   Zmodeling_outputsr	   r
   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   Zpytorch_utilsr   r   utilsr   r   r   r   Zutils.backbone_utilsr   Zutils.genericr   r   Zconfiguration_dinov2r   Z
get_loggerrS   loggerModuler   r'   rW   r   rr   rs   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __all__r2   r2   r2   r3   <module>   sf   
Q) 5
*.D3J