a
    hp                  	   @   s"  d Z ddlZddlZddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% e&e'Z(G dd de
j)Z*G dd de
j)Z+d4e
j)ej,ej,ej,eej, e-e-dddZ.G dd de
j)Z/G dd de
j)Z0G dd de
j)Z1G dd de
j)Z2G d d! d!e
j)Z3G d"d# d#eZ4G d$d% d%e
j)Z5eG d&d' d'eZ6eG d(d) d)e6Z7G d*d+ d+e
j)Z8ed,d-G d.d/ d/e6Z9ed0d-G d1d2 d2e6Z:g d3Z;dS )5zPyTorch ViT model.    N)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringlogging	torch_int)can_return_tuplecheck_model_inputs   )	ViTConfigc                       sb   e Zd ZdZdeed fddZeje	e	ejdddZ
dejeej eejd
ddZ  ZS )ViTEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    F)configuse_mask_tokenc                    s   t    ttdd|j| _|r<ttdd|jnd | _	t
|| _| jj}ttd|d |j| _t|j| _|j| _|| _d S )Nr   )super__init__r   	ParametertorchZrandnhidden_size	cls_tokenZzeros
mask_tokenViTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r   r&   	__class__ `/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/vit/modeling_vit.pyr   1   s    
 
zViTEmbeddings.__init__)
embeddingsheightwidthreturnc                 C   s   |j d d }| jj d d }tj s>||kr>||kr>| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   N      ?r   r      ZbicubicF)sizemodeZalign_cornersdim)shaper'   r    Zjit
is_tracingr+   r   reshapepermuter   
functionalZinterpolateviewcat)r,   r1   r2   r3   r&   Znum_positionsZclass_pos_embedZpatch_pos_embedr;   Z
new_heightZ	new_widthZsqrt_num_positionsr/   r/   r0   interpolate_pos_encoding=   s(    



z&ViTEmbeddings.interpolate_pos_encodingN)pixel_valuesbool_masked_posrC   r4   c                 C   s   |j \}}}}| j||d}|d urb|j d }	| j||	d}
|d|
}|d|  |
|  }| j|dd}tj||fdd}|r|| 	||| }n
|| j
 }| |}|S )N)rC   r   r5         ?r:   )r<   r%   r#   expand	unsqueezeZtype_asr"   r    rB   rC   r'   r*   )r,   rD   rE   rC   
batch_sizenum_channelsr2   r3   r1   Z
seq_lengthZmask_tokensmaskZ
cls_tokensr/   r/   r0   forwarde   s    


zViTEmbeddings.forward)F)NF)__name__
__module____qualname____doc__r   boolr   r    TensorintrC   r   
BoolTensorrL   __classcell__r/   r/   r-   r0   r   ,   s   +  r   c                       s>   e Zd ZdZed fddZd	ejeejdddZ	  Z
S )
r$   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    r   c                    s   t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizeZstride)r   r   
image_sizer+   rJ   r!   
isinstancecollectionsabcIterabler&   r   Conv2d
projection)r,   r   rX   r+   rJ   r!   r&   r-   r/   r0   r      s    
 zViTPatchEmbeddings.__init__F)rD   rC   r4   c              
   C   s   |j \}}}}|| jkr0td| j d| d|s~|| jd ksP|| jd kr~td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).r7   )r<   rJ   
ValueErrorrX   r^   flatten	transpose)r,   rD   rC   rI   rJ   r2   r3   r1   r/   r/   r0   rL      s(    
zViTPatchEmbeddings.forward)F)rM   rN   rO   rP   r   r   r    rR   rQ   rL   rU   r/   r/   r-   r0   r$      s   r$           )modulequerykeyvalueattention_maskscalingr*   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d urX|| }t ||}	|	dd }	|	|fS )Nr5   )r;   dtype)ptrainingr   r7   )r    matmulrc   r   r@   Zsoftmaxfloat32torl   r*   rn   
contiguous)
re   rf   rg   rh   ri   rj   r*   kwargsZattn_weightsZattn_outputr/   r/   r0   eager_attention_forward   s    rt   c                       sJ   e Zd Zed fddZdejeej eejejf dddZ	  Z
S )	ViTSelfAttentionrV   c                    s   t    |j|j dkr>t|ds>td|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r_   g      F)bias)r   r   r!   num_attention_headshasattrra   r   rS   attention_head_sizeall_head_sizeZattention_probs_dropout_probdropout_probrj   	is_causalr   LinearZqkv_biasrf   rg   rh   r,   r   r-   r/   r0   r      s"    

zViTSelfAttention.__init__Nhidden_states	head_maskr4   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t}| j	j
dkr~t| j	j
 }|| ||||| j| j| jsdn| jd\}	}
|	 d d | jf }|	|}	|	|
fS )	Nr   r5   r   r7   eagerrd   )r|   rj   r*   rk   )r<   rw   ry   rg   rA   rc   rh   rf   rt   r   Z_attn_implementationr   r|   rj   rn   r{   r8   rz   r>   )r,   r   r   rI   Z	new_shapeZ	key_layerZvalue_layerZquery_layerZattention_interfaceZcontext_layerZattention_probsZnew_context_layer_shaper/   r/   r0   rL      s*    


zViTSelfAttention.forward)N)rM   rN   rO   r   r   r    rR   r   tuplerL   rU   r/   r/   r-   r0   ru      s    ru   c                       s>   e Zd ZdZed fddZejejejdddZ  Z	S )ViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rV   c                    s.   t    t|j|j| _t|j| _d S N)	r   r   r   r}   r!   denser(   r)   r*   r~   r-   r/   r0   r     s    
zViTSelfOutput.__init__r   input_tensorr4   c                 C   s   |  |}| |}|S r   r   r*   r,   r   r   r/   r/   r0   rL     s    

zViTSelfOutput.forward)
rM   rN   rO   rP   r   r   r    rR   rL   rU   r/   r/   r-   r0   r      s   r   c                       sR   e Zd Zed fddZee dddZdej	e
ej	 ej	dd	d
Z  ZS )ViTAttentionrV   c                    s*   t    t|| _t|| _t | _d S r   )r   r   ru   	attentionr   outputsetpruned_headsr~   r-   r/   r0   r     s    


zViTAttention.__init__)headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r:   )lenr   r   rw   ry   r   r   rf   rg   rh   r   r   rz   union)r,   r   indexr/   r/   r0   prune_heads  s    zViTAttention.prune_headsNr   c                 C   s    |  ||\}}| ||}|S r   )r   r   )r,   r   r   Zself_attn_output_r   r/   r/   r0   rL   '  s    zViTAttention.forward)N)rM   rN   rO   r   r   r   rS   r   r    rR   r   rL   rU   r/   r/   r-   r0   r     s   r   c                       s6   e Zd Zed fddZejejdddZ  ZS )ViTIntermediaterV   c                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r   )r   r   r   r}   r!   intermediate_sizer   rY   Z
hidden_actstrr   intermediate_act_fnr~   r-   r/   r0   r   .  s
    
zViTIntermediate.__init__r   r4   c                 C   s   |  |}| |}|S r   )r   r   )r,   r   r/   r/   r0   rL   6  s    

zViTIntermediate.forward	rM   rN   rO   r   r   r    rR   rL   rU   r/   r/   r-   r0   r   -  s   r   c                       s:   e Zd Zed fddZejejejdddZ  ZS )	ViTOutputrV   c                    s.   t    t|j|j| _t|j| _	d S r   )
r   r   r   r}   r   r!   r   r(   r)   r*   r~   r-   r/   r0   r   =  s    
zViTOutput.__init__r   c                 C   s    |  |}| |}|| }|S r   r   r   r/   r/   r0   rL   B  s    

zViTOutput.forwardr   r/   r/   r-   r0   r   <  s   r   c                       sD   e Zd ZdZed fddZd	ejeej ejdddZ	  Z
S )
ViTLayerz?This corresponds to the Block class in the timm implementation.rV   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r   r   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r   r   	LayerNormr!   layer_norm_epslayernorm_beforelayernorm_afterr~   r-   r/   r0   r   L  s    



zViTLayer.__init__Nr   c                 C   sB   |  |}| ||}|| }| |}| |}| ||}|S r   )r   r   r   r   r   )r,   r   r   Zhidden_states_normZattention_outputZlayer_outputr/   r/   r0   rL   V  s    


zViTLayer.forward)N)rM   rN   rO   rP   r   r   r    rR   r   rL   rU   r/   r/   r-   r0   r   I  s   
r   c                       s>   e Zd Zed fddZdejeej edddZ	  Z
S )	
ViTEncoderrV   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r/   )r   ).0r   rV   r/   r0   
<listcomp>k      z'ViTEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r   Z
ModuleListrangenum_hidden_layerslayerZgradient_checkpointingr~   r-   rV   r0   r   h  s    
 zViTEncoder.__init__Nr   c                 C   s<   t | jD ]&\}}|d ur"|| nd }|||}q
t|dS )N)last_hidden_state)	enumerater   r	   )r,   r   r   iZlayer_moduleZlayer_head_maskr/   r/   r0   rL   n  s    zViTEncoder.forward)N)rM   rN   rO   r   r   r    rR   r   r	   rL   rU   r/   r/   r-   r0   r   g  s   r   c                   @   sb   e Zd ZU eed< dZdZdZddgZdZ	dZ
dZdZeedZeejejejf dd	d
ZdS )ViTPreTrainedModelr   vitrD   Tr   r   )r   
attentions)re   c                 C   s
  t |tjtjfrZtjj|jjt	j
d| jjd|jj|j_|jdurX|jj  nt |tjr|jj  |jjd nt |trtjj|jjt	j
d| jjd|jj|j_tjj|jjt	j
d| jjd|jj|j_|jdur|jj  dS )zInitialize the weightsrd   )meanZstdNrF   )rY   r   r}   r]   initZtrunc_normal_weightdatarq   r    rp   r   Zinitializer_rangerl   rv   Zzero_r   Zfill_r   r'   r"   r#   )r,   re   r/   r/   r0   _init_weights  s8    
z ViTPreTrainedModel._init_weightsN)rM   rN   rO   r   __annotations__Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attnZ_supports_flex_attnZ_supports_attention_backendr   ru   Z_can_record_outputsr   r   r}   r]   r   r   r/   r/   r/   r0   r   v  s   
r   c                
       s   e Zd Zdeeed fddZedddZee	e
e	 f d	d
dZeedeej eej eej ee ee edddZ  ZS )ViTModelTF)r   add_pooling_layerr   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|rJt|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   r   N)r   r   r   r   r1   r   encoderr   r   r!   r   	layernorm	ViTPoolerpooler	post_init)r,   r   r   r   r-   r/   r0   r     s    
zViTModel.__init__)r4   c                 C   s   | j jS r   )r1   r%   )r,   r/   r/   r0   get_input_embeddings  s    zViTModel.get_input_embeddings)heads_to_prunec                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r,   r   r   r   r/   r/   r0   _prune_heads  s    zViTModel._prune_headsNrD   rE   r   rC   rs   r4   c                 K   s   |du rt d| || jj}| jjjjj}|j|krB|	|}| j|||d}| j
||d}|j}	| |	}	| jdur| |	nd}
t|	|
dS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rE   rC   )r   )r   Zpooler_output)ra   Zget_head_maskr   r   r1   r%   r^   r   rl   rq   r   r   r   r   r
   )r,   rD   rE   r   rC   rs   Zexpected_dtypeZembedding_outputZencoder_outputssequence_outputpooled_outputr/   r/   r0   rL     s    


zViTModel.forward)TF)NNNN)rM   rN   rO   r   rQ   r   r$   r   dictrS   listr   r   r   r   r    rR   rT   r   r   r
   rL   rU   r/   r/   r-   r0   r     s"       r   c                       s6   e Zd Zed fddZejejdddZ  ZS )r   rV   c                    s,   t    t|j|j| _t|j | _	d S r   )
r   r   r   r}   r!   Zpooler_output_sizer   r   Z
pooler_act
activationr~   r-   r/   r0   r     s    
zViTPooler.__init__r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r,   r   Zfirst_token_tensorr   r/   r/   r0   rL     s    

zViTPooler.forwardr   r/   r/   r-   r0   r     s   r   ac  
    ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )Zcustom_introc                
       s^   e Zd Zed fddZeedeej	 eej
 eej	 ee ee edddZ  ZS )	ViTForMaskedImageModelingrV   c                    sX   t  | t|ddd| _ttj|j|jd |j	 ddt
|j| _|   d S )NFT)r   r   r7   r   )Zin_channelsZout_channelsrW   )r   r   r   r   r   Z
Sequentialr]   r!   encoder_striderJ   ZPixelShuffledecoderr   r~   r-   r/   r0   r     s    

z"ViTForMaskedImageModeling.__init__Nr   c                 K   sD  |dur6| j j| j jkr6td| j j d| j j d| j|f|||d|}|j}|ddddf }|j\}}	}
t|	d  }}|	dd	d
||
||}| |}d}|dur0| j j| j j }|
d
||}|| j jd| j jd	d }tjj||dd}||  | d  | j j }t|||j|jdS )a+  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
        >>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```NzWhen `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that the reconstructed image has the same dimensions as the input. Got `patch_size` = z and `encoder_stride` = r_   )rE   r   rC   r   r6   r   r7   r5   none)Z	reductiongh㈵>)lossZreconstructionr   r   )r   r+   r   ra   r   r   r<   mathfloorr?   r>   r   rX   Zrepeat_interleaverH   rr   r   r@   Zl1_losssumrJ   r   r   r   )r,   rD   rE   r   rC   rs   outputsr   rI   Zsequence_lengthrJ   r2   r3   Zreconstructed_pixel_valuesZmasked_im_lossr8   rK   Zreconstruction_lossr/   r/   r0   rL     sR    &

 z!ViTForMaskedImageModeling.forward)NNNN)rM   rN   rO   r   r   r   r   r   r    rR   rT   rQ   r   r   r   rL   rU   r/   r/   r-   r0   r     s       r   a  
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                
       s^   e Zd Zed fddZeedeej	 eej	 eej	 ee
 ee edddZ  ZS )	ViTForImageClassificationrV   c                    sR   t  | |j| _t|dd| _|jdkr<t|j|jnt | _	| 
  d S )NF)r   r   )r   r   Z
num_labelsr   r   r   r}   r!   ZIdentity
classifierr   r~   r-   r/   r0   r     s
    $z"ViTForImageClassification.__init__N)rD   r   labelsrC   rs   r4   c                 K   sv   | j |f||d|}|j}|dddddf }| |}	d}
|durb| j||	| jfi |}
t|
|	|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   rC   Nr   )r   logitsr   r   )r   r   r   Zloss_functionr   r   r   r   )r,   rD   r   r   rC   rs   r   r   r   r   r   r/   r/   r0   rL     s(    
z!ViTForImageClassification.forward)NNNN)rM   rN   rO   r   r   r   r   r   r    rR   rQ   r   r   r   rL   rU   r/   r/   r-   r0   r   q  s       r   )r   r   r   r   )rd   )<rP   collections.abcrZ   r   typingr   r   r   r    Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_layersr   Zmodeling_outputsr	   r
   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   Zpytorch_utilsr   r   utilsr   r   r   r   Zutils.genericr   r   Zconfiguration_vitr   Z
get_loggerrM   loggerModuler   r$   rR   floatrt   ru   r   r   r   r   r   r   r   r   r   r   r   __all__r/   r/   r/   r0   <module>   sb   
X. 4-Jg5