a
    h[                  	   @   s  d dl Zd dlmZmZmZ d dlZd dlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZ ddl m!Z! G dd dej"Z#G dd dej"Z$d/ej"ej%ej%ej%eej% e&e&dddZ'G dd dej"Z(G dd dej"Z)G dd dej"Z*G dd dej"Z+G dd dej"Z,G d d! d!eZ-eG d"d# d#eZ.G d$d% d%ej"Z/G d&d' d'ej"Z0eG d(d) d)e.Z1ed*d+G d,d- d-e.Z2g d.Z3dS )0    N)CallableOptionalUnion   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstring	torch_int)can_return_tuplecheck_model_inputs   )IJepaConfigc                       s>   e Zd ZdZed fddZd	ejeejdddZ	  Z
S )
IJepaPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    configc                    s   t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )Zkernel_sizeZstride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesnnConv2d
projection)selfr   r   r   r   r   r$   	__class__ d/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/ijepa/modeling_ijepa.pyr      s    
 zIJepaPatchEmbeddings.__init__F)pixel_valuesinterpolate_pos_encodingreturnc              
   C   s   |j \}}}}|| jkr0td| j d| d|s~|| jd ksP|| jd kr~td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).   )shaper   
ValueErrorr   r'   flatten	transpose)r(   r-   r.   
batch_sizer   heightwidth
embeddingsr+   r+   r,   forward.   s(    
zIJepaPatchEmbeddings.forward)F)__name__
__module____qualname____doc__r   r   torchTensorboolr;   __classcell__r+   r+   r)   r,   r      s   r   c                       sd   e Zd ZdZdeedd fddZeje	e	ejddd	Z
dejeej eejd
ddZ  ZS )IJepaEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    FN)r   use_mask_tokenr/   c                    st   t    |r$ttdd|jnd | _t|| _	| j	j
}ttd||j| _t|j| _|j| _|| _d S )Nr   )r   r   r%   	Parameterr@   Zzerosr   
mask_tokenr   patch_embeddingsr$   Zrandnposition_embeddingsDropouthidden_dropout_probdropoutr   r   )r(   r   rE   r$   r)   r+   r,   r   D   s    
 
zIJepaEmbeddings.__init__)r:   r8   r9   r/   c                 C   s   |j d }| jj d }tj s6||kr6||kr6| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r2   ZbicubicF)sizemodeZalign_corners)r3   rI   r@   Zjit
is_tracingr   r   reshapeZpermuter%   
functionalZinterpolateview)r(   r:   r8   r9   r$   Znum_positionsZpatch_pos_embeddimZ
new_heightZ	new_widthZsqrt_num_positionsr+   r+   r,   r.   N   s&    




z(IJepaEmbeddings.interpolate_pos_encoding)r-   bool_masked_posr.   r/   c                 C   s   |j \}}}}| j||d}|d urb|j d }	| j||	d}
|d|
}|d|  |
|  }|rz|| ||| }n
|| j }| |}|S )N)r.   r   rM         ?)	r3   rH   rG   expandZ	unsqueezeZtype_asr.   rI   rL   )r(   r-   rU   r.   r7   _r8   r9   r:   Z
seq_lengthZmask_tokensmaskr+   r+   r,   r;   u   s    


zIJepaEmbeddings.forward)F)NF)r<   r=   r>   r?   r   rB   r   r@   rA   intr.   r   
BoolTensorr;   rC   r+   r+   r)   r,   rD   ?   s   
*  rD           )modulequerykeyvalueattention_maskscalingrL   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d urX|| }t ||}	|	dd }	|	|fS )NrM   )rT   dtype)ptrainingr   r2   )r@   matmulr6   r%   rR   Zsoftmaxfloat32tord   rL   rf   
contiguous)
r]   r^   r_   r`   ra   rb   rL   kwargsZattn_weightsZattn_outputr+   r+   r,   eager_attention_forward   s    rl   c                       sJ   e Zd Zed fddZdejeej eejejf dddZ	  Z
S )	IJepaSelfAttentionr   c                    s   t    |j|j dkr>t|ds>td|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r0   g      F)bias)r   r   r   num_attention_headshasattrr4   r   rZ   attention_head_sizeall_head_sizeZattention_probs_dropout_probdropout_probrb   	is_causalr%   LinearZqkv_biasr^   r_   r`   r(   r   r)   r+   r,   r      s"    

zIJepaSelfAttention.__init__Nhidden_states	head_maskr/   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t}| j	j
dkr~t| j	j
 }|| ||||| j| j| jsdn| jd\}	}
|	 d d | jf }|	|}	|	|
fS )	Nr   rM   r   r2   eagerr\   )rt   rb   rL   rc   )r3   ro   rq   r_   rS   r6   r`   r^   rl   r   Z_attn_implementationr   rt   rb   rf   rs   rN   rr   rQ   )r(   rx   ry   r7   Z	new_shapeZ	key_layerZvalue_layerZquery_layerZattention_interfaceZcontext_layerZattention_probsZnew_context_layer_shaper+   r+   r,   r;      s*    


zIJepaSelfAttention.forward)N)r<   r=   r>   r   r   r@   rA   r   tupler;   rC   r+   r+   r)   r,   rm      s    rm   c                       s>   e Zd ZdZed fddZejejejdddZ  Z	S )IJepaSelfOutputz
    The residual connection is defined in IJepaLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                    s.   t    t|j|j| _t|j| _d S N)	r   r   r%   ru   r   denserJ   rK   rL   rv   r)   r+   r,   r      s    
zIJepaSelfOutput.__init__rx   input_tensorr/   c                 C   s   |  |}| |}|S r}   r~   rL   r(   rx   r   r+   r+   r,   r;      s    

zIJepaSelfOutput.forward)
r<   r=   r>   r?   r   r   r@   rA   r;   rC   r+   r+   r)   r,   r|      s   r|   c                       sR   e Zd Zed fddZee dddZdej	e
ej	 ej	dd	d
Z  ZS )IJepaAttentionr   c                    s*   t    t|| _t|| _t | _d S r}   )r   r   rm   	attentionr|   outputsetpruned_headsrv   r)   r+   r,   r      s    


zIJepaAttention.__init__)headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rT   )lenr   r   ro   rq   r   r   r^   r_   r`   r   r~   rr   union)r(   r   indexr+   r+   r,   prune_heads   s    zIJepaAttention.prune_headsNrw   c                 C   s    |  ||\}}| ||}|S r}   )r   r   )r(   rx   ry   Zself_attn_outputrX   r   r+   r+   r,   r;     s    zIJepaAttention.forward)N)r<   r=   r>   r   r   r   rZ   r   r@   rA   r   r;   rC   r+   r+   r)   r,   r      s   r   c                       s6   e Zd Zed fddZejejdddZ  ZS )IJepaIntermediater   c                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r}   )r   r   r%   ru   r   intermediate_sizer~   r    Z
hidden_actstrr   intermediate_act_fnrv   r)   r+   r,   r     s
    
zIJepaIntermediate.__init__rx   r/   c                 C   s   |  |}| |}|S r}   )r~   r   )r(   rx   r+   r+   r,   r;     s    

zIJepaIntermediate.forward	r<   r=   r>   r   r   r@   rA   r;   rC   r+   r+   r)   r,   r     s   r   c                       s:   e Zd Zed fddZejejejdddZ  ZS )IJepaOutputr   c                    s.   t    t|j|j| _t|j| _	d S r}   )
r   r   r%   ru   r   r   r~   rJ   rK   rL   rv   r)   r+   r,   r   "  s    
zIJepaOutput.__init__r   c                 C   s    |  |}| |}|| }|S r}   r   r   r+   r+   r,   r;   '  s    

zIJepaOutput.forwardr   r+   r+   r)   r,   r   !  s   r   c                       sD   e Zd ZdZed fddZd	ejeej ejdddZ	  Z
S )

IJepaLayerz?This corresponds to the Block class in the timm implementation.r   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r   r   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r   r%   	LayerNormr   layer_norm_epslayernorm_beforelayernorm_afterrv   r)   r+   r,   r   1  s    



zIJepaLayer.__init__Nrw   c                 C   sB   |  |}| ||}|| }| |}| |}| ||}|S r}   )r   r   r   r   r   )r(   rx   ry   Zhidden_states_normZattention_outputZlayer_outputr+   r+   r,   r;   ;  s    


zIJepaLayer.forward)N)r<   r=   r>   r?   r   r   r@   rA   r   r;   rC   r+   r+   r)   r,   r   .  s   
r   c                   @   sd   e Zd ZU eed< dZdZdZddgZdZ	dZ
dZdZeedZeejejejf dd	d
dZdS )IJepaPreTrainedModelr   ijepar-   TrD   r   )rx   
attentionsN)r]   r/   c                 C   s   t |tjtjfrZtjj|jjt	j
d| jjd|jj|j_|jdur|jj  nxt |tjr|jj  |jjd nPt |trtjj|jjt	j
d| jjd|jj|j_|jdur|jj  dS )zInitialize the weightsr\   )meanZstdNrV   )r    r%   ru   r&   initZtrunc_normal_weightdatari   r@   rh   r   Zinitializer_rangerd   rn   Zzero_r   Zfill_rD   rI   rG   )r(   r]   r+   r+   r,   _init_weights\  s*    


z"IJepaPreTrainedModel._init_weights)r<   r=   r>   r   __annotations__Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attnZ_supports_flex_attnZ_supports_attention_backendr   rm   Z_can_record_outputsr   r%   ru   r&   r   r   r+   r+   r+   r,   r   L  s   
r   c                       s>   e Zd Zed fddZdejeej edddZ	  Z
S )	IJepaEncoderr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r+   )r   ).0rX   r   r+   r,   
<listcomp>w      z)IJepaEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r%   Z
ModuleListrangenum_hidden_layerslayerZgradient_checkpointingrv   r)   r   r,   r   t  s    
 zIJepaEncoder.__init__Nrw   c                 C   s<   t | jD ]&\}}|d ur"|| nd }|||}q
t|dS )N)last_hidden_state)	enumerater   r   )r(   rx   ry   iZlayer_moduleZlayer_head_maskr+   r+   r,   r;   z  s    zIJepaEncoder.forward)N)r<   r=   r>   r   r   r@   rA   r   r   r;   rC   r+   r+   r)   r,   r   s  s   r   c                       s6   e Zd Zed fddZejejdddZ  ZS )IJepaPoolerr   c                    s,   t    t|j|j| _t|j | _	d S r}   )
r   r   r%   ru   r   Zpooler_output_sizer~   r   Z
pooler_act
activationrv   r)   r+   r,   r     s    
zIJepaPooler.__init__r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r~   r   )r(   rx   Zfirst_token_tensorpooled_outputr+   r+   r,   r;     s    

zIJepaPooler.forwardr   r+   r+   r)   r,   r     s   r   c                
       s   e Zd Zdeeed fddZedddZee	e
e	 f dd	d
Zeedeej eej eej ee ee edddZ  ZS )
IJepaModelF)r   add_pooling_layerrE   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|rJt|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )rE   r   N)r   r   r   rD   r:   r   encoderr%   r   r   r   	layernormr   pooler	post_init)r(   r   r   rE   r)   r+   r,   r     s    
zIJepaModel.__init__)r/   c                 C   s   | j jS r}   )r:   rH   )r(   r+   r+   r,   get_input_embeddings  s    zIJepaModel.get_input_embeddings)heads_to_prunec                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r(   r   r   r   r+   r+   r,   _prune_heads  s    zIJepaModel._prune_headsN)r-   rU   ry   r.   rk   r/   c                 K   s   |du rt d| || jj}| jjjjj}|j|krB|	|}| j|||d}| j
||d}|j}	| |	}	| jdur| |	nd}
t|	|
dS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rU   r.   )ry   )r   Zpooler_output)r4   Zget_head_maskr   r   r:   rH   r'   r   rd   ri   r   r   r   r   r	   )r(   r-   rU   ry   r.   rk   Zexpected_dtypeZembedding_outputZencoder_outputssequence_outputr   r+   r+   r,   r;     s    


zIJepaModel.forward)FF)NNNN)r<   r=   r>   r   rB   r   r   r   dictrZ   listr   r   r   r   r@   rA   r[   r   r   r	   r;   rC   r+   r+   r)   r,   r     s"       r   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )Zcustom_introc                
       s^   e Zd Zed fddZeedeej	 eej	 eej	 ee
 ee edddZ  ZS )	IJepaForImageClassificationr   c                    sR   t  | |j| _t|dd| _|jdkr<t|j|jnt | _	| 
  d S )NF)r   r   )r   r   Z
num_labelsr   r   r%   ru   r   ZIdentity
classifierr   rv   r)   r+   r,   r     s
    $z$IJepaForImageClassification.__init__N)r-   ry   labelsr.   rk   r/   c           
      K   sh   | j |f||d|}|j}| |jdd}d}	|durT| j||| jfi |}	t|	||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )ry   r.   r   r   N)losslogitsrx   r   )	r   r   r   r   Zloss_functionr   r
   rx   r   )
r(   r-   ry   r   r.   rk   outputsr   r   r   r+   r+   r,   r;     s&    z#IJepaForImageClassification.forward)NNNN)r<   r=   r>   r   r   r   r   r   r@   rA   rB   r   r   r
   r;   rC   r+   r+   r)   r,   r     s       r   )r   r   r   )r\   )4collections.abcr!   typingr   r   r   r@   Ztorch.nnr%   Zactivationsr   Zmodeling_layersr   Zmodeling_outputsr   r	   r
   Zmodeling_utilsr   r   Zprocessing_utilsr   Zpytorch_utilsr   r   utilsr   r   r   Zutils.genericr   r   Zconfiguration_ijepar   Moduler   rD   rA   floatrl   rm   r|   r   r   r   r   r   r   r   r   r   __all__r+   r+   r+   r,   <module>   sR   'X 4&I3