a
    hj                  
   @   s  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZ G dd dejZG dd dejZG dd dejZd-ejejejejeej e e ee dddZ!dd Z"eje#ejdddZ$ejejejeje%ejejf dddZ&G dd  d ejZ'G d!d" d"eZ(G d#d$ d$ejZ)G d%d& d&ejZ*eG d'd( d(eZ+ed)d*G d+d, d,e+Z,d(d,gZ-dS ).    )CallableOptionalUnionN   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring	torch_int   )MLCDVisionConfigc                       s0   e Zd Z fddZejejdddZ  ZS )MLCDMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)super__init__configr   Z
hidden_actactivation_fnnnLinearhidden_sizeZintermediate_sizefc1fc2selfr   	__class__ b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/mlcd/modeling_mlcd.pyr   %   s
    
zMLCDMLP.__init__)hidden_statesreturnc                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r   r$   r"   r"   r#   forward,   s    


zMLCDMLP.forward)__name__
__module____qualname__r   torchTensorr&   __classcell__r"   r"   r    r#   r   $   s   r   c                       sH   e Zd ZU ejed< d
eedd fddZeeejddd	Z	  Z
S )MLCDRotaryEmbeddinginv_freq     @N)dimthetar%   c                    s>   t    d|tjd|dtjd|   }| jd|dd d S )N      ?r      dtyper.   F
persistent)r   r   r*   arangefloatregister_buffer)r   r0   r1   r.   r    r"   r#   r   6   s    
 zMLCDRotaryEmbedding.__init__)num_patches_heightnum_patches_widthr%   c           
      C   s   t j|| jjddd|}t j|| jjdd|d}t j| | gdd}t||}t j|| jj| jj	d}t 
|| j}|| d}	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer   r   r0   )r=   r5   )r*   r8   r.   r=   	unsqueezeexpandstackflattenmaxr5   outer)
r   r;   r<   Zhpos_idsZwpos_idsZpos_idsZmax_grid_sizeseqZrotary_pos_emb_fullrotary_pos_embr"   r"   r#   r&   ;   s    
zMLCDRotaryEmbedding.forward)r/   )r'   r(   r)   r*   r+   __annotations__intr9   r   r&   r,   r"   r"   r    r#   r-   3   s   

r-   c                       sN   e Zd Zed fddZejeeejdddZej	ejddd	Z
  ZS )
MLCDVisionEmbeddingsr   c                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _| jdt	| jddd d S )NF)Zin_channelsZout_channelsZkernel_sizeZstridebiasr3   r   position_ids)r   r>   r6   )r   r   r   r   	embed_dimZ
image_size
patch_sizer   	Parameterr*   randnclass_embeddingZConv2dZnum_channelspatch_embeddingnum_patchesnum_positionsr:   r8   rA   r   r    r"   r#   r   ]   s     
zMLCDVisionEmbeddings.__init__)
embeddingsheightwidthr%   c                 C   s  |j d d }| jjd}|j d d }tj sP||krP||krP| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr>   g      ?r   r3   ZbicubicF)sizemodeZalign_cornersr?   )shapeposition_embeddingweightr@   r*   Zjit
is_tracingrM   rO   r   reshapepermuter   
functionalZinterpolateviewcat)r   rV   rW   rX   rT   r\   rU   Zclass_pos_embedZpatch_pos_embedr0   Z
new_heightZ	new_widthZsqrt_num_positionsr"   r"   r#   interpolate_pos_encodingr   s*    



z-MLCDVisionEmbeddings.interpolate_pos_encoding)pixel_valuesr%   c                 C   s^   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|S )Nr   r4   r3   r   r>   r?   )r[   rS   r]   r5   torC   	transposerR   rA   r*   rc   )r   re   
batch_sizeZtarget_dtypeZpatch_embedsZclass_embedsrV   r"   r"   r#   r&      s    

zMLCDVisionEmbeddings.forward)r'   r(   r)   r   r   r*   r+   rI   rd   FloatTensorr&   r,   r"   r"   r    r#   rJ   \   s   )rJ           )modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr3   r   r>   )r0   r5   )ptrainingr   )	repeat_kvnum_key_value_groupsr*   matmulrg   r[   r   ra   ZsoftmaxZfloat32rf   r5   rq   ru   
contiguous)rk   rl   rm   rn   ro   rp   rq   rr   
key_statesvalue_statesattn_weightsZcausal_maskattn_outputr"   r"   r#   eager_attention_forward   s    
&r~   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr>   r3   r?   )r[   r*   rc   )xx1Zx2r"   r"   r#   rotate_half   s    r   )r$   n_repr%   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r[   rA   r_   )r$   r   batchZnum_key_value_headsslenhead_dimr"   r"   r#   rv      s
    0rv   )qkcossinr%   c                 C   s   | j }|j }|  |  } }|d |d  }}| | t| |  }|| t||  }||}||}||fS )Nrs   )r5   r9   r@   r   rf   )r   r   r   r   Zorig_q_dtypeZorig_k_dtypeZq_embedZk_embedr"   r"   r#   apply_rotary_pos_emb_vision   s    

r   c                	       sf   e Zd ZdZed fddZd	ejeejejf e	ej e
e eeje	ej f dddZ  ZS )
MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    rK   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkrZtd| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _|j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r   r   r   r   rN   num_attention_heads	num_headsr   
ValueErrorscaleZattention_dropoutrq   	is_causalr   r   k_projv_projq_projout_projrw   r   r    r"   r#   r      s&    

zMLCDAttention.__init__N)r$   position_embeddingsro   rr   r%   c                 K   sp  |j dd \}}| |||| j| jf}| |||| j| jf}| |||| j| jf}	|d d }
|d d }t	|||
|\}}|
dddd }|
dddd }|	
dddd }	t}| jjdkrt| jj }|| |||	|f| jsdn| j| j| jd	|\}}|
dddd }|||d}| |}|
ddd }||fS )
z#Input shape: Batch x Time x ChannelNr>   r   r   r3   r   eagerrj   )rq   rp   r   )r[   r   r_   r   r   r   r   r@   r9   r   r`   ry   r~   r   Z_attn_implementationr   ru   rq   r   r   rb   r   )r   r$   r   ro   rr   rh   Z
seq_lengthZquery_statesrz   r{   r   r   Zattention_interfacer}   r|   r"   r"   r#   r&      s>    	

zMLCDAttention.forward)N)r'   r(   r)   __doc__r   r   r*   r+   tupler   r   r   r&   r,   r"   r"   r    r#   r      s    r   c                       sX   e Zd Zed fddZd	ejeejejf eej ee	 eej
 dddZ  ZS )
MLCDEncoderLayerrK   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S )Neps)r   r   r   rN   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   r    r"   r#   r   1  s    


zMLCDEncoderLayer.__init__NF)r$   r   ro   output_attentionsr%   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r`||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        r$   r   ro   r   )r   r   r   r   )r   r$   r   ro   r   Zresidualr|   outputsr"   r"   r#   r&   9  s"    




zMLCDEncoderLayer.forward)NF)r'   r(   r)   r   r   r*   r+   r   r   boolri   r&   r,   r"   r"   r    r#   r   0  s     r   c                
       sj   e Zd ZdZed fddZd	ejeej	ej	f e
ej	 e
e e
e e
e eeef dddZ  ZS )
MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    rK   c                    s:   t     | _t fddt jD | _d| _dS )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.c                    s   g | ]}t  qS r"   )r   ).0_rK   r"   r#   
<listcomp>s      z(MLCDEncoder.__init__.<locals>.<listcomp>FN)	r   r   r   r   Z
ModuleListrangenum_hidden_layerslayersZgradient_checkpointingr   r    rK   r#   r   o  s    
 zMLCDEncoder.__init__N)inputs_embedsr   ro   r   output_hidden_statesreturn_dictr%   c                 C   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}|rDdnd}|rPdnd}|}	t| jD ]@\}
}|rx||	f }||	|||d}|d }	|rb||d f }qb|r||	f }|stdd |	||fD S t|	||dS )	aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr"   r   r   r   c                 s   s   | ]}|d ur|V  qd S r   r"   )r   vr"   r"   r#   	<genexpr>  r   z&MLCDEncoder.forward.<locals>.<genexpr>)last_hidden_stater$   
attentions)r   r   use_return_dictr   	enumerater   r   r	   )r   r   r   ro   r   r   r   Zencoder_statesZall_attentionsr$   idxZencoder_layerZlayer_outputsr"   r"   r#   r&   v  s8    "

zMLCDEncoder.forward)NNNN)r'   r(   r)   r   r   r   r*   ri   r   r+   r   r   r   r	   r&   r,   r"   r"   r    r#   r   f  s       
r   c                	       sX   e Zd Zed fddZedeej ee	 ee	 ee	 e
eef dddZ  ZS )	MLCDVisionTransformerrK   c                    s   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _t|j|j d | _ttd|j|j d | _d S )Nr   r3   r   )r   r   r   r   rJ   rV   r   r   r   pre_layrnormr   encoderpost_layernormr-   r   vision_rotary_embeddingrP   r*   rQ   class_pos_emb)r   r   rN   r    r"   r#   r     s    


zMLCDVisionTransformer.__init__Nre   r   r   r   r%   c                 C   s>  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|d u rLtd|jd | j j }|jd | j j }| ||}|| j	j
}tj| j	|gdd}tj||fdd}| | f}	| |}
| |
}
| j|
|	|||d}|d }|d d dd d f }| |}|s*||f|dd   S t|||j|jdS )	Nz You have to specify pixel_valuesrs   r>   r   r?   )r   r   r   r   r   r   )r   Zpooler_outputr$   r   )r   r   r   r   r   r[   rO   r   rf   r   r=   r*   rc   r   r   rV   r   r   r   r
   r$   r   )r   re   r   r   r   r;   r<   rG   Zembr   r$   Zencoder_outputsr   Zpooled_outputr"   r"   r#   r&     sB    	


zMLCDVisionTransformer.forward)NNNN)r'   r(   r)   r   r   r   r   r*   ri   r   r   r   r
   r&   r,   r"   r"   r    r#   r     s       
r   c                   @   s.   e Zd ZU eed< dZdZdZdZdd Z	dS )MLCDPreTrainedModelr   ZmlcdTc                 C   s  | j j}t|trX| j j}tjj|jd|jd | d tjj|j	j
|j j| d nt|tr| j j}|jd d|j j d  | }|jd | }tjj|jj
|d tjj|jj
|d tjj|jj
|d tjj|jj
|d nt|tr\| j j}|j jd d|j j d  | }d|j j d | }tjj|jj
|d tjj|jj
|d nt|tr| j j}|j j|j j d d | }tjj|jd|d nPt|tjr|jj  |j
jd n&t|tjr|jdur|jj  dS )zInitialize the weightsrj   r   )meanstd)r   r3   r2   N)r   Zinitializer_factor
isinstancerJ   r   initZnormal_rR   rN   rS   r]   Zinitializer_ranger   r   r   r   r   r   r   r   r   r   r   r   r   r   rL   dataZzero_Zfill_r   )r   rk   factorZin_proj_stdZout_proj_stdZfc_stdZpos_emb_stdr"   r"   r#   _init_weights  s8    
 
 z!MLCDPreTrainedModel._init_weightsN)
r'   r(   r)   r   rH   Zbase_model_prefixZsupports_gradient_checkpointingZ_supports_flash_attnZ_supports_sdpar   r"   r"   r"   r#   r     s   
r   zN
    The vision model from M_L_C_D without any head or projection on top.
    )Zcustom_introc                	       s|   e Zd ZU eed< dZdgZed fddZej	ddd	Z
edeej ee ee ee eeef dddZ  ZS )MLCDVisionModelr   re   r   rK   c                    s"   t  | t|| _|   d S r   )r   r   r   vision_modelZ	post_initr   r    r"   r#   r   .  s    
zMLCDVisionModel.__init__)r%   c                 C   s
   | j jjS r   )r   rV   rS   )r   r"   r"   r#   get_input_embeddings4  s    z$MLCDVisionModel.get_input_embeddingsNr   c                 C   sN   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| j||||dS )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```N)re   r   r   r   )r   r   r   r   r   )r   re   r   r   r   r"   r"   r#   r&   7  s    zMLCDVisionModel.forward)NNNN)r'   r(   r)   r   rH   Zmain_input_nameZ_no_split_modulesr   r   Moduler   r   r   r*   ri   r   r   r   r
   r&   r,   r"   r"   r    r#   r   $  s"   
    
r   )rj   ).typingr   r   r   r*   Ztorch.nnr   Zactivationsr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr	   r
   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   Zconfiguration_mlcdr   r   r   r-   rJ   r+   r9   r~   r   rI   rv   r   r   r   r   r   r   r   r   __all__r"   r"   r"   r#   <module>   sP   )S M6V@':