a
    ha[                     @   sv  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZmZmZmZ d
dlmZ d
dl m!Z!m"Z" e#e$Z%G dd deZ&G dd deZ'G dd de!Z(G dd deZ)G dd deZ*G dd deZ+G dd deZ,G dd deZ-eG dd deZ.G d d! d!eZ/g d"Z0dS )#    )CallableOptionalUnionN   )PretrainedConfig)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionc                       s*   e Zd ZdZdZdZd fdd	Z  ZS )MLCDVisionConfiga  
    This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
    [DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1664):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 1024):
            Dimensionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 336):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```python
    >>> from transformers import MLCDVisionConfig, MLCDVisionModel

    >>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> configuration = MLCDVisionConfig()

    >>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> model = MLCDVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Zmlcd_vision_modelZvision_config      0         r   P     geluh㈵>        {Gz?      ?c                    sd   t  jf i | || _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|
| _|	| _d S N)super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_groupsnum_channels
patch_size
image_sizeinitializer_rangeinitializer_factorattention_dropoutlayer_norm_eps
hidden_act)selfr*   r+   r,   r-   r.   r/   r1   r0   r6   r5   r4   r2   r3   kwargs	__class__ a/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/mlcd/modular_mlcd.pyr)   d   s    zMLCDVisionConfig.__init__)r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   )__name__
__module____qualname____doc__Z
model_typeZbase_config_keyr)   __classcell__r;   r;   r9   r<   r   *   s"   6             r   c                   @   s   e Zd ZdS )MLCDMLPN)r=   r>   r?   r;   r;   r;   r<   rB      s   rB   c                   @   s    e Zd ZeeejdddZdS )MLCDRotaryEmbedding)num_patches_heightnum_patches_widthreturnc           
      C   s   t j|| jjddd|}t j|| jjdd|d}t j| | gdd}t||}t j|| jj| jj	d}t 
|| j}|| d}	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer   r   dim)rG   dtype)torchZarangeZinv_freqrG   	unsqueezeexpandstackflattenmaxrK   outer)
r7   rD   rE   Zhpos_idsZwpos_idsZpos_idsZmax_grid_sizeseqZrotary_pos_emb_fullrotary_pos_embr;   r;   r<   forward   s    
zMLCDRotaryEmbedding.forwardN)r=   r>   r?   intrL   TensorrU   r;   r;   r;   r<   rC      s   rC   c                       s6   e Zd Zed fddZejejdddZ  Z	S )MLCDVisionEmbeddingsconfigc                    s   t  | | `d S r'   )r(   r)   Zposition_embeddingr7   rZ   r9   r;   r<   r)      s    zMLCDVisionEmbeddings.__init__)pixel_valuesrF   c                 C   s^   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|S )Nr   )rK   r   r   rH   rI   )shapepatch_embeddingweightrK   torP   Z	transposeclass_embeddingrN   rL   cat)r7   r\   
batch_sizeZtarget_dtypeZpatch_embedsZclass_embeds
embeddingsr;   r;   r<   rU      s    

zMLCDVisionEmbeddings.forward)
r=   r>   r?   r   r)   rL   FloatTensorrW   rU   rA   r;   r;   r9   r<   rX      s   rX   c                	       sf   e Zd ZdZed fddZd	ejeejejf e	ej e
e eeje	ej f dddZ  ZS )
MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    rY   c                    s   t  | |j| _d| _d S )NF)r(   r)   r.   	is_causalr[   r9   r;   r<   r)      s    zMLCDAttention.__init__N)hidden_statesposition_embeddingsattention_maskr8   rF   c                 K   sp  |j d d \}}| |||| j| jf}| |||| j| jf}| |||| j| jf}	|d d }
|d d }t	|||
|\}}|
dddd }|
dddd }|	
dddd }	t}| jjdkrt| jj }|| |||	|f| jsdn| j| j| jd|\}}|
dddd }|||d}| |}|
ddd }||fS )	NrH   r   r   r   r   eagerr$   )dropoutZscalingrg   )r]   q_projZreshapeZ	num_headsZhead_dimk_projv_projrM   floatr   Zpermute
contiguousr   rZ   Z_attn_implementationr
   Ztrainingrl   scalerg   viewout_proj)r7   rh   ri   rj   r8   rc   Z
seq_lengthZquery_statesZ
key_statesZvalue_statescossinZattention_interfaceZattn_outputattn_weightsr;   r;   r<   rU      s>    	

zMLCDAttention.forward)N)r=   r>   r?   r@   r   r)   rL   rW   tupler   r   r   rU   rA   r;   r;   r9   r<   rf      s   	 rf   c                       sX   e Zd Zed fddZd	ejeejejf eej ee	 eej
 dddZ  ZS )
MLCDEncoderLayerrY   c                    s   t  | t|| _d S r'   )r(   r)   rf   	self_attnr[   r9   r;   r<   r)      s    zMLCDEncoderLayer.__init__NF)rh   ri   rj   output_attentionsrF   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r`||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        rh   ri   rj   r{   )Zlayer_norm1rz   Zlayer_norm2Zmlp)r7   rh   ri   rj   r{   Zresidualrw   outputsr;   r;   r<   rU      s"    




zMLCDEncoderLayer.forward)NF)r=   r>   r?   r   r)   rL   rW   rx   r   boolre   rU   rA   r;   r;   r9   r<   ry      s     ry   c                
       sj   e Zd ZdZed fddZd	ejeej	ej	f e
ej	 e
e e
e e
e eeef dddZ  ZS )
MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    rY   c                    s   t  | dS )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r(   r)   r[   r9   r;   r<   r)   5  s    zMLCDEncoder.__init__N)inputs_embedsri   rj   r{   output_hidden_statesreturn_dictrF   c                 C   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}|rDdnd}|rPdnd}|}	t| jD ]@\}
}|rx||	f }||	|||d}|d }	|rb||d f }qb|r||	f }|stdd |	||fD S t|	||dS )	aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr;   r|   r   r   c                 s   s   | ]}|d ur|V  qd S r'   r;   ).0vr;   r;   r<   	<genexpr>w      z&MLCDEncoder.forward.<locals>.<genexpr>)last_hidden_staterh   
attentions)rZ   r   use_return_dictr{   	enumerateZlayersrx   r   )r7   r   ri   rj   r{   r   r   Zencoder_statesZall_attentionsrh   idxZencoder_layerZlayer_outputsr;   r;   r<   rU   9  s8    "

zMLCDEncoder.forward)NNNN)r=   r>   r?   r@   r   r)   rL   re   rx   rW   r   r~   r   r   rU   rA   r;   r;   r9   r<   r   ,  s       
r   c                	       sX   e Zd Zed fddZedeej ee	 ee	 ee	 e
eef dddZ  ZS )	MLCDVisionTransformerrY   c                    sF   t  | t|j|j d | _tt	d|j|j d | _
d S )Nr   r   )r(   r)   rC   r*   r-   vision_rotary_embeddingnn	ParameterrL   Zrandnclass_pos_embr[   r9   r;   r<   r)     s    zMLCDVisionTransformer.__init__Nr\   r{   r   r   rF   c                 C   s>  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|d u rLtd|jd | j j }|jd | j j }| ||}|| j	j
}tj| j	|gdd}tj||fdd}| | f}	| |}
| |
}
| j|
|	|||d}|d }|d d dd d f }| |}|s*||f|dd   S t|||j|jdS )	Nz You have to specify pixel_valuesrH   r   rI   )r   ri   r{   r   r   r   )r   Zpooler_outputrh   r   )rZ   r   r   r{   
ValueErrorr]   r0   r   r`   r   rG   rL   rb   ru   rv   rd   Zpre_layrnormencoderZpost_layernormr	   rh   r   )r7   r\   r{   r   r   rD   rE   rT   Zembri   rh   Zencoder_outputsr   Zpooled_outputr;   r;   r<   rU     sB    	


zMLCDVisionTransformer.forward)NNNN)r=   r>   r?   r   r)   r   r   rL   re   r~   r   rx   r	   rU   rA   r;   r;   r9   r<   r     s       
r   c                   @   s.   e Zd ZU eed< dZdZdZdZdd Z	dS )MLCDPreTrainedModelrZ   ZmlcdTc                 C   s  | j j}t|trX| j j}tjj|jd|jd | d tjj|j	j
|j j| d nt|tr| j j}|jd d|j j d  | }|jd | }tjj|jj
|d tjj|jj
|d tjj|jj
|d tjj|jj
|d nt|tr\| j j}|j jd d|j j d  | }d|j j d | }tjj|jj
|d tjj|jj
|d nt|tr| j j}|j j|j j d d | }tjj|jd|d nPt|tjr|jj  |j
jd n&t|tjr|jdur|jj  dS )zInitialize the weightsr$   g      )meanstd)r   r   r&   N)rZ   r3   
isinstancerX   r   initZnormal_ra   Z	embed_dimr^   r_   r2   rf   r,   rm   rn   ro   rt   rB   r*   Zfc1Zfc2r   r-   r   Z	LayerNormZbiasdataZzero_Zfill_ZLinear)r7   modulefactorZin_proj_stdZout_proj_stdZfc_stdZpos_emb_stdr;   r;   r<   _init_weights  s8    
 
 z!MLCDPreTrainedModel._init_weightsN)
r=   r>   r?   r   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_supports_flash_attnZ_supports_sdpar   r;   r;   r;   r<   r     s   
r   c                	   @   sB   e Zd Zedeej ee ee ee ee	e
f dddZdS )MLCDVisionModelNr   c                 C   sN   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| j||||dS )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```N)r\   r{   r   r   )rZ   r   r   r{   Zvision_model)r7   r\   r{   r   r   r;   r;   r<   rU     s    zMLCDVisionModel.forward)NNNN)r=   r>   r?   r   r   rL   re   r~   r   rx   r	   rU   r;   r;   r;   r<   r     s       
r   )r   r   r   )1typingr   r   r   rL   Ztorch.nnr   Zconfiguration_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   r	   Zmodeling_utilsr
   r   Zprocessing_utilsr   utilsr   r   Zclip.modeling_clipr   r   r   r   r   r   r   Zllama.modeling_llamar   Zqwen2_vl.modeling_qwen2_vlr   r   Z
get_loggerr=   loggerr   rB   rC   rX   rf   ry   r   r   r   r   __all__r;   r;   r;   r<   <module>   s0   $	
\"<2S9'-