a
    hr                     @   s  d Z ddlZddlmZ ddlZddlm  mZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZ ddlmZmZ ddlmZmZmZ ddlm Z m!Z!m"Z" G dd deZ#G dd deZ$G dd deZ%G dd de"Z&G dd deZ'G dd deZ(G dd dej)Z*G dd deZ+G d d! d!e Z,G d"d# d#eZ-G d$d% d%e!Z.G d&d' d'ej)Z/eG d(d) d)eZ0ed*d+G d,d- d-e0Z1ed.d+G d/d0 d0e0Z2eG d1d2 d2eZ3g d3Z4dS )4z%Pytorch implementation of AIMv2 Model    N)Optional)nn   )create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPooling)PreTrainedModel)auto_docstringcan_return_tuple   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutputc                       sD   e Zd ZdZdeeeeeeeeeeeeeeed fddZ  Z	S )Aimv2VisionConfiga  
    This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2816):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
        use_head (`str`, *optional*, defaults to `True`):
            Whether to use Attention Pooling Head or Not.
        is_native (`str`, *optional*, defaults to `False`):
            Whether to use ckpt trained for image native resolution or not.
    Example:

    ```python
    >>> from transformers import SiglipVisionConfig, SiglipVisionModel

    >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
    >>> configuration = Aimv2VisionConfig()

    >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
    >>> model = Aimv2VisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```            r         h㈵>        Fsilu{Gz?T)hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_sizerms_norm_epsattention_dropoutqkv_biasmlp_bias
hidden_actinitializer_rangeuse_head	is_nativec                    sX   t  jf |||||||||
d	| || _|| _|	| _|| _|
| _|| _|| _| `	d S )N)	r"   r#   r$   r%   r-   r&   r'   r(   r+   )
super__init__r/   r.   r*   r,   r+   r)   r0   layer_norm_eps)selfr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   kwargs	__class__ c/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/aimv2/modular_aimv2.pyr2   `   s*    
zAimv2VisionConfig.__init__)r   r   r   r   r   r   r   r   r   FFr    r!   TF)
__name__
__module____qualname____doc__intfloatboolstrr2   __classcell__r8   r8   r6   r9   r   '   sB   :               r   c                       sL   e Zd ZdZdeeeeeeeeeeee ee eeed fddZ	  Z
S )Aimv2TextConfiga  
    This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
    AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 49408):
            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`Aimv2Model`].
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        pad_token_id (`int`, *optional*, defaults to 1):
            The id of the padding token in the vocabulary.
        bos_token_id (`int`, *optional*, defaults to 49406):
            The id of the beginning-of-sequence token in the vocabulary.
        eos_token_id (`int`, *optional*, defaults to 49407):
            The id of the end-of-sequence token in the vocabulary.
        max_position_embeddings (`int`, *optional*, defaults to 77):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
                   r   r   Fr    N  M   r!   )
vocab_sizer"   r#   r$   r%   r)   r*   r+   r,   r-   pad_token_idbos_token_ideos_token_idmax_position_embeddingsr.   c                    sZ   t  jf ||||||
||||d
| || _|| _|	| _|| _|| _| `| `| `	| `
d S )N)
rK   r"   r#   r$   r%   r-   rO   rL   rM   rN   )r1   r2   r.   r*   r,   r+   r)   rM   rL   Zprojection_sizer3   )r4   rK   r"   r#   r$   r%   r)   r*   r+   r,   r-   rL   rM   rN   rO   r.   r5   r6   r8   r9   r2      s.    zAimv2TextConfig.__init__)rD   rE   rF   rG   rH   r   r   FFr    NNrI   rJ   r!   )r:   r;   r<   r=   r>   r?   r@   rA   r   r2   rB   r8   r8   r6   r9   rC      sB   /               rC   c                       s"   e Zd ZdZd fdd	Z  ZS )Aimv2Configa@  
    [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
    instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
        projection_dim (`int`, *optional*, defaults to 512):
            Dimensionality of text and vision projection layers.
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The initial value of the *logit_scale* parameter.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import Aimv2Config, Aimv2Model

    >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
    >>> configuration = Aimv2Config()

    >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
    >>> model = Aimv2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
    >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
    >>> config_text = Aimv2TextConfig()
    >>> config_vision = Aimv2VisionConfig()

    >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
    ```N   /L
F@c                    s0   t  j||fi | || _|| _d| _| `d S )Ng      Y@)r1   r2   projection_dimlogit_scale_init_valuemax_logit_scaleZinitializer_factor)r4   text_configvision_configrS   rT   r5   r6   r8   r9   r2     s
    zAimv2Config.__init__)NNrQ   rR   )r:   r;   r<   r=   r2   rB   r8   r8   r6   r9   rP      s   . rP   c                   @   s   e Zd ZdS )Aimv2OutputNr:   r;   r<   r8   r8   r8   r9   rX     s   rX   c                   @   s   e Zd ZdS )Aimv2RMSNormNrY   r8   r8   r8   r9   rZ   #  s   rZ   c                   @   s   e Zd ZdS )Aimv2MLPNrY   r8   r8   r8   r9   r[   '  s   r[   c                       sV   e Zd Zed fddZedddejfejddd	Z	ejejd
ddZ
  ZS )Aimv2VisionEmbeddingsconfigc                    s   t    || _|j| _tj|j|j|j|jd| _t	|j|j
| _|j|j d }| jjslt||j| _| jdt|ddd d S )N)Zkernel_sizeZstrider   position_ids)   F)
persistent)r1   r2   r^   r(   r   ZConv2dr&   r"   patch_embedrZ   r)   rms_normr'   r0   Z	Embeddingposition_embeddingZregister_buffertorcharangeexpand)r4   r^   Znum_patchesr6   r8   r9   r2   ,  s    
zAimv2VisionEmbeddings.__init__   g     @cpureturnc                 C   s   t jt|||d}t jt| ||d}t j||dd\}}|d }t j|||d| }	d||	  }	| d |	d d d f  }
| d |	d d d f  }t j|
 |
 | | gddd d d d d f S )	NdtypedeviceZxy)Zindexing   g      ?).Nr`   dim)rf   rg   r>   Zmeshgridflattenconcatsincos)heightwidth	embed_dimZtemperaturero   rn   Zgrid_wZgrid_hZpos_dimomegaZout_hZout_wr8   r8   r9   "build_2d_sincos_position_embedding:  s    z8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding)pixel_valuesrl   c                 C   s|   |  \}}}}| |ddd}| |}| jjrd| j|| j || j | jj	|j
|jd}n| | j}|| }|S )Nr   r`   )ry   ro   rn   )sizerc   rs   	transposerd   r^   r0   r{   r(   r"   ro   rn   re   r_   )r4   r|   _rw   rx   hidden_statesZ	pos_embedr8   r8   r9   forwardK  s    
zAimv2VisionEmbeddings.forward)r:   r;   r<   r   r2   staticmethodrf   Zfloat32Tensorr{   r   rB   r8   r8   r6   r9   r\   +  s   
r\   c                   @   s   e Zd ZdS )Aimv2TextEmbeddingsNrY   r8   r8   r8   r9   r   _  s   r   c                       s   e Zd Z fddZ  ZS )Aimv2Attentionc                    sp   t  | tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _	d S )NZbias)
r1   r2   r   Linearry   r+   k_projv_projZq_projZout_projr4   r^   r6   r8   r9   r2   d  s
    zAimv2Attention.__init__)r:   r;   r<   r2   rB   r8   r8   r6   r9   r   c  s   r   c                       sP   e Zd Zed fddZd	ejeej ee e	ejejf dddZ
  ZS )
Aimv2EncoderLayerr]   c                    sB   t    t|| _t|| _t|j|j| _	t|j|j| _
d S N)r1   r2   r   	attentionr[   ffnrZ   r"   r)   	rms_norm1	rms_norm2r   r6   r8   r9   r2   m  s
    


zAimv2EncoderLayer.__init__NF)r   attention_maskoutput_attentionsrl   c                 C   sT   |  |}| j||d\}}|| }| |}| |}|| }|rL||fS |d fS )N)r   r   )r   r   r   r   )r4   r   r   r   Znorm_hidden_statesattn_outputZattn_weightsZ
mlp_outputr8   r8   r9   r   t  s    


zAimv2EncoderLayer.forward)NF)r:   r;   r<   r   r2   rf   r   r   r@   tupler   rB   r8   r8   r6   r9   r   l  s   
  r   c                   @   s   e Zd ZdS )Aimv2EncoderNrY   r8   r8   r8   r9   r     s   r   c                       s6   e Zd Zed fddZejejdddZ  ZS )Aimv2AttentionPoolingHeadr]   c                    s|   t    |j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	t
tdd| j| _tj| j| jdd| _d S )Nr   r`   T)r1   r2   r"   r%   	num_headsr   r   r+   r   r   	Parameterrf   Zzeros	cls_tokenoutput_projr   r6   r8   r9   r2     s    
z"Aimv2AttentionPoolingHead.__init__)r   rl   c                 C   s   |j \}}}| j|dd}| |||| j|| j }| |||| j|| j }||d| j|| j }|dddd}|dddd}|dddd}t	|||}	|	
dd|d|}	|	jdd}	| |	}
|
S )Nra   r`   r   r   r   rq   )shaper   rh   r   Zreshaper   r   ZpermuteFZscaled_dot_product_attentionr~   meanr   )r4   r   
batch_sizeseq_lenZ
hidden_dimr   keyvaluequeryr   outputr8   r8   r9   r     s    
z!Aimv2AttentionPoolingHead.forward)	r:   r;   r<   r   r2   rf   r   r   rB   r8   r8   r6   r9   r     s   r   c                       sF   e Zd ZU dZeed< dZdZg dZdZ	dZ
dZ fddZ  ZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    r^   Zaimv2T)r   r   r\   r   c                    s^   t  | t|dr:t|jtjrZ|jjt	
d n t|trZ|jjjd| jjd d S )Nlogit_scaleg$I$I,@r   )r   Zstd)r1   _init_weightshasattr
isinstancer   r   r   dataZfill_mathlogr   r   Znormal_r^   r.   )r4   moduler6   r8   r9   r     s    

z"Aimv2PreTrainedModel._init_weights)r:   r;   r<   r=   rP   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpa_supports_flash_attnZ_supports_flex_attnr   rB   r8   r8   r6   r9   r     s   
r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )Zcustom_introc                       sl   e Zd ZU eed< dZed fddZejdddZ	e
edeej ee ee ed
ddZ  ZS )Aimv2VisionModelr^   r|   r]   c                    sZ   t  | || _t|| _t|| _t|j|j	| _
|j| _| jrNt|| _|   d S r   )r1   r2   r^   r\   
embeddingsr   encoderrZ   r"   r)   rd   r/   r   head	post_initr   r6   r8   r9   r2     s    


zAimv2VisionModel.__init__rk   c                 C   s   | j jS r   )r   rc   r4   r8   r8   r9   get_input_embeddings  s    z%Aimv2VisionModel.get_input_embeddingsNr   r   output_hidden_statesrl   c           	      C   s|   |dur|n| j j}|dur |n| j j}| |}| j|||d}|d }| |}| jrd| |nd}t|||j	|j
dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```N)inputs_embedsr   r   r   last_hidden_statepooler_outputr   
attentions)r^   r   r   r   r   rd   r/   r   r   r   r   )	r4   r|   r   r   r   r   encoder_outputsr   r   r8   r8   r9   r     s$    

zAimv2VisionModel.forward)NNN)r:   r;   r<   r   r   main_input_namer2   r   Moduler   r
   r	   r   rf   r   r@   r   r   rB   r8   r8   r6   r9   r     s   
   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c                       sj   e Zd ZdZed fddZejdddZdd	 Z	e
edeej ee ee edddZ  ZS )Aimv2TextModel	input_idsr]   c                    sJ   t  | || _t|| _t|| _t|j|j	| _
|j| _|   d S r   )r1   r2   r^   r   r   r   r   rZ   r"   r)   rd   rN   r   r   r6   r8   r9   r2   !  s    

zAimv2TextModel.__init__rk   c                 C   s   | j jS r   r   Ztoken_embeddingr   r8   r8   r9   r   ,  s    z#Aimv2TextModel.get_input_embeddingsc                 C   s   || j _d S r   r   )r4   r   r8   r8   r9   set_input_embeddings/  s    z#Aimv2TextModel.set_input_embeddingsNr   c                 C   s   |d ur|n| j j}|d ur |n| j j}| |}|j\}}}tj|tj|jd}	|		d
|d}
|d urt| j ||
||	d d}| j||||d}|d }| |}|tj|jd |jd|jtj|jd| jk jddf }t|||j|jdS )	Nrm   r   ra   )r^   Zinput_embedsr_   r   cache_positionZpast_key_values)r   r   r   r   )ro   rq   r   )r^   r   r   r   r   rf   rg   longro   Z	unsqueezerh   r   r   rd   tor>   rN   Zargmaxr   r   r   )r4   r   r   r   r   r   r   r   r   r   r_   r   r   Zpooled_outputr8   r8   r9   r   2  sF    	
	
"zAimv2TextModel.forward)NNN)r:   r;   r<   r   rC   r2   r   r   r   r   r
   r	   r   rf   r   r@   r   r   rB   r8   r8   r6   r9   r     s      r   c                
   @   sZ   e Zd ZdZedddZeed	ee	j
 ee	j ee	j ee ee edddZdS )

Aimv2ModelTr]   c                 C   s   t | | |j| _|jj| _|jj| _t	|j| _
t	|j| _tj| j| jdd| _tj| j| jdd| _tt| jj| _t|j| _|   d S )NFr   )r   r2   rS   rW   r"   Zvision_embed_dimrV   Ztext_embed_dimr   _from_configvision_modelr   
text_modelr   r   visual_projectiontext_projectionr   rf   Ztensorr^   rT   r   r   r   rU   max_log_logit_scaler   r   r8   r8   r9   r2   k  s    

zAimv2Model.__init__N)r   r|   r   r   r   rl   c                 C   s   |dur|n| j j}|dur |n| j j}| j|||d}| j||||d}|j}| |}|j}	| |	}	|t| }|	t|	 }	| j	
d| j |	j}
|
|	 |  }| }t|||	|||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```N)r|   r   r   )r   r   r   r   r   )logits_per_imagelogits_per_texttext_embedsimage_embedsZtext_model_outputZvision_model_output)r^   r   r   r   r   r   r   r   r   r   clampr   expr   ro   trX   )r4   r   r|   r   r   r   Zvision_outputsZtext_outputsr   r   r   r   r   r8   r8   r9   r   }  s>    !

zAimv2Model.forward)NNNNN)r:   r;   r<   r   rP   r2   r	   r
   r   rf   Z
LongTensorZFloatTensorr   r@   rX   r   r8   r8   r8   r9   r   g  s"        r   )rP   r   rC   r   r   r   r   )5r=   r   typingr   rf   Ztorch.nn.functionalr   Z
functionalr   Zmasking_utilsr   Zmodeling_layersr   Zmodeling_outputsr   Zmodeling_utilsr   utilsr	   r
   Zclip.modeling_clipr   r   r   Zllama.modeling_llamar   r   Zsiglip.configuration_siglipr   r   r   Zsiglip.modeling_siglipr   r   r   r   rC   rP   rX   rZ   r[   r   r\   r   r   r   r   r   r   r   r   r   __all__r8   r8   r8   r9   <module>   sN   d[94	"LI`