a
    hCQ                     @   s   d Z ddlmZ ddlmZ ddlmZmZ ee	Z
G dd deZG dd	 d	eZG d
d deZG dd deZG dd deZg dZdS )zSAM2 model configuration   )PretrainedConfig)logging   )CONFIG_MAPPING
AutoConfigc                       s*   e Zd ZdZdZdZd fdd	Z  ZS )Sam2HieraDetConfiga  
    This is the configuration class to store the configuration of a [`Sam2HieraDetModel`]. It is used to instantiate
    a HieraDet model as defined in the original sam2 repo according to the specified arguments, defining the model architecture.
    Instantiating a configuration defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/sam2.1-hiera-tiny](https://huggingface.co/facebook/sam2.1-hiera-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 96):
            The hidden dimension of the image encoder.
        num_attention_heads (`int`, *optional*, defaults to 1):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of channels in the image.
        image_size (`list[int]`, *optional*, defaults to `[1024, 1024]`):
            The size of the image.
        patch_kernel_size (`list[int]`, *optional*, defaults to `[7, 7]`):
            The kernel size of the patch.
        patch_stride (`list[int]`, *optional*, defaults to `[4, 4]`):
            The stride of the patch.
        patch_padding (`list[int]`, *optional*, defaults to `[3, 3]`):
            The padding of the patch.
        query_stride (`list[int]`, *optional*, defaults to `[2, 2]`):
            The downsample stride between stages.
        window_positional_embedding_background_size (`list[int]`, *optional*, defaults to `[7, 7]`):
            The window size per stage when not using global attention.
        num_query_pool_stages (`int`, *optional*, defaults to 3):
            The number of query pool stages.
        blocks_per_stage (`list[int]`, *optional*, defaults to `[1, 2, 7, 2]`):
            The number of blocks per stage.
        embed_dim_per_stage (`list[int]`, *optional*, defaults to `[96, 192, 384, 768]`):
            The embedding dimension per stage.
        num_attention_heads_per_stage (`list[int]`, *optional*, defaults to `[1, 2, 4, 8]`):
            The number of attention heads per stage.
        window_size_per_stage (`list[int]`, *optional*, defaults to `[8, 4, 14, 7]`):
            The window size per stage.
        global_attention_blocks (`list[int]`, *optional*, defaults to `[5, 7, 9]`):
            The blocks where global attention is used.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            The ratio of the MLP hidden dimension to the embedding dimension.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    backbone_configsam2_hiera_det_model`      r   N      @geluư>{Gz?c                    sd  t  jf i | |d ur|nddg}|d ur2|nddg}|d urF|nddg}|d urZ|nddg}|d urn|nddg}|	d ur|	nddg}	|d ur|ng d}|d ur|ng d}|d ur|ng d}|d ur|ng d	}|d ur|ng d
}|| _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S )N         r   r   )r   r   r   r   )r
           )r   r   r      )r   r      r   )   r   	   )super__init__hidden_sizenum_attention_headsnum_channels
image_sizepatch_kernel_sizepatch_stridepatch_paddingquery_stride+window_positional_embedding_background_sizenum_query_pool_stagesblocks_per_stageembed_dim_per_stagenum_attention_heads_per_stagewindow_size_per_stageglobal_attention_blocks	mlp_ratio
hidden_actlayer_norm_epsinitializer_range)selfr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   kwargs	__class__ g/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/sam2/configuration_sam2.pyr   P   sF    zSam2HieraDetConfig.__init__)r
   r   r   NNNNNNr   NNNNNr   r   r   r   )__name__
__module____qualname____doc__base_config_key
model_typer   __classcell__r3   r3   r1   r4   r      s.   3                   r   c                       s2   e Zd ZdZdZdZdeiZd fdd	Z  Z	S )Sam2VisionConfiga  
    This is the configuration class to store the configuration of a [`Sam2VisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/sam2.1-hiera-tiny](https://huggingface.co/facebook/sam2.1-hiera-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PretrainedConfig"]`, *optional*):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        backbone_channel_list (`List[int]`, *optional*, defaults to `[768, 384, 192, 96]`):
            The list of channel dimensions for the backbone.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
            The spatial sizes of the feature maps from the backbone.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        fpn_kernel_size (`int`, *optional*, defaults to 1):
            The kernel size for the convolutions in the neck.
        fpn_stride (`int`, *optional*, defaults to 1):
            The stride for the convolutions in the neck.
        fpn_padding (`int`, *optional*, defaults to 0):
            The padding for the convolutions in the neck.
        fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
            The levels for the top-down FPN connections.
        num_feature_levels (`int`, *optional*, defaults to 3):
            The number of feature levels from the FPN to use.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configsam2_vision_modelr   N   r       r   r   r   r   c                    s   t  jf i | |d u r"g dn|}|d u rDddgddgddggn|}|d u rXddgn|}t|tr|dd|d< t|d  f i |}nt|tr|}n|d u rt }|| _|| _|| _	|| _
|| _|| _|| _|| _|	| _|
| _|| _|| _d S )	N)r   r   r   r
   r?      @   r   r   r:   r	   )r   r   
isinstancedictgetr   r   r   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levelsr,   r-   r.   )r/   r   rF   rG   rH   rI   rJ   rK   rL   rM   r,   r-   r.   r0   r1   r3   r4   r      s0     

zSam2VisionConfig.__init__)NNNr?   r   r   r@   Nr   r   r   r   )
r5   r6   r7   r8   r9   r:   r   sub_configsr   r;   r3   r3   r1   r4   r<      s$   &            r<   c                       s&   e Zd ZdZdZd fd
d	Z  ZS )Sam2PromptEncoderConfiga<  
    This is the configuration class to store the configuration of a [`Sam2PromptEncoder`]. The [`Sam2PromptEncoder`]
    module is used to encode the input 2D points and bounding boxes.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        image_size (`int`, *optional*, defaults to 1024):
            The expected output resolution of the image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        mask_input_channels (`int`, *optional*, defaults to 16):
            The number of channels to be fed to the `MaskDecoder` module.
        num_point_embeddings (`int`, *optional*, defaults to 4):
            The number of point embeddings to be used.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the encoder and pooler.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        scale (`float`, *optional*, defaults to 1):
            The scale factor for the prompt encoder.
    prompt_encoder_configr?   r      r   r   r   r   c	           
         sF   t  jf i |	 || _|| _|| _|| _|| _|| _|| _|| _	d S N)
r   r   r   r   
patch_sizemask_input_channelsnum_point_embeddingsr,   r-   scale)
r/   r   r   rS   rT   rU   r,   r-   rV   r0   r1   r3   r4   r     s    z Sam2PromptEncoderConfig.__init__)r?   r   rQ   rQ   r   r   r   r   r5   r6   r7   r8   r9   r   r;   r3   r3   r1   r4   rO      s           rO   c                       s&   e Zd ZdZdZd fdd	Z  ZS )Sam2MaskDecoderConfiga  
    This is the configuration class to store the configuration of a [`Sam2MaskDecoder`]. It is used to instantiate a SAM2
    memory encoder according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the SAM2 mask decoder.
        mlp_dim (`int`, *optional*, defaults to 2048):
            The dimension of the MLP in the two-way transformer.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            The number of hidden layers in the two-way transformer.
        num_attention_heads (`int`, *optional*, defaults to 8):
            The number of attention heads in the two-way transformer.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsample rate for the attention layers.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of multimask outputs.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The depth of the IoU head.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The hidden dimension of the IoU head.
        dynamic_multimask_via_stability (`bool`, *optional*, defaults to `True`):
            Whether to use dynamic multimask via stability.
        dynamic_multimask_stability_delta (`float`, *optional*, defaults to 0.05):
            The stability delta for the dynamic multimask.
        dynamic_multimask_stability_thresh (`float`, *optional*, defaults to 0.98):
            The stability threshold for the dynamic multimask.

    mask_decoder_configr?   r      r   r   r   T皙?\(\?c                    sd   t  jf i | || _|| _|| _|| _|	| _|
| _|| _|| _	|| _
|| _|| _|| _|| _d S rR   )r   r   r   num_multimask_outputsr,   iou_head_depthiou_head_hidden_dimdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_threshnum_hidden_layersr   mlp_dimattention_downsample_rate)r/   r   r,   rd   rc   r   re   r]   r^   r_   r`   ra   rb   r0   r1   r3   r4   r   H  s    zSam2MaskDecoderConfig.__init__)r?   r   rZ   r   r   r   r   r   r?   Tr[   r\   rW   r3   r3   r1   r4   rX   "  s   #            rX   c                       s2   e Zd ZdZdZeeedZd fdd	Z	  Z
S )	
Sam2Configal	  
    [`Sam2Config`] is the configuration class to store the configuration of a [`Sam2Model`]. It is used to instantiate a
    SAM2 model according to the specified arguments, defining the memory attention, memory encoder, and image encoder
    configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny
    [facebook/sam2.1-hiera-tiny](https://huggingface.co/facebook/sam2.1-hiera-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `Sam2VisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`Sam2VisionConfig`].
        prompt_encoder_config (Union[`dict`, `Sam2PromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`Sam2PromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `Sam2MaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`Sam2MaskDecoderConfig`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            Standard deviation for parameter initialization.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     Sam2VisionConfig,
    ...     Sam2PromptEncoderConfig,
    ...     Sam2MaskDecoderConfig,
    ...     Sam2Model,
    ... )

    >>> # Initializing a Sam2Config with `"facebook/sam2.1_hiera_tiny"` style configuration
    >>> configuration = Sam2config()

    >>> # Initializing a Sam2Model (with random weights) from the `"facebook/sam2.1_hiera_tiny"` style configuration
    >>> model = Sam2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Sam2Config from a Sam2VisionConfig, Sam2PromptEncoderConfig, and Sam2MaskDecoderConfig

    >>> # Initializing SAM2 vision encoder, memory attention, and memory encoder configurations
    >>> vision_config = Sam2VisionConfig()
    >>> prompt_encoder_config = Sam2PromptEncoderConfig()
    >>> mask_decoder_config = Sam2MaskDecoderConfig()

    >>> config = Sam2Config(vision_config, prompt_encoder_config, mask_decoder_config)
    ```Zsam2)r=   rP   rY   Nr   c                    s   t  jf i | |d ur|ni }|d ur.|ni }|d ur>|ni }t|trt|dd|d< t|d  f i |}nt|tr|}t|tr| }t|t	r| }|| _
tf i || _t	f i || _|| _d S )Nr:   r>   )r   r   rC   rD   rE   r   r   rO   to_dictrX   r=   rP   rY   r.   )r/   r=   rP   rY   r.   r0   r1   r3   r4   r     s"    



zSam2Config.__init__)NNNr   )r5   r6   r7   r8   r:   r   rO   rX   rN   r   r;   r3   r3   r1   r4   rf   k  s   2    rf   )rf   r   r<   rO   rX   N)r8   Zconfiguration_utilsr   utilsr   autor   r   Z
get_loggerr5   loggerr   r<   rO   rX   rf   __all__r3   r3   r3   r4   <module>   s   
w^4IX