a
    hc                     @   s  d Z ddlZddlmZ ddlmZ ddlZddlm  m	Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% e&e'Z(G dd de%Z)eeddG dd deZ*G dd de!Z+G dd deZ,G dd deZ-G dd de#Z.G d d! d!eZ/G d"d# d#eZ0G d$d% d%ej1Z2G d&d' d'ej3Z4G d(d) d)ej3Z5G d*d+ d+ej3Z6eG d,d- d-eZ7ed.dG d/d0 d0e Z8g d1Z9dS )2zPyTorch EoMT model.    N)	dataclass)Optional)Tensornn   )ACT2FN)ModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)check_model_inputs   )Dinov2EmbeddingsDinov2LayerDinov2LayerScaleDinov2PatchEmbeddings)#Mask2FormerForUniversalSegmentationMask2FormerLoss)SiglipAttention)	ViTConfigc                       s8   e Zd ZdZdZdeeeeeeed fddZ  ZS )
EomtConfiga  
    This is the configuration class to store the configuration of a [`EomtForUniversalSegmentation`]. It is used to instantiate an EoMT model
    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the EoMT
    [tue-mps/coco_panoptic_eomt_large_640](https://huggingface.co/tue-mps/coco_panoptic_eomt_large_640)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the hidden representations.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads in each attention layer.
        mlp_ratio (`int`, *optional*, defaults to 4):
            Ratio of the MLP hidden dimensionality to the hidden size.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings and encoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        image_size (`int`, *optional*, defaults to 640):
            The size (resolution) of each input image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        layerscale_value (`float`, *optional*, defaults to 1.0):
            Initial value for the LayerScale parameter.
        drop_path_rate (`float`, *optional*, defaults to 0.0):
            The stochastic depth rate (drop path) used during training.
        num_upscale_blocks (`int`, *optional*, defaults to 2):
            Number of upsampling blocks used in the decoder or segmentation head.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability applied after attention projection.
        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
            Whether to use the SwiGLU feedforward neural network.
        num_blocks (`int`, *optional*, defaults to 4):
            Number of feature blocks or stages in the architecture.
        no_object_weight (`float`, *optional*, defaults to 0.1):
            Loss weight for the 'no object' class in panoptic/instance segmentation.
        class_weight (`float`, *optional*, defaults to 2.0):
            Loss weight for classification targets.
        mask_weight (`float`, *optional*, defaults to 5.0):
            Loss weight for mask prediction.
        dice_weight (`float`, *optional*, defaults to 5.0):
            Loss weight for the dice loss component.
        train_num_points (`int`, *optional*, defaults to 12544):
            Number of points to sample for mask loss computation during training.
        oversample_ratio (`float`, *optional*, defaults to 3.0):
            Oversampling ratio used in point sampling for mask training.
        importance_sample_ratio (`float`, *optional*, defaults to 0.75):
            Ratio of points to sample based on importance during training.
        num_queries (`int`, *optional*, defaults to 200):
            Number of object queries in the Transformer.
        num_register_tokens (`int`, *optional*, defaults to 4):
            Number of learnable register tokens added to the transformer input.

    Example:

    ```python
    >>> from transformers import EomtConfig, EomtForUniversalSegmentation

    >>> # Initialize configuration
    >>> config = EomtConfig()

    >>> # Initialize model
    >>> model = EomtForUniversalSegmentation(config)

    >>> # Access config
    >>> config = model.config
    ```eomt            gelu        {Gz?ư>  r         ?r   F皙?       @      @ 1        @      ?   )no_object_weightclass_weightmask_weightdice_weighttrain_num_pointsoversample_ratioimportance_sample_ratioc                    s   t  jf ||||||||	|
|d
| | `| `| `| `| `| `|| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d S )N)
hidden_sizenum_hidden_layersnum_attention_headshidden_dropout_prob
hidden_actinitializer_rangelayer_norm_eps
image_size
patch_sizenum_channels)super__init__Zintermediate_sizeZqkv_biasZ
pooler_actZpooler_output_sizeZencoder_strideZattention_probs_dropout_prob	mlp_ratioattention_dropoutlayerscale_valuedrop_path_ratenum_upscale_blocksuse_swiglu_ffn
num_blocksr+   r,   r-   r.   r/   r0   r1   num_queriesnum_register_tokens)selfr2   r3   r4   r>   r6   r5   r7   r8   r9   r:   r;   r@   rA   rB   r?   rC   rD   r+   r,   r-   r.   r/   r0   r1   rE   rF   kwargs	__class__ a/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/eomt/modular_eomt.pyr=      sH    zEomtConfig.__init__)r   r   r   r   r   r   r    r!   r"   r   r   r#   r   r   r   Fr   r$   r%   r&   r&   r'   r(   r)   r*   r   )	__name__
__module____qualname____doc__Z
model_typefloatintr=   __classcell__rK   rK   rI   rL   r   3   sJ   O                          r   a  
    Class for outputs of [`EomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
    [`~EomtImageProcessor.post_process_instance_segmentation`] or
    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~EomtImageProcessor] for details regarding usage.
    )Zcustom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dS )
"EomtForUniversalSegmentationOutputa+  
    loss (`torch.Tensor`, *optional*):
        The computed loss, returned when labels are present.
    class_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
        query. Note the `+ 1` is needed because we incorporate the null class.
    masks_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
        query.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Last hidden states (final feature map) of the last layer.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
    patch_offsets (`list[torch.Tensor]`, *optional*):
        list of tuples indicating the image index and start and end positions of patches for semantic segementation.
    Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionspatch_offsets)rM   rN   rO   rP   rU   r   torchZFloatTensor__annotations__rV   rW   rX   rY   tuplerZ   r[   listr   rK   rK   rK   rL   rT      s   
rT   c                   @   s   e Zd ZdS )EomtLossNrM   rN   rO   rK   rK   rK   rL   r`      s   r`   c                   @   s   e Zd ZdS )EomtPatchEmbeddingsNra   rK   rK   rK   rL   rb      s   rb   c                   @   s8   e Zd ZeddddZdd Zejejddd	ZdS )
EomtEmbeddingsN)configreturnc                 C   s   t j|  || _|j| _t tdd|j| _	t t
d|j|j| _t|| _| jj}t |j| _d|j | _t ||j| _| jdt|ddd d S )N   position_ids)rf   F)
persistent)r   Moduler=   rd   r:   	Parameterr\   Zrandnr2   	cls_tokenZzerosrF   register_tokensrb   patch_embeddingsnum_patchesZDropoutr5   dropoutnum_prefix_tokens	Embeddingposition_embeddingsregister_bufferZarangeexpand)rG   rd   ro   rK   rK   rL   r=      s    
zEomtEmbeddings.__init__c                 C   s   t dd S )NzNot needed for Eomt ModelAttributeErrorrG   rK   rK   rL   interpolate_pos_encoding  s    z'EomtEmbeddings.interpolate_pos_encoding)pixel_valuesre   c                 C   s~   |j \}}}}| jjjj}| |j|d}| j|dd}| j|dd}|| 	| j
 }tj|||gdd}| |}|S )N)dtyperh   rf   dim)shapern   Z
projectionweightr{   torl   ru   rm   rs   rg   r\   catrp   )rG   rz   Z
batch_size_Ztarget_dtype
embeddingsZ
cls_tokensrm   rK   rK   rL   forward  s    
zEomtEmbeddings.forward)	rM   rN   rO   r   r=   ry   r\   r   r   rK   rK   rK   rL   rc      s   rc   c                   @   s   e Zd ZdS )EomtAttentionNra   rK   rK   rK   rL   r      s   r   c                   @   s   e Zd ZdS )EomtLayerScaleNra   rK   rK   rK   rL   r   $  s   r   c                   @   s*   e Zd Zdejeej ejdddZdS )	EomtLayerN)rY   	head_maskre   c                 C   sb   |  |}| ||\}}| |}| || }| |}| |}| |}| || }|S N)Znorm1Z	attentionZlayer_scale1Z	drop_pathZnorm2ZmlpZlayer_scale2)rG   rY   r   Zhidden_states_normZself_attention_outputr   Zlayer_outputrK   rK   rL   r   )  s    




zEomtLayer.forward)N)rM   rN   rO   r\   r   r   r   rK   rK   rK   rL   r   (  s    r   c                       s2   e Zd Zd fdd	ZejejdddZ  ZS )	EomtLayerNorm2dr!   Tc                    s   t  j|||d d S )N)epsZelementwise_affine)r<   r=   )rG   r;   r   ZaffinerI   rK   rL   r=   A  s    zEomtLayerNorm2d.__init__)hidden_statere   c                 C   s>   | dddd}t|| j| j| j| j}| dddd}|S )Nr   r   r   rf   )ZpermuteFZ
layer_normZnormalized_shaper   biasr   )rG   r   rK   rK   rL   r   D  s    zEomtLayerNorm2d.forward)r!   T)rM   rN   rO   r=   r\   r   r   rS   rK   rK   rI   rL   r   @  s   r   c                       s6   e Zd Zed fddZejejdddZ  ZS )EomtScaleLayerrd   c                    sV   t    |j}tj||ddd| _t|j | _tj	||dd|dd| _
t|| _d S )Nr   )kernel_sizeZstrider   rf   F)r   paddinggroupsr   )r<   r=   r2   r   ConvTranspose2dconv1r   r6   
activationConv2dconv2r   layernorm2drG   rd   r2   rI   rK   rL   r=   L  s    
	zEomtScaleLayer.__init__rY   re   c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   r   r   rG   rY   rK   rK   rL   r   \  s
    



zEomtScaleLayer.forward	rM   rN   rO   r   r=   r\   r   r   rS   rK   rK   rI   rL   r   K  s   r   c                       s6   e Zd Zed fddZejejdddZ  ZS )EomtScaleBlockr   c                    s6   t     j| _t fddt| jD | _d S )Nc                    s   g | ]}t  qS rK   )r   .0r   r   rK   rL   
<listcomp>h      z+EomtScaleBlock.__init__.<locals>.<listcomp>)r<   r=   rB   rD   r   
ModuleListrangeblockrG   rd   rI   r   rL   r=   e  s    
zEomtScaleBlock.__init__r   c                 C   s   | j D ]}||}q|S r   )r   )rG   rY   r   rK   rK   rL   r   j  s    

zEomtScaleBlock.forwardr   rK   rK   rI   rL   r   d  s   r   c                       s6   e Zd Zed fddZejejdddZ  ZS )EomtMaskHeadr   c                    sJ   t    |j}t||| _t||| _t||| _t|j	 | _
d S r   )r<   r=   r2   r   Linearfc1fc2fc3r   r6   r   r   rI   rK   rL   r=   q  s    
zEomtMaskHead.__init__r   c                 C   s.   |  | |}|  | |}| |}|S r   )r   r   r   r   r   rK   rK   rL   r   z  s    
zEomtMaskHead.forwardr   rK   rK   rI   rL   r   p  s   	r   c                   @   sP   e Zd ZU dZeed< dZdZdZdgZ	dZ
dZeedZejd	d
ddZd	S )EomtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    rd   r   rz   Fr   T)rY   rZ   N)modulere   c                 C   sZ  | j j}t|tjtjtjfrtjj|j	t
dd |jd urtj|j	\}}|dkrhdt
| nd}tj|j| | nt|tjr|j	jd |jj  nt|tjr|j	jjddd |jd ur|j	j|j   npt|trt|drV|jj| j j nDt|trVtjj|jjtjd|d|jj |j_|j!j  d S )	N   )ar   rf   r#   r   )meanstdlambda1)"rd   r7   
isinstancer   r   r   r   initZkaiming_uniform_r   mathsqrtr   Z_calculate_fan_in_and_fan_outZuniform_	LayerNormdataZfill_Zzero_rr   Znormal_Zpadding_idxr   hasattrr   r@   rc   Ztrunc_normal_rl   r   r\   Zfloat32r{   rm   )rG   r   r   Zfan_inr   boundrK   rK   rL   _init_weights  s0    

z!EomtPreTrainedModel._init_weights)rM   rN   rO   rP   r   r]   Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attnr   r   Z_can_record_outputsr   rj   r   rK   rK   rK   rL   r     s   
r   zV
    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
    c                
   @   s   e Zd ZedddZdd Zdd Zejdd	d
Z	e
dd Zeedeeee  eee  eee  ee edddZdS )EomtForUniversalSegmentationr   c                    s   t |    | _ j| _t | _tj j j	d| _
t j j| _t fddt jD | _t | _t | _t j jd | _ j j  j j f| _ j j jd| _t | jd| _ | !dt"# j$ | %  d S )N)r   c                    s   g | ]}t  qS rK   )r   r   r   rK   rL   r     r   z9EomtForUniversalSegmentation.__init__.<locals>.<listcomp>rf   )Zloss_cross_entropyZ	loss_maskZ	loss_dice)rd   weight_dictattn_mask_probs)&r	   r=   rd   r3   rc   r   r   r   r2   r8   	layernormrr   rE   queryr   r   layersr   upscale_blockr   	mask_headr   Z
num_labelsclass_predictorr9   r:   	grid_sizer,   r-   r.   r   r`   	criterionrt   r\   onesrD   Z	post_initr   rK   r   rL   r=     s$    
 

z%EomtForUniversalSegmentation.__init__c                 C   s   | j jS r   )r   rn   rx   rK   rK   rL   get_input_embeddings  s    z1EomtForUniversalSegmentation.get_input_embeddingsc                 C   s   t dd S )NzNote needed for Eomt Model.rv   rx   rK   rK   rL   get_auxiliary_logits  s    z1EomtForUniversalSegmentation.get_auxiliary_logits)logitsc                 C   s   |d d d | j jd d f }| |}|d d | j j| jj d d d f }|dd}|j|jd dg| jR  }| 	|}| 
|}td||}||fS )Nrf   r   r   rh   zbqc, bchw -> bqhw)rd   rE   r   r   rq   Z	transposeZreshaper~   r   r   r   r\   Zeinsum)rG   r   Zquery_tokensZclass_logitsZprefix_tokensZmask_logitsrK   rK   rL   predict  s    
&

z$EomtForUniversalSegmentation.predictc                 C   sD   |dk r@t j| jd ||d|k}d| d d d ||d f |< | S )Nrf   r   )device)r\   Zrandr~   )Z	attn_maskprobnum_query_tokensencoder_start_tokensr   Zrandom_queriesrK   rK   rL   _disable_attention_mask  s    z4EomtForUniversalSegmentation._disable_attention_maskN)rz   mask_labelsclass_labelsr[   rH   re   c                 K   sx  d\}}d}|du rt d| |}	t| jD ]\}
}|
| j| jj kr| jjdddddf 	|	j
d dd|	j}tj||	fdd}	|
| j| jj kr| js| j|
| j | jj  dkr| |	}| |\}}||f7 }||f7 }tj|	j
d |	j
d |	j
d |	jtjd}tj|| jd	d
}||d|dd}| jj}|| jj }|dk|ddd||df< | j|| j|
| j | jj  |||jd}|ddddf 	d| jjdd}| | d}||	|}	q0| |	}| |\}}||f7 }||f7 }d}|durf|durfd}t ||D ],\}}| j!||||dd}|| "|7 }q8t#|||||dS )ah  
        mask_labels (`list[torch.Tensor]`, *optional*):
            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
        class_labels (`list[torch.LongTensor]`, *optional*):
            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
        patch_offsets (`list[torch.Tensor]`, *optional*):
            list of tuples indicating the image index and start and end positions of patches for semantic segementation.
        )rK   rK   Nz You have to specify pixel_valuesr   rh   rf   r|   )r   r{   Zbilinear)sizemode)r   r   r   r   .g    er   )rW   rV   r   r   Zauxiliary_predictions)rU   rW   rV   rX   r[   )$
ValueErrorr   	enumerater   r3   rd   rD   r   r   ru   r~   r   r   r\   r   Ztrainingr   r   r   r   boolr   Zinterpolater   viewr   rE   rq   r   r4   rQ   Zmasked_fillzipZget_loss_dictZget_lossrT   )rG   rz   r   r   r[   rH   Zmasks_queries_logits_per_layerZclass_queries_logits_per_layerZattention_maskrY   idxZlayer_moduler   Znorm_hidden_statesrW   rV   Zinterpolated_logitsr   r   Zsequence_outputrU   Z	loss_dictrK   rK   rL   r     s    
2


	"


z$EomtForUniversalSegmentation.forward)NNN)rM   rN   rO   r   r=   r   r   r\   r   r   staticmethodr   r   r   r   r_   r
   r   rT   r   rK   rK   rK   rL   r     s&   

   


r   )r   r   r   ):rP   r   dataclassesr   typingr   r\   Ztorch.nn.functionalr   Z
functionalr   r   Zactivationsr   Z
file_utilsr   Zmodeling_utilsr	   Zprocessing_utilsr
   utilsr   r   r   Zutils.genericr   Zdinov2.modeling_dinov2r   r   r   r   Z mask2former.modeling_mask2formerr   r   Zsiglip.modeling_siglipr   Zvit.configuration_vitr   Z
get_loggerrM   loggerr   rT   r`   rb   rc   r   r   r   r   r   rj   r   r   r   r   r   __all__rK   rK   rK   rL   <module>   sT   
 
$+ )