a
    h                  	   @   s  d dl Zd dlZd dlmZ d dlmZmZ d dlZ	d dl
Z
d dlm  mZ d dl
mZmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ e rd dl%m&Z& e  rd dl'm(Z( d dl)m*Z* eeddG dd deZ+dLe
je
je
jdddZ,eeedddZ-e
je
je
jdddZ.G dd  d ej/Z0eee1ed!d"d#Z2e
je
je1e
jd!d$d%Z3G d&d' d'ej/Z4G d(d) d)ej/Z5G d*d+ d+ej/Z6dMej/e
je
je
jee
j e7e7d-d.d/Z8G d0d1 d1ej/Z9G d2d3 d3ej/Z:dNe
je7e;e
jd4d5d6Z<G d7d8 d8ej/Z=G d9d: d:ej/Z>G d;d< d<ej/Z?G d=d> d>eZ@G d?d@ d@ejAZBG dAdB dBej/ZCG dCdD dDej/ZDG dEdF dFej/ZEeG dGdH dHeZFedIdG dJdK dKeFZGdHdKgZHdS )O    N)	dataclass)CallableOptional)Tensornn   )ACT2FN)ModelOutputis_scipy_availablerequires_backends)GradientCheckpointingLayer)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_accelerate_available)check_model_inputs   )
EomtConfig)linear_sum_assignment)PartialState)reducea  
    Class for outputs of [`EomtForUniversalSegmentationOutput`].

    This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or
    [`~EomtImageProcessor.post_process_instance_segmentation`] or
    [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see
    [`~EomtImageProcessor] for details regarding usage.
    )Zcustom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dS )
"EomtForUniversalSegmentationOutputa+  
    loss (`torch.Tensor`, *optional*):
        The computed loss, returned when labels are present.
    class_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each
        query. Note the `+ 1` is needed because we incorporate the null class.
    masks_queries_logits (`torch.FloatTensor`):
        A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each
        query.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Last hidden states (final feature map) of the last layer.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model.
    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Self and Cross Attentions weights from transformer decoder.
    patch_offsets (`list[torch.Tensor]`, *optional*):
        list of tuples indicating the image index and start and end positions of patches for semantic segementation.
    Nlossclass_queries_logitsmasks_queries_logitslast_hidden_statehidden_states
attentionspatch_offsets)__name__
__module____qualname____doc__r   r   torchZFloatTensor__annotations__r   r   r   r   tupler   r    listr    r)   r)   b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/eomt/modeling_eomt.pyr   2   s   
r   F)input_featurespoint_coordinatesreturnc                 K   sL   |  dkrd}|d}tjjj| d| d fi |}|rH|d}|S )a(  
    A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors.

    Args:
        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
            A tensor that contains features map on a height * width grid
        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,:
        2)):
            A tensor that contains [0, 1] * [0, 1] normalized point coordinates
        add_dim (`bool`):
            boolean value to keep track of added dimension

    Returns:
        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
        height_grid, width_grid):
            A tensor that contains features for points in `point_coordinates`.
    r   T   g       @      ?)dimZ	unsqueezer%   r   
functionalZgrid_samplesqueeze)r+   r,   Zadd_dimkwargsZpoint_featuresr)   r)   r*   sample_point]   s    
 
r4   )inputslabelsr-   c                 C   sd   |   d} dt| |j }| ddddf |ddddf  }d|d |d   }|S )a  
    A pair wise version of the dice loss, see `dice_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        `torch.Tensor`: The computed loss between each pairs.
    r   r.   N)sigmoidflattenr%   matmulTsum)r5   r6   	numeratordenominatorr   r)   r)   r*   pair_wise_dice_loss}   s
    ,r?   c           	      C   sj   | j d }tjdd}|| t| }|| t| }t|| |j}t|| d| j}|| }|S )a  
    A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage.

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss between each pairs.
    r   noneZ	reduction)shaper   BCEWithLogitsLossr%   Z	ones_likeZ
zeros_liker:   r;   )	r5   r6   Zheight_and_width	criterionZcross_entropy_loss_posZcross_entropy_loss_negZloss_posZloss_negr   r)   r)   r*   $pair_wise_sigmoid_cross_entropy_loss   s    
rE   c                       s\   e Zd ZdZd
eeeed fddZe ej	ej	ej	ej	e
ee	  ddd	Z  ZS )EomtHungarianMatcheraq  This class computes an assignment between the labels and the predictions of the network.

    For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
    predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).
    r/    1  )
cost_class	cost_mask	cost_dice
num_pointsc                    sF   t    |dkr*|dkr*|dkr*td|| _|| _|| _|| _dS )aH  Creates the matcher

        Params:
            cost_class (`float`, *optional*, defaults to 1.0):
                Relative weight of the classification error in the matching cost.
            cost_mask (`float`, *optional*,  defaults to 1.0):
                This is the relative weight of the focal loss of the binary mask in the matching cost.
            cost_dice (`float`, *optional*, defaults to 1.0):
                This is the relative weight of the dice loss of the binary mask in the matching cost.
            num_points (`int`, *optional*, defaults to 12544):
                No. of points to sample on which the mask loss will be calculated. The same set of K points are
                uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
                matching.
        r   zAll costs can't be 0N)super__init__
ValueErrorrK   rH   rI   rJ   )selfrH   rI   rJ   rK   	__class__r)   r*   rM      s    
zEomtHungarianMatcher.__init__)r   r   mask_labelsclass_labelsr-   c                 C   sh  g }|j d }t|D ]<}|| d}|| }	|dd|| f  }
|| |	}|dddf }|	dddf }	tjd| jd|	jd}||j d dd}t	||dd
d}||	j d dd}t	|	|dd
d}	t|	|}t|	|}| j| | j|
  | j|  }t|td	}t|td
}t|d}t| }|| qdd |D }|S )ao  
        Params:
            masks_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, num_labels` with the classification logits.
            class_queries_logits (`torch.Tensor`):
                A tensor of dim `batch_size, num_queries, height, width` with the predicted masks.
            class_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the
                target) containing the class labels.
            mask_labels (`torch.Tensor`):
                A tensor of dim `num_target_boxes, height, width` containing the target masks.

        Returns:
            matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j)
            where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected labels (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
        r   r7   Nr   r.   deviceFZalign_cornersg    _Bg    _c                 S   s0   g | ](\}}t j|t jd t j|t jd fqS )dtype)r%   	as_tensorint64).0ijr)   r)   r*   
<listcomp>  s   z0EomtHungarianMatcher.forward.<locals>.<listcomp>)rB   rangesoftmaxtor%   randrK   rU   repeatr4   r2   rE   r?   rI   rH   rJ   minimumtensormaximumZ
nan_to_numr   cpuappend)rO   r   r   rR   rS   indices
batch_sizer\   Z
pred_probsZ	pred_maskrH   Ztarget_maskr,   Ztarget_coordinatesZpred_coordinatesrI   rJ   Zcost_matrixZassigned_indicesZmatched_indicesr)   r)   r*   forward   s4    


zEomtHungarianMatcher.forward)r/   r/   r/   rG   )r!   r"   r#   r$   floatintrM   r%   no_gradr   r(   r'   rk   __classcell__r)   r)   rP   r*   rF      s    
rF   )r5   r6   	num_masksr-   c                 C   sX   |   d}d|| d }|d|d }d|d |d   }| | }|S )a4  
    Compute the DICE loss, similar to generalized IOU for masks as follows:

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$

    In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow

    $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$

    Args:
        inputs (`torch.Tensor`):
            A tensor representing a mask.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).
        num_masks (`int`):
            The number of masks present in the current batch, used for normalization.

    Returns:
        `torch.Tensor`: The computed loss.
    r   r.   r7   )r8   r9   r<   )r5   r6   rp   Zprobsr=   r>   r   r)   r)   r*   	dice_loss  s    rq   c                 C   s,   t jdd}|| |}|d | }|S )a|  
    Args:
        inputs (`torch.Tensor`):
            A float tensor of arbitrary shape.
        labels (`torch.Tensor`):
            A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs
            (0 for the negative class and 1 for the positive class).

    Returns:
        loss (`torch.Tensor`): The computed loss.
    r@   rA   r   )r   rC   meanr<   )r5   r6   rp   rD   Zcross_entropy_lossr   r)   r)   r*   sigmoid_cross_entropy_loss7  s    
rs   c                	       sP  e Zd Zeeeef d fddZeee	  ee	 dddZ
ee eeef ddd	Zeee eej eeef d
ddZejeej eej e	eeejf dddZdd Zdd ZejejdddZeje	e	eejdddZd!ejejeej eej eeeejf  eeejf dddZejejejddd Z  ZS )"EomtLossconfigweight_dictc                    s   t    t| dg |j| _|| _|j| _t| jd }| j|d< | 	d| |j
| _|j| _|j| _t|j|j|j| jd| _dS )aH  
        The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
        compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
        of matched ground-truth / prediction (supervise class and mask)

        Args:
            config (`EomtConfig`):
                The configuration for Eomt model also containing loss calculation specific parameters.
            weight_dict (`dict[str, float]`):
                A dictionary of weights to be applied to the different losses.
        Zscipyr   r7   empty_weight)rH   rJ   rI   rK   N)rL   rM   r   
num_labelsrw   Zno_object_weightZeos_coefr%   onesregister_bufferZtrain_num_pointsrK   oversample_ratioimportance_sample_ratiorF   class_weightdice_weightmask_weightmatcher)rO   rv   rw   rx   rP   r)   r*   rM   L  s"    

zEomtLoss.__init__)sizesr-   c                 C   sB   |d }|dd  D ](}t |D ]\}}t|| |||< q q|S )Nr   r   )	enumeratemax)rO   r   ZmaxesZsublistindexitemr)   r)   r*   _max_by_axiso  s
    zEomtLoss._max_by_axis)tensorsr-   c                 C   s   |  dd |D }t|g| }|\}}}}|d j}|d j}	tj|||	d}
tj|||ftj|	d}t||
|D ]\\}}}|d |j	d d |j	d d |j	d f 
| d|d |j	d d |j	d f< qv|
|fS )Nc                 S   s   g | ]}t |jqS r)   )r(   rB   )r[   re   r)   r)   r*   r^   y      z8EomtLoss._pad_images_to_max_in_batch.<locals>.<listcomp>r   rX   rU   r   r.   F)r   lenrX   rU   r%   zerosrz   boolziprB   Zcopy_)rO   r   max_sizeZbatch_shaperj   _heightwidthrX   rU   Zpadded_tensorsZpadding_masksre   Zpadded_tensorZpadding_maskr)   r)   r*   _pad_images_to_max_in_batchw  s    

2"z$EomtLoss._pad_images_to_max_in_batch)r   rS   ri   r-   c                 C   s   |}|j \}}}tj| jd}| |}	tdd t||D }
tj||f| j	tj
|jd}|
||	< |dd}|||}d|i}|S )a  Compute the losses related to the labels using cross entropy.

        Args:
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `batch_size, num_queries, num_labels`
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.

        Returns:
            `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
        )weightc                 S   s   g | ]\}\}}|| qS r)   r)   )r[   targetr   r]   r)   r)   r*   r^     r   z(EomtLoss.loss_labels.<locals>.<listcomp>)Z
fill_valuerX   rU   r   r.   loss_cross_entropy)rB   r   ZCrossEntropyLossrx   $_get_predictions_permutation_indicesr%   catr   fullry   rZ   rU   	transpose)rO   r   rS   ri   Zpred_logitsrj   num_queriesr   rD   idxZtarget_classes_oZtarget_classesZpred_logits_transposedZloss_celossesr)   r)   r*   loss_labels  s    

zEomtLoss.loss_labels)r   rR   ri   rp   r-   c                    s     |} |}|| } |\}}	|| }|dddf }|dddf }t D  | fdd j j j}
t	||
dd
d}W d   n1 s0    Y  t	||
dd
d}t|||t|||d}~~|S )a  Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            indices (`tuple[np.array])`:
                The indices computed by the Hungarian matcher.
            num_masks (`int)`:
                The number of masks, used for normalization.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
            - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
              masks.
        Nc                    s
     | S N)calculate_uncertaintylogitsrO   r)   r*   <lambda>  r   z%EomtLoss.loss_masks.<locals>.<lambda>FrV   r   )	loss_mask	loss_dice)r    _get_targets_permutation_indicesr   r%   rn   sample_points_using_uncertaintyrK   r|   r}   r4   r2   rs   rq   )rO   r   rR   ri   rp   Zsrc_idxZtgt_idxZ
pred_masksZtarget_masksr   r,   Zpoint_labelspoint_logitsr   r)   r   r*   
loss_masks  s.    



2

zEomtLoss.loss_masksc                 C   s4   t dd t|D }t dd |D }||fS )Nc                 S   s    g | ]\}\}}t ||qS r)   r%   Z	full_like)r[   r\   srcr   r)   r)   r*   r^     r   zAEomtLoss._get_predictions_permutation_indices.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r)   r)   )r[   r   r   r)   r)   r*   r^     r   r%   r   r   )rO   ri   batch_indicesZpredictions_indicesr)   r)   r*   r     s    z-EomtLoss._get_predictions_permutation_indicesc                 C   s4   t dd t|D }t dd |D }||fS )Nc                 S   s    g | ]\}\}}t ||qS r)   r   )r[   r\   r   tgtr)   r)   r*   r^     r   z=EomtLoss._get_targets_permutation_indices.<locals>.<listcomp>c                 S   s   g | ]\}}|qS r)   r)   )r[   r   r   r)   r)   r*   r^     r   r   )rO   ri   r   Ztarget_indicesr)   r)   r*   r     s    z)EomtLoss._get_targets_permutation_indices)r   r-   c                 C   s   t | }|S )a  
        In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits'
        for the foreground class in `classes`.

        Args:
            logits (`torch.Tensor`):
            A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is:
            the number of foreground classes. The values are logits.

        Returns:
            scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most
            uncertain locations having the highest uncertainty score.
        )r%   abs)rO   r   Zuncertainty_scoresr)   r)   r*   r     s    zEomtLoss.calculate_uncertainty)r   rK   r|   r}   r-   c                 C   s   |j d }t|| }tj||d|jd}t||dd}	||	}
t|| }|| }tj|
dddddf |ddd }|tj|tj|jd	 }||dddf 7 }|	d
d|	d
ddf 	||d}|dkrtj
|tj||d|jdgdd}|S )a  
        This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The
        uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit
        prediction as input.

        Args:
            logits (`float`):
                Logit predictions for P points.
            uncertainty_function:
                A function that takes logit predictions for P points and returns their uncertainties.
            num_points (`int`):
                The number of points P to sample.
            oversample_ratio (`int`):
                Oversampling parameter.
            importance_sample_ratio (`float`):
                Ratio of points that are sampled via importance sampling.

        Returns:
            point_coordinates (`torch.Tensor`):
                Coordinates for P sampled points.
        r   r.   rT   FrV   Nr   )kr0   r   r7   r0   )rB   rm   r%   rb   rU   r4   Ztopkarangelongviewr   )rO   r   Zuncertainty_functionrK   r|   r}   Z	num_boxesZnum_points_sampledr,   r   Zpoint_uncertaintiesZnum_uncertain_pointsZnum_random_pointsr   shiftr)   r)   r*   r     s"    
&(z(EomtLoss.sample_points_using_uncertaintyNr   r   rR   rS   auxiliary_predictionsr-   c                    s   |  ||||}| j||d jd}i | ||||| |||}|durt|D ]H\ }	|	d }|	d }| ||||}
 fdd|
 D }
||
 qV|S )a  
        This performs the loss computation.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, num_labels)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            class_labels (`list[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*):
                if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from
                the inner layers of the EomtMaskedAttentionDecoder.

        Returns:
            losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
            - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth
              masks.
            if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional
            losses for each auxiliary predictions.
        r   rT   Nr   r   c                    s    i | ]\}}| d   |qS )r   r)   )r[   keyvaluer   r)   r*   
<dictcomp>n  r   z$EomtLoss.forward.<locals>.<dictcomp>)	r   get_num_masksrU   r   r   r   rk   itemsupdate)rO   r   r   rR   rS   r   ri   rp   r   Zaux_outputs	loss_dictr)   r   r*   rk   <  s    $zEomtLoss.forward)rS   rU   r-   c                 C   s^   t dd |D }tj|tj|d}d}t rHtji krHt|}t j}tj	|| dd}|S )zk
        Computes the average number of target masks across the batch, for normalization purposes.
        c                 S   s   g | ]}t |qS r)   )r   )r[   classesr)   r)   r*   r^   w  r   z*EomtLoss.get_num_masks.<locals>.<listcomp>r   r   )min)
r<   r%   rY   rl   r   r   Z_shared_stater   Znum_processesclamp)rO   rS   rU   rp   Z
world_sizer)   r)   r*   r   s  s    
zEomtLoss.get_num_masks)N)r!   r"   r#   r   dictstrrl   rM   r(   rm   r   r   r'   r   nparrayr   r%   r   r   r   r   r   r   rk   rU   r   ro   r)   r)   rP   r*   rt   K  s>   #
$>= 7rt   c                       s4   e Zd ZdZ fddZejejdddZ  ZS )EomtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   kernel_sizeZstride)rL   rM   
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)rO   rv   r   r   r   r   r   rP   r)   r*   rM     s    
 zEomtPatchEmbeddings.__init__pixel_valuesr-   c                 C   sH   |j d }|| jkr,td| j d| d| |ddd}|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r.   )rB   r   rN   r   r9   r   )rO   r   r   
embeddingsr)   r)   r*   rk     s    

zEomtPatchEmbeddings.forward)	r!   r"   r#   r$   rM   r%   r   rk   ro   r)   r)   rP   r*   r     s   r   c                       s<   e Zd ZdZedd fddZejejdddZ  Z	S )	EomtEmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    Nrv   r-   c                    s   t    || _|j| _ttdd|j| _	tt
d|j|j| _t|| _| jj}t|j| _d|j | _t||j| _| jdt|ddd d S )Nr   position_ids)r   r7   F)
persistent)rL   rM   rv   r   r   	Parameterr%   Zrandnr   	cls_tokenr   Znum_register_tokensregister_tokensr   patch_embeddingsr   ZDropoutZhidden_dropout_probdropoutnum_prefix_tokens	Embeddingposition_embeddingsr{   r   expand)rO   rv   r   rP   r)   r*   rM     s    

zEomtEmbeddings.__init__r   c                 C   s~   |j \}}}}| jjjj}| |j|d}| j|dd}| j|dd}|| 	| j
 }tj|||gdd}| |}|S )NrW   r7   r   r   )rB   r   r   r   rX   ra   r   r   r   r   r   r%   r   r   )rO   r   rj   r   Ztarget_dtyper   Z
cls_tokensr   r)   r)   r*   rk     s    
zEomtEmbeddings.forward)
r!   r"   r#   r$   r   rM   r%   r   rk   ro   r)   r)   rP   r*   r     s   r           )modulequeryr   r   attention_maskscalingr   c           
      K   s|   t ||dd| }|d ur(|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr7   )r0   rX   )ptrainingr   r.   )r%   r:   r   r   r1   r`   float32ra   rX   r   r   
contiguous)
r   r   r   r   r   r   r   r3   attn_weightsattn_outputr)   r)   r*   eager_attention_forward  s    
r   c                       sL   e Zd ZdZ fddZdejeej eejeej f dddZ	  Z
S )	EomtAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkrZtd| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)rL   rM   rv   r   	embed_dimnum_attention_heads	num_headshead_dimrN   scaleZattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrO   rv   rP   r)   r*   rM     s$    

zEomtAttention.__init__N)r   r   r-   c              
   K   s   |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t}
| j	j
dkrt| j	j
 }
|
| |||	|| j| j| jsdn| jd\}}|||| }| |}||fS )z#Input shape: Batch x Time x Channelr   r.   eagerr   )r   r   r   )rB   r   r   r   r   r   r   r   r   rv   Z_attn_implementationr   r   r   r   r   reshaper   r   )rO   r   r   r3   rj   Z
seq_lengthr   ZquerieskeysvaluesZattention_interfacer   r   r)   r)   r*   rk     s.    




zEomtAttention.forward)N)r!   r"   r#   r$   rM   r%   r   r   r'   rk   ro   r)   r)   rP   r*   r     s    r   c                       s6   e Zd Zdd fddZejejdddZ  ZS )EomtLayerScaleNr-   c                    s(   t    t|jt|j | _d S r   )	rL   rM   r   r   layerscale_valuer%   rz   r   lambda1r   rP   r)   r*   rM     s    
zEomtLayerScale.__init__hidden_stater-   c                 C   s
   || j  S r   )r  rO   r
  r)   r)   r*   rk   #  s    zEomtLayerScale.forwardr!   r"   r#   rM   r%   r   rk   ro   r)   r)   rP   r*   r    s   r  )input	drop_probr   r-   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   r   )rB   ndimr%   rb   rX   rU   Zfloor_div)r  r  r   Z	keep_probrB   Zrandom_tensoroutputr)   r)   r*   	drop_path'  s    
r  c                       sP   e Zd ZdZdee dd fddZejejdddZ	e
d	d
dZ  ZS )EomtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).N)r  r-   c                    s   t    || _d S r   )rL   rM   r  )rO   r  rP   r)   r*   rM   >  s    
zEomtDropPath.__init__r   r-   c                 C   s   t || j| jS r   )r  r  r   rO   r   r)   r)   r*   rk   B  s    zEomtDropPath.forwardr  c                 C   s   d| j  S )Nzp=)r  r   r)   r)   r*   
extra_reprE  s    zEomtDropPath.extra_repr)N)r!   r"   r#   r$   r   rl   rM   r%   r   rk   r   r  ro   r)   r)   rP   r*   r  ;  s   r  c                       s6   e Zd Zdd fddZejejdddZ  ZS )EomtMLPNr  c                    sn   t    |j }}t|j|j }tj||dd| _t|j	t
rPt|j	 | _n|j	| _tj||dd| _d S )NTbias)rL   rM   r   rm   	mlp_ratior   r   fc1r   
hidden_actr   r   
activationfc2rO   rv   Zin_featuresZout_featuresZhidden_featuresrP   r)   r*   rM   J  s    

zEomtMLP.__init__r	  c                 C   s"   |  |}| |}| |}|S r   )r  r  r  r  r)   r)   r*   rk   U  s    


zEomtMLP.forwardr  r)   r)   rP   r*   r  I  s   r  c                       s6   e Zd Zdd fddZejejdddZ  ZS )EomtSwiGLUFFNNr  c                    sl   t    |j }}t|j|j }t|d d d d d }tj|d| dd| _tj||dd| _d S )Nr.   r         Tr  )	rL   rM   r   rm   r  r   r   
weights_inweights_outr  rP   r)   r*   rM   ]  s    

zEomtSwiGLUFFN.__init__r	  c                 C   s6   |  |}|jddd\}}tj|| }| |S )Nr.   r7   r   )r#  chunkr   r1   Zsilur$  )rO   r
  x1Zx2Zhiddenr)   r)   r*   rk   f  s    
zEomtSwiGLUFFN.forwardr  r)   r)   rP   r*   r   \  s   	r   c                       sF   e Zd ZdZedd fddZd	ejeej ejdddZ	  Z
S )
	EomtLayerzCThis corresponds to the Block class in the original implementation.Nr   c                    s   t    tj|j|jd| _t|| _t	|| _
|jdkrFt|jnt | _tj|j|jd| _|jrvt|| _n
t|| _t	|| _d S )Nepsr   )rL   rM   r   	LayerNormr   layer_norm_epsnorm1r   	attentionr  layer_scale1Zdrop_path_rater  ZIdentityr  norm2Zuse_swiglu_ffnr   mlpr  layer_scale2r   rP   r)   r*   rM   p  s    



zEomtLayer.__init__)r   	head_maskr-   c                 C   sb   |  |}| ||\}}| |}| || }| |}| |}| |}| || }|S r   )r,  r-  r.  r  r/  r0  r1  )rO   r   r2  Zhidden_states_normZself_attention_outputr   Zlayer_outputr)   r)   r*   rk     s    




zEomtLayer.forward)N)r!   r"   r#   r$   r   rM   r%   r   r   rk   ro   r)   r)   rP   r*   r'  m  s    r'  c                       s2   e Zd Zd fdd	ZejejdddZ  ZS )	EomtLayerNorm2dư>Tc                    s   t  j|||d d S )N)r)  Zelementwise_affine)rL   rM   )rO   r   r)  ZaffinerP   r)   r*   rM     s    zEomtLayerNorm2d.__init__r	  c                 C   s>   | dddd}t|| j| j| j| j}| dddd}|S )Nr   r.   r   r   )ZpermuteFZ
layer_normZnormalized_shaper   r  r)  r  r)   r)   r*   rk     s    zEomtLayerNorm2d.forward)r4  Tr  r)   r)   rP   r*   r3    s   r3  c                       s6   e Zd Zed fddZejejdddZ  ZS )EomtScaleLayerrv   c                    sV   t    |j}tj||ddd| _t|j | _tj	||dd|dd| _
t|| _d S )Nr.   r   r   r   F)r   paddinggroupsr  )rL   rM   r   r   ConvTranspose2dconv1r   r  r  r   conv2r3  layernorm2drO   rv   r   rP   r)   r*   rM     s    
	zEomtScaleLayer.__init__r  c                 C   s,   |  |}| |}| |}| |}|S r   )r;  r  r<  r=  r  r)   r)   r*   rk     s
    



zEomtScaleLayer.forward	r!   r"   r#   r   rM   r%   r   rk   ro   r)   r)   rP   r*   r6    s   r6  c                       s6   e Zd Zed fddZejejdddZ  ZS )EomtScaleBlockr7  c                    s6   t     j| _t fddt| jD | _d S )Nc                    s   g | ]}t  qS r)   )r6  r[   r   r7  r)   r*   r^     r   z+EomtScaleBlock.__init__.<locals>.<listcomp>)rL   rM   Znum_upscale_blocks
num_blocksr   
ModuleListr_   blockr   rP   r7  r*   rM     s    
zEomtScaleBlock.__init__r  c                 C   s   | j D ]}||}q|S r   )rD  )rO   r   rD  r)   r)   r*   rk     s    

zEomtScaleBlock.forwardr?  r)   r)   rP   r*   r@    s   r@  c                       s6   e Zd Zed fddZejejdddZ  ZS )EomtMaskHeadr7  c                    sJ   t    |j}t||| _t||| _t||| _t|j	 | _
d S r   )rL   rM   r   r   r   r  r  fc3r   r  r  r>  rP   r)   r*   rM     s    
zEomtMaskHead.__init__r  c                 C   s.   |  | |}|  | |}| |}|S r   )r  r  r  rF  r  r)   r)   r*   rk     s    
zEomtMaskHead.forwardr?  r)   r)   rP   r*   rE    s   	rE  c                   @   sP   e Zd ZU dZeed< dZdZdZdgZ	dZ
dZeedZejd	d
ddZd	S )EomtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    rv   Zeomtr   Fr'  T)r   r   N)r   r-   c                 C   sZ  | j j}t|tjtjtjfrtjj|j	t
dd |jd urtj|j	\}}|dkrhdt
| nd}tj|j| | nt|tjr|j	jd |jj  nt|tjr|j	jjddd |jd ur|j	j|j   npt|trt|drV|jj| j j nDt|trVtjj|jjtjd|d|jj |j_|j!j  d S )	N   )ar   r   r/   r   )rr   stdr  )"rv   Zinitializer_ranger   r   r   r   r:  initZkaiming_uniform_r   mathsqrtr  Z_calculate_fan_in_and_fan_outZuniform_r*  dataZfill_Zzero_r   Znormal_Zpadding_idxr  hasattrr  r  r   Ztrunc_normal_r   ra   r%   r   rX   r   )rO   r   rJ  Zfan_inr   boundr)   r)   r*   _init_weights  s0    

z!EomtPreTrainedModel._init_weights)r!   r"   r#   r$   r   r&   Zbase_model_prefixmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attnr'  r   Z_can_record_outputsr   ModulerQ  r)   r)   r)   r*   rG    s   
rG  zV
    The EoMT Model with head on top for instance/semantic/panoptic segmentation.
    c                
       s   e Zd ZdZed fddZeeeeeeef eeef dddZ	eeef edd	d
Z
eedeeee  eee  eee  ee edddZdd ZejdddZedd Z  ZS )EomtForUniversalSegmentationr   r7  c                    s   t     | _ j| _t | _tj j j	d| _
t j j| _t fddt jD | _t | _t | _t j jd | _ j j  j j f| _ j j jd| _t | jd| _ | !dt"# j$ | %  d S )Nr(  c                    s   g | ]}t  qS r)   )r'  rA  r7  r)   r*   r^     r   z9EomtForUniversalSegmentation.__init__.<locals>.<listcomp>r   )r   r   r   ru   attn_mask_probs)&rL   rM   rv   num_hidden_layersr   r   r   r*  r   r+  	layernormr   r   r   rC  r_   layersr@  upscale_blockrE  	mask_headr   ry   class_predictorr   r   	grid_sizer~   r   r   rw   rt   rD   r{   r%   rz   rB  Z	post_initr   rP   r7  r*   rM     s$    
 

z%EomtForUniversalSegmentation.__init__r   c                 C   sN   | j |||||d}| j D ]*\}}| D ]\}	}
||	v r.|
|9 }
q.q|S )Nr   r   rR   rS   r   )rD   rw   r   )rO   r   r   rR   rS   r   r   r   r   Zloss_keyr   r)   r)   r*   get_loss_dict(  s    	z*EomtForUniversalSegmentation.get_loss_dict)r   r-   c                 C   s   t | S r   )r<   r  )rO   r   r)   r)   r*   get_loss@  s    z%EomtForUniversalSegmentation.get_lossN)r   rR   rS   r    r3   r-   c                 K   sx  d\}}d}|du rt d| |}	t| jD ]\}
}|
| j| jj kr| jjdddddf 	|	j
d dd|	j}tj||	fdd}	|
| j| jj kr| js| j|
| j | jj  dkr| |	}| |\}}||f7 }||f7 }tj|	j
d |	j
d |	j
d |	jtjd}tj|| jd	d
}||d|dd}| jj}|| jj }|dk|ddd||df< | j|| j|
| j | jj  |||jd}|ddddf 	d| jjdd}| | d}||	|}	q0| |	}| |\}}||f7 }||f7 }d}|durf|durfd}t ||D ],\}}| j!||||dd}|| "|7 }q8t#|||||dS )ah  
        mask_labels (`list[torch.Tensor]`, *optional*):
            list of mask labels of shape `(num_labels, height, width)` to be fed to a model
        class_labels (`list[torch.LongTensor]`, *optional*):
            list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the
            labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`.
        patch_offsets (`list[torch.Tensor]`, *optional*):
            list of tuples indicating the image index and start and end positions of patches for semantic segementation.
        )r)   r)   Nz You have to specify pixel_valuesr   r7   r   r   )rU   rX   Zbilinear)sizemode)probnum_query_tokensencoder_start_tokensrU   .g    er   r]  )r   r   r   r   r    )$rN   r   r   rX  rV  rv   rB  r   r   r   rB   ra   rU   r%   r   r   rU  rW  predictrz   r   r5  Zinterpolater\  r   r`  r   r   _disable_attention_maskr   rl   Zmasked_fillr   r^  r_  r   )rO   r   rR   rS   r    r3   Zmasks_queries_logits_per_layerZclass_queries_logits_per_layerr   r   r   Zlayer_moduler   Znorm_hidden_statesr   r   Zinterpolated_logitsrc  rd  Zsequence_outputr   r   r)   r)   r*   rk   C  s    
2


	"


z$EomtForUniversalSegmentation.forwardc                 C   s   | j jS r   )r   r   r   r)   r)   r*   get_input_embeddings  s    z1EomtForUniversalSegmentation.get_input_embeddingsr   c                 C   s   |d d d | j jd d f }| |}|d d | j j| jj d d d f }|dd}|j|jd dg| jR  }| 	|}| 
|}td||}||fS )Nr   r.   r   r7   zbqc, bchw -> bqhw)rv   r   r[  r   r   r   r  rB   r\  rZ  rY  r%   Zeinsum)rO   r   Zquery_tokensZclass_logitsZprefix_tokensZmask_logitsr)   r)   r*   re    s    
&

z$EomtForUniversalSegmentation.predictc                 C   sD   |dk r@t j| jd ||d|k}d| d d d ||d f |< | S )Nr   r   rT   )r%   rb   rB   )Z	attn_maskrb  rc  rd  rU   Zrandom_queriesr)   r)   r*   rf    s    z4EomtForUniversalSegmentation._disable_attention_mask)NNN)r!   r"   r#   rR  r   rM   r   r   r   r^  r_  r   r   r   r(   r   r   r   rk   rg  r%   re  staticmethodrf  ro   r)   r)   rP   r*   rT    s6   

   


grT  )F)r   )r   F)Icollections.abcr   rL  dataclassesr   typingr   r   numpyr   r%   Ztorch.nn.functionalr   r1   r5  r   Zactivationsr   Z
file_utilsr	   r
   r   Zmodeling_layersr   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   Zutils.genericr   Zconfiguration_eomtr   Zscipy.optimizer   Z
accelerater   Zaccelerate.utilsr   r   r4   r?   rE   rS  rF   rm   rq   rs   rt   r   r   rl   r   r   r  r   r  r  r  r   r'  r*  r3  r6  r@  rE  rG  rT  __all__r)   r)   r)   r*   <module>   s   
!  j  :!, >	*+ C