a
    h+                     @   s   d dl Z d dlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
mZ e rVd dlmZ de je jeeed	d
dZG dd deZG dd de	ZdddZdS )    N   )center_to_corners_format)is_scipy_available   )HungarianMatcher	ImageLoss_set_aux_lossgeneralized_box_ioulinear_sum_assignment      ?inputstargets	num_boxesalphagammac           
      C   sv   |   }tjj| |dd}|| d| d|   }|d| |  }|dkrj|| d| d|   }	|	| }| | S )aR  
    Loss used in RetinaNet for dense detection: https://huggingface.co/papers/1708.02002.

    Args:
        inputs (`torch.FloatTensor` of arbitrary shape):
            The predictions for each example.
        targets (`torch.FloatTensor` with the same shape as `inputs`)
            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
            and 1 for the positive class).
        num_boxes (`int`):
            The total number of boxes in the batch.
        alpha (`float`, *optional*, defaults to 0.25):
            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
        gamma (`int`, *optional*, defaults to 2):
            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.

    Returns:
        Loss tensor
    none)Z	reductionr   r   )sigmoidnnZ
functionalZ binary_cross_entropy_with_logitssum)
r   r   r   r   r   ZprobZce_lossZp_tlossZalpha_t r   a/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/loss/loss_grounding_dino.pysigmoid_focal_loss   s    r   c                   @   s   e Zd Ze dd ZdS )GroundingDinoHungarianMatcherc                 C   sd  |d j dd \}}|d dd }|d dd}|d }tdd	 t||D }||jd
dd }tdd	 |D }d}	d}
d|	 ||
  d| d    }|	d| |
  |d    }|| |  }tj	||dd}t
t|t| }| j| | j|  | j|  }|||d
 }dd	 |D }dd	 t||d
D }dd	 |D S )a  
        Args:
            outputs (`dict`):
                A dictionary that contains at least these entries:
                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
                * "label_maps": Tuple of tensors of dim [num_classes, hidden_dim].
            targets (`list[dict]`):
                A list of targets (len(targets) = batch_size), where each target is a dict containing:
                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
                  ground-truth
                 objects in the target) containing the class labels
                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.

        Returns:
            `list[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
            - index_i is the indices of the selected predictions (in order)
            - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        logitsNr   r   r   
pred_boxes
label_mapsc                 S   s   g | ]\}}||d   qS )class_labelsr   ).0Z	label_maptargetr   r   r   
<listcomp>b       z9GroundingDinoHungarianMatcher.forward.<locals>.<listcomp>T)dimZkeepdimc                 S   s   g | ]}|d  qS boxesr   r    vr   r   r   r"   g   r#   r          @g:0yE>)pc                 S   s   g | ]}t |d  qS r&   lenr(   r   r   r   r"   {   r#   c                 S   s   g | ]\}}t || qS r   r
   )r    icr   r   r   r"   |   r#   c                 S   s0   g | ](\}}t j|t jd t j|t jd fqS ))dtype)torchZ	as_tensorZint64)r    r.   jr   r   r   r"   }   r#   )shapeflattenr   r1   catzipr   logtZcdistr	   r   	bbox_cost
class_cost	giou_costviewcpu	enumeratesplit)selfoutputsr   Z
batch_sizeZnum_queriesZout_probZout_bboxr   Ztarget_bboxr   r   Zneg_cost_classZpos_cost_classr:   r9   r;   Zcost_matrixsizesindicesr   r   r   forwardD   s&    "z%GroundingDinoHungarianMatcher.forwardN)__name__
__module____qualname__r1   Zno_gradrD   r   r   r   r   r   C   s   r   c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	GroundingDinoImageLossa  
    This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we
    compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of
    matched ground-truth / prediction (supervise class and box).

    Args:
        matcher (`GroundingDinoHungarianMatcher`):
            Module able to compute a matching between targets and proposals.
        focal_alpha (`float`):
            Alpha parameter in focal loss.
        losses (`list[str]`):
            List of all the losses to be applied. See `get_loss` for a list of all available losses.
    c                 C   s"   t j|  || _|| _|| _d S N)r   Module__init__matcherfocal_alphalosses)r@   rL   rM   rN   r   r   r   rK      s    zGroundingDinoImageLoss.__init__c           	         sr    d }t  fddtt||D }t j d dd}| |}t j||jt jd}|| t j||< |S )z>
        Create one_hot based on the matching indices
        r   c                    sH   g | ]@\}\}\}}|d kr8|d | t  d |  n
|d | qS )r   r   r   r,   )r    r.   r!   _JrA   r   r   r"      s   zFGroundingDinoImageLoss._get_target_classes_one_hot.<locals>.<listcomp>r   r   )r%   )devicer0   )	r1   r5   r>   r6   Z_get_source_permutation_idxZ
zeros_likerR   longto)	r@   rA   r   rC   r   r   r   idxtarget_classes_onehotr   rQ   r   _get_target_classes_one_hot   s    

z2GroundingDinoImageLoss._get_target_classes_one_hotc           
      C   s~   d|vrt dd|vr t d| |||}|d }|d }t||}t||}| }t|||| jdd}d|i}	|	S )z
        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
        of dim [nb_target_boxes]
        r   z#No logits were found in the outputs	text_maskz&No text_mask were found in the outputsr   r   loss_ce)KeyErrorrW   r1   Zmasked_selectfloatr   rM   )
r@   rA   r   rC   r   rV   Zsource_logitsrX   rY   rN   r   r   r   loss_labels   s&    z"GroundingDinoImageLoss.loss_labelsN)rE   rF   rG   __doc__rK   rW   r\   r   r   r   r   rH      s   rH   c                    sr  t |j|j|jd}g d}t||j|d}|| i }| |d< ||d< ||d< ||d< d }|jrt||}|D ]}||d< ||d< qr||d< ||||j	r|	|
||d	}|||}d
d |
 D }| d|j|jd|j	rdd 
 D }| |jrPi }t|jd D ]" | fdd
 D  q"| tfddD }||fS )N)r:   r9   r;   )labelsr'   Zcardinality)rL   rM   rN   r   r   r   rX   auxiliary_outputs)r   r   r   rX   c                 S   s   i | ]\}}|d  |qS Z_encr   r    kr)   r   r   r   
<dictcomp>   r#   z7GroundingDinoForObjectDetectionLoss.<locals>.<dictcomp>r*   )rY   Z	loss_bboxZ	loss_giouc                 S   s   i | ]\}}|d  |qS r`   r   ra   r   r   r   rc     r#   r   c                    s    i | ]\}}|d    |qS )rO   r   ra   )r.   r   r   rc     r#   c                 3   s&   | ]}|v r | |  V  qd S rI   r   )r    rb   )	loss_dictweight_dictr   r   	<genexpr>  r#   z6GroundingDinoForObjectDetectionLoss.<locals>.<genexpr>)r   r:   r9   r;   rH   rM   rT   Zauxiliary_lossr   Z	two_stageitemsupdateZbbox_loss_coefficientZgiou_loss_coefficientrangeZdecoder_layersr   )r   r^   rR   r   configr   rX   Zoutputs_classZoutputs_coordZencoder_logitsZencoder_pred_boxesrL   rN   	criterionZoutputs_lossr_   Z
aux_outputZencoder_outputs_lossZencoder_loss_dictZenc_weight_dictZaux_weight_dictr   r   )r.   rd   re   r   #GroundingDinoForObjectDetectionLoss   s\    






 
rl   )r   r   )NNNN)r1   Ztorch.nnr   Zimage_transformsr   utilsr   Zloss_for_object_detectionr   r   r   r	   Zscipy.optimizer   ZTensorintr[   r   r   rH   rl   r   r   r   r   <module>   s,   	  '=Q    