a
    h-                     @   s,  d Z ddlmZ ddlmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlmZ eeZeeddG dd deZeG dd deZdddZG dd dej Z!G dd dej Z"eddG dd deZ#ddgZ$dS ) zPyTorch VitPose model.    )	dataclass)OptionalUnionN)nn   )BackboneOutput)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)load_backbone)can_return_tuple   )VitPoseConfigz6
    Class for outputs of pose estimation models.
    )Zcustom_introc                   @   sj   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )VitPoseEstimatorOutputaH  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
    heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
        Heatmaps as predicted by the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    Nlossheatmaps.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchZFloatTensor__annotations__r   r   tupler    r   r   h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   &   s
   
r   c                   @   s@   e Zd ZU eed< dZdZdZee	j
e	je	jf dddZdS )	VitPosePreTrainedModelconfigZvitpixel_valuesT)modulec                 C   s   t |tjtjfrZtjj|jjt	j
d| jjd|jj|j_|jdur|jj  n&t |tjr|jj  |jjd dS )zInitialize the weightsg        )meanZstdNg      ?)
isinstancer   LinearConv2dinitZtrunc_normal_weightdatator   Zfloat32r!   Zinitializer_rangeZdtypebiasZzero_	LayerNormZfill_)selfr#   r   r   r   _init_weightsE   s    
z$VitPosePreTrainedModel._init_weightsN)r   r   r   r   r   Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingr   r   r&   r'   r-   r/   r   r   r   r   r    >   s
   
r    gaussian-heatmapc                 C   s   |dvrt d| jdkr"t d| j\}}}}d}|dkrnd}| dddddd	f  | dddddd	f< | |d
|||} |  }| D ]H\}	}
| dd|
d	f |dd|	d	f< | dd|	d	f |dd|
d	f< q|||||f}|d
}|S )a  Flip the flipped heatmaps back to the original form.

    Args:
        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
            The output heatmaps obtained from the flipped images.
        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
            Target type to use. Can be gaussian-heatmap or combined-target.
            gaussian-heatmap: Classification target with gaussian distribution.
            combined-target: The combination of classification target (response map) and regression target (offset map).
            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

    Returns:
        torch.Tensor: heatmaps that flipped back to the original image
    )r0   combined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   r1   r   N.)
ValueErrorndimshapereshapeclonetolistflip)Zoutput_flipped
flip_pairsZtarget_type
batch_sizeZnum_keypointsheightwidthZchannelsZoutput_flipped_backleftrightr   r   r   	flip_backT   s"    
. "
rA   c                       sD   e Zd ZdZed fddZd	ejeej ejdddZ	  Z
S )
VitPoseSimpleDecoderz
    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
    feature maps into heatmaps.
    r!   c                    sH   t    t | _tj|jddd| _tj|j	j
|jdddd| _d S )NZbilinearF)scale_factormodeZalign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationZUpsamplerD   
upsamplingr'   backbone_confighidden_size
num_labelsconvr.   r!   	__class__r   r   rK      s    

zVitPoseSimpleDecoder.__init__N)hidden_stater;   returnc                 C   s4   |  |}| |}| |}|d ur0t||}|S N)rM   rN   rR   rA   r.   rV   r;   r   r   r   r   forward   s    



zVitPoseSimpleDecoder.forward)Nr   r   r   r   r   rK   r   Tensorr   rZ   __classcell__r   r   rT   r   rB   |   s   	rB   c                       s@   e Zd ZdZed fddZd	ejeej dddZ	  Z
S )
VitPoseClassicDecoderz
    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
    turning the feature maps into heatmaps.
    rC   c                    s   t    tj|jjdddddd| _td| _t	 | _
tjddddddd| _td| _t	 | _tjd|jdddd| _d S )	N   r2      r   F)rG   rH   rI   r,   r   rF   )rJ   rK   r   ZConvTranspose2drO   rP   deconv1ZBatchNorm2d
batchnorm1rL   relu1deconv2
batchnorm2relu2r'   rQ   rR   rS   rT   r   r   rK      s    


zVitPoseClassicDecoder.__init__N)rV   r;   c                 C   s\   |  |}| |}| |}| |}| |}| |}| |}|d urXt||}|S rX   )ra   rb   rc   rd   re   rf   rR   rA   rY   r   r   r   rZ      s    







zVitPoseClassicDecoder.forward)Nr[   r   r   rT   r   r^      s   r^   z?
    The VitPose model with a pose estimation head on top.
    c                
       s\   e Zd Zed fddZeedeje	ej e	ej e	ej e
e edddZ  ZS )	VitPoseForPoseEstimationrC   c                    s|   t  | t|| _t| jjds,tdt| jjdsBtdt| jjdsXtd|jrft|nt	|| _
|   d S )NrP   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)rJ   rK   r   backbonehasattrr!   r4   Zuse_simple_decoderrB   r^   headZ	post_initrS   rT   r   r   rK      s    
z!VitPoseForPoseEstimation.__init__N)r"   dataset_indexr;   labelskwargsrW   c                 K   s   d}|durt d| jj|fd|i|}|jd }|jd }	| jjjd | jjjd  }
| jjjd | jjjd  }|	ddd}|
|	d|
| }| j||d}t|||j|jd	S )
ac  
        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
        flip_pairs (`torch.tensor`, *optional*):
            Whether to mirror pairs of keypoints (for example, left ear -- right ear).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> heatmaps = outputs.heatmaps
        ```NzTraining is not yet supportedrm   r3   r   r   r`   )r;   )r   r   r   r   )NotImplementedErrorrj   Zforward_with_filtered_kwargsZfeature_mapsr6   r!   rO   rh   ri   Zpermuter7   
contiguousrl   r   r   r   )r.   r"   rm   r;   rn   ro   r   outputsZsequence_outputr<   Zpatch_heightZpatch_widthr   r   r   r   rZ      s.    '

z VitPoseForPoseEstimation.forward)NNN)r   r   r   r   rK   r   r   r   r\   r   r	   r   r   rZ   r]   r   r   rT   r   rg      s      rg   )r0   )%r   dataclassesr   typingr   r   r   Ztorch.utils.checkpointr   Zmodeling_outputsr   Zmodeling_utilsr   Zprocessing_utilsr	   utilsr
   r   r   r   Zutils.backbone_utilsr   Zutils.genericr   Zconfiguration_vitposer   Z
get_loggerr   loggerr   r    rA   ModulerB   r^   rg   __all__r   r   r   r   <module>   s8   

(&V