a
    h                  	   @   s  d Z ddlZddlmZ ddlmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& e'e(Z)eeddG dd deZ*eeddG dd deZ+G dd de
j,Z-G dd de
j,Z.G dd de
j,Z/dQe
j,ej0ej0ej0eej0 e1e1d d!d"Z2G d#d$ d$e
j,Z3G d%d& d&e
j,Z4G d'd( d(e
j,Z5G d)d* d*e
j,Z6G d+d, d,e
j,Z7G d-d. d.eZ8G d/d0 d0e
j,Z9G d1d2 d2e
j,Z:d3d4 Z;G d5d6 d6e
j,Z<G d7d8 d8e
j,Z=G d9d: d:e
j,Z>G d;d< d<e
j,Z?eG d=d> d>eZ@eG d?d@ d@e@ZAG dAdB dBe
j,ZBG dCdD dDe
j,ZCG dEdF dFe
j,ZDedGdG dHdI dIe@ZEG dJdK dKe
j,ZFG dLdM dMe
j,ZGeG dNdO dOe@ZHg dPZIdS )RzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)	dataclass)CallableOptional)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int)load_backbone)can_return_tuplecheck_model_inputs   )	DPTConfigz
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:
    )Zcustom_introc                   @   s>   e Zd ZU dZdZeej ed< dZ	ee
ejdf  ed< dS )*BaseModelOutputWithIntermediateActivationsak  
    last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_states.intermediate_activations)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tuple r%   r%   `/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/dpt/modeling_dpt.pyr   -   s   
r   z
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dS )	4BaseModelOutputWithPoolingAndIntermediateActivationsa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) after further processing
        through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
        the classification token after processing through a linear layer and a tanh activation function. The linear
        layer weights are trained from the next sentence prediction (classification) objective during pretraining.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_statepooler_output.hidden_states
attentionsr   )r   r   r   r    r(   r   r!   r"   r#   r)   r*   r$   r+   r   r%   r%   r%   r&   r'   @   s   

r'   c                       sV   e Zd ZdZdeeeeef  d fddZdddZ	de
jeed
ddZ  ZS )DPTViTHybridEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    N)configfeature_sizec           
         sj  t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }t
|| _| jjd }t| jjdkrtdt| jj ddg| _|d u r|j}	|	dd  }|	d }n&t|tjj	r|n||f}| jjd }|| _|d | _|| _tj||dd| _ttdd|j| _ttd|d |j| _d S )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler   backbonechannelslen
ValueErrorresidual_feature_map_indexZbackbone_featmap_shaper   Conv2d
projection	Parameterr!   zeros	cls_tokenposition_embeddings)
selfr-   r.   r5   r6   r7   r8   num_patchesZfeature_dimZfeat_map_shape	__class__r%   r&   r4   `   s0    
 



zDPTViTHybridEmbeddings.__init__r   c                 C   s   |d d d |f }|d|d f }t t|d }|d||ddddd}tjj|||fdd}|ddddd|| d}tj||gdd	}|S 
Nr         ?r   r/   r      bilinear)sizemodedim)	r   r?   reshapepermuter   
functionalinterpolater!   catrH   ZposembZgrid_size_heightZgrid_size_widthstart_indexZ
posemb_tokZposemb_gridZold_grid_sizer%   r%   r&   _resize_pos_embed   s    z(DPTViTHybridEmbeddings._resize_pos_embedF)pixel_valuesinterpolate_pos_encodingreturnc              
      s   |j \}}}}|| jkr td|sn|| jd ks@|| jd krntd| d| d| jd  d| jd  d	| | j|| j || j }| |  jd } fd	d
| j	D }	| 
|ddd}
| j|dd}tj||
fdd}
|
| }
t|
|	dS )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r/   c                    s   g | ]} j | qS r%   )feature_maps).0indexZbackbone_outputr%   r&   
<listcomp>       z2DPTViTHybridEmbeddings.forward.<locals>.<listcomp>rN   rR   )r   r   )shaper7   r@   r5   r[   rG   r6   r=   ra   rA   rC   flatten	transposerF   expandr!   rX   r   )rH   r\   r]   
batch_sizer7   heightwidthrG   featuresoutput_hidden_states
embeddings
cls_tokensr%   rd   r&   forward   s8    


zDPTViTHybridEmbeddings.forward)N)r   )F)r   r   r   r    r   r   r$   intr4   r[   r!   Tensorboolr   rr   __classcell__r%   r%   rJ   r&   r,   Y   s   ""
 r,   c                       s<   e Zd ZdZ fddZd
ddZejeddd	Z	  Z
S )DPTViTEmbeddingszB
    Construct the CLS token, position and patch embeddings.

    c                    sh   t    ttdd|j| _t|| _	| j	j
}ttd|d |j| _t|j| _|| _d S )Nr   )r3   r4   r   rD   r!   rE   r8   rF   DPTViTPatchEmbeddingspatch_embeddingsrI   rG   Dropouthidden_dropout_probdropoutr-   )rH   r-   rI   rJ   r%   r&   r4      s    

zDPTViTEmbeddings.__init__r   c                 C   s   |d d d |f }|d|d f }t |dd }|d||ddddd}tjj|||fdd}|ddddd|| d}tj||gdd	}|S rL   )	r   rP   rT   rU   r   rV   rW   r!   rX   rY   r%   r%   r&   r[      s    z"DPTViTEmbeddings._resize_pos_embedr\   r^   c                 C   s   |j \}}}}| jj}| | j|| || }| |}| \}}	}
| j|dd}t	j
||fdd}|| }| |}t|dS )Nr/   r   rR   )r   )rg   r-   r6   r[   rG   ry   rP   rF   rj   r!   rX   r|   r   )rH   r\   rk   r7   rl   rm   r6   rG   rp   Zseq_len_rq   r%   r%   r&   rr      s    

zDPTViTEmbeddings.forward)r   )r   r   r   r    r4   r[   r!   rt   r   rr   rv   r%   r%   rJ   r&   rw      s   

rw   c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )rx   z$
    Image to Patch Embedding.

    r-   c                    s   t    |j|j }}|j|j }}t|tjj	r8|n||f}t|tjj	rR|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )r2   stride)r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   rI   r   rB   rC   )rH   r-   r5   r6   r7   r8   rI   rJ   r%   r&   r4      s    
 zDPTViTPatchEmbeddings.__init__r}   c                 C   s<   |j \}}}}|| jkr td| |ddd}|S )Nr_   rN   r   )rg   r7   r@   rC   rh   ri   )rH   r\   rk   r7   rl   rm   rp   r%   r%   r&   rr     s    
zDPTViTPatchEmbeddings.forward
r   r   r   r    r   r4   r!   rt   rr   rv   r%   r%   rJ   r&   rx      s   rx           )modulequerykeyvalueattention_maskscalingr|   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d urX|| }t ||}	|	dd }	|	|fS )Nr/   r0   )rS   dtype)ptrainingr   rN   )r!   matmulri   r   rV   ZsoftmaxZfloat32tor   r|   r   
contiguous)
r   r   r   r   r   r   r|   kwargsZattn_weightsZattn_outputr%   r%   r&   eager_attention_forward  s    r   c                       sJ   e Zd Zed fddZdejeej eejejf dddZ	  Z
S )	DPTSelfAttentionr   c                    s   t    |j|j dkr>t|ds>td|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r3   r4   r8   num_attention_headshasattrr@   r-   rs   attention_head_sizeall_head_sizeZattention_probs_dropout_probdropout_probr   	is_causalr   LinearZqkv_biasr   r   r   rH   r-   rJ   r%   r&   r4   0  s"    

zDPTSelfAttention.__init__Nr*   	head_maskr^   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t}| j	j
dkr~t| j	j
 }|| ||||| j| j| jsdn| jd\}	}
|	 d d | jf }|	|}	|	|
fS )	Nr   r/   r   rN   eagerr   )r   r   r|   r0   )rg   r   r   r   viewri   r   r   r   r-   Z_attn_implementationr   r   r   r   r   rP   r   rT   )rH   r*   r   rk   Z	new_shapeZ	key_layerZvalue_layerZquery_layerZattention_interfaceZcontext_layerZattention_probsZnew_context_layer_shaper%   r%   r&   rr   D  s*    


zDPTSelfAttention.forward)N)r   r   r   r   r4   r!   rt   r   r$   rr   rv   r%   r%   rJ   r&   r   /  s    r   c                       s>   e Zd ZdZed fddZejejejdddZ  Z	S )DPTViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                    s.   t    t|j|j| _t|j| _d S N)	r3   r4   r   r   r8   denserz   r{   r|   r   rJ   r%   r&   r4   j  s    
zDPTViTSelfOutput.__init__r*   input_tensorr^   c                 C   s   |  |}| |}|S r   r   r|   rH   r*   r   r%   r%   r&   rr   o  s    

zDPTViTSelfOutput.forwardr   r%   r%   rJ   r&   r   d  s   r   c                       sR   e Zd Zed fddZee dddZdej	e
ej	 ej	dd	d
Z  ZS )DPTViTAttentionr   c                    s*   t    t|| _t|| _t | _d S r   )r3   r4   r   	attentionr   outputsetpruned_headsr   rJ   r%   r&   r4   w  s    


zDPTViTAttention.__init__)headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rR   )r?   r   r   r   r   r   r   r   r   r   r   r   r   union)rH   r   rc   r%   r%   r&   prune_heads}  s    zDPTViTAttention.prune_headsNr   c                 C   s    |  ||\}}| ||}|S r   )r   r   )rH   r*   r   Zself_attn_outputr~   r   r%   r%   r&   rr     s    zDPTViTAttention.forward)N)r   r   r   r   r4   r   rs   r   r!   rt   r   rr   rv   r%   r%   rJ   r&   r   v  s   r   c                       s6   e Zd Zed fddZejejdddZ  ZS )DPTViTIntermediater   c                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r   )r3   r4   r   r   r8   intermediate_sizer   r9   
hidden_actstrr   intermediate_act_fnr   rJ   r%   r&   r4     s
    
zDPTViTIntermediate.__init__r*   r^   c                 C   s   |  |}| |}|S r   )r   r   )rH   r*   r%   r%   r&   rr     s    

zDPTViTIntermediate.forward	r   r   r   r   r4   r!   rt   rr   rv   r%   r%   rJ   r&   r     s   r   c                       s:   e Zd Zed fddZejejejdddZ  ZS )DPTViTOutputr   c                    s.   t    t|j|j| _t|j| _	d S r   )
r3   r4   r   r   r   r8   r   rz   r{   r|   r   rJ   r%   r&   r4     s    
zDPTViTOutput.__init__r   c                 C   s    |  |}| |}|| }|S r   r   r   r%   r%   r&   rr     s    

zDPTViTOutput.forwardr   r%   r%   rJ   r&   r     s   r   c                       sD   e Zd ZdZed fddZd	ejeej ejdddZ	  Z
S )
DPTViTLayerz?This corresponds to the Block class in the timm implementation.r   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r3   r4   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r   r   	LayerNormr8   layer_norm_epslayernorm_beforelayernorm_afterr   rJ   r%   r&   r4     s    



zDPTViTLayer.__init__Nr   c                 C   sB   |  |}| ||}|| }| |}| |}| ||}|S r   )r   r   r   r   r   )rH   r*   r   Zhidden_states_normZattention_outputZlayer_outputr%   r%   r&   rr     s    


zDPTViTLayer.forward)N)r   r   r   r    r   r4   r!   rt   r   rr   rv   r%   r%   rJ   r&   r     s   
r   c                       s@   e Zd Zed fddZd	ejeej ee	dddZ
  ZS )
DPTViTEncoderr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r%   )r   )rb   r~   r   r%   r&   re     rf   z*DPTViTEncoder.__init__.<locals>.<listcomp>F)	r3   r4   r-   r   
ModuleListrangenum_hidden_layerslayerZgradient_checkpointingr   rJ   r   r&   r4     s    
 zDPTViTEncoder.__init__NF)r*   r   ro   r^   c                 C   sf   |r
|gnd }t | jD ]4\}}|d ur0|| nd }|||}|r|| qt||r^t|nd dS )N)r(   r*   )	enumerater   appendr
   r$   )rH   r*   r   ro   Zall_hidden_statesiZlayer_moduleZlayer_head_maskr%   r%   r&   rr     s    
zDPTViTEncoder.forward)NF)r   r   r   r   r4   r!   rt   r   ru   r
   rr   rv   r%   r%   rJ   r&   r     s    r   c                       sN   e Zd ZdZ fddZdd Zdd Zdeej	 eej	 d	d
dZ
  ZS )DPTReassembleStagea@  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                    sB   t    || _t | _|jr,| | n
| | |j	| _	d S r   )
r3   r4   r-   r   r   layers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   rJ   r%   r&   r4     s    


zDPTReassembleStage.__init__c              	   C   s   t tt|j|jD ]F\}}|dkr8| jt  q|dkr| jt	||j| |d q|j
dkrztd|j
 dt | _t|}tt|jD ]R}|dkr| jtt  q|dkr| jttd| |t|j  qdS )a   "
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        r   r>   factorprojectzReadout type z! is not supported for DPT-Hybrid.rN   N)zipr   r?   neck_hidden_sizesreassemble_factorsr   r   r   IdentityDPTReassembleLayerreadout_typer@   r   readout_projects_get_backbone_hidden_size
Sequentialr   r   r   )rH   r-   r   r   r8   r%   r%   r&   r     s     

z.DPTReassembleStage._init_reassemble_dpt_hybridc              	   C   s   t tt|j|jD ]$\}}| jt||j| |d q|jdkrt	
 | _t|}tt|jD ]*}| jt	t	d| |t|j  qfd S )Nr   r   rN   )r   r   r?   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rH   r-   r   r   r8   r~   r%   r%   r&   r      s    

z'DPTReassembleStage._init_reassemble_dptNr   c                 C   sT  g }t |D ]@\}}|| jvrD|dddf |ddddf  }}|j\}}	}
|durv|durv|||||
}nt|	d }|||||
}|dddd }|j}| jjdkr
|	dd}|
d|}| j| t||fd	}|ddd|}n,| jjd
kr6|	d|
d	 }||}| j| |}|| q|S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        Nr   r   rM   r   rN   r   )r   rN   r   r/   add)r   r   rg   rT   r   rU   r   r-   r   rh   Z	unsqueezeZ	expand_asr   r!   rX   r   r   )rH   r*   patch_heightpatch_widthoutr   hidden_staterF   rk   Zsequence_lengthr7   rP   Zfeature_shapeZreadoutr%   r%   r&   rr   ,  s,    &
zDPTReassembleStage.forward)NN)r   r   r   r    r4   r   r   listr!   rt   rr   rv   r%   r%   rJ   r&   r     s
   r   c                 C   s&   | j d ur| jdu r| j jS | jS d S )NF)backbone_configr   r8   r   r%   r%   r&   r   R  s    r   c                       s.   e Zd Zeeed fddZdd Z  ZS )r   )r-   r>   r   c                    s   t    t|}tj||dd| _|dkrDtj||||dd| _n:|dkrXt | _n&|dk r~tj||dt	d| dd| _d S )Nr   )Zin_channelsZout_channelsr2   r   r2   r   paddingr   )
r3   r4   r   r   rB   rC   ConvTranspose2dresizer   rs   )rH   r-   r>   r   r8   rJ   r%   r&   r4   Z  s    
zDPTReassembleLayer.__init__c                 C   s   |  |}| |}|S r   )rC   r   )rH   r   r%   r%   r&   rr   i  s    

zDPTReassembleLayer.forward)r   r   r   r   rs   r4   rr   rv   r%   r%   rJ   r&   r   Y  s   r   c                       s*   e Zd Zed fddZdd Z  ZS )DPTFeatureFusionStager   c                    s<   t    t | _tt|jD ]}| jt	| q"d S r   )
r3   r4   r   r   r   r   r?   r   r   DPTFeatureFusionLayer)rH   r-   r~   rJ   r%   r&   r4   p  s    

zDPTFeatureFusionStage.__init__c                 C   sV   |d d d }g }d }t || jD ].\}}|d u r<||}n
|||}|| q"|S )Nr/   )r   r   r   )rH   r*   Zfused_hidden_statesZfused_hidden_stater   r   r%   r%   r&   rr   v  s    

zDPTFeatureFusionStage.forward)r   r   r   r   r4   rr   rv   r%   r%   rJ   r&   r   o  s   r   c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )DPTPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    r   c                    s   t    |j| _|jd ur"|jn| j }t | _tj|j	|j	ddd|d| _
t | _tj|j	|j	ddd|d| _| jrt|j	| _t|j	| _d S )Nr   r   )r2   r   r   r   )r3   r4   Z!use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1rB   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rH   r-   r   rJ   r%   r&   r4     s6    

	
	zDPTPreActResidualLayer.__init__)r   r^   c                 C   sT   |}|  |}| |}| jr(| |}| |}| |}| jrL| |}|| S r   )r   r   r   r   r   r   r   rH   r   residualr%   r%   r&   rr     s    





zDPTPreActResidualLayer.forwardr   r%   r%   rJ   r&   r     s   "r   c                       sH   e Zd ZdZd
eed fddZdeje	ej ejddd	Z
  ZS )r   a3  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    T)r-   align_cornersc                    s@   t    || _tj|j|jddd| _t|| _t|| _	d S )Nr   T)r2   r   )
r3   r4   r   r   rB   r   rC   r   residual_layer1residual_layer2)rH   r-   r   rJ   r%   r&   r4     s
    

zDPTFeatureFusionLayer.__init__N)r   r   r^   c                 C   st   |d urF|j |j kr8tjj||j d |j d fddd}|| | }| |}tjj|dd| jd}| |}|S )NrN   r   rO   FrP   rQ   r   Zscale_factorrQ   r   )rg   r   rV   rW   r   r   r   rC   r   r%   r%   r&   rr     s    


zDPTFeatureFusionLayer.forward)T)N)r   r   r   r    r   ru   r4   r!   rt   r   rr   rv   r%   r%   rJ   r&   r     s   	
r   c                   @   sB   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdeiZdd ZdS )	DPTPreTrainedModelr-   dptr\   Tr+   c                 C   s   t |tjtjtjfrD|jjjd| jj	d |j
durp|j
j  n,t |tjtjfrp|j
j  |jjd t |ttfr|jj  |jj  dS )zInitialize the weightsr   )meanZstdNg      ?)r9   r   r   rB   r   weightdataZnormal_r-   Zinitializer_ranger   Zzero_r   r   Zfill_rw   r,   rF   rG   )rH   r   r%   r%   r&   _init_weights  s    
z DPTPreTrainedModel._init_weightsN)r   r   r   r   r#   Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_sdpaZ_supports_flash_attnZ_supports_flex_attnZ_supports_attention_backendr   Z_can_record_outputsr   r%   r%   r%   r&   r     s   
r   c                       s`   e Zd Zdeed fddZdd Zdd Zee	de
jee
j ee ed
ddZ  ZS )DPTModelT)r-   add_pooling_layerc                    sj   t  | || _|jr$t|| _n
t|| _t|| _t	j
|j|jd| _|rXt|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r3   r4   r-   r   r,   rp   rw   r   encoderr   r   r8   r   	layernormDPTViTPoolerpooler	post_init)rH   r-   r   rJ   r%   r&   r4   	  s    

zDPTModel.__init__c                 C   s   | j jr| jS | jjS d S r   )r-   r   rp   ry   rH   r%   r%   r&   get_input_embeddings  s    zDPTModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )rH   Zheads_to_pruner   r   r%   r%   r&   _prune_heads$  s    zDPTModel._prune_headsN)r\   r   ro   r^   c           
      K   s|   |d u r| j j}| || j j}| |}|j}| j|||d}|j}| |}| j	d urd| 	|nd }	t
||	|j|jdS )Nr   ro   )r(   r)   r   r*   )r-   ro   Zget_head_maskr   rp   r   r  r(   r  r  r'   r   r*   )
rH   r\   r   ro   r   Zembedding_outputZembedding_last_hidden_statesZencoder_outputsZsequence_outputpooled_outputr%   r%   r&   rr   ,  s"    	

zDPTModel.forward)T)NN)r   r   r   r   ru   r4   r  r	  r   r   r!   r"   r   r'   rr   rv   r%   r%   rJ   r&   r     s     r   c                       s6   e Zd Zed fddZejejdddZ  ZS )r  r   c                    s,   t    t|j|j| _t|j | _	d S r   )
r3   r4   r   r   r8   Zpooler_output_sizer   r   Z
pooler_act
activationr   rJ   r%   r&   r4   T  s    
zDPTViTPooler.__init__r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )rH   r*   Zfirst_token_tensorr  r%   r%   r&   rr   Y  s    

zDPTViTPooler.forwardr   r%   r%   rJ   r&   r  S  s   r  c                       sP   e Zd ZdZed fddZd	eej e	e
 e	e
 eej dddZ  ZS )
DPTNecka;  
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    r   c              
      sz   t    || _|jd ur.|jjdv r.d | _n
t|| _t | _	|j
D ]"}| j	tj||jdddd qHt|| _d S )N)Zswinv2r   r   Fr2   r   r   )r3   r4   r-   r   Z
model_typereassemble_stager   r   r   convsr   r   rB   r   r   fusion_stage)rH   r-   ZchannelrJ   r%   r&   r4   n  s    



 zDPTNeck.__init__N)r*   r   r   r^   c                    sn   t |ttfstdt|t jjkr2td jdurJ |||} fddt	|D } 
|}|S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.Nc                    s   g | ]\}} j | |qS r%   )r  )rb   r   featurer  r%   r&   re     rf   z#DPTNeck.forward.<locals>.<listcomp>)r9   r$   r   	TypeErrorr?   r-   r   r@   r  r   r  )rH   r*   r   r   rn   r   r%   r  r&   rr     s    

zDPTNeck.forward)NN)r   r   r   r    r   r4   r   r!   rt   r   rs   rr   rv   r%   r%   rJ   r&   r  b  s     r  c                       s>   e Zd ZdZed fddZeej ejdddZ	  Z
S )DPTDepthEstimationHeada	  
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    supplementary material).
    r   c                    s   t    || _d | _|jr2tjdddddd| _|j}ttj||d ddddtj	ddd	d
tj|d dddddt
 tjddddddt
 | _d S )N   )r   r   )r   r   r   rN   r   r   rO   Tr       r   )r3   r4   r-   rC   Zadd_projectionr   rB   r   r   Upsampler   headrH   r-   rn   rJ   r%   r&   r4     s    
zDPTDepthEstimationHead.__init__r   c                 C   sF   || j j }| jd ur,| |}t |}| |}|jdd}|S )Nr   rR   )r-   head_in_indexrC   r   r   r  Zsqueeze)rH   r*   predicted_depthr%   r%   r&   rr     s    


zDPTDepthEstimationHead.forward)r   r   r   r    r   r4   r   r!   rt   rr   rv   r%   r%   rJ   r&   r    s   r  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                	       sN   e Zd Z fddZeedejeej eej	 ee
 edddZ  ZS )DPTForDepthEstimationc                    sj   t  | d | _|jdu r<|jd us0|jd ur<t|| _nt|dd| _t|| _	t
|| _|   d S NF)r   )r3   r4   r=   r   r   r   r   r   r  neckr  r  r  r   rJ   r%   r&   r4     s    

zDPTForDepthEstimation.__init__Nr\   r   labelsro   r^   c                    sD  |du r j j}d}|dur$td jdurN jj|fddi|}|j}nt j|f|dd|}|j} j js fddt	|dd D }n.|j
}	|	 fd	d
t	|dd D  |	}d\}
} j jdur j jdu r|j\}}}} j jj}|| }
|| } ||
|} |}t|||r8|jnd|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yetro   Tr
  c                    s    g | ]\}}| j jv r|qS r%   r-   Zbackbone_out_indicesrb   idxr  r  r%   r&   re     s   z1DPTForDepthEstimation.forward.<locals>.<listcomp>r   c                 3   s*   | ]"\}}| j jd d v r|V  qdS rN   Nr!  r"  r  r%   r&   	<genexpr>   s   z0DPTForDepthEstimation.forward.<locals>.<genexpr>)NNF)lossr  r*   r+   )r-   ro   NotImplementedErrorr=   Zforward_with_filtered_kwargsra   r   r*   r   r   r   extendr   rg   r6   r  r  r   r+   )rH   r\   r   r   ro   r   r&  outputsr*   backbone_hidden_statesr   r   r~   rl   rm   r6   r  r%   r  r&   rr     sB    /



zDPTForDepthEstimation.forward)NNN)r   r   r   r4   r   r   r!   r"   r   
LongTensorru   r   rr   rv   r%   r%   rJ   r&   r    s      r  c                       s:   e Zd Zed fddZeej ejdddZ  Z	S )DPTSemanticSegmentationHeadr   c                    sl   t    || _|j}ttj||ddddt|t t	|j
tj||jddtjdddd	| _d S )
Nr   r   Fr  r1   rN   rO   Tr   )r3   r4   r-   r   r   r   rB   r   r   rz   Zsemantic_classifier_dropout
num_labelsr  r  r  rJ   r%   r&   r4   :  s    

z$DPTSemanticSegmentationHead.__init__r   c                 C   s   || j j }| |}|S r   )r-   r  r  rH   r*   logitsr%   r%   r&   rr   H  s    
z#DPTSemanticSegmentationHead.forward)
r   r   r   r   r4   r   r!   rt   rr   rv   r%   r%   rJ   r&   r,  9  s   r,  c                       s6   e Zd Zed fddZejejdddZ  ZS )DPTAuxiliaryHeadr   c                    sX   t    |j}ttj||ddddt|t tddtj||j	dd| _
d S )Nr   r   Fr  g?r1   )r3   r4   r   r   r   rB   r   r   rz   r-  r  r  rJ   r%   r&   r4   P  s    

zDPTAuxiliaryHead.__init__r   c                 C   s   |  |}|S r   )r  r.  r%   r%   r&   rr   \  s    
zDPTAuxiliaryHead.forwardr   r%   r%   rJ   r&   r0  O  s   r0  c                	       sX   e Zd Zed fddZeedeej	 eej	 eej
 ee edddZ  ZS )	DPTForSemanticSegmentationr   c                    sN   t  | t|dd| _t|| _t|| _|jr<t	|nd | _
|   d S r  )r3   r4   r   r   r  r  r,  r  Zuse_auxiliary_headr0  auxiliary_headr  r   rJ   r%   r&   r4   c  s    

z#DPTForSemanticSegmentation.__init__Nr  c                    sv  |du r j j}|dur, j jdkr,td j|f|dd|}|j} j jsr fddt|dd D }n.|j}|	 fdd	t|dd D  |} j
|d
} |}	d}
 jdur҈ |d }
d}|durXtjj|	|jdd ddd}|
dur&tjj|
|jdd ddd}t j jd}|||}|||}| j j|  }t||	|rj|jnd|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr
  c                    s    g | ]\}}| j jv r|qS r%   r!  r"  r  r%   r&   re     s   z6DPTForSemanticSegmentation.forward.<locals>.<listcomp>c                 3   s*   | ]"\}}| j jd d v r|V  qdS r$  r!  r"  r  r%   r&   r%    s   z5DPTForSemanticSegmentation.forward.<locals>.<genexpr>)r*   r/   r0   rO   Fr   )Zignore_index)r&  r/  r*   r+   )r-   ro   r-  r@   r   r*   r   r   r   r(  r  r  r2  r   rV   rW   rg   r   Zsemantic_loss_ignore_indexZauxiliary_loss_weightr   r+   )rH   r\   r   r   ro   r   r)  r*   r*  r/  Zauxiliary_logitsr&  Zupsampled_logitsZupsampled_auxiliary_logitsZloss_fctZ	main_lossZauxiliary_lossr%   r  r&   rr   r  sZ     






z"DPTForSemanticSegmentation.forward)NNNN)r   r   r   r   r4   r   r   r   r!   r"   r+  ru   r   rr   rv   r%   r%   rJ   r&   r1  a  s       r1  )r  r1  r   r   )r   )Jr    collections.abcr:   dataclassesr   typingr   r   r!   Ztorch.utils.checkpointr   Ztorch.nnr   Zactivationsr   Zmodeling_layersr	   Zmodeling_outputsr
   r   r   Zmodeling_utilsr   r   Zpytorch_utilsr   r   utilsr   r   r   r   Zutils.backbone_utilsr   Zutils.genericr   r   Zconfiguration_dptr   Z
get_loggerr   loggerr   r'   Moduler,   rw   rx   rt   floatr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r,  r0  r1  __all__r%   r%   r%   r&   <module>   s   
`7' 5 h=%K:(ph