a
    h8                     @   s  d Z ddlmZmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ G dd dejZG dd dejZG dd dejZG dd dejZG dd dejZeG dd deZeddG dd deZddgZdS )zrPyTorch UperNet model. Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.    )OptionalUnionN)nn)CrossEntropyLoss   )SemanticSegmenterOutput)PreTrainedModel)auto_docstring)load_backbone   )UperNetConfigc                       sz   e Zd ZdZdeeeeeeef f eeeeef ef eeeeeef f dd fddZ	e
je
jd	d
dZ  ZS )UperNetConvModulez
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
    r   Fr   N)in_channelsout_channelskernel_sizepaddingbiasdilationreturnc                    s<   t    tj||||||d| _t|| _t | _d S )N)r   r   r   r   r   r   )	super__init__r   Conv2dconvBatchNorm2d
batch_normZReLU
activation)selfr   r   r   r   r   r   	__class__ h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/upernet/modeling_upernet.pyr   $   s    	
zUperNetConvModule.__init__inputr   c                 C   s"   |  |}| |}| |}|S N)r   r   r   )r   r"   outputr   r   r    forward9   s    


zUperNetConvModule.forward)r   Fr   )__name__
__module____qualname____doc__intr   tuplestrboolr   torchTensorr%   __classcell__r   r   r   r    r      s   
   r   c                       s<   e Zd Zeeedd fddZejejdddZ  ZS )UperNetPyramidPoolingBlockN)
pool_scaler   channelsr   c                    sL   t    t|t||ddg| _t| jD ]\}}| t|| q.d S )Nr   r   )	r   r   r   ZAdaptiveAvgPool2dr   layers	enumerate
add_moduler,   )r   r2   r   r3   ilayerr   r   r    r   B   s    
z#UperNetPyramidPoolingBlock.__init__r!   c                 C   s   |}| j D ]}||}q
|S r#   )r5   )r   r"   Zhidden_stater9   r   r   r    r%   K   s    

z"UperNetPyramidPoolingBlock.forward)	r&   r'   r(   r*   r   r.   r/   r%   r0   r   r   r   r    r1   A   s   	r1   c                       sN   e Zd ZdZeedf eeedd fddZej	e
ej	 ddd	Z  ZS )
UperNetPyramidPoolingModulea}  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (`tuple[int]`):
            Pooling scales used in Pooling Pyramid Module.
        in_channels (`int`):
            Input channels.
        channels (`int`):
            Channels after modules, before conv_seg.
        align_corners (`bool`):
            align_corners argument of F.interpolate.
    .N)pool_scalesr   r3   align_cornersr   c                    sh   t    || _|| _|| _|| _g | _t|D ]2\}}t|||d}| j	| | 
t|| q0d S )N)r2   r   r3   )r   r   r;   r<   r   r3   blocksr6   r1   appendr7   r,   )r   r;   r   r3   r<   r8   r2   blockr   r   r    r   a   s    
z$UperNetPyramidPoolingModule.__init__)xr   c                 C   sH   g }| j D ]8}||}tjj|| dd  d| jd}|| q
|S )N   bilinearsizemoder<   )r=   r   
functionalinterpolaterD   r<   r>   )r   r@   Zppm_outsppmZppm_outZupsampled_ppm_outr   r   r    r%   m   s    
z#UperNetPyramidPoolingModule.forward)r&   r'   r(   r)   r+   r*   r-   r   r.   r/   listr%   r0   r   r   r   r    r:   R   s   "r:   c                       s<   e Zd ZdZ fddZdd ZejejdddZ  Z	S )	UperNetHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://huggingface.co/papers/1807.10221).
    c                    s  t    || _|j| _|| _|j| _d| _tj	| j|j
dd| _t| j| jd | j| jd| _t| jd t| j| j  | jddd| _t | _t | _| jd d D ]@}t|| jdd}t| j| jddd}| j| | j| qtt| j| j | jddd| _d S )NFr   r4   )r<   r   r   r   )r   r   configr;   r   Zhidden_sizer3   r<   r   r   
num_labels
classifierr:   psp_modulesr   len
bottleneckZ
ModuleListlateral_convs	fpn_convsr>   fpn_bottleneck)r   rM   r   Zl_convZfpn_convr   r   r    r   ~   s@    


zUperNetHead.__init__c                 C   s:   |d }|g}| | | tj|dd}| |}|S )NrK   r   dim)extendrP   r.   catrR   )r   inputsr@   Zpsp_outsr$   r   r   r    psp_forward   s    
zUperNetHead.psp_forwardencoder_hidden_statesr   c                    s   fddt jD   t}t|d ddD ]H}|d  jdd  }|d  tjj	| |dj
d |d < q@fd	dt|d D }|d  t|d ddD ]0}tjj	|| |d jdd  dj
d||< qtj|dd
}|}|}|S )Nc                    s   g | ]\}}| | qS r   r   ).0r8   Zlateral_conv)r]   r   r    
<listcomp>       z'UperNetHead.forward.<locals>.<listcomp>r   r   rK   rA   rB   rC   c                    s   g | ]}j |  | qS r   )rT   )r^   r8   )lateralsr   r   r    r_      r`   rV   )r6   rS   r>   r[   rQ   rangeshaper   rF   rG   r<   r.   rY   rU   rO   )r   r]   Zused_backbone_levelsr8   Z
prev_shapeZfpn_outsr$   r   )r]   ra   r   r    r%      s$    

zUperNetHead.forward)
r&   r'   r(   r)   r   r[   r.   r/   r%   r0   r   r   r   r    rJ   x   s   '	rJ   c                       sR   e Zd ZdZdeeeeeeef f dd fddZej	ej	d	d
dZ
  ZS )UperNetFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is the implementation of
    [FCNNet](https://huggingface.co/papers/1411.4038>).

    Args:
        config:
            Configuration.
        in_channels (int):
            Number of input channels.
        kernel_size (int):
            The kernel size for convs in the head. Default: 3.
        dilation (int):
            The dilation rate for convs in the head. Default: 1.
    rA   r   r   N)in_indexr   r   r   c           	   
      s  t    || _|jd u r"|| n|j| _|j| _|j| _|j	| _
|| _|d | }g }|t| j| j|||d t| jd D ] }|t| j| j|||d q| jdkrt | _ntj| | _| j
rt| j| j | j||d d| _tj| j|jdd| _d S )NrA   )r   r   r   r   r   rL   r4   )r   r   rM   Zauxiliary_in_channelsr   Zauxiliary_channelsr3   Zauxiliary_num_convsZ	num_convsZauxiliary_concat_inputconcat_inputre   r>   r   rb   r   ZIdentityconvsZ
Sequentialconv_catr   rN   rO   )	r   rM   r   re   r   r   Zconv_paddingrg   r8   r   r   r    r      s:    

zUperNetFCNHead.__init__r\   c                 C   s@   || j  }| |}| jr2| tj||gdd}| |}|S )Nr   rV   )re   rg   rf   rh   r.   rY   rO   )r   r]   hidden_statesr$   r   r   r    r%     s    


zUperNetFCNHead.forward)rA   r   r   )r&   r'   r(   r)   r*   r   r+   r   r.   r/   r%   r0   r   r   r   r    rd      s    &rd   c                   @   s&   e Zd ZU eed< dZg Zdd ZdS )UperNetPreTrainedModelrM   pixel_valuesc                 C   sd   t |tjr:|jjjd| jjd |jd ur`|jj	  n&t |tj
r`|jjd |jj	  d S )Ng        )meanZstdg      ?)
isinstancer   r   weightdataZnormal_rM   Zinitializer_ranger   Zzero_r   Zfill_)r   moduler   r   r    _init_weights  s    
z$UperNetPreTrainedModel._init_weightsN)r&   r'   r(   r   __annotations__Zmain_input_nameZ_no_split_modulesrq   r   r   r   r    rj     s   
rj   zW
    UperNet framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
    )Zcustom_introc                
       sZ   e Zd Z fddZedeej ee ee eej ee e	e
ef dddZ  ZS )UperNetForSemanticSegmentationc                    sP   t  | t|| _t|| jjd| _|jr>t|| jjdnd | _	| 
  d S )N)r   )r   r   r
   backbonerJ   r3   decode_headZuse_auxiliary_headrd   auxiliary_headZ	post_init)r   rM   r   r   r    r   "  s    
z'UperNetForSemanticSegmentation.__init__N)rk   output_attentionsoutput_hidden_stateslabelsreturn_dictr   c                 C   sv  |dur| j jdkrtd|dur(|n| j j}|dur<|n| j j}|durP|n| j j}| jj|||d}|j}| 	|}t
jj||jdd ddd}d}	| jdur| |}	t
jj|	|jdd ddd}	d}
|durt| j jd	}|||}
|	dur||	|}|
| j j| 7 }
|sb|r8|f|dd  }n|f|dd  }|
dur^|
f| S |S t|
||j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-tiny")
        >>> model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-tiny")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/fixtures_ade20k", filename="ADE_val_00000001.jpg", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits  # shape (batch_size, num_labels, height, width)
        >>> list(logits.shape)
        [1, 150, 512, 512]
        ```Nr   z/The number of labels should be greater than one)rx   rw   rA   rB   FrC   )Zignore_index)losslogitsri   
attentions)rM   rN   
ValueErrorZuse_return_dictrx   rw   rt   Zforward_with_filtered_kwargsZfeature_mapsru   r   rF   rG   rc   rv   r   Zloss_ignore_indexZauxiliary_loss_weightr   ri   r}   )r   rk   rw   rx   ry   rz   outputsfeaturesr|   Zauxiliary_logitsr{   Zloss_fctZauxiliary_lossr$   r   r   r    r%   0  sH    $






z&UperNetForSemanticSegmentation.forward)NNNNN)r&   r'   r(   r   r	   r   r.   r/   r-   r   r+   r   r%   r0   r   r   r   r    rs     s        
rs   )r)   typingr   r   r.   r   Ztorch.nnr   Zmodeling_outputsr   Zmodeling_utilsr   utilsr	   Zutils.backbone_utilsr
   Zconfiguration_upernetr   Moduler   r1   r:   rJ   rd   rj   rs   __all__r   r   r   r    <module>   s*   #&T@c