a
    h@                     @   sh  d Z ddlmZmZmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ eeZG dd dej Z!G dd dej Z"G dd dej Z#G dd dej Z$eG dd deZ%eG dd de%Z&eddG dd de%Z'eddG dd  d e%eZ(g d!Z)dS )"zPyTorch TextNet model.    )AnyOptionalUnionN)Tensor)BCEWithLogitsLossCrossEntropyLossMSELoss)PreTrainedModel)ACT2CLS)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)TextNetConfig)logging)BackboneMixin   )auto_docstringc                       s6   e Zd Zed fddZejejdddZ  ZS )TextNetConvLayerconfigc                    s   t    |j| _|j| _|j| _t|jt	rJ|jd d |jd d fn|jd }t
j|j|j|j|j|dd| _t
|j|j| _t
 | _| jd urt| j  | _d S )Nr         F)kernel_sizestridepaddingbias)super__init__Zstem_kernel_sizer   Zstem_strider   Zstem_act_funcactivation_function
isinstancetuplennConv2dZstem_num_channelsZstem_out_channelsconvBatchNorm2dbatch_norm_eps
batch_normIdentity
activationr
   )selfr   r   	__class__ h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/textnet/modeling_textnet.pyr   +   s(    



zTextNetConvLayer.__init__hidden_statesreturnc                 C   s   |  |}| |}| |S N)r$   r'   r)   )r*   r0   r-   r-   r.   forwardF   s    

zTextNetConvLayer.forward)	__name__
__module____qualname__r   r   torchr   r3   __classcell__r-   r-   r+   r.   r   *   s   r   c                       sB   e Zd ZdZeeeeed fddZejejdddZ	  Z
S )TextNetRepConvLayera  
    This layer supports re-parameterization by combining multiple convolutional branches
    (e.g., main convolution, vertical, horizontal, and identity branches) during training.
    At inference time, these branches can be collapsed into a single convolution for
    efficiency, as per the re-parameterization paradigm.

    The "Rep" in the name stands for "re-parameterization" (introduced by RepVGG).
    )r   in_channelsout_channelsr   r   c           	         sf  t    || _|| _|| _|| _|d d d |d d d f}t | _tj	|||||dd| _
tj||jd| _|d d d df}d|d d d f}|d dkrtj	|||d df||dd| _tj||jd| _nd\| _| _|d dkr,tj	||d|d f||dd| _tj||jd| _nd\| _| _||kr\|dkr\tj||jdnd | _d S )Nr   r   r   F)r:   r;   r   r   r   r   )num_featureseps)NN)r   r   Znum_channelsr;   r   r   r"   ZReLUr   r#   	main_convr%   r&   main_batch_normvertical_convvertical_batch_normhorizontal_convhorizontal_batch_normrbr_identity)	r*   r   r:   r;   r   r   r   Zvertical_paddinghorizontal_paddingr+   r-   r.   r   V   s\    
 


zTextNetRepConvLayer.__init__r/   c                 C   s   |  |}| |}| jd ur:| |}| |}|| }| jd ur`| |}| |}|| }| jd ur|| |}|| }| |S r2   )r>   r?   r@   rA   rB   rC   rD   r   )r*   r0   Zmain_outputsZvertical_outputsZhorizontal_outputsZid_outr-   r-   r.   r3      s    









zTextNetRepConvLayer.forward)r4   r5   r6   __doc__r   intr   r7   r   r3   r8   r-   r-   r+   r.   r9   L   s   	9r9   c                       s,   e Zd Zeed fddZdd Z  ZS )TextNetStage)r   depthc                    s   t    |j| }|j| }t|}|j| }|j|d  }|g|g|d   }|g| }	g }
t||	||D ]}|
t|g|R   qnt	
|
| _d S )Nr   )r   r   conv_layer_kernel_sizesZconv_layer_strideslenhidden_sizeszipappendr9   r"   
ModuleListstage)r*   r   rI   r   r   Z
num_layersZstage_in_channel_sizeZstage_out_channel_sizer:   r;   rP   Zstage_configr+   r-   r.   r      s    




zTextNetStage.__init__c                 C   s   | j D ]}||}q|S r2   )rP   )r*   hidden_stateblockr-   r-   r.   r3      s    

zTextNetStage.forward)r4   r5   r6   r   rG   r   r3   r8   r-   r-   r+   r.   rH      s   rH   c                       sB   e Zd Zed fddZdejee ee e	dddZ
  ZS )	TextNetEncoderr   c                    sF   t    g }t|j}t|D ]}|t|| q t|| _	d S r2   )
r   r   rK   rJ   rangerN   rH   r"   rO   stages)r*   r   rU   Z
num_stagesZstage_ixr+   r-   r.   r      s    

zTextNetEncoder.__init__N)rQ   output_hidden_statesreturn_dictr1   c                 C   sL   |g}| j D ]}||}|| q|s@|f}|r<||f S |S t||dS )N)last_hidden_stater0   )rU   rN   r   )r*   rQ   rV   rW   r0   rP   outputr-   r-   r.   r3      s    
zTextNetEncoder.forward)NN)r4   r5   r6   r   r   r7   r   r   boolr   r3   r8   r-   r-   r+   r.   rS      s     rS   c                   @   s&   e Zd ZU eed< dZdZdd ZdS )TextNetPreTrainedModelr   textnetpixel_valuesc                 C   st   t |tjtjfr@|jjjd| jjd |j	d urp|j	j
  n0t |tjrp|jjd |j	d urp|j	j
  d S )Ng        )meanZstdg      ?)r    r"   Linearr#   weightdataZnormal_r   Zinitializer_ranger   Zzero_r%   Zfill_)r*   moduler-   r-   r.   _init_weights   s    

z$TextNetPreTrainedModel._init_weightsN)r4   r5   r6   r   __annotations__Zbase_model_prefixZmain_input_namerc   r-   r-   r-   r.   r[      s   
r[   c                
       sX   e Zd Z fddZedeee ee ee	e
ee
 f e	e
 ef dddZ  ZS )TextNetModelc                    s8   t  | t|| _t|| _td| _| 	  d S )N)r   r   )
r   r   r   stemrS   encoderr"   AdaptiveAvgPool2dpooler	post_initr*   r   r+   r-   r.   r      s
    

zTextNetModel.__init__Nr]   rV   rW   r1   c           	      C   s   |d ur|n| j j}|d ur |n| j j}| |}| j|||d}|d }| |}|sv||f}|rr||d f S |S t|||r|d nd dS )NrV   rW   r   r   )rX   Zpooler_outputr0   )r   use_return_dictrV   rf   rg   ri   r   )	r*   r]   rV   rW   rQ   Zencoder_outputsrX   Zpooled_outputrY   r-   r-   r.   r3      s"    

zTextNetModel.forward)NN)r4   r5   r6   r   r   r   r   rZ   r   r!   r   listr   r3   r8   r-   r-   r+   r.   re      s    re   z
    TextNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )Zcustom_introc                       sL   e Zd Z fddZedeej eej ee	 ee	 e
dddZ  ZS )TextNetForImageClassificationc                    s|   t  | |j| _t|| _td| _t | _	|jdkrRt
|jd |jnt | _t| j| j	g| _|   d S )N)r   r   r   )r   r   
num_labelsre   r\   r"   rh   Zavg_poolZFlattenflattenr_   rL   r(   fcrO   
classifierrj   rk   r+   r-   r.   r     s    

(z&TextNetForImageClassification.__init__N)r]   labelsrV   rW   r1   c                 C   sv  |dur|n| j j}| j|||d}|d }| jD ]}||}q2| |}d}	|dur6| j jdu r| jdkrxd| j _n4| jdkr|jtj	ks|jtj
krd| j _nd| j _| j jdkrt }
| jdkr|
| | }	n
|
||}	nN| j jdkrt }
|
|d| j|d}	n| j jdkr6t }
|
||}	|sf|f|d	d  }|	durb|	f| S |S t|	||jd
S )al  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> import torch
        >>> import requests
        >>> from transformers import TextNetForImageClassification, TextNetImageProcessor
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = TextNetImageProcessor.from_pretrained("czczup/textnet-base")
        >>> model = TextNetForImageClassification.from_pretrained("czczup/textnet-base")

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> outputs.logits.shape
        torch.Size([1, 2])
        ```Nrm   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrq   r   )losslogitsr0   )r   rn   r\   ru   rt   Zproblem_typerr   Zdtyper7   longrG   r   Zsqueezer   viewr   r   r0   )r*   r]   rv   rV   rW   outputsrX   layerrx   rw   Zloss_fctrY   r-   r-   r.   r3   '  s:    !





"


z%TextNetForImageClassification.forward)NNNN)r4   r5   r6   r   r   r   r7   ZFloatTensorZ
LongTensorrZ   r   r3   r8   r-   r-   r+   r.   rp     s       rp   zP
    TextNet backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sN   e Zd ZdZ fddZedeee ee e	e
e
 ef dddZ  ZS )	TextNetBackboneFc                    s6   t  | t  | t|| _|j| _|   d S r2   )r   r   Z_init_backbonere   r\   rL   r<   rj   rk   r+   r-   r.   r   u  s
    
zTextNetBackbone.__init__Nrl   c           
      C   s   |dur|n| j j}|dur |n| j j}| j|d|d}|rB|jn|d }d}t| jD ] \}}|| jv rX||| f7 }qX|s|f}	|r|r|jn|d }|	|f7 }	|	S t||r|jndddS )a  
        Examples:

        ```python
        >>> import torch
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, AutoBackbone

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("czczup/textnet-base")
        >>> model = AutoBackbone.from_pretrained("czczup/textnet-base")

        >>> inputs = processor(image, return_tensors="pt")
        >>> with torch.no_grad():
        >>>     outputs = model(**inputs)
        ```NTrm   r   r-   )feature_mapsr0   Z
attentions)	r   rn   rV   r\   r0   	enumerateZstage_namesZout_featuresr   )
r*   r]   rV   rW   r{   r0   r~   idxrP   rY   r-   r-   r.   r3     s(    

zTextNetBackbone.forward)NN)r4   r5   r6   Zhas_attentionsr   r   r   r   rZ   r   r!   r   r3   r8   r-   r-   r+   r.   r}   m  s   
 r}   )r}   re   r[   rp   )*rF   typingr   r   r   r7   Ztorch.nnr"   r   r   r   r   Ztransformersr	   Ztransformers.activationsr
   Ztransformers.modeling_outputsr   r   r   r   Z1transformers.models.textnet.configuration_textnetr   Ztransformers.utilsr   Z!transformers.utils.backbone_utilsr   utilsr   Z
get_loggerr4   loggerModuler   r9   rH   rS   r[   re   rp   r}   __all__r-   r-   r-   r.   <module>   s<   
"Z%U@