a
    hqH                     @   s  d Z ddlmZ ddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ eeZd-ejeeejdddZG dd dejZ G dd dejZ!G dd dej"Z#G dd dejZ$G dd dejZ%G dd dejZ&G dd  d ejZ'eG d!d" d"eZ(eG d#d$ d$e(Z)ed%d&G d'd( d(e(Z*ed)d&G d*d+ d+e(eZ+g d,Z,dS ).zPyTorch ConvNextV2 model.    )OptionalN)nn   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)BackboneMixin)can_return_tuple   )ConvNextV2Config        F)input	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchZrandr   r   Zfloor_div)r   r   r   Z	keep_probr   Zrandom_tensoroutput r   n/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/convnextv2/modeling_convnextv2.py	drop_path)   s    
r   c                       sP   e Zd ZdZdee dd fddZejejdddZ	e
d	d
dZ  ZS )ConvNextV2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).N)r   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r   r   r#   A   s    
zConvNextV2DropPath.__init__hidden_statesr   c                 C   s   t || j| jS r!   )r   r   r   )r$   r(   r   r   r   forwardE   s    zConvNextV2DropPath.forward)r   c                 C   s   d| j  S )Nzp=)r   )r$   r   r   r   
extra_reprH   s    zConvNextV2DropPath.extra_repr)N)__name__
__module____qualname____doc__r   floatr#   r   Tensorr)   strr*   __classcell__r   r   r%   r   r    >   s   r    c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )ConvNextV2GRNz)GRN (Global Response Normalization) layer)dimc                    s>   t    ttddd|| _ttddd|| _d S )Nr   )r"   r#   r   	Parameterr   Zzerosweightbias)r$   r4   r%   r   r   r#   O   s    
zConvNextV2GRN.__init__r'   c                 C   sF   t jj|dddd}||jdddd  }| j||  | j | }|S )N   )r   r8   T)ordr4   keepdim)r4   r:   ư>)r   ZlinalgZvector_normmeanr6   r7   )r$   r(   Zglobal_featuresZnorm_featuresr   r   r   r)   T   s    zConvNextV2GRN.forward)
r+   r,   r-   r.   intr#   r   FloatTensorr)   r2   r   r   r%   r   r3   L   s   r3   c                       s@   e Zd ZdZddd fdd
Zejejd fdd	Z  ZS )
ConvNextV2LayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    r<   channels_lastepsdata_formatc                   s8   t  j|fd|i| |dvr.td| || _d S )NrC   )rA   channels_firstzUnsupported data format: )r"   r#   NotImplementedErrorrD   )r$   Znormalized_shaperC   rD   kwargsr%   r   r   r#   d   s    zConvNextV2LayerNorm.__init__featuresr   c                    sH   | j dkr8|dddd}t |}|dddd}nt |}|S )z
        Args:
            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
        rE   r   r8   r   r   )rD   permuter"   r)   )r$   rI   r%   r   r   r)   j   s    
zConvNextV2LayerNorm.forward	r+   r,   r-   r.   r#   r   r0   r)   r2   r   r   r%   r   r@   ^   s   r@   c                       s4   e Zd ZdZ fddZejejdddZ  Z	S )ConvNextV2EmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    sL   t    tj|j|jd |j|jd| _t|jd ddd| _	|j| _d S )Nr   kernel_sizestrider<   rE   rB   )
r"   r#   r   Conv2dnum_channelshidden_sizesZ
patch_sizepatch_embeddingsr@   	layernormr$   configr%   r   r   r#   ~   s    
zConvNextV2Embeddings.__init__)pixel_valuesr   c                 C   s4   |j d }|| jkrtd| |}| |}|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rQ   
ValueErrorrS   rT   )r$   rW   rQ   
embeddingsr   r   r   r)      s    



zConvNextV2Embeddings.forward)
r+   r,   r-   r.   r#   r   r?   r0   r)   r2   r   r   r%   r   rL   y   s   rL   c                       s6   e Zd ZdZd fdd	ZejejdddZ  ZS )	ConvNextV2Layera5  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    r   c                    s   t    tj||dd|d| _t|dd| _t|d| | _t	|j
 | _td| | _td| || _|dkr|t|nt | _d S )N   r   )rN   paddinggroupsr<   rC      r   )r"   r#   r   rP   dwconvr@   rT   Linearpwconv1r   Z
hidden_actactr3   grnpwconv2r    Identityr   )r$   rV   r4   r   r%   r   r   r#      s    
zConvNextV2Layer.__init__rH   c                 C   sr   |}|  |}|dddd}| |}| |}| |}| |}| |}|dddd}|| | }|S )Nr   r8   r   r   )r`   rJ   rT   rb   rc   rd   re   r   )r$   rI   Zresidualr   r   r   r)      s    





zConvNextV2Layer.forward)r   rK   r   r   r%   r   rZ      s   rZ   c                       s6   e Zd ZdZd	 fdd	ZejejdddZ  ZS )
ConvNextV2Stagea  ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`list[float]`): Stochastic depth rates for each layer.
    r8   Nc              	      s   t    |ks|dkrDtt|dddtj|||dg| _n
t | _pZdg| t fddt|D | _d S )	Nr   r<   rE   rB   rM   r   c                    s   g | ]}t  | d qS ))r4   r   )rZ   ).0jrV   drop_path_ratesout_channelsr   r   
<listcomp>       z,ConvNextV2Stage.__init__.<locals>.<listcomp>)	r"   r#   r   
ModuleListr@   rP   downsampling_layerrangelayers)r$   rV   in_channelsrl   rN   rO   depthrk   r%   rj   r   r#      s    

zConvNextV2Stage.__init__rH   c                 C   s,   | j D ]}||}q| jD ]}||}q|S r!   )rp   rr   )r$   rI   layerr   r   r   r)      s
    



zConvNextV2Stage.forward)r8   r8   r8   NrK   r   r   r%   r   rg      s   
rg   c                       s6   e Zd Z fddZdejee edddZ	  Z
S )ConvNextV2Encoderc              	      s   t    t | _dd tjd|jt|j	dd
|j	D }|jd }t|jD ]H}|j| }t||||dkrvdnd|j	| || d}| j| |}qTd S )	Nc                 S   s   g | ]}|  qS r   )tolist)rh   xr   r   r   rm      s   z.ConvNextV2Encoder.__init__.<locals>.<listcomp>r   cpu)r   r8   r   )rs   rl   rO   rt   rk   )r"   r#   r   ro   stagesr   ZlinspaceZdrop_path_ratesumZdepthssplitrR   rq   Z
num_stagesrg   append)r$   rV   rk   Zprev_chsiZout_chsstager%   r   r   r#      s$    

 

zConvNextV2Encoder.__init__F)r(   output_hidden_statesr   c                 C   s@   |r
|gnd }| j D ]}||}|d ur|| qt||dS )N)last_hidden_stater(   )rz   r}   r   )r$   r(   r   Zall_hidden_statesZlayer_moduler   r   r   r)      s    
zConvNextV2Encoder.forward)F)r+   r,   r-   r#   r   r0   r   boolr   r)   r2   r   r   r%   r   rv      s    
rv   c                   @   s,   e Zd ZU eed< dZdZdgZdd ZdS )ConvNextV2PreTrainedModelrV   
convnextv2rW   rZ   c                 C   s   t |tjtjfr@|jjjd| jjd |j	dur|j	j
  nNt |tjtfrl|j	j
  |jjd n"t |tr|jj
  |j	j
  dS )zInitialize the weightsr   )r=   ZstdNg      ?)
isinstancer   ra   rP   r6   dataZnormal_rV   Zinitializer_ranger7   Zzero_	LayerNormr@   Zfill_r3   )r$   moduler   r   r   _init_weights  s    

z'ConvNextV2PreTrainedModel._init_weightsN)	r+   r,   r-   r   __annotations__Zbase_model_prefixZmain_input_nameZ_no_split_modulesr   r   r   r   r   r     s
   
r   c                       sB   e Zd Z fddZeedeej ee	 e
dddZ  ZS )ConvNextV2Modelc                    sJ   t  | || _t|| _t|| _tj|j	d |j
d| _|   d S )Nr;   r^   )r"   r#   rV   rL   rY   rv   encoderr   r   rR   Zlayer_norm_epsrT   	post_initrU   r%   r   r   r#     s    

zConvNextV2Model.__init__NrW   r   r   c                 C   sb   |d u r| j j}|d u r td| |}| j||d}|j}| |ddg}t|||j	dS )Nz You have to specify pixel_valuesr   r;   )r   pooler_outputr(   )
rV   r   rX   rY   r   r   rT   r=   r   r(   )r$   rW   r   embedding_outputZencoder_outputsr   pooled_outputr   r   r   r)   ,  s    
zConvNextV2Model.forward)NN)r+   r,   r-   r#   r   r   r   r   r?   r   r   r)   r2   r   r   r%   r   r     s    r   z
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )Zcustom_introc                       sD   e Zd Z fddZeedeej eej	 e
dddZ  ZS ) ConvNextV2ForImageClassificationc                    sV   t  | |j| _t|| _|jdkr@t|jd |j| _n
t	 | _| 
  d S )Nr   r;   )r"   r#   Z
num_labelsr   r   r   ra   rR   
classifierrf   r   rU   r%   r   r   r#   O  s    


z)ConvNextV2ForImageClassification.__init__N)rW   labelsr   c                 K   sP   | j |fi |}|j}| |}d}|dur@| j||| jd}t|||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   Zpooled_logitsrV   )losslogitsr(   )r   r   r   Zloss_functionrV   r	   r(   )r$   rW   r   rG   outputsr   r   r   r   r   r   r)   ^  s    
z(ConvNextV2ForImageClassification.forward)NN)r+   r,   r-   r#   r   r   r   r   r?   Z
LongTensorr	   r)   r2   r   r   r%   r   r   G  s    r   zT
    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sB   e Zd ZdZ fddZeedeje	e
 edddZ  ZS )	ConvNextV2BackboneFc                    s   t  | t  | t|| _t|| _|jd g|j | _i }t	| j
| jD ]\}}t|dd||< qRt|| _|   d S )Nr   rE   )rD   )r"   r#   Z_init_backbonerL   rY   rv   r   rR   Znum_featureszipZ_out_featuresZchannelsr@   r   Z
ModuleDicthidden_states_normsr   )r$   rV   r   r   rQ   r%   r   r   r#     s    

zConvNextV2Backbone.__init__Nr   c           	      C   s   |du r| j j}| |}| j|dd}|j}g }t| j|D ]*\}}|| jv r>| j| |}|	| q>t
t||rz|nddS )ar  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   )feature_mapsr(   )rV   r   rY   r   r(   r   Zstage_namesZout_featuresr   r}   r   tuple)	r$   rW   r   r   r   r(   r   r   Zhidden_stater   r   r   r)     s    


zConvNextV2Backbone.forward)N)r+   r,   r-   Zhas_attentionsr#   r   r   r   r0   r   r   r   r)   r2   r   r   r%   r   r   x  s    r   )r   r   r   r   )r   F)-r.   typingr   r   Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_outputsr   r   r   r	   Zmodeling_utilsr
   utilsr   r   Zutils.backbone_utilsr   Zutils.genericr   Zconfiguration_convnextv2r   Z
get_loggerr+   loggerr0   r/   r   r   Moduler    r3   r   r@   rL   rZ   rg   rv   r   r   r   r   __all__r   r   r   r   <module>   sB   
,%#)*@