a
    hB                     @   s  d Z ddlmZ ddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ eeZd+ejeeejdddZG dd dejZ G dd dej!Z"G dd dejZ#G dd dejZ$G dd dejZ%G dd dejZ&eG dd  d eZ'eG d!d" d"e'Z(ed#d$G d%d& d&e'Z)ed'd$G d(d) d)e'eZ*g d*Z+dS ),zPyTorch ConvNext model.    )OptionalN)nn   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)BackboneMixin)can_return_tuple   )ConvNextConfig        F)input	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchZrandr   r   Zfloor_div)r   r   r   Z	keep_probr   Zrandom_tensoroutput r   j/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/convnext/modeling_convnext.py	drop_path)   s    
r   c                       sP   e Zd ZdZdee dd fddZejejdddZ	e
d	d
dZ  ZS )ConvNextDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).N)r   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r   r   r#   A   s    
zConvNextDropPath.__init__)hidden_statesr   c                 C   s   t || j| jS r!   )r   r   r   )r$   r'   r   r   r   forwardE   s    zConvNextDropPath.forward)r   c                 C   s   d| j  S )Nzp=)r   )r$   r   r   r   
extra_reprH   s    zConvNextDropPath.extra_repr)N)__name__
__module____qualname____doc__r   floatr#   r   Tensorr(   strr)   __classcell__r   r   r%   r   r    >   s   r    c                       s@   e Zd ZdZddd fdd
Zejejd fdd	Z  ZS )
ConvNextLayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    ư>channels_lastepsdata_formatc                   s8   t  j|fd|i| |dvr.td| || _d S )Nr6   )r4   channels_firstzUnsupported data format: )r"   r#   NotImplementedErrorr7   )r$   Znormalized_shaper6   r7   kwargsr%   r   r   r#   R   s    zConvNextLayerNorm.__init__featuresr   c                    sH   | j dkr8|dddd}t |}|dddd}nt |}|S )z
        Args:
            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
        r8   r      r   r   )r7   permuter"   r(   )r$   r<   r%   r   r   r(   X   s    
zConvNextLayerNorm.forward	r*   r+   r,   r-   r#   r   r/   r(   r1   r   r   r%   r   r2   L   s   r2   c                       s4   e Zd ZdZ fddZejejdddZ  Z	S )ConvNextEmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    sL   t    tj|j|jd |j|jd| _t|jd ddd| _	|j| _d S )Nr   kernel_sizestrider3   r8   r5   )
r"   r#   r   Conv2dnum_channelshidden_sizesZ
patch_sizepatch_embeddingsr2   	layernormr$   configr%   r   r   r#   k   s    
zConvNextEmbeddings.__init__)pixel_valuesr   c                 C   s4   |j d }|| jkrtd| |}| |}|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rE   
ValueErrorrG   rH   )r$   rK   rE   
embeddingsr   r   r   r(   s   s    



zConvNextEmbeddings.forward)
r*   r+   r,   r-   r#   r   FloatTensorr/   r(   r1   r   r   r%   r   r@   f   s   r@   c                       s6   e Zd ZdZd fdd	ZejejdddZ  ZS )	ConvNextLayera3  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextConfig`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    r   c                    s   t    tj||dd|d| _t|dd| _t|d| | _t	|j
 | _td| || _|jdkrtj|jt| dd	nd | _|d
krt|nt | _d S )N   r   )rB   paddinggroupsr3   r6      r   T)Zrequires_gradr   )r"   r#   r   rD   dwconvr2   rH   Linearpwconv1r   Z
hidden_actactpwconv2layer_scale_init_value	Parameterr   Zoneslayer_scale_parameterr    Identityr   )r$   rJ   dimr   r%   r   r   r#      s    
zConvNextLayer.__init__r;   c                 C   s|   |}|  |}|dddd}| |}| |}| |}| |}| jd urZ| j| }|dddd}|| | }|S )Nr   r=   r   r   )rU   r>   rH   rW   rX   rY   r\   r   )r$   r<   Zresidualr   r   r   r(      s    






zConvNextLayer.forward)r   r?   r   r   r%   r   rO   ~   s   rO   c                       s6   e Zd ZdZd	 fdd	ZejejdddZ  ZS )
ConvNextStagea  ConvNeXT stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextConfig`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`list[float]`): Stochastic depth rates for each layer.
    r=   Nc              	      s   t    |ks|dkrDtt|dddtj|||dg| _n
t | _pZdg| t fddt|D | _d S )	Nr   r3   r8   r5   rA   r   c                    s   g | ]}t  | d qS ))r^   r   )rO   ).0jrJ   drop_path_ratesout_channelsr   r   
<listcomp>       z*ConvNextStage.__init__.<locals>.<listcomp>)	r"   r#   r   
ModuleListr2   rD   downsampling_layerrangelayers)r$   rJ   in_channelsrd   rB   rC   depthrc   r%   rb   r   r#      s    

zConvNextStage.__init__r;   c                 C   s,   | j D ]}||}q| jD ]}||}q|S r!   )rh   rj   )r$   r<   layerr   r   r   r(      s
    



zConvNextStage.forward)r=   r=   r=   Nr?   r   r   r%   r   r_      s   
r_   c                       s6   e Zd Z fddZdejee edddZ	  Z
S )ConvNextEncoderc              	      s   t    t | _dd tjd|jt|j	dd
|j	D }|jd }t|jD ]H}|j| }t||||dkrvdnd|j	| || d}| j| |}qTd S )	Nc                 S   s   g | ]}|  qS r   )tolist)r`   xr   r   r   re      s   z,ConvNextEncoder.__init__.<locals>.<listcomp>r   cpu)r   r=   r   )rk   rd   rC   rl   rc   )r"   r#   r   rg   stagesr   ZlinspaceZdrop_path_ratesumZdepthssplitrF   ri   Z
num_stagesr_   append)r$   rJ   rc   Zprev_chsiZout_chsstager%   r   r   r#      s$    

 

zConvNextEncoder.__init__F)r'   output_hidden_statesr   c                 C   s@   |r
|gnd }| j D ]}||}|d ur|| qt||dS )N)last_hidden_stater'   )rr   ru   r   )r$   r'   rx   Zall_hidden_statesZlayer_moduler   r   r   r(      s    
zConvNextEncoder.forward)F)r*   r+   r,   r#   r   r/   r   boolr   r(   r1   r   r   r%   r   rn      s    
rn   c                   @   s0   e Zd ZU eed< dZdZdgZi Zdd Z	dS )ConvNextPreTrainedModelrJ   convnextrK   rO   c                 C   s   t |tjtjfr@|jjjd| jjd |j	dur|j	j
  nRt |tjtfrl|j	j
  |jjd n&t |tr|jdur|jj| jj dS )zInitialize the weightsr   )meanZstdNg      ?)
isinstancer   rV   rD   weightdataZnormal_rJ   Zinitializer_rangeZbiasZzero_	LayerNormr2   Zfill_rO   r\   rZ   )r$   moduler   r   r   _init_weights   s    


z%ConvNextPreTrainedModel._init_weightsN)
r*   r+   r,   r   __annotations__Zbase_model_prefixZmain_input_nameZ_no_split_modulesZ_can_record_outputsr   r   r   r   r   r{      s   
r{   c                       sB   e Zd Z fddZeedeej ee	 e
dddZ  ZS )ConvNextModelc                    sJ   t  | || _t|| _t|| _tj|j	d |j
d| _|   d S )NrS   )r"   r#   rJ   r@   rM   rn   encoderr   r   rF   Zlayer_norm_epsrH   	post_initrI   r%   r   r   r#   
  s    

zConvNextModel.__init__NrK   rx   r   c                 C   sb   |d u r| j j}|d u r td| |}| j||d}|j}| |ddg}t|||j	dS )Nz You have to specify pixel_valuesrx   r   )ry   pooler_outputr'   )
rJ   rx   rL   rM   r   ry   rH   r}   r   r'   )r$   rK   rx   embedding_outputZencoder_outputsry   pooled_outputr   r   r   r(     s    
zConvNextModel.forward)NN)r*   r+   r,   r#   r   r   r   r   rN   rz   r   r(   r1   r   r   r%   r   r     s    r   z
    ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )Zcustom_introc                       sD   e Zd Z fddZeedeej eej	 e
dddZ  ZS )ConvNextForImageClassificationc                    sV   t  | |j| _t|| _|jdkr@t|jd |j| _n
t	 | _| 
  d S )Nr   r   )r"   r#   Z
num_labelsr   r|   r   rV   rF   
classifierr]   r   rI   r%   r   r   r#   9  s    


z'ConvNextForImageClassification.__init__N)rK   labelsr   c                 K   sP   | j |fi |}|j}| |}d}|dur@| j||| jd}t|||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   Zpooled_logitsrJ   )losslogitsr'   )r|   r   r   Zloss_functionrJ   r	   r'   )r$   rK   r   r:   outputsr   r   r   r   r   r   r(   H  s    
z&ConvNextForImageClassification.forward)NN)r*   r+   r,   r#   r   r   r   r   rN   Z
LongTensorr	   r(   r1   r   r   r%   r   r   2  s    r   zQ
    ConvNeXt backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sB   e Zd ZdZ fddZeedeje	e
 edddZ  ZS )	ConvNextBackboneFc                    s   t  | t  | t|| _t|| _|jd g|j | _i }t	| j
| jD ]\}}t|dd||< qRt|| _|   d S )Nr   r8   )r7   )r"   r#   Z_init_backboner@   rM   rn   r   rF   Znum_featureszipZ_out_featuresZchannelsr2   r   Z
ModuleDicthidden_states_normsr   )r$   rJ   r   rw   rE   r%   r   r   r#   j  s    

zConvNextBackbone.__init__Nr   c           	      C   s   |du r| j j}| |}| j|dd}|j}g }t| j|D ]*\}}|| jv r>| j| |}|	| q>t
t||rz|nddS )ah  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnext-tiny-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   )feature_mapsr'   )rJ   rx   rM   r   r'   r   Zstage_namesZout_featuresr   ru   r   tuple)	r$   rK   rx   r   r   r'   r   rw   Zhidden_stater   r   r   r(   {  s    


zConvNextBackbone.forward)N)r*   r+   r,   Zhas_attentionsr#   r   r   r   r/   r   rz   r   r(   r1   r   r   r%   r   r   b  s    r   )r   r   r{   r   )r   F),r-   typingr   r   Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_outputsr   r   r   r	   Zmodeling_utilsr
   utilsr   r   Zutils.backbone_utilsr   Zutils.genericr   Zconfiguration_convnextr   Z
get_loggerr*   loggerr/   r.   rz   r   Moduler    r   r2   r@   rO   r_   rn   r{   r   r   r   __all__r   r   r   r   <module>   s@   
+$#)*@