a
    hf                     @   s  d Z ddlZddlmZ ddlmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ eeZeeddG dd deZG dd de	jZG dd de	jZ G dd de	jZ!G dd de	jZ"G dd de	jZ#G dd de	jZ$G dd de	jZ%G dd  d e	jZ&G d!d" d"e	jZ'G d#d$ d$e	jZ(G d%d& d&e	jZ)eG d'd( d(eZ*eG d)d* d*e*Z+ed+dG d,d- d-e*Z,ed.dG d/d0 d0e*Z-g d1Z.dS )2zPyTorch LeViT model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionModelOutput)PreTrainedModel)auto_docstringlogging   )LevitConfigzD
    Output type of [`LevitForImageClassificationWithTeacher`].
    )Zcustom_introc                   @   s^   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dS ),LevitForImageClassificationWithTeacherOutputan  
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores as the average of the `cls_logits` and `distillation_logits`.
    cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
        class token).
    distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
        distillation token).
    Nlogits
cls_logitsdistillation_logitshidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   tuple r    r    d/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/levit/modeling_levit.pyr   (   s
   
r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )LevitConvEmbeddingsz[
    LeViT Conv Embeddings with Batch Norm, used in the initial patch embedding layer.
    r   c	           	   
      s6   t    tj|||||||dd| _t|| _d S )NF)dilationgroupsbias)super__init__r   Conv2dconvolutionBatchNorm2d
batch_norm)	selfZin_channelsZout_channelskernel_sizestridepaddingr#   r$   bn_weight_init	__class__r    r!   r'   E   s
    
zLevitConvEmbeddings.__init__c                 C   s   |  |}| |}|S N)r)   r+   )r,   
embeddingsr    r    r!   forwardN   s    

zLevitConvEmbeddings.forward)r   r   r   r   r   r   r   r'   r5   __classcell__r    r    r1   r!   r"   @   s    	r"   c                       s(   e Zd ZdZ fddZdd Z  ZS )LevitPatchEmbeddingsz
    LeViT patch embeddings, for final embeddings to be passed to transformer blocks. It consists of multiple
    `LevitConvEmbeddings`.
    c                    s   t    t|j|jd d |j|j|j| _t	
 | _t|jd d |jd d |j|j|j| _t	
 | _t|jd d |jd d |j|j|j| _t	
 | _t|jd d |jd |j|j|j| _|j| _d S )Nr            )r&   r'   r"   num_channelshidden_sizesr-   r.   r/   embedding_layer_1r   	Hardswishactivation_layer_1embedding_layer_2activation_layer_2embedding_layer_3activation_layer_3embedding_layer_4r,   configr1   r    r!   r'   Z   s"    

$
$
 zLevitPatchEmbeddings.__init__c                 C   st   |j d }|| jkrtd| |}| |}| |}| |}| |}| |}| 	|}|
dddS )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r;   )shaper<   
ValueErrorr>   r@   rA   rB   rC   rD   rE   flatten	transpose)r,   pixel_valuesr<   r4   r    r    r!   r5   p   s    








zLevitPatchEmbeddings.forwardr6   r    r    r1   r!   r8   T   s   r8   c                       s&   e Zd Zd fdd	Zdd Z  ZS )MLPLayerWithBNr   c                    s,   t    tj||dd| _t|| _d S )NF)Zin_featuresZout_featuresr%   )r&   r'   r   LinearlinearBatchNorm1dr+   )r,   	input_dim
output_dimr0   r1   r    r!   r'      s    
zMLPLayerWithBN.__init__c                 C   s&   |  |}| |dd|}|S )Nr   r   )rO   r+   rJ   Z
reshape_asr,   hidden_stater    r    r!   r5      s    
zMLPLayerWithBN.forward)r   r   r   r   r'   r5   r7   r    r    r1   r!   rM      s   rM   c                       s$   e Zd Z fddZdd Z  ZS )LevitSubsamplec                    s   t    || _|| _d S r3   )r&   r'   r.   
resolution)r,   r.   rW   r1   r    r!   r'      s    
zLevitSubsample.__init__c                 C   sL   |j \}}}||| j| j|d d d d | jd d | jf |d|}|S )N)rH   viewrW   r.   reshape)r,   rT   
batch_size_Zchannelsr    r    r!   r5      s    zLevitSubsample.forwardrU   r    r    r1   r!   rV      s   rV   c                       sB   e Zd Z fddZe d
 fdd	Zdd Zdd	 Z  Z	S )LevitAttentionc                    sB  t    || _|d | _|| _|| _|| | || d  | _|| | | _t|| j| _	t
 | _t| j|dd| _ttt|t|}t|}i g  }}	|D ]X}
|D ]N}t|
d |d  t|
d |d  f}||vrt|||< |	||  qqi | _tj
t|t|| _| jdt|	||dd d S )	N      r;   r   )r0   r   attention_bias_idxsF
persistent)r&   r'   num_attention_headsscalekey_dimattention_ratioout_dim_keys_valuesout_dim_projectionrM   queries_keys_valuesr   r?   
activation
projectionlist	itertoolsproductrangelenabsappendattention_bias_cacher   	Parameterzerosattention_biasesregister_buffer
LongTensorrY   )r,   r=   rd   rb   re   rW   points
len_pointsattention_offsetsindicesp1p2offsetr1   r    r!   r'      s0    



(zLevitAttention.__init__Tc                    s    t  | |r| jri | _d S r3   r&   trainrr   r,   moder1   r    r!   r      s    
zLevitAttention.trainc                 C   sT   | j r| jd d | jf S t|}|| jvrF| jd d | jf | j|< | j| S d S r3   trainingru   r_   strrr   r,   deviceZ
device_keyr    r    r!   get_attention_biases   s    
z#LevitAttention.get_attention_biasesc           
      C   s   |j \}}}| |}|||| jdj| j| j| j| j gdd\}}}|dddd}|dddd}|dddd}||dd | j	 | 
|j }	|	jdd}	|	| dd||| j}| | |}|S NrX   r	   dimr   r;   r   )rH   rh   rY   rb   splitrd   re   permuterK   rc   r   r   softmaxrZ   rg   rj   ri   )
r,   rT   r[   
seq_lengthr\   rh   querykeyvalue	attentionr    r    r!   r5      s    
"zLevitAttention.forward)T
r   r   r   r'   r   Zno_gradr   r   r5   r7   r    r    r1   r!   r]      s
   	r]   c                       sB   e Zd Z fddZe d
 fdd	Zdd Zdd	 Z  Z	S )LevitAttentionSubsamplec	                    s  t    || _|d | _|| _|| _|| | ||  | _|| | | _|| _t	|| j| _
t||| _t	||| | _t | _t	| j|| _i | _ttt|t|}	ttt|t|}
t|	t|
 }}i g  }}|
D ]~}|	D ]t}d}t|d | |d  |d d  t|d | |d  |d d  f}||vrVt|||< |||  qqtjt|t|| _| jdt| ||dd d S )Nr^   r   r   r;   r_   Fr`   )!r&   r'   rb   rc   rd   re   rf   rg   resolution_outrM   keys_valuesrV   queries_subsamplequeriesr   r?   ri   rj   rr   rk   rl   rm   rn   ro   rp   rq   r   rs   rt   ru   rv   rw   rY   )r,   rQ   rR   rd   rb   re   r.   resolution_inr   rx   Zpoints_ry   Zlen_points_rz   r{   r|   r}   sizer~   r1   r    r!   r'      s:    



H
z LevitAttentionSubsample.__init__Tc                    s    t  | |r| jri | _d S r3   r   r   r1   r    r!   r     s    
zLevitAttentionSubsample.trainc                 C   sT   | j r| jd d | jf S t|}|| jvrF| jd d | jf | j|< | j| S d S r3   r   r   r    r    r!   r     s    
z,LevitAttentionSubsample.get_attention_biasesc           	      C   s   |j \}}}| |||| jdj| j| j| j gdd\}}|dddd}|dddd}| | 	|}||| j
d | j| jdddd}||dd | j | |j }|jdd}|| dd|d| j}| | |}|S r   )rH   r   rY   rb   r   rd   re   r   r   r   r   rK   rc   r   r   r   rZ   rg   rj   ri   )	r,   rT   r[   r   r\   r   r   r   r   r    r    r!   r5     s$    

"zLevitAttentionSubsample.forward)Tr   r    r    r1   r!   r      s
   -	r   c                       s(   e Zd ZdZ fddZdd Z  ZS )LevitMLPLayerzE
    MLP Layer with `2X` expansion in contrast to ViT with `4X`.
    c                    s0   t    t||| _t | _t||| _d S r3   )r&   r'   rM   	linear_upr   r?   ri   linear_down)r,   rQ   
hidden_dimr1   r    r!   r'   0  s    

zLevitMLPLayer.__init__c                 C   s"   |  |}| |}| |}|S r3   )r   ri   r   rS   r    r    r!   r5   6  s    


zLevitMLPLayer.forwardr6   r    r    r1   r!   r   +  s   r   c                       s(   e Zd ZdZ fddZdd Z  ZS )LevitResidualLayerz"
    Residual Block for LeViT
    c                    s   t    || _|| _d S r3   )r&   r'   module	drop_rate)r,   r   r   r1   r    r!   r'   B  s    
zLevitResidualLayer.__init__c                 C   sr   | j r\| jdkr\tj|ddd|jd}|| jd| j  }|| 	||  }|S || 	| }|S d S )Nr   r   )r   )
r   r   r   Zrandr   r   Zge_divdetachr   )r,   rT   Zrndr    r    r!   r5   G  s    zLevitResidualLayer.forwardr6   r    r    r1   r!   r   =  s   r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )
LevitStagezP
    LeViT Stage consisting of `LevitMLPLayer` and `LevitAttention` layers.
    c                    sH  t    g | _|| _|
| _t|D ]R}| jtt|||||
| jj	 |dkr$|| }| jtt
||| jj	 q$|	d dkr6| jd |	d  d | _| jt| jj||d  |	d |	d |	d |	d |
| jd | j| _|	d dkr6| jj|d  |	d  }| jtt
| jj|d  || jj	 t| j| _d S )	Nr   Z	Subsampler      r;   r	   )rd   rb   re   r.   r   r   r:   )r&   r'   layersrG   r   rn   rq   r   r]   Zdrop_path_rater   r   r   r=   r   
ModuleList)r,   rG   idxr=   rd   depthsrb   re   	mlp_ratiodown_opsr   r\   r   r1   r    r!   r'   W  sL    
zLevitStage.__init__c                 C   s   | j S r3   )r   )r,   r    r    r!   get_resolution  s    zLevitStage.get_resolutionc                 C   s   | j D ]}||}q|S r3   )r   )r,   rT   layerr    r    r!   r5     s    

zLevitStage.forward)r   r   r   r   r'   r   r5   r7   r    r    r1   r!   r   R  s   7r   c                       s*   e Zd ZdZ fddZdddZ  ZS )	LevitEncoderzC
    LeViT Encoder consisting of multiple `LevitStage` stages.
    c                    s   t    || _| jj| jj }g | _| jjdg tt	|j
D ]\}t|||j| |j| |j
| |j| |j| |j| |j| |
}| }| j| qDt| j| _d S )N )r&   r'   rG   Z
image_sizeZ
patch_sizestagesr   rq   rn   ro   r   r   r=   rd   rb   re   r   r   r   r   )r,   rG   rW   Z	stage_idxstager1   r    r!   r'     s*    
zLevitEncoder.__init__FTc                 C   sb   |rdnd }| j D ]}|r$||f }||}q|r<||f }|sVtdd ||fD S t||dS )Nr    c                 s   s   | ]}|d ur|V  qd S r3   r    ).0vr    r    r!   	<genexpr>      z'LevitEncoder.forward.<locals>.<genexpr>)last_hidden_stater   )r   r   r
   )r,   rT   output_hidden_statesreturn_dictZall_hidden_statesr   r    r    r!   r5     s    



zLevitEncoder.forward)FTr6   r    r    r1   r!   r     s   r   c                       s(   e Zd ZdZ fddZdd Z  ZS )LevitClassificationLayerz$
    LeViT Classification Layer
    c                    s(   t    t|| _t||| _d S r3   )r&   r'   r   rP   r+   rN   rO   )r,   rQ   rR   r1   r    r!   r'     s    
z!LevitClassificationLayer.__init__c                 C   s   |  |}| |}|S r3   )r+   rO   )r,   rT   r   r    r    r!   r5     s    

z LevitClassificationLayer.forwardr6   r    r    r1   r!   r     s   r   c                   @   s,   e Zd ZU eed< dZdZdgZdd ZdS )LevitPreTrainedModelrG   levitrL   r   c                 C   sp   t |tjtjfr@|jjjd| jjd |j	durl|j	j
  n,t |tjtjfrl|j	j
  |jjd dS )zInitialize the weightsg        )meanZstdNg      ?)
isinstancer   rN   r(   weightdataZnormal_rG   Zinitializer_ranger%   Zzero_rP   r*   Zfill_)r,   r   r    r    r!   _init_weights  s    
z"LevitPreTrainedModel._init_weightsN)	r   r   r   r   r   Zbase_model_prefixZmain_input_nameZ_no_split_modulesr   r    r    r    r!   r     s
   
r   c                       sL   e Zd Z fddZedeej ee ee e	e
ef dddZ  ZS )
LevitModelc                    s2   t  | || _t|| _t|| _|   d S r3   )r&   r'   rG   r8   patch_embeddingsr   encoder	post_initrF   r1   r    r!   r'     s
    

zLevitModel.__init__NrL   r   r   returnc                 C   s   |d ur|n| j j}|d ur |n| j j}|d u r8td| |}| j|||d}|d }|jdd}|s~||f|dd   S t|||jdS )Nz You have to specify pixel_valuesr   r   r   r   r   )r   Zpooler_outputr   )	rG   r   use_return_dictrI   r   r   r   r   r   )r,   rL   r   r   r4   Zencoder_outputsr   Zpooled_outputr    r    r!   r5     s(    
zLevitModel.forward)NNN)r   r   r   r'   r   r   r   r   boolr   r   r   r5   r7   r    r    r1   r!   r     s      
r   z
    Levit Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                	       sT   e Zd Z fddZedeej eej ee	 ee	 e
eef dddZ  ZS )LevitForImageClassificationc                    sX   t  | || _|j| _t|| _|jdkr@t|jd |jntj	
 | _|   d S Nr   rX   )r&   r'   rG   
num_labelsr   r   r   r=   r   r   Identity
classifierr   rF   r1   r    r!   r'     s    
z$LevitForImageClassification.__init__N)rL   labelsr   r   r   c                 C   sl  |dur|n| j j}| j|||d}|d }|d}| |}d}|dur,| j jdu r| jdkrnd| j _n4| jdkr|jtj	ks|jtj
krd| j _nd| j _| j jdkrt }	| jdkr|	| | }n
|	||}nN| j jdkrt }	|	|d| j|d}n| j jdkr,t }	|	||}|s\|f|d	d  }
|durX|f|
 S |
S t|||jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrX   r;   )lossr   r   )rG   r   r   r   r   Zproblem_typer   Zdtyper   longintr   Zsqueezer   rY   r   r   r   )r,   rL   r   r   r   outputssequence_outputr   r   Zloss_fctoutputr    r    r!   r5   /  s@    




"


z#LevitForImageClassification.forward)NNNN)r   r   r   r'   r   r   r   r   rw   r   r   r   r   r5   r7   r    r    r1   r!   r     s       
r   ap  
    LeViT Model transformer with image classification heads on top (a linear layer on top of the final hidden state and
    a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet. .. warning::
           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                       sL   e Zd Z fddZedeej ee ee e	e
ef dddZ  ZS )&LevitForImageClassificationWithTeacherc                    s   t  | || _|j| _t|| _|jdkr@t|jd |jntj	
 | _|jdkrht|jd |jntj	
 | _|   d S r   )r&   r'   rG   r   r   r   r   r=   r   r   r   r   classifier_distillr   rF   r1   r    r!   r'   o  s    
z/LevitForImageClassificationWithTeacher.__init__Nr   c           
      C   s   |d ur|n| j j}| j|||d}|d }|d}| || | }}|| d }|sv|||f|dd   }	|	S t||||jdS )Nr   r   r   r;   )r   r   r   r   )rG   r   r   r   r   r   r   r   )
r,   rL   r   r   r   r   r   Zdistill_logitsr   r   r    r    r!   r5     s    
z.LevitForImageClassificationWithTeacher.forward)NNN)r   r   r   r'   r   r   r   r   r   r   r   r   r5   r7   r    r    r1   r!   r   f  s   	   
r   )r   r   r   r   )/r   rl   dataclassesr   typingr   r   r   Ztorch.utils.checkpointr   Ztorch.nnr   r   r   Zmodeling_outputsr
   r   r   r   Zmodeling_utilsr   utilsr   r   Zconfiguration_levitr   Z
get_loggerr   loggerr   Moduler"   r8   rM   rV   r]   r   r   r   r   r   r   r   r   r   r   __all__r    r    r    r!   <module>   sR   
,>SE..H2