a
    h                  	   @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
mZ ddlZ	ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z& e!'e(Z)e	j*e	j*dddZ+e	j*e	j*dddZ,eeG dd deZ-G dd dej.Z/G dd dej.Z0G dd dej.Z1de0iZ2G dd dej.Z3G dd  d ej.Z4G d!d" d"ej.Z5G d#d$ d$eZ6G d%d& d&ej.Z7G d'd( d(ej.Z8dHej.e	j*e	j*e	j*ee	j* e9e9d*d+d,Z:G d-d. d.ej.Z;G d/d0 d0ej.Z<G d1d2 d2eZ=G d3d4 d4ej.Z>G d5d6 d6ej.Z?eG d7d8 d8eZ@G d9d: d:ej.ZAG d;d< d<e@ZBed=d>G d?d@ d@e@ZCG dAdB dBe@ZDG dCdD dDe@ZEdIdEdFZFg dGZGdS )JzPyTorch AltCLIP model.    N)	dataclass)AnyCallableOptionalUnion   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfig)logitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)nn
functionalZcross_entropytorcharangelenr   )r    r%   h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_loss,   s    r'   )
similarityr   c                 C   s    t | }t |  }|| d S )Ng       @)r'   t)r(   Zcaption_lossZ
image_lossr%   r%   r&   	clip_loss0   s    r*   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< ee d
ddZdS )AltCLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d vr | nt  | V  qdS ))r1   r2   N)getattrto_tuple).0kselfr%   r&   	<genexpr>V   s   z)AltCLIPOutput.to_tuple.<locals>.<genexpr>)tuplekeysr8   r%   r8   r&   r5   U   s    zAltCLIPOutput.to_tuple)__name__
__module____qualname____doc__r,   r   r"   FloatTensor__annotations__r-   r.   r/   r0   r1   r   r2   r;   r   r5   r%   r%   r%   r&   r+   6   s   
r+   c                       s2   e Zd ZdZ fddZd
ddZdd	 Z  ZS )AltRobertaEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd |j| _tj|j|j| jd| _	d S )N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistenttoken_type_idsdtype)super__init__r    	EmbeddingZ
vocab_sizehidden_sizeZpad_token_idword_embeddingsmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutr4   rG   register_bufferr"   r#   expandzerosrI   sizelongrD   r9   config	__class__r%   r&   rR   c   s"    
zAltRobertaEmbeddings.__init__Nr   c                 C   s   |d u r*|d ur t || j|}n
| |}|d ur<| }n| d d }|d }|d u rt| dr| jd d d |f }||d |}	|	}ntj|tj	| j
jd}|d u r| |}| |}
||
 }| jdkr| |}||7 }| |}| |}|S )NrK   r   rN   r   rP   r   rH   )"create_position_ids_from_input_idsrD   &create_position_ids_from_inputs_embedsra   hasattrrN   r_   r"   r`   rb   rI   r   rU   rX   rG   rW   rY   r]   )r9   	input_idsrN   rI   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrX   
embeddingsrW   r%   r%   r&   forward|   s0    








zAltRobertaEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrK   r   rg   r   )ra   r"   r#   rD   rb   r   	unsqueezer_   )r9   rl   rn   Zsequence_lengthrI   r%   r%   r&   ri      s    	z;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds)NNNNr   )r=   r>   r?   r@   rR   rs   ri   __classcell__r%   r%   re   r&   rC   ]   s
    
(rC   c                       sN   e Zd Zd fdd	Zd	ejeej eej ee e	ej dddZ
  ZS )
AltRobertaSelfAttentionNc                    s   t    |j|j dkr>t|ds>td|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|pt|dd| _| jdks| jd	kr|j| _t	d
|j d | j| _d S )Nr   Zembedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rG   rH   relative_keyrelative_key_query   r   )rQ   rR   rT   num_attention_headsrj   
ValueErrorintattention_head_sizeall_head_sizer    Linearquerykeyvaluer[   Zattention_probs_dropout_probr]   r4   rG   rV   rS   distance_embeddingr9   rd   rG   re   r%   r&   rR      s(    

z AltRobertaSelfAttention.__init__Fhidden_statesattention_mask	head_maskoutput_attentionsr   c                 C   s  |j d d }g |d| jR }| ||dd}| ||dd}| ||dd}	t||dd}
| j	dks| j	dkrd|j d |j d  }}tj
|tj|jddd}tj
|tj|jddd}|| }| || j d }|j|jd}| j	dkr0td	||}|
| }
n4| j	dkrdtd	||}td
||}|
| | }
|
t| j }
|d ur|
| }
tjj|
dd}| |}|d ur|| }t||	}|dddd }| d d | jf }||}|r||fn|f}|S )NrK   r   rz   rx   ry   rg   rO   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r   )shaper~   r   view	transposer   r   r"   matmulrG   r#   rb   r   r   rV   torP   Zeinsummathsqrtr    r!   softmaxr]   permute
contiguousra   r   )r9   r   r   r   r   rn   Zhidden_shapeZquery_layerZ	key_layerZvalue_layerZattention_scoresZquery_lengthZ
key_lengthZposition_ids_lZposition_ids_rZdistanceZpositional_embeddingZrelative_position_scoresZrelative_position_scores_queryZrelative_position_scores_keyZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr%   r%   r&   rs      sB    




zAltRobertaSelfAttention.forward)N)NNF)r=   r>   r?   rR   r"   Tensorr   rA   boolr;   rs   ru   r%   r%   re   r&   rv      s      rv   c                       s4   e Zd Z fddZejejejdddZ  ZS )AltRobertaSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S NrE   )rQ   rR   r    r   rT   denserY   rZ   r[   r\   r]   rc   re   r%   r&   rR     s    
zAltRobertaSelfOutput.__init__r   input_tensorr   c                 C   s&   |  |}| |}| || }|S Nr   r]   rY   r9   r   r   r%   r%   r&   rs     s    

zAltRobertaSelfOutput.forwardr=   r>   r?   rR   r"   r   rs   ru   r%   r%   re   r&   r     s   r   eagerc                       sV   e Zd Zd
 fdd	Zdd Zdejeej eej ee	 e
ej ddd	Z  ZS )AltRobertaAttentionNc                    s4   t    t|j ||d| _t|| _t | _d S )N)rG   )	rQ   rR   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationr9   r   outputsetpruned_headsr   re   r%   r&   rR   !  s    

zAltRobertaAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )r$   r   r9   r{   r~   r   r   r   r   r   r   r   r   union)r9   headsindexr%   r%   r&   prune_heads)  s    zAltRobertaAttention.prune_headsFr   c                 C   s8   | j ||||d}| |d |}|f|dd   }|S N)r   r   r   r   r   )r9   r   )r9   r   r   r   r   Zself_outputsattention_outputr   r%   r%   r&   rs   ;  s    zAltRobertaAttention.forward)N)NNF)r=   r>   r?   rR   r   r"   r   r   rA   r   r;   rs   ru   r%   r%   re   r&   r      s      r   c                       s0   e Zd Z fddZejejdddZ  ZS )AltRobertaIntermediatec                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S r   )rQ   rR   r    r   rT   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrc   re   r%   r&   rR   O  s
    
zAltRobertaIntermediate.__init__r   r   c                 C   s   |  |}| |}|S r   )r   r   r9   r   r%   r%   r&   rs   W  s    

zAltRobertaIntermediate.forwardr   r%   r%   re   r&   r   N  s   r   c                       s4   e Zd Z fddZejejejdddZ  ZS )AltRobertaOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )rQ   rR   r    r   r   rT   r   rY   rZ   r[   r\   r]   rc   re   r%   r&   rR   _  s    
zAltRobertaOutput.__init__r   c                 C   s&   |  |}| |}| || }|S r   r   r   r%   r%   r&   rs   e  s    

zAltRobertaOutput.forwardr   r%   r%   re   r&   r   ^  s   r   c                       sT   e Zd Z fddZd
ejeej eej ee e	ej dddZ
dd	 Z  ZS )AltRobertaLayerc                    s:   t    |j| _d| _t|| _t|| _t|| _	d S )Nr   )
rQ   rR   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   rc   re   r%   r&   rR   n  s    


zAltRobertaLayer.__init__NFr   c           
      K   sP   | j |f|||d|}|d }|dd  }t| j| j| j|}	|	f| }|S r   )r   r   feed_forward_chunkr   r   )
r9   r   r   r   r   kwargsZself_attention_outputsr   r   layer_outputr%   r%   r&   rs   v  s     
zAltRobertaLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )r9   r   Zintermediate_outputr   r%   r%   r&   r     s    
z"AltRobertaLayer.feed_forward_chunk)NNF)r=   r>   r?   rR   r"   r   r   rA   r   r;   rs   r   ru   r%   r%   re   r&   r   m  s      r   c                       sd   e Zd Z fddZed	ejeej eej ee	 ee	 ee	 e
eej ef dddZ  ZS )
AltRobertaEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r%   )r   )r6   ird   r%   r&   
<listcomp>      z.AltRobertaEncoder.__init__.<locals>.<listcomp>F)	rQ   rR   rd   r    
ModuleListrangenum_hidden_layerslayergradient_checkpointingrc   re   r   r&   rR     s    
 zAltRobertaEncoder.__init__NFT)r   r   r   r   output_hidden_statesreturn_dictr   c                 K   s   |rdnd }|rdnd }	t | jD ]\\}
}|r8||f }|d urH||
 nd }|f ||||d|}|d }|r"|	|d f }	q"|r||f }t|||	dS )Nr%   )r   r   r   r   r   r   last_hidden_stater   
attentions)	enumerater   r
   )r9   r   r   r   r   r   r   r   Zall_hidden_statesZall_self_attentionsr   Zlayer_moduleZlayer_head_masklayer_outputsr%   r%   r&   rs     s0    

zAltRobertaEncoder.forward)NNFFT)r=   r>   r?   rR   r   r"   r   r   rA   r   r   r;   r
   rs   ru   r%   r%   re   r&   r     s         r   c                       s0   e Zd Z fddZejejdddZ  ZS )AltRobertaPoolerc                    s*   t    t|j|j| _t | _d S r   )rQ   rR   r    r   rT   r   ZTanh
activationrc   re   r%   r&   rR     s    
zAltRobertaPooler.__init__r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r9   r   Zfirst_token_tensorpooled_outputr%   r%   r&   rs     s    

zAltRobertaPooler.forwardr   r%   r%   re   r&   r     s   r           )moduler   r   r   r   scalingr]   c           
      K   s|   t ||dd| }|d ur(|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrK   r   )r   rP   )ptrainingr   rz   )r"   r   r   r    r!   r   Zfloat32r   rP   r]   r   r   )
r   r   r   r   r   r   r]   r   attn_weightsattn_outputr%   r%   r&   eager_attention_forward  s    
r   c                	       sZ   e Zd ZdZ fddZd	ejeej eej ee e	ejeej f dddZ
  ZS )
AltCLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkrZtd| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)rQ   rR   rd   rT   	embed_dimr{   	num_headshead_dimr|   scaleZattention_dropoutr]   	is_causalr    r   k_projv_projq_projout_projrc   re   r%   r&   rR     s$    

zAltCLIPAttention.__init__NFr   r   causal_attention_maskr   r   c              
   C   sP  |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
| jj	dkr|dur|dur|| }q|dur|}n
|du| _
t}| jj	dkr| jj	dkr|rtd nt| jj	 }|| ||	|
|| j
| j| jsdn| jd	\}}|||| }| |}|sHd}||fS )
z#Input shape: Batch x Time x Channelr   rz   Zflash_attention_2Nr   Zsdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r]   )r   r   r   r   r   r   r   r   rd   r   r   r   loggerZwarning_oncer   r   r   r]   reshaper   r   )r9   r   r   r   r   
batch_sizero   r   Zqueriesr<   valuesZattention_interfacer   r   r%   r%   r&   rs     sF    	






zAltCLIPAttention.forward)NNF)r=   r>   r?   r@   rR   r"   r   r   r   r;   rs   ru   r%   r%   re   r&   r     s      r   c                       s0   e Zd Z fddZejejdddZ  ZS )
AltCLIPMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )rQ   rR   rd   r   r   activation_fnr    r   rT   r   fc1fc2rc   re   r%   r&   rR   @  s
    
zAltCLIPMLP.__init__r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   r   r%   r%   r&   rs   G  s    


zAltCLIPMLP.forwardr   r%   r%   re   r&   r   ?  s   r   c                       sJ   e Zd Zed fddZdejejejee e	ej
 dddZ  ZS )	AltCLIPEncoderLayerr   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S r   )rQ   rR   rT   r   r   	self_attnr    rY   rZ   layer_norm1r   mlplayer_norm2rc   re   r%   r&   rR   O  s    


zAltCLIPEncoderLayer.__init__Fr   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r`||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   )r   r   r   r   )r9   r   r   r   r   Zresidualr   r   r%   r%   r&   rs   W  s"    




zAltCLIPEncoderLayer.forward)F)r=   r>   r?   r   rR   r"   r   r   r   r;   rA   rs   ru   r%   r%   re   r&   r   N  s    r   c                
       sd   e Zd ZdZed fddZed	eej	 eej	 ee
 ee
 ee
 eeef dddZ  ZS )
AltCLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].

    Args:
        config: AltCLIPConfig
    r   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r%   )r   )r6   _r   r%   r&   r     r   z+AltCLIPEncoder.__init__.<locals>.<listcomp>F)	rQ   rR   rd   r    r   r   r   layersr   rc   re   r   r&   rR     s    
 zAltCLIPEncoder.__init__N)r   r   r   r   r   r   c                 C   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}|rDdnd}|rPdnd}|}	t| jD ]@\}
}|rx||	f }||	|||d}|d }	|rb||d f }qb|r||	f }t|	||dS )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr%   )r   r   r   r   )rd   r   r   use_return_dictr   r   r
   )r9   rl   r   r   r   r   r   Zencoder_statesZall_attentionsr   idxZencoder_layerr   r%   r%   r&   rs     s0    '

zAltCLIPEncoder.forward)NNNNN)r=   r>   r?   r@   r   rR   r   r   r"   r   r   r   r;   r
   rs   ru   r%   r%   re   r&   r     s         
r   c                       sP   e Zd Zed fddZejeeejdddZdej	ejdd	d
Z
  ZS )AltCLIPVisionEmbeddingsr   c                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)Zin_channelsZout_channelsZkernel_sizeZstridebiasrz   r   rI   rJ   rL   )rQ   rR   rd   rT   r   
image_size
patch_sizer    	Parameterr"   Zrandnclass_embeddingZConv2dZnum_channelspatch_embeddingnum_patchesnum_positionsrS   position_embeddingr^   r#   r_   rc   re   r%   r&   rR     s"    
z AltCLIPVisionEmbeddings.__init__)rr   heightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj sP||krP||krP| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrK   g      ?r   rz   ZbicubicF)ra   modeZalign_cornersr   )r   r  weightrt   r"   Zjit
is_tracingrI   r  r   r   r   r    r!   Zinterpolater   cat)r9   rr   r  r	  r  r  r  Zclass_pos_embedZpatch_pos_embedr   Z
new_heightZ	new_widthZsqrt_num_positionsr%   r%   r&   interpolate_pos_encoding  s*    



z0AltCLIPVisionEmbeddings.interpolate_pos_encodingF)pixel_valuesr   c              
   C   s   |j \}}}}|sL|| jks&|| jkrLtd| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r|
| |
|| }
n|
| | j }
|
S )
NzInput image size (*z) doesn't match model (r   rO   rz   r   rK   r   )r   r   r|   r  r  rP   r   flattenr   r  r_   r"   r  r  r  rI   )r9   r  r  r   r   r  r	  Ztarget_dtypeZpatch_embedsZclass_embedsrr   r%   r%   r&   rs     s     
zAltCLIPVisionEmbeddings.forward)F)r=   r>   r?   r   rR   r"   r   r}   r  rA   rs   ru   r%   r%   re   r&   r     s   )r   c                   @   s*   e Zd ZU eed< dZdZg Zdd ZdS )AltCLIPPreTrainedModelrd   ZaltclipTc                 C   s  | j j}t|trt| j j}tjj|jd|jd | d tjj|j	j
|j j| d tjj|jj
|j j| d nt|tr| j j}|jd d|j j d  | }|jd | }tjj|jj
|d tjj|jj
|d tjj|jj
|d tjj|jj
|d nt|tr|| j j}|j jd d|j j d  | }d|j j d | }tjj|jj
|d tjj|jj
|d nt|trtjj|jj
|jd | j j d d|j_tjj|jj
|jd | j j d d|j_nt|tjr|jj   |j
j!d nt|tj"rF|j
jjd| j jd |jdur|jj   nBt|tj#r|j
jjd| j jd |j$dur|j
j|j$    dS )	zInitialize the weightsr   r   )meanstd)r  rz   Tg      ?N)%rd   Zinitializer_factorr   r   r    initZnormal_r  r   r  r  Zinitializer_ranger  r   r   r   r   r   r   r   rT   r   r   AltCLIPModeltext_projectiontext_embed_dimZ_is_hf_initializedvisual_projectionvision_embed_dimrY   r   dataZzero_Zfill_r   rS   rD   )r9   r   factorZin_proj_stdZout_proj_stdZfc_stdr%   r%   r&   _init_weights2  sT    
  
z$AltCLIPPreTrainedModel._init_weightsN)	r=   r>   r?   r   rB   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_moduler  r%   r%   r%   r&   r  +  s
   
r  c                       sb   e Zd Zed fddZeed	eej	 ee
 ee
 ee
 ee
 eeef dddZ  ZS )
AltCLIPVisionTransformerr   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )rQ   rR   rd   rT   r   rr   r    rY   rZ   pre_layrnormr   encoderpost_layernorm)r9   rd   r   re   r%   r&   rR   a  s    


z!AltCLIPVisionTransformer.__init__NF)r  r   r   r   r  r   c           
      C   s   |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|d u rLtd| j||d}| |}| j|||dd}|d }|d d dd d f }	| |	}	t	||	|j
|jdS )Nz You have to specify pixel_values)r  T)rl   r   r   r   r   r   pooler_outputr   r   )rd   r   r   r   r|   rr   r  r   r!  r   r   r   )
r9   r  r   r   r   r  r   encoder_outputsr   r   r%   r%   r&   rs   k  s.    


z AltCLIPVisionTransformer.forward)NNNNF)r=   r>   r?   r   rR   r   r   r   r"   rA   r   r   r;   r   rs   ru   r%   r%   re   r&   r  `  s    
     
r  c                
       sx   e Zd ZU eed< dZed fddZejdddZ	e
deej ee ee eee eeef dddZ  ZS )AltCLIPVisionModelrd   r  r   c                    s"   t  | t|| _|   d S r   )rQ   rR   r  vision_model	post_initrc   re   r%   r&   rR     s    
zAltCLIPVisionModel.__init__r3   c                 C   s
   | j jjS r   )r&  rr   r  r8   r%   r%   r&   get_input_embeddings  s    z'AltCLIPVisionModel.get_input_embeddingsNFr  r   r   r  r   r   c                 C   s(   |dur|n| j j}| j|||||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPVisionModel

        >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nr  r   r   r  r   )rd   r   r&  )r9   r  r   r   r  r   r%   r%   r&   rs     s    zAltCLIPVisionModel.forward)NNNFN)r=   r>   r?   r   rB   Zmain_input_namerR   r    Moduler(  r   r   r"   rA   r   r   r;   r   rs   ru   r%   r%   re   r&   r%    s$   
     
r%  aE  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )Zcustom_introc                       s   e Zd ZU eed< d fdd	Zdd Zdd Zd	d
 Ze	de
ej e
ej e
ej e
ej e
ej e
ej e
e e
e e
e eeej ef d
ddZ  ZS )AltRobertaModelrd   Tc                    sD   t  | || _t|| _t|| _|r2t|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rQ   rR   rd   rC   rr   r   r   r   poolerr'  )r9   rd   add_pooling_layerre   r%   r&   rR     s    

zAltRobertaModel.__init__c                 C   s   | j jS r   rr   rU   r8   r%   r%   r&   r(    s    z$AltRobertaModel.get_input_embeddingsc                 C   s   || j _d S r   r/  r9   r   r%   r%   r&   set_input_embeddings  s    z$AltRobertaModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r9   Zheads_to_pruner   r   r%   r%   r&   _prune_heads  s    zAltRobertaModel._prune_headsN)
rk   r   rN   rI   r   rl   r   r   r   r   c
                 C   s  |d ur|n| j j}|d ur |n| j j}|	d ur4|	n| j j}	|d urV|d urVtdn@|d urt| || | }
n"|d ur| d d }
ntd|
\}}|d ur|jn|j}|d u rtj	||f|d}|d u r t
| jdr| jjd d d |f }|||}|}ntj|
tj|d}| ||
}| || j j}| j||||d}| j|||||dd	}|d
 }| jd ur| |nd }t|||j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timerK   z5You have to specify either input_ids or inputs_embedsr   rN   rg   )rk   rI   rN   rl   T)r   r   r   r   r   r   r"  )rd   r   r   r   r|   Z%warn_if_padding_and_no_attention_maskra   r   r"   Zonesrj   rr   rN   r_   r`   rb   Zget_extended_attention_maskZget_head_maskr   r   r-  r   r   r   )r9   rk   r   rN   rI   r   rl   r   r   r   rn   r   ro   r   rp   rq   Zextended_attention_maskZembedding_outputr$  sequence_outputr   r%   r%   r&   rs     s\    


zAltRobertaModel.forward)T)	NNNNNNNNN)r=   r>   r?   r   rB   rR   r(  r1  r3  r   r   r"   r   r   r   r;   r   rs   ru   r%   r%   re   r&   r,    s6   

         r,  c                       s   e Zd ZU eed<  fddZejdddZej	ddd	d
Z
dee ej	d fddZeedeej eej eej eej eej eej ee ee ee eeef d
ddZ  ZS )AltCLIPTextModelrd   c                    sL   t  | t|dd| _t|j|j| _tj	|j|j
d| _|   d S )NF)r.  rE   )rQ   rR   r,  robertar    r   rT   project_dimtransformationrY   rZ   pre_LNr'  rc   re   r%   r&   rR   C  s
    zAltCLIPTextModel.__init__r3   c                 C   s
   | j jjS r   r6  rr   rU   r8   r%   r%   r&   r(  J  s    z%AltCLIPTextModel.get_input_embeddingsN)r   r   c                 C   s   || j j_d S r   r:  r0  r%   r%   r&   r1  M  s    z%AltCLIPTextModel.set_input_embeddings)new_num_tokensr   c                    s   t  |S r   )rQ   resize_token_embeddings)r9   r;  re   r%   r&   r<  P  s    z(AltCLIPTextModel.resize_token_embeddings)
rk   r   rN   rI   r   rl   r   r   r   r   c
                 C   sp   |dur|n| j j}| j||||||||	dd	}
|
d }| |}| |}|dddf }t|||
j|
jdS )a+  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPTextModel

        >>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> texts = ["it's a cat", "it's a dog"]

        >>> inputs = processor(text=texts, padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```NT)	rk   r   rN   rI   r   rl   r   r   r   r   r"  )rd   r   r6  r9  r8  r   r   r   )r9   rk   r   rN   rI   r   rl   r   r   r   r   r4  Zprojection_stater#  r%   r%   r&   rs   S  s,     

zAltCLIPTextModel.forward)N)	NNNNNNNNN)r=   r>   r?   r   rB   rR   r    r+  r(  rS   r1  r   r}   r<  r   r   r"   r   r   r   r;   r   rs   ru   r%   r%   re   r&   r5  @  s8   
         
r5  c                       s   e Zd ZU eed< ed fddZedeej	 eej	 eej	 ee
 ee
 ee
 ejdddZedeej ee
 ee
 e
ee
 ejd
ddZedeej eej eej	 eej eej	 ee
 ee
 ee
 e
ee
 eeef dddZ  ZS )r  rd   r   c                    s   t  | t|jts.tdt|j dt|jtsPtdt|j d|j}|j}|j	|_	|j
| _
|j| _|j| _t|| _t|| _tj| j| j
dd| _tj| j| j
dd| _tt| jj| _|   d S )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)r   )rQ   rR   r   vision_configr   	TypeErrortypetext_configr   r   Zprojection_dimr7  r  rT   r  r5  
text_modelr  r&  r    r   r  r  r  r"   Ztensorrd   Zlogit_scale_init_valuelogit_scaler'  )r9   rd   rA  r>  re   r%   r&   rR     s2    

zAltCLIPModel.__init__N)rk   r   rI   r   r   r   r   c              	   C   sj   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| j|||||||d}|d }	| |	}
|
S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```N)rk   r   rI   rN   r   r   r   r   )rd   r   r   r   rB  r  )r9   rk   r   rI   rN   r   r   r   text_outputsr   Ztext_featuresr%   r%   r&   get_text_features  s     	
zAltCLIPModel.get_text_featuresFr)  c           	      C   sf   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| j|||||d}|d }| |}|S )a*  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_features = model.get_image_features(**inputs)
        ```Nr*  r   )rd   r   r   r   r&  r  )	r9   r  r   r   r  r   vision_outputsr   Zimage_featuresr%   r%   r&   get_image_features  s    
zAltCLIPModel.get_image_features)rk   r  r   rI   rN   return_lossr   r   r  r   r   c              	   C   s,  |dur|n| j j}|dur |n| j j}|
dur4|
n| j j}
| j|||||||
d}| j||||	|
d}|d }| |}|d }| |}||jdddd }||jdddd }| j	
 }t|| | }|j}d}|rt|}|
s||||||f}|dur|f| S |S t|||||||d	S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```N)rk   r   rN   rI   r   r   r   r*  r   rz   rK   T)r   r   Zkeepdim)r,   r-   r.   r/   r0   r1   r2   )rd   r   r   r   rB  r&  r  r  ZnormrC  expr"   r   r)   Tr*   r+   )r9   rk   r  r   rI   rN   rH  r   r   r  r   rD  rF  r0   r/   rC  r.   r-   r,   r   r%   r%   r&   rs     sX    %



zAltCLIPModel.forward)NNNNNNN)NNNFN)
NNNNNNNNFN)r=   r>   r?   r   rB   rR   r   r   r"   r   r   rA   rE  rG  Z
LongTensorr   r;   r+   rs   ru   r%   r%   re   r&   r    sr   
!       .     /          
r  c                 C   s6   |  | }tj|dd|| | }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )ner}   r"   ZcumsumZtype_asrb   )rk   rD   rm   maskZincremental_indicesr%   r%   r&   rh   v  s    rh   )r  r%  r5  r  )r   )r   )Hr@   r   dataclassesr   typingr   r   r   r   r"   Ztorch.nnr    Ztorch.utils.checkpointZactivationsr   Zmodeling_layersr	   Zmodeling_outputsr
   r   r   r   Zmodeling_utilsr   r   Zpytorch_utilsr   r   r   utilsr   r   r   r   r   Zconfiguration_altclipr   r   r   Z
get_loggerr=   r   r   r'   r*   r+   r+  rC   rv   r   r   r   r   r   r   r   r   floatr   r   r   r   r   r   r  r  r%  r,  r5  r  rh   __all__r%   r%   r%   r&   <module>   st   
$YW.)2 P2XS445	nS d
