a
    h                  
   @   s  d Z ddlmZ ddlmZmZmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z% e &e'Z(ej)ej)dddZ*ej)ej)dddZ+ej)ej)dddZ,eeddG dd deZ-eeddG dd deZ.eeG d d! d!eZ/G d"d# d#e	j0Z1G d$d% d%e	j0Z2dIe	j0ej)ej)ej)eej) e3e3e4d(d)d*Z5G d+d, d,e	j0Z6G d-d. d.e	j0Z7G d/d0 d0eZ8eG d1d2 d2eZ9G d3d4 d4e	j0Z:G d5d6 d6e	j0Z;ed7dG d8d9 d9e9Z<G d:d; d;e	j0Z=ed<dG d=d> d>e9Z>eG d?d@ d@e9Z?eG dAdB dBe9Z@eG dCdD dDe9ZAedEdG dFdG dGe9ZBg dHZCdS )JzPyTorch CLIP model.    )	dataclass)AnyCallableOptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )
CLIPConfigCLIPTextConfigCLIPVisionConfig)logitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalZcross_entropytorcharangelenr!   )r    r&   b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/clip/modeling_clip.pycontrastive_loss&   s    r(   )
similarityr   c                 C   s    t | }t |  }|| d S )Ng       @)r(   t)r)   Zcaption_lossZ
image_lossr&   r&   r'   	clip_loss*   s    r+   )tensorr   c                 C   s,   t | d}t j|ddd}t |d}|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
       T)dimZkeepdim      ?)r#   powsum)r,   Zsquare_tensorZ
sum_tensorZnormed_tensorr&   r&   r'   _get_vector_norm0   s    r3   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )Zcustom_introc                   @   sj   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )CLIPVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r5   r   r#   FloatTensor__annotations__r6   r7   tupler8   r&   r&   r&   r'   r4   ;   s
   
r4   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                   @   sj   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )CLIPTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr6   .r7   r8   )r9   r:   r;   r<   rA   r   r#   r=   r>   r6   r7   r?   r8   r&   r&   r&   r'   r@   M   s
   
r@   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< ee d
ddZdS )
CLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_textrA   r5   text_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d vr | nt  | V  qdS ))rF   rG   N)getattrto_tuple).0kselfr&   r'   	<genexpr>~   s   z&CLIPOutput.to_tuple.<locals>.<genexpr>)r?   keysrM   r&   rM   r'   rJ   }   s    zCLIPOutput.to_tuple)r9   r:   r;   r<   rC   r   r#   r=   r>   rD   rE   rA   r5   rF   r   rG   r?   r   rJ   r&   r&   r&   r'   rB   _   s   
rB   c                       sP   e Zd Zed fddZejeeejdddZdej	ejdd	d
Z
  ZS )CLIPVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)Zin_channelsZout_channelsZkernel_sizeZstridebiasr-   r   position_idsr   r.   
persistent)super__init__rS   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr#   Zrandnclass_embeddingZConv2dZnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr$   expandrN   rS   	__class__r&   r'   rZ      s"    
zCLIPVisionEmbeddings.__init__)
embeddingsheightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj sP||krP||krP| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr.   r0   r   r-   ZbicubicF)sizemodeZalign_cornersr/   )shapere   weightZ	unsqueezer#   Zjit
is_tracingrU   r^   r   reshapeZpermuter   r"   Zinterpolateviewcat)rN   rk   rl   rm   rb   re   rc   Zclass_pos_embedZpatch_pos_embedr/   Z
new_heightZ	new_widthZsqrt_num_positionsr&   r&   r'   interpolate_pos_encoding   s*    



z-CLIPVisionEmbeddings.interpolate_pos_encodingF)pixel_valuesr   c              
   C   s   |j \}}}}|sL|| jks&|| jkrLtd| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r|
| |
|| }
n|
| | j }
|
S )
NzInput image size (*z) doesn't match model ().)dtyper-   r   r.   rp   )rq   r]   
ValueErrorra   rr   r{   toflatten	transposer`   rg   r#   rv   rw   re   rU   )rN   rx   rw   
batch_size_rl   rm   Ztarget_dtypeZpatch_embedsZclass_embedsrk   r&   r&   r'   forward   s     
zCLIPVisionEmbeddings.forward)F)r9   r:   r;   r   rZ   r#   Tensorintrw   r=   r   __classcell__r&   r&   ri   r'   rQ      s   )rQ   c                       sL   e Zd Zed fddZdeej eej eej ej	dddZ
  ZS )	CLIPTextEmbeddingsrR   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )NrU   rV   FrW   )rY   rZ   r[   r   rd   Z
vocab_sizetoken_embeddingZmax_position_embeddingsre   rf   r#   r$   rg   rN   rS   r\   ri   r&   r'   rZ      s    
zCLIPTextEmbeddings.__init__N)	input_idsrU   inputs_embedsr   c                 C   s   |d ur|j d n|j d }| jjj d }||krFtd| d| |d u rd| jd d d |f }|d u rv| |}| |}|| }|S )Nr.   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rq   re   rr   r|   rU   r   )rN   r   rU   r   
seq_lengthZmax_position_embeddingZposition_embeddingsrk   r&   r&   r'   r      s"    

zCLIPTextEmbeddings.forward)NNN)r9   r:   r;   r   rZ   r   r#   
LongTensorr=   r   r   r   r&   r&   ri   r'   r      s      r           T)modulequerykeyvalueattention_maskscalingdropoutoutput_attentionsc                 K   s   t ||dd| }	|d ur(|	| }	tjj|	dt jd|j}	tjj	|	|| j
d}	t |	|}
|
dd }
|s|d }	|
|	fS )Nr.   r   )r/   r{   )ptrainingr   r-   )r#   matmulr   r   r"   ZsoftmaxZfloat32r}   r{   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr&   r&   r'   eager_attention_forward   s    r   c                	       sh   e Zd ZdZeeef d fddZd
ej	e
ej	 e
ej	 e
e eej	e
ej	 f ddd	Z  ZS )CLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrR   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkrZtd| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rz         F)rY   rZ   rS   r[   r\   Znum_attention_headsZ	num_headshead_dimr|   scaleZattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrh   ri   r&   r'   rZ     s$    

zCLIPAttention.__init__NFr7   r   causal_attention_maskr   r   c                 C   sL  |j \}}}| |}| |}	| |}
|||d| jdd}|	||d| jdd}	|
||d| jdd}
| jjdkr|du| _	n&|dur|dur|| }n|dur|}t
}| jjdkr| jjdkr|rtd nt| jj }|| ||	|
|| j	| j| jsd	n| j|d
	\}}|||| }| |}|sDd}||fS )z#Input shape: Batch x Time x Channelr.   r   r-   flash_attention_2NeagerZsdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r   r   )rq   r   r   r   ru   r   r   rS   _attn_implementationr   r   loggerZwarning_oncer   r   r   r   rt   r   r   )rN   r7   r   r   r   r   r   r\   ZqueriesrP   valuesZattention_interfacer   r   r&   r&   r'   r   .  sH    	





zCLIPAttention.forward)NNF)r9   r:   r;   r<   r   r   r   rZ   r#   r   r   boolr?   r   r   r&   r&   ri   r'   r     s      r   c                       s0   e Zd Z fddZejejdddZ  ZS )CLIPMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)rY   rZ   rS   r   Z
hidden_actactivation_fnr   r   r[   Zintermediate_sizefc1fc2rh   ri   r&   r'   rZ   i  s
    
zCLIPMLP.__init__)r7   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rN   r7   r&   r&   r'   r   p  s    


zCLIPMLP.forward)r9   r:   r;   rZ   r#   r   r   r   r&   r&   ri   r'   r   h  s   r   c                       sR   e Zd Zeeef d fddZdejejeje	e
 eej dddZ  ZS )	CLIPEncoderLayerrR   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S Neps)rY   rZ   r[   r\   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rh   ri   r&   r'   rZ   x  s    


zCLIPEncoderLayer.__init__Fr   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r`||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r7   r   r   r   )r   r   r   r   )rN   r7   r   r   r   Zresidualr   outputsr&   r&   r'   r     s"    




zCLIPEncoderLayer.forward)F)r9   r:   r;   r   r   r   rZ   r#   r   r   r   r?   r=   r   r   r&   r&   ri   r'   r   w  s    r   c                   @   s6   e Zd ZU eed< dZdZdZdZdZ	dZ
dd ZdS )CLIPPreTrainedModelrS   ZclipTc                 C   s  | j j}t|trF|jjjjd|d d |jjjjd|d d nZt|t	r| j j}t
jj|jd|jd | d t
jj|jj|j j| d t
jj|jj|j j| d nt|trF| j j}|jd d|j j d  | }|jd | }t
jj|jj|d t
jj|jj|d t
jj|jj|d t
jj|jj|d nZt|tr| j j}|j jd d|j j d  | }d|j j d | }t
jj|jj|d t
jj|jj|d nt|tr
t
jj|jj|jd | j j d t
jj|jj|jd | j j d nt|tr<t
jj|jj| j jd | j j d ndt|t rnt
jj|jj| j jd | j j d n2t|t!rt
jj|j"j| j j#jd | j j d t|t
j$r|j%j&  |jj'd t|t
j(r|j%dur|j%j&  dS )	zInitialize the weightsr   g{Gz?)meanstdr   )r   r-   g      ?N))rS   Zinitializer_factor
isinstancer   r   rr   dataZnormal_re   rQ   r   initr`   r\   ra   Zinitializer_ranger   num_hidden_layersr   r   r   r   r   r[   r   r   	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr   rT   Zzero_Zfill_r   )rN   r   factorZin_proj_stdZout_proj_stdZfc_stdr&   r&   r'   _init_weights  sh    

  z!CLIPPreTrainedModel._init_weightsN)r9   r:   r;   r   r>   Zbase_model_prefixZsupports_gradient_checkpointingZ_supports_sdpa_supports_flash_attnZ_supports_flex_attnZ_supports_attention_backendr   r&   r&   r&   r'   r     s   
r   c                       sR   e Zd ZdZed fddZd	eej eej ee	 ee	 e
dddZ  ZS )
CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    rR   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r&   )r   )rK   r   rR   r&   r'   
<listcomp>      z(CLIPEncoder.__init__.<locals>.<listcomp>F)	rY   rZ   rS   r   Z
ModuleListranger   layersZgradient_checkpointingrh   ri   rR   r'   rZ     s    
 zCLIPEncoder.__init__N)r   r   r   output_hidden_statesr   c                 C   s   |dur|n| j j}|dur |n| j j}|r0dnd}|r<dnd}|}t| jD ]@\}	}
|rd||f }|
||||d}|d }|rN||d f }qN|r||f }t|||dS )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr&   )r   r   r   )r6   r7   r8   )rS   r   r   	enumerater   r   )rN   r   r   r   r   r   Zencoder_statesZall_attentionsr7   idxZencoder_layerZlayer_outputsr&   r&   r'   r     s2    %

zCLIPEncoder.forward)NNNN)r9   r:   r;   r<   r   rZ   r   r#   r   r   r   r   r   r&   r&   ri   r'   r     s   	    r   c                	       sZ   e Zd Zed fddZedeej eej eej ee	 ee	 e
dddZ  ZS )	CLIPTextTransformerrR   c                    sT   t    || _|j}t|| _t|| _tj	||j
d| _|j| _|jdk| _d S )Nr   r   )rY   rZ   rS   r[   r   rk   r   encoderr   r   r   final_layer_normeos_token_idr   _use_flash_attention_2r   ri   r&   r'   rZ   C  s    


zCLIPTextTransformer.__init__Nr   r   rU   r   r   r   c                 C   s@  |d ur|n| j j}|d ur |n| j j}|d u r8td| }|d|d }| j||d}t||j|j	d}|d ur| j
st||j}| j|||||d}	|	j}
| |
}
| jdkr|
tj|
jd |
j	d|jtj|
j	djdd	f }n>|
tj|
jd |
j	d|jtj|
j	d| jk jdd	f }t|
||	j|	jd
S )NzYou have to specify input_idsr.   )r   rU   r    )r   r   r   r   r   r-   r   )r{   r!   rp   r6   pooler_outputr7   r8   )rS   r   r   r|   rn   ru   rk   r   r{   r!   r   r   r   r6   r   r   r#   r$   rq   r}   r   Zargmaxr   r7   r8   )rN   r   r   rU   r   r   Zinput_shaper7   r   encoder_outputsr6   pooled_outputr&   r&   r'   r   Q  sR    	


	zCLIPTextTransformer.forward)NNNNN)r9   r:   r;   r   rZ   r   r   r#   r   r   r   r   r   r&   r&   ri   r'   r   B  s        r   zI
    The text model from CLIP without any head or projection on top.
    c                
       s   e Zd ZU eed< ddgZdZed fddZej	dd	d
Z
dd Zeedeej eej eej ee ee edddZ  ZS )CLIPTextModelrS   r   r   FrR   c                    s"   t  | t|| _|   d S r   )rY   rZ   r   
text_model	post_initrh   ri   r&   r'   rZ     s    
zCLIPTextModel.__init__rH   c                 C   s
   | j jjS r   r   rk   r   rM   r&   r&   r'   get_input_embeddings  s    z"CLIPTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r   rN   r   r&   r&   r'   set_input_embeddings  s    z"CLIPTextModel.set_input_embeddingsNr   c                 C   s   | j |||||dS )a9  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rU   r   r   )r   )rN   r   r   rU   r   r   r&   r&   r'   r     s    zCLIPTextModel.forward)NNNNN)r9   r:   r;   r   r>   _no_split_modulesr   rZ   r   Moduler   r   r   r   r   r#   r   r   r   r   r   r&   r&   ri   r'   r     s*   
     r   c                       sP   e Zd Zed fddZed	eej ee	 ee	 ee	 e
dddZ  ZS )
CLIPVisionTransformerrR   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )rY   rZ   rS   r[   rQ   rk   r   r   r   pre_layrnormr   r   post_layernormr   ri   r&   r'   rZ     s    


zCLIPVisionTransformer.__init__NFrx   r   r   rw   r   c           	      C   s   |d ur|n| j j}|d ur |n| j j}|d u r8td| j||d}| |}| j|||d}|j}|d d dd d f }| |}t	|||j
|jdS )Nz You have to specify pixel_values)rw   )r   r   r   r   r   )rS   r   r   r|   rk   r   r   r6   r   r   r7   r8   )	rN   rx   r   r   rw   r7   r   r6   r   r&   r&   r'   r     s*    

zCLIPVisionTransformer.forward)NNNF)r9   r:   r;   r   rZ   r   r   r#   r=   r   r   r   r   r&   r&   ri   r'   r     s   
    r   zK
    The vision model from CLIP without any head or projection on top.
    c                	       st   e Zd ZU eed< dZdgZed fddZej	ddd	Z
eedeej ee ee eedddZ  ZS )CLIPVisionModelrS   rx   r   rR   c                    s"   t  | t|| _|   d S r   )rY   rZ   r   vision_modelr   rh   ri   r&   r'   rZ     s    
zCLIPVisionModel.__init__rH   c                 C   s
   | j jjS r   r   rk   ra   rM   r&   r&   r'   r     s    z$CLIPVisionModel.get_input_embeddingsNFr   c                 C   s   | j ||||dS )a  
        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```rx   r   r   rw   )r   )rN   rx   r   r   rw   r&   r&   r'   r     s    zCLIPVisionModel.forward)NNNF)r9   r:   r;   r   r>   main_input_namer   rZ   r   r   r   r   r   r   r#   r=   r   r   r   r   r&   r&   ri   r'   r     s$   
    r   c                       s   e Zd ZU eed< g dZdZed fddZede	e
j e	e
j e	e
j e	e e	e e
jdd	d
Zede	e
j e	e e	e ee
jdddZeede	e
j e	e
j e	e
j e	e
j e	e e	e e	e eed	ddZ  ZS )r   rS   )r   r   rQ   FrR   c                    s   t  | t|jts.tdt|j dt|jtsPtdt|j d|j}|j}|j	| _	|j
| _|j
| _t|}|j| _t|}|j| _tj| j| j	dd| _tj| j| j	dd| _tt| jj| _|   d S )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type FrT   )rY   rZ   r   text_configr   	TypeErrortyper   r   projection_dimr[   r   r   r   _from_configr   r   r   r   r   r   r   r_   r#   r,   rS   Zlogit_scale_init_valuelogit_scaler   )rN   rS   r   r   r   r   ri   r&   r'   rZ   D  s4    

zCLIPModel.__init__Nr   c           	      C   sP   |dur|n| j j}|dur |n| j j}| j|||||d}|j}| |}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr   )rS   r   r   r   r   r   )	rN   r   r   rU   r   r   text_outputsr   Ztext_featuresr&   r&   r'   get_text_featuresg  s    
zCLIPModel.get_text_featuresr   c                 C   sN   |dur|n| j j}|dur |n| j j}| j||||d}|j}| |}|S )aD  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nr   )rS   r   r   r   r   r   )rN   rx   r   r   rw   vision_outputsr   Zimage_featuresr&   r&   r'   get_image_features  s    
zCLIPModel.get_image_features)	r   rx   r   rU   return_lossr   r   rw   r   c	              	   C   s   |dur|n| j j}|dur |n| j j}| j||||d}	| j|||||d}
|	j}| |}|
j}| |}|t| }|t| }t	
|| |j}|| j |j }| }d}|rt|}t||||||
|	dS )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr   r   )rC   rD   rE   rA   r5   rF   rG   )rS   r   r   r   r   r   r   r   r3   r#   r   r*   r}   r!   r  expr+   rB   )rN   r   rx   r   rU   r  r   r   rw   r  r  r5   rA   rE   rD   rC   r&   r&   r'   r     sJ    '

zCLIPModel.forward)NNNNN)NNNF)NNNNNNNF)r9   r:   r;   r   r>   r   r   rZ   r   r   r#   r   r   r=   r  r  r   r   rB   r   r   r&   r&   ri   r'   r   >  sf   
#     +    /        r   c                
       s   e Zd ZU eed< ddgZed fddZejddd	Z	d
d Z
eedeej eej eej ee ee edddZ  ZS )r   rS   r   r   rR   c                    s@   t  | t|}|j| _tj|j|jdd| _	| 
  d S NFr   )rY   rZ   r   r   r   r   r   r[   r   r   r   )rN   rS   r   ri   r&   r'   rZ   #  s
    
z$CLIPTextModelWithProjection.__init__rH   c                 C   s
   | j jjS r   r   rM   r&   r&   r'   r   .  s    z0CLIPTextModelWithProjection.get_input_embeddingsc                 C   s   || j j_d S r   r   r   r&   r&   r'   r   1  s    z0CLIPTextModelWithProjection.set_input_embeddingsNr   c           	      C   s:   | j |||||d}|j}| |}t||j|j|jdS )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```r   )rA   r6   r7   r8   )r   r   r   r@   r6   r7   r8   )	rN   r   r   rU   r   r   r  r   rA   r&   r&   r'   r   4  s    
z#CLIPTextModelWithProjection.forward)NNNNN)r9   r:   r;   r   r>   r   rZ   r   r   r   r   r   r   r   r#   r   r   r@   r   r   r&   r&   ri   r'   r     s(   
     r   c                	       sn   e Zd ZU eed< dZed fddZejdddZ	e
edeej ee ee eedddZ  ZS )r   rS   rx   rR   c                    s@   t  | t|}|j| _tj|j|jdd| _	| 
  d S r  )rY   rZ   r   r   r   r   r   r[   r   r   r   rN   rS   r   ri   r&   r'   rZ   d  s
    
z&CLIPVisionModelWithProjection.__init__rH   c                 C   s
   | j jjS r   r   rM   r&   r&   r'   r   o  s    z2CLIPVisionModelWithProjection.get_input_embeddingsNFr   c                 C   s8   | j ||||d}|j}| |}t||j|j|jdS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection

        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> image_embeds = outputs.image_embeds
        ```r   )r5   r6   r7   r8   )r   r   r   r4   r6   r7   r8   )rN   rx   r   r   rw   r  r   r5   r&   r&   r'   r   r  s    
z%CLIPVisionModelWithProjection.forward)NNNF)r9   r:   r;   r   r>   r   rZ   r   r   r   r   r   r   r#   r=   r   r4   r   r   r&   r&   ri   r'   r   _  s"   
    r   z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                	       s\   e Zd ZdZedd fddZeed	ee	j
 ee	j
 ee ee edddZ  ZS )
r   rx   N)rS   r   c                    sZ   t  | |j| _t|j}|j| _|jdkrDt|jj	|jnt
 | _|   d S )Nr   )rY   rZ   
num_labelsr   r   r   r   r   r   r[   ZIdentityr   r   r	  ri   r&   r'   rZ     s    "z#CLIPForImageClassification.__init__)rx   labelsr   r   r   c           
      C   s|  |dur|n| j j}|dur |n| j j}| j|||d}|j}tj|ddddddf dd}| |}d}|durh||j	}| j j
du r| jdkrd| j _
n4| jdkr|jtjks|jtjkrd| j _
nd| j _
| j j
dkrt }	| jdkr|	| | }n
|	||}nN| j j
dkrJt }	|	|d| j|d}n| j j
dkrht }	|	||}t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   rp   Z
regressionZsingle_label_classificationZmulti_label_classificationr.   )rC   r   r7   r8   )rS   r   r   r   r6   r#   r   r   r}   r!   Zproblem_typer
  r{   longr   r
   Zsqueezer	   ru   r   r   r7   r8   )
rN   rx   r  r   r   r   Zsequence_outputr   rC   Zloss_fctr&   r&   r'   r     sJ    $



"

z"CLIPForImageClassification.forward)NNNN)r9   r:   r;   r   r   rZ   r   r   r   r#   r   r   r   r   r   r&   r&   ri   r'   r     s       r   )r   r   r   r   r   r   r   )r   T)Dr<   dataclassesr   typingr   r   r   r   r#   r   Ztorch.nnr   r	   r
   Zactivationsr   Zmodeling_attn_mask_utilsr   r   Zmodeling_layersr   Zmodeling_outputsr   r   r   Zmodeling_utilsr   r   utilsr   r   r   r   r   Zconfiguration_clipr   r   r   Z
get_loggerr9   r   r   r(   r+   r3   r4   r@   rB   r   rQ   r   floatr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __all__r&   r&   r&   r'   <module>   s   
#S/  Q2BVY504 _A@T