a
    hm                  	   @   sv  d dl Zd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 d dlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ e0e1Z2d<e
j3ej4ej4ej4eej4 e5e5dddZ6G dd de&Z7G dd de$Z8eG dd deZ9eeddG d d! d!eZ:G d"d# d#e
j3Z;G d$d% d%e
j3Z<G d&d' d'e"Z=e
j>e7d(Z?G d)d* d*eZ@G d+d, d,e
j3ZAeG d-d. d.e9ZBG d/d0 d0e,ZCdZDG d1d2 d2e
j3ZEG d3d4 d4e+ZFG d5d6 d6e*ZGG d7d8 d8e(ZHG d9d: d:e)ZIg d;ZJdS )=    N)	dataclass)CallableOptionalUnion   )ACT2FN)Cache)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging	torch_int   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfig        )modulequerykeyvalueattention_maskscalingdropoutc                 K   s   |}|}	t ||dd| }
|d urV|d d d d d d d |jd f }|
| }
tjj|
dd}
tjj|
|| jd}
t |
|	}|dd	 }||
fS )Nr   r   dim)ptrainingr   )
torchmatmul	transposeshapenn
functionalZsoftmaxr'   r-   
contiguous)r!   r"   r#   r$   r%   r&   r'   kwargs
key_statesvalue_statesattn_weightsZcausal_maskattn_output r:   i/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forward1   s    
&r<   c                   @   s   e Zd ZdS )InternVLVisionRMSNormN__name__
__module____qualname__r:   r:   r:   r;   r=   L   s   r=   c                       sJ   e Zd Zed fddZdejeej eej ee	 dddZ
  ZS )	InternVLVisionAttentionconfigc                    sP   t  | | `d| _|j}|r*t| jnt | _	|rBt| jnt | _
d S )NF)super__init__Znum_key_value_groups	is_causalZuse_qk_normr=   	embed_dimr2   Identityq_normk_norm)selfrD   Zqk_norm	__class__r:   r;   rF   Q   s    z InternVLVisionAttention.__init__N)hidden_statesr%   output_attentionsr5   c                 K   s   |  \}}}| |}| |}	| |}
| |}| |	}	|||| j| j	dd}|	||| j| j	dd}	|

||| j| j	dd}
t}| jjdkrt| jj }|| ||	|
|f| jsdn| j| jdd|\}}|||| j}| |}| |}|r||fn|d f}|S )Nr   r   eagerr    F)r'   r&   rG   )sizeZq_projZk_projZv_projrJ   rK   reshapeZ	num_headsZhead_dimr0   viewr<   rD   Z_attn_implementationr   r-   Zattention_dropoutscalerH   Zprojection_layerZprojection_dropout)rL   rO   r%   rP   r5   
batch_sizeseq_len_Zquery_statesr6   r7   Zattention_interfacer9   r8   outputoutputsr:   r:   r;   forward\   s<    




	


zInternVLVisionAttention.forward)NN)r?   r@   rA   r   rF   r.   Tensorr   r   r	   r[   __classcell__r:   r:   rM   r;   rB   P   s     rB   c                       sH   e Zd ZU eed< dZdZdZdgZdZ	dZ
dZdZ fddZ  ZS )InternVLVisionPreTrainedModelrD   Zinternvl_visionpixel_valuesTInternVLVisionLayerc                    s   t  | t|trP|jj  |jdur8|jj  |jdur~|jj  n.t|t	r~|j
j| jj |jj| jj dS )zInitialize the weightsN)rE   _init_weights
isinstanceInternVLVisionEmbeddings	cls_tokendataZzero_
mask_tokenposition_embeddingsr`   lambda_1Zfill_rD   layer_scale_init_valuelambda_2)rL   r!   rM   r:   r;   ra      s    



z+InternVLVisionPreTrainedModel._init_weights)r?   r@   rA   r   __annotations__Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attnZ_supports_flex_attnZ_supports_attention_backendra   r]   r:   r:   rM   r;   r^      s   
r^   z7
    Class for outputs of [`InternVLVisionModel`].
    )Zcustom_introc                   @   s   e Zd ZdZdS )$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)r?   r@   rA   __doc__r:   r:   r:   r;   rl      s   rl   c                       s4   e Zd ZdZ fddZejejdddZ  ZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j }}|j|j }}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _|| _tj	||||d| _
d S )Nr   r   )Zkernel_sizeZstride)rE   rF   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper2   ZConv2d
projection)rL   rD   ro   rp   rq   rr   rs   rt   rM   r:   r;   rF      s    
  z&InternVLVisionPatchEmbeddings.__init__)r_   returnc           	      C   s^   |j \}}}}|| jkr td| |}|j d |j d  }}|ddd}|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )r1   rq   
ValueErrorru   flattenr0   )	rL   r_   rV   rq   heightwidth
embeddingspatch_heightpatch_widthr:   r:   r;   r[      s    

z%InternVLVisionPatchEmbeddings.forward)	r?   r@   rA   rm   rF   r.   r\   r[   r]   r:   r:   rM   r;   rn      s   rn   c                       s^   e Zd ZdZedd fddZejeeejdddZ	deje
ej ejd	d
dZ  ZS )rc   zc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    NrD   rv   c                    s   t    ttdd|j| _|jrBttdd|j| _	nd | _	t
|| _|j| _t|jtjjrp|jn
|j|jf| _| jj}|jrttd|d |j| _nd | _t|j| _d S )Nr   )rE   rF   r2   	Parameterr.   Zzerosrr   rd   Zuse_mask_tokenrf   rn   patch_embeddingsrp   rb   ro   collectionsabcIterablers   Z use_absolute_position_embeddingsrg   Dropouthidden_dropout_probr'   )rL   rD   rs   rM   r:   r;   rF      s     


z!InternVLVisionEmbeddings.__init__)r{   ry   rz   rv   c                 C   s   |j d d }| jj d d }tj s>||kr>||kr>| jS | jddddf }| jddddf }|j d }|| jd  }	|| jd  }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr)   r         ?r   r   ZbicubicF)rR   modeZalign_cornersr*   )r1   rg   r.   Zjit
is_tracingrp   r   rS   permuter2   r3   ZinterpolaterT   cat)rL   r{   ry   rz   rs   Znum_positionsZclass_pos_embedZpatch_pos_embedr+   Z
new_heightZ	new_widthZsqrt_num_positionsr:   r:   r;   interpolate_pos_encoding   s(    

z1InternVLVisionEmbeddings.interpolate_pos_encoding)r_   bool_masked_posrv   c                 C   s   |j \}}}}| |\}\}}| \}	}
}|d urj| j|	|
d}|d|}|d|  ||  }| j|	dd}tj	||fdd}| j
d ur|| ||| }| |}|||ffS )Nr)   r   r*   )r1   r   rR   rf   expandZ	unsqueezeZtype_asrd   r.   r   rg   r   r'   )rL   r_   r   rX   ry   rz   r{   r|   r}   rV   rW   Zmask_tokenswZ
cls_tokensr:   r:   r;   r[     s    

z InternVLVisionEmbeddings.forward)N)r?   r@   rA   rm   r   rF   r.   r\   intr   r   
BoolTensorr[   r]   r:   r:   rM   r;   rc      s   + rc   c                   @   s   e Zd ZdS )InternVLVisionMLPNr>   r:   r:   r:   r;   r   5  s   r   )
layer_normZrms_normc                       sX   e Zd ZdZedd fddZd
ejee	e
ej e
ejejf f ddd	Z  ZS )r`   z?This corresponds to the Block class in the timm implementation.Nr~   c                    s   t    |j| _d| _t|| _t|| _t|j	 |j
|jd| _t|j	 |j
|jd| _|j}tj|t|j
 dd| _tj|t|j
 dd| _t|j| _d S )Nr   epsT)Zrequires_grad)rE   rF   Zchunk_size_feed_forwardZseq_len_dimrB   	attentionr   mlpNORM2FNZ	norm_typerr   layer_norm_epslayernorm_beforelayernorm_afterri   r2   r   r.   Zonesrh   rj   r   r   r'   )rL   rD   Zinit_valuesrM   r:   r;   rF   ?  s    


zInternVLVisionLayer.__init__F)rO   rP   rv   c                 C   sl   | j | ||d\}}| j| }|| }| |}| |}| |}| jd ur\| j| }|| }||fS )N)rP   )r   r   rh   r   r   r'   rj   )rL   rO   rP   Zattention_outputZattention_weightsZlayer_outputr:   r:   r;   r[   N  s    






zInternVLVisionLayer.forward)F)r?   r@   rA   rm   r   rF   r.   r\   boolr   tupler[   r]   r:   r:   rM   r;   r`   <  s    r`   c                       sH   e Zd Zedd fddZed	ejeee	e
ef dddZ  ZS )
InternVLVisionEncoderNr~   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r:   )r`   ).0irC   r:   r;   
<listcomp>p      z2InternVLVisionEncoder.__init__.<locals>.<listcomp>F)	rE   rF   rD   r2   Z
ModuleListrangeZnum_hidden_layerslayerZgradient_checkpointingrL   rD   rM   rC   r;   rF   m  s    
 zInternVLVisionEncoder.__init__F)rO   rP   output_hidden_statesrv   c           	      C   sz   |rdnd }|rdnd }t | jD ]:\}}|r8||f }|||}|d }|r"||d f }q"|rl||f }t|||dS )Nr:   r   r   last_hidden_staterO   
attentions)	enumerater   r   )	rL   rO   rP   r   Zall_hidden_statesZall_self_attentionsr   Zlayer_moduleZlayer_outputsr:   r:   r;   r[   s  s     


zInternVLVisionEncoder.forward)FF)r?   r@   rA   r   rF   r   r.   r\   r   r   r   r   r[   r]   r:   r:   rM   r;   r   l  s     
r   c                
       sd   e Zd Zedd fddZdd Zeed
ej	e
ej e
e e
e eeef ddd	Z  ZS )InternVLVisionModelNr~   c                    sT   t  | || _t|| _t|| _|jr4t	 ntj
|j|jd| _|   d S )Nr   )rE   rF   rD   rc   r{   r   encoderZuse_mean_poolingr2   rI   	LayerNormrr   r   	layernormZ	post_initr   rM   r:   r;   rF     s    

zInternVLVisionModel.__init__c                 C   s   | j jS N)r{   r   )rL   r:   r:   r;   get_input_embeddings  s    z(InternVLVisionModel.get_input_embeddings)r_   r   rP   r   rv   c           	      C   sn   |dur|n| j j}|dur |n| j j}| j||d\}}| j|||d}|d }| |}t||j|jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r   )rP   r   r   r   )	rD   rP   r   r{   r   r   rl   rO   r   )	rL   r_   r   rP   r   Zembedding_outputrX   Zencoder_outputsZsequence_outputr:   r:   r;   r[     s     
zInternVLVisionModel.forward)NNN)r?   r@   rA   r   rF   r   r   r   r.   r\   r   r   r   r   r   rl   r[   r]   r:   r:   rM   r;   r     s      
r   c                   @   s   e Zd ZdS )InternVLPreTrainedModelNr>   r:   r:   r:   r;   r     s   r   c                       s*   e Zd Zed fddZdd Z  ZS )InternVLMultiModalProjectorrC   c                    sz   t    t|jjtd|j d  | _t	|jjtd|j d  |j
j| _t|j | _t	|j
j|j
j| _d S )Nr   r   )rE   rF   r2   r   Zvision_configrr   r   downsample_ratior   ZLinearZtext_configlinear_1r   Zprojector_hidden_actactlinear_2r   rM   r:   r;   rF     s    
"z$InternVLMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   r   r   )rL   image_featuresrO   r:   r:   r;   r[     s
    



z#InternVLMultiModalProjector.forward)r?   r@   rA   r   rF   r[   r]   r:   r:   rM   r;   r     s   	r   c                   @   s   e Zd ZdS )InternVLModelOutputWithPastNr>   r:   r:   r:   r;   r     s   r   c                   @   s   e Zd ZdejedddZdejee	e
ee
 f  ee dddZeedejejeej eej ee eej ee	e
ee
 f  ee ee ee ee ee eej ee e	eef d	d
dZdS )InternVLModelr   )vision_featuresscale_factorc              	   C   s   |  \}}}}|| dks(|| dkr0td|||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )rR   rw   rT   r   r   r4   )rL   r   r   rV   rz   ry   channelsr:   r:   r;   pixel_shuffle  s    $zInternVLModel.pixel_shuffleNr_   vision_feature_layervision_feature_select_strategyc           
      K   s   |dur|n| j j}|dur |n| j j}|j| jd}| j j}|dkrV| j|dj}n| j|dj	| }|dkr|ddddddf }|j
d }t|d }|j
d }	||	||d}| j||d	}||	d|j
d }| |}|S )
a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `list[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        N)dtyper)   )r_   defaultr   r   r   )r   )rD   r   r   tor   r   Zvision_towerr   Zvision_modelrO   r1   r   rS   r   Zmulti_modal_projector)
rL   r_   r   r   r5   r   r   r   Zfeature_sizerV   r:   r:   r;   get_image_features	  s*    


z InternVLModel.get_image_features)	input_idsr_   r%   position_idspast_key_valuesinputs_embedsr   r   	use_cacherP   r   return_dictcache_positionr5   rv   c                 K   s   |
d ur|
n| j j}
|d ur |n| j j}|d ur4|n| j j}|d urH|n| j j}|d ur\|n| j j}|d u |d uA r|td|d u r|  |}|d ur| j|||d}|	|j
|j}| j|||d}|||}| jf |||||	|
|d|d	|}t|j|j|j|j|d ur|nd dS )Nz:You must specify exactly one of input_ids or inputs_embedsr   )r   r   T)	r%   r   r   r   r   rP   r   r   r   )r   r   rO   r   Zimage_hidden_states)rD   rP   r   Zuse_return_dictr   r   rw   r   r   r   Zdevicer   Zget_placeholder_maskZmasked_scatterZlanguage_modelr   r   r   rO   r   )rL   r   r_   r%   r   r   r   r   r   r   rP   r   r   r   r5   r   Zspecial_image_maskrZ   r:   r:   r;   r[   ?  sZ    
zInternVLModel.forward)r   )NN)NNNNNNNNNNNNN)r?   r@   rA   r.   r\   floatr   ZFloatTensorr   r   r   liststrr   r   r   Z
LongTensorr   r   r   r	   r   r   r[   r:   r:   r:   r;   r     sP   &  6             
r   c                   @   s   e Zd ZdS )InternVLCausalLMOutputWithPastNr>   r:   r:   r:   r;   r     s   r   c                       s   e Zd Z fddZ  ZS ) InternVLForConditionalGenerationc                     s   t  jf i |  dS )ac  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```N)rE   r[   )Zsuper_kwargsrM   r:   r;   r[     s    $z(InternVLForConditionalGeneration.forward)r?   r@   rA   r[   r]   r:   r:   rM   r;   r     s   r   )r^   r   r   r   r   )r    )Kcollections.abcr   dataclassesr   typingr   r   r   r.   Ztorch.nnr2   Ztorch.utils.checkpointZactivationsr   Zcache_utilsr   Zmodeling_flash_attention_utilsr	   Zmodeling_layersr
   Zmodeling_outputsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zclip.modeling_clipr   Zjanus.modeling_janusr   Zllama.modeling_llamar   Zllava.modeling_llavar   r   r   r   r   Zconfiguration_internvlr   r   Z
get_loggerr?   loggerModuler\   r   r<   r=   rB   r^   rl   rn   rc   r   r   r   r`   r   r   r   ZINTERNVL_INPUTS_DOCSTRINGr   r   r   r   r   __all__r:   r:   r:   r;   <module>   sn   

 8	&^0&5 $(