a
    h0                     @   s   d dl mZmZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ d	d
lmZ eeZG dd de	ZG dd deZG dd dejZG dd de
ZG dd deZG dd deZg dZdS )    )OptionalUnionN)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)Cache)auto_docstringlogging   )VipLlavaConfigc                   @   s   e Zd ZdS )VipLlavaModelOutputWithPastN__name__
__module____qualname__ r   r   i/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/vipllava/modular_vipllava.pyr   &   s   r   c                   @   s   e Zd ZdS )VipLlavaCausalLMOutputWithPastNr   r   r   r   r   r   *   s   r   c                       s*   e Zd Zed fddZdd Z  ZS )VipLlavaMultiModalProjector)configc                    s   t    t|jtrdnt|j}tj||jj	 |j
d| _tj||jj	 |jj	dd| _t|j | _tj|jj	|jj	dd| _d S )Nr   )epsT)Zbias)super__init__
isinstancevision_feature_layersintlenr   Z	LayerNormZvision_configZhidden_sizeZprojector_layernorm_epsprojector_layernormZLineartext_configlinear_1r   Zprojector_hidden_actactlinear_2)selfr   Znum_feature_layers	__class__r   r   r   /   s    

z$VipLlavaMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S )N)r"   r$   r%   r&   )r'   hidden_statesr   r   r   forward>   s
    



z#VipLlavaMultiModalProjector.forward)r   r   r   r   r   r+   __classcell__r   r   r(   r   r   .   s   r   c                   @   s   e Zd ZdS )VipLlavaPreTrainedModelNr   r   r   r   r   r-   F   s   r-   c                   @   s   e Zd Zdejeeeee f  dddZ	e
d	ejejeej eej ee eej eeeee f  ee ee ee ee eej eeef dddZdS )
VipLlavaModelNpixel_valuesr   c                    sv   |dur|n| j j}| j|dd t|trH j| ddddf }n  fdd|D }tj|dd}| |}|S )	aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, list[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        NT)output_hidden_statesr   c                    s&   g | ]} j | d d dd f qS )Nr   )r*   ).0indexZimage_outputsr   r   
<listcomp>e       z4VipLlavaModel.get_image_features.<locals>.<listcomp>)dim)	r   r   Zvision_towerr   r    r*   torchcatZmulti_modal_projector)r'   r0   r   image_featuresr   r4   r   get_image_featuresK   s    

z VipLlavaModel.get_image_features)	input_idsr0   attention_maskposition_idspast_key_valuesinputs_embedsr   	use_cacheoutput_attentionsr1   return_dictcache_positionreturnc                 K   s  |	dur|	n| j j}	|
dur |
n| j j}
|dur4|n| j j}|durH|n| j j}|du |duA rhtd|du r||  |}|dur| j||d}||j	|j
}| j|||d}|||}| jf ||||||	|
d|d	|}t|j|j|j|j|dur|ndd}|r|S | S )z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedsr/   )rA   r;   T)	r>   r?   r@   rA   rB   rC   r1   rD   rE   )last_hidden_stater@   r*   
attentionsimage_hidden_states)r   rC   r1   use_return_dictr   
ValueErrorZget_input_embeddingsr<   toZdeviceZdtypeZget_placeholder_maskZmasked_scatterZlanguage_modelr   rG   r@   r*   rH   Zto_tuple)r'   r=   r0   r>   r?   r@   rA   r   rB   rC   r1   rD   rE   	lm_kwargsr;   Zspecial_image_maskoutputsoutputr   r   r   r+   j   sP    
zVipLlavaModel.forward)N)NNNNNNNNNNNN)r   r   r   r9   FloatTensorr   r   r    listr<   r   
LongTensorTensorr   booltupler   r+   r   r   r   r   r.   J   s@                
r.   c                   @   s   e Zd Zd	ejeeeee f  dddZ	d
ej
ejeej eej
 ee eej eeeee f  eej
 ee ee ee ee eej
 eeejf eeef dddZdS ) VipLlavaForConditionalGenerationNr/   c                 C   s   | j j||dS )Nr/   )modelr<   )r'   r0   r   r   r   r   r<      s    z3VipLlavaForConditionalGeneration.get_image_featuresr   )r=   r0   r>   r?   r@   rA   r   labelsrB   rC   r1   rD   rE   logits_to_keeprF   c                 K   s   |
dur|
n| j j}
|dur |n| j j}|dur4|n| j j}|durH|n| j j}| jf |||||||	||
|d|d|}|d }t|trt| dn|}| 	|dd|ddf }d}|dur| j
||| j jjd}t|||j|j|j|jdS )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)r=   r0   r>   r?   r@   rA   rB   r   rC   r1   rD   rE   r   )logitsrX   
vocab_size)lossrZ   r@   r*   rH   rI   )r   rC   r1   rJ   r   rW   r   r    sliceZlm_headZloss_functionr#   r[   r   r@   r*   rH   rI   )r'   r=   r0   r>   r?   r@   rA   r   rX   rB   rC   r1   rD   rE   rY   rM   rN   r*   Zslice_indicesrZ   r\   r   r   r   r+      sH    4z(VipLlavaForConditionalGeneration.forward)N)NNNNNNNNNNNNNr   )r   r   r   r9   rP   r   r   r    rQ   r<   rR   rS   r   rT   rU   r   r+   r   r   r   r   rV      sF                  
rV   )r.   rV   r-   )typingr   r   r9   r   Z(transformers.models.llava.modeling_llavar   r   r   r   r	   Zactivationsr   Zcache_utilsr   utilsr   r   Zconfiguration_vipllavar   Z
get_loggerr   loggerr   r   Moduler   r-   r.   rV   __all__r   r   r   r   <module>   s   
ff