a
    hF                     @   s   d Z ddlmZmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ eeZeG dd deZeddG dd deZeddG dd dee
Zg dZdS )zPyTorch Fuyu model.    )OptionalUnionN)nn   )Cache)GenerationMixin)CausalLMOutputWithPast)PreTrainedModel)	AutoModel)auto_docstringcan_return_tuplelogging   )
FuyuConfigc                   @   s>   e Zd ZU eed< dZdZdZdZdZ	dZ
g ZdZdd ZdS )FuyuPreTrainedModelconfigZfuyuTpast_key_valuesc                 C   s|   | j j}t|tjr>|jjjd|d |jd urx|jj	  n:t|tj
rx|jjjd|d |jd urx|jj|j 	  d S )Ng        )meanstd)r   Zinitializer_range
isinstancer   LinearweightdataZnormal_biasZzero_Z	Embeddingpadding_idx)selfmoduler    r   b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/fuyu/modeling_fuyu.py_init_weights/   s    

z!FuyuPreTrainedModel._init_weightsN)__name__
__module____qualname__r   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_supports_attention_backendZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_no_split_modulesZ_skip_keys_device_placementr   r   r   r   r   r   #   s   
r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )Zcustom_introc                       s   e Zd ZddiZed fddZdd Zdd	 Zd
d Zdd Z	e
jee
j e
je
jdddZe
jdddZe
je
je
jdddZede
je
je
jee
j ee
j ee ee
j ee ee ee ee eeef dddZ  ZS )	FuyuModelzlanguage_model.modellanguage_modelr   c                    s\   t  | |j| _|jj| _t|j| _t	
|j|j |j |j| _d| _|   d S )NF)super__init__Zpad_token_idr   text_config
vocab_sizer
   from_configr%   r   r   Z
patch_sizeZnum_channelshidden_sizevision_embed_tokensZgradient_checkpointing	post_initr   r   	__class__r   r   r(   C   s    
zFuyuModel.__init__c                 C   s
   | j  S N)r%   get_input_embeddingsr   r   r   r   r3   P   s    zFuyuModel.get_input_embeddingsc                 C   s   | j | d S r2   )r%   set_input_embeddingsr   valuer   r   r   r5   S   s    zFuyuModel.set_input_embeddingsc                 C   s
   || _ d S r2   r%   r   decoderr   r   r   set_decoderV   s    zFuyuModel.set_decoderc                 C   s   | j S r2   r8   r4   r   r   r   get_decoderY   s    zFuyuModel.get_decoder)word_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc              	   C   s   |j d t|ks0tdt|d|j d | }t|j d D ]}tj|| dkddd }|| | }|j d || j d krtd|| j d|j d| d	|| | |j|||f< qF|S )
a  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchZnonzerotodevice)r   r=   r>   r?   Zoutput_embeddingsZ	batch_idxZdst_indicesZsrc_indicesr   r   r   gather_continuous_embeddings\   s(    z&FuyuModel.gather_continuous_embeddings)pixel_valuesc                    s    fdd|D }|S )a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        c                    s(   g | ] }  | j jjd qS )r   )r-   rI   r   dtypeZsqueeze).0patchr4   r   r   
<listcomp>   s   z0FuyuModel.get_image_features.<locals>.<listcomp>r   )r   rL   kwargspatch_embeddingsr   r4   r   get_image_features   s    
zFuyuModel.get_image_features)	input_idsinputs_embedsimage_featuresc                 C   s   |du r8||   tj| jjtj|jdk}|d}n|| jjk}| }|	d
||j}|jd |jd  }||  | krtd| d| |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        NrM   rJ   r   r   z6Image features and image tokens do not match: tokens: z, features )r3   rH   Ztensorr   Zimage_token_idlongrJ   allsum	unsqueezeZ	expand_asrI   rC   ZnumelrE   )r   rT   rU   rV   special_image_maskZn_image_tokensZn_image_featuresr   r   r   get_placeholder_mask   s    zFuyuModel.get_placeholder_maskN)rT   image_patchesimage_patches_indicesattention_maskposition_idsr   rU   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictr@   c                 K   sv  |	dur|	n| j j}	|
dur |
n| j j}
|dur4|n| j j}|durH|n| j j}|durj|durjtdn2|dur~|j\}}n|dur|j\}}}ntd|du r|dur|jn|j}|dur| nd}t	j
||| t	j|d}|d}|du r| j |}|durP| |}t	j|dd|j|j}| j|||d}|||}| jf |||||	|
||d|}|S )	a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr   rW   )dim)rU   rV   )rU   ra   rb   r   rd   re   rc   rf   )r   rd   re   rc   use_return_dictrE   rC   rJ   Zget_seq_lengthrH   ZarangerY   r\   r%   r3   rS   catrI   rM   r^   Zmasked_scatter)r   rT   r_   r`   ra   rb   r   rU   rc   rd   re   rf   rQ   Z
batch_sizeZ
seq_length_rJ   Zpast_key_values_lengthrR   r]   outputsr   r   r   forward   sR    




	zFuyuModel.forward)NNNNNNNNNNN)r    r!   r"   _checkpoint_conversion_mappingr   r(   r3   r5   r;   r<   rH   TensorlistrK   FloatTensorrS   
LongTensorr^   r   r   r   boolr   tupler   rl   __classcell__r   r   r0   r   r$   ;   sP   ,           
r$   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                       s   e Zd ZddddZdgZed fddZd	d
 Zdd Zdd Z	dd Z
eedejejejeej eej ee eej ee eej ee ee ee ee eeef dddZd fdd	Z  ZS )FuyuForCausalLMzmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightr&   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NF)r   )r'   r(   r$   modelr   r   r)   r,   r*   rv   r.   r/   r0   r   r   r(     s    
zFuyuForCausalLM.__init__c                 C   s
   | j  S r2   )rw   r3   r4   r   r   r   r3     s    z$FuyuForCausalLM.get_input_embeddingsc                 C   s   | j | d S r2   )rw   r5   r6   r   r   r   r5     s    z$FuyuForCausalLM.set_input_embeddingsc                 C   s   | j | d S r2   )rw   r;   r9   r   r   r   r;     s    zFuyuForCausalLM.set_decoderc                 C   s
   | j  S r2   )rw   r<   r4   r   r   r   r<     s    zFuyuForCausalLM.get_decoderNr   )rT   r_   r`   ra   rb   r   rU   rc   labelsrd   re   rf   logits_to_keepr@   c                 K   s   |
dur|
n| j j}
|dur |n| j j}|dur4|n| j j}|durH|n| j j}| j||||||||
||dd}|d }t|trt| dn|}| 	|dd|ddf }d}|	dur| j
f ||	| j jjd|}t|||j|j|jdS )a@  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import requests

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```NT)rT   r_   r`   rU   ra   rb   r   rd   re   rc   rf   r   )logitsrx   r*   )lossrz   r   hidden_states
attentions)r   rd   re   rc   rh   rw   r   intslicerv   Zloss_functionr)   r*   r   r   r|   r}   )r   rT   r_   r`   ra   rb   r   rU   rc   rx   rd   re   rf   ry   rQ   rk   r|   Zslice_indicesrz   r{   r   r   r   rl     sF    5zFuyuForCausalLM.forwardc           
   	      sB   t  j|f||||||d|}	|d dkr>d |	d< d |	d< |	S )N)r   ra   rU   r_   r`   cache_positionr   r`   r_   )r'   prepare_inputs_for_generation)
r   rT   r   ra   rU   r_   r`   r   rQ   Zmodel_inputsr0   r   r   r   v  s     z-FuyuForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNNNr   )NNNNNN)r    r!   r"   rm   Z_tied_weights_keysr   r(   r3   r5   r;   r<   r   r   rH   rq   rn   r   r   rp   rr   r~   r   rs   r   rl   r   rt   r   r   r0   r   ru      s`                
`      ru   )ru   r   r$   )__doc__typingr   r   rH   Ztorch.utils.checkpointr   Zcache_utilsr   Z
generationr   Zmodeling_outputsr   Zmodeling_utilsr	   Zmodels.auto.modeling_autor
   utilsr   r   r   Zconfiguration_fuyur   Z
get_loggerr    loggerr   r$   ru   __all__r   r   r   r   <module>   s2   
 9 