a
    h.                     @   s   d Z ddlmZ ddlmZmZmZ ddlmZ ddl	m
Z
mZmZ ddlmZmZmZmZ ddlmZmZ dd	lmZ erdd
lmZ eeZedddZdd ZG dd deddZG dd deddZG dd deZ dgZ!dS )z
Processor class for IDEFICS2.
    )
accumulate)TYPE_CHECKINGOptionalUnion   )BatchFeature)
ImageInputis_valid_image
load_image)ImagesKwargsProcessingKwargsProcessorMixinUnpack)
AddedToken	TextInput)logging)PreTokenizedInput)returnc                 C   s   t | to| dS )Nhttp)
isinstancestr
startswith)val r   l/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/idefics2/processing_idefics2.pyis_url)   s    r   c                 C   s   t | pt| S N)r   r	   )elemr   r   r   is_image_or_image_url-   s    r   c                   @   s   e Zd ZU ee ed< dS )Idefics2ImagesKwargsimage_seq_lenN)__name__
__module____qualname__r   int__annotations__r   r   r   r   r   1   s   
r   F)totalc                   @   s(   e Zd ZU eed< ddddi dZdS )Idefics2ProcessorKwargsimages_kwargsTF)add_special_tokenspaddingZis_split_into_words)text_kwargsr(   N)r!   r"   r#   r   r%   	_defaultsr   r   r   r   r'   5   s   
r'   c                       s   e Zd ZdZddgZdZdZdeee	 d fd	d
Z
dd Zdeeee eee  f eedee ed f ee edddZ  ZS )Idefics2Processora  
    Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.

    [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See
    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.

    Args:
        image_processor (`Idefics2ImageProcessor`):
            An instance of [`Idefics2ImageProcessor`]. The image processor is a required input.
        tokenizer (`PreTrainedTokenizerBase`, *optional*):
            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
        image_seq_len (`int`, *optional*, defaults to 64):
            The length of the image sequence i.e. the number of <image> tokens per image in the input.
            This parameter is used to build the string from the input prompt and image tokens and should match the
            config.perceiver_config.resampler_n_latents value for the model used.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    image_processor	tokenizerZIdefics2ImageProcessorZAutoTokenizerN@   )r    chat_templatec                    s   |d u rt d|d u r t dt|dsxtddddj| _tddddj| _d	| j| jgi}|| || j| _n|j	| _|j| _|j| _td
ddd| _
|d	| j
gi || _t j|||d d S )Nz)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.image_tokenz<fake_token_around_image>FT)
normalizedZspecialz<image>Zadditional_special_tokensz<end_of_utterance>)r1   )
ValueErrorhasattrr   contentfake_image_tokenr2   r)   Zconvert_tokens_to_idsZimage_token_idZimage_boundary_tokenZend_of_utterance_tokenr    super__init__)selfr.   r/   r    r1   kwargsZtokens_to_add	__class__r   r   r9   Z   s"    

zIdefics2Processor.__init__c                 C   sT   g }|D ]F}g }|D ].}t |r,|| qt|r|t| q|| q|S r   )r	   appendr   r
   )r:   ZpromptsZprompt_imagespromptimagesr   r   r   r   _extract_images_from_promptss   s    z.Idefics2Processor._extract_images_from_promptsr   )r@   textr;   r   c              
      s  |du rdu rt d| jtfd| jji|}|d dd}|durN|n| j}|d dd}g }	i }
|dur\t|tr|g}n t|t	st|d tst d	| j
}| j}| ||  | }| jjr|d
 }|d
9 }g }|D ]@}|	|| |||}|| | | }|| q| j|fi |d }| j||dgd |
| durtrzggntt	tfrtd r|durt|	tkrt d| dt|	 d| dt d	dgt	t|	   fddtt|	D ngn>tt	tfsTtd t	tfsTtd d sTt ddd D }|dur||	kst d|	 d| ddd D | jfi |d }|
| t|
|dS )a
  
        Processes the input prompts and returns a BatchEncoding.

        Example:

        ```python
        >>> import requests
        >>> from transformers import Idefics2Processor
        >>> from transformers.image_utils import load_image

        >>> processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example

        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"

        >>> image1, image2 = load_image(url1), load_image(url2)
        >>> images = [[image1], [image2]]

        >>> text = [
        ...     "<image>In this image, we see",
        ...     "bla bla bla<image>",
        ... ]
        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
        >>> input_ids = outputs.input_ids
        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
        >>> print(input_tokens)
        ['<s><fake_token_around_image><image><image><fake_token_around_image> In this image, we see', '<s> bla bla bla<fake_token_around_image><image><image><fake_token_around_image>']
        ```

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).

                Wherever an image token, `<image>` is encountered it is expanded to
                `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
            return_tensors (`Union[str, TensorType]`, *optional*):
                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
                information.

        Nz+You must provide either `text` or `images`.Ztokenizer_init_kwargsr(   r    r+   return_tensorsr   zAInvalid input text. Please provide a string, or a list of strings   image)Z
modalitieszThe total number of zP tokens in the prompts should be the same as the number of images passed. Found  z tokens and z images.c                    s$   g | ]} |  |d    qS )   r   ).0iZcumsum_images_in_textr@   r   r   
<listcomp>   s   z.Idefics2Processor.__call__.<locals>.<listcomp>zdInvalid input images. Please provide a single image or a list of images or a list of list of images.c                 S   s   g | ]}t |qS r   )lenrH   sampler   r   r   rK          z!The number of images in the text z and images  z should be the same.c                 S   s   g | ]}d d |D qS )c                 S   s   g | ]}t |qS r   )r
   )rH   Zimr   r   r   rK     rO   z9Idefics2Processor.__call__.<locals>.<listcomp>.<listcomp>r   rM   r   r   r   rK     rO   )Ztensor_type)r4   Z_merge_kwargsr'   r/   Zinit_kwargspopr    r   r   listr7   r2   r.   Zdo_image_splittingr>   countreplaceZ_check_special_mm_tokensupdater   tuplesumrL   r   ranger   )r:   r@   rB   ZaudioZvideosr;   Zoutput_kwargsr    rC   Zn_images_in_textinputsr7   r2   Z	image_strZprompt_stringsrN   Ztext_inputsZn_images_in_imagesZimage_inputsr   rJ   r   __call__   s    6








zIdefics2Processor.__call__)Nr0   N)NNNN)r!   r"   r#   __doc__
attributesZimage_processor_classZtokenizer_classr$   r   r   r9   rA   r   r   rQ   r   r   r'   r   rY   __classcell__r   r   r<   r   r-   B   s&        r-   N)"rZ   	itertoolsr   typingr   r   r   Zfeature_extraction_utilsr   Zimage_utilsr   r	   r
   Zprocessing_utilsr   r   r   r   Ztokenization_utils_baser   r   utilsr   r   Z
get_loggerr!   loggerboolr   r   r   r'   r-   __all__r   r   r   r   <module>   s"   
 I