a
    hM(                     @   s   d dl mZmZmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZmZ e
 rnd dlmZ ddlmZ e rdd	lmZ e	 rd d
lZddlmZ eeZeedddG dd deZd
S )    )AnyUnionoverload   )GenerationConfig)add_end_docstringsis_tf_availableis_torch_availableis_vision_availableloggingrequires_backends   )Pipelinebuild_pipeline_init_args)Image)
load_image)'TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESN)$MODEL_FOR_VISION_2_SEQ_MAPPING_NAMEST)Zhas_tokenizerZhas_image_processorc                       s   e Zd ZdZdZdZdZdZdZe	ddZ
 fddZdd	d
Zeeedf eeeeef  dddZeeee ed f eeeeeef   dddZeeee ded f d fddZdddZdd Zdd Z  ZS )ImageToTextPipelinea  
    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'generated_text': 'two birds are standing next to each other '}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
    "image-to-text".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
    TF   )max_new_tokensc                    s8   t  j|i | t| d | | jdkr.tnt d S )NZvisiontf)super__init__r   Zcheck_model_type	frameworkr   r   )selfargskwargs	__class__ `/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/pipelines/image_to_text.pyr   T   s
    
zImageToTextPipeline.__init__Nc                 C   s   i }i }|d ur||d< |d ur(||d< |d ur8||d< |d urb|d urXd|v rXt d|| | jd urv| j|d< | jd ur| j|d< | j|d< ||i fS )Nprompttimeoutr   zp`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use only 1 versionassistant_model	tokenizerassistant_tokenizer)
ValueErrorupdater$   r&   r%   )r   r   generate_kwargsr"   r#   Zforward_paramsZpreprocess_paramsr    r    r!   _sanitize_parameters[   s(    





z(ImageToTextPipeline._sanitize_parameterszImage.Image)inputsr   returnc                 K   s   d S Nr    r   r+   r   r    r    r!   __call__v   s    zImageToTextPipeline.__call__c                 K   s   d S r-   r    r.   r    r    r!   r/   y   s    )r+   c                    s6   d|v r| d}|du r"tdt j|fi |S )a  
        Assign labels to the image(s) passed as inputs.

        Args:
            inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images.

            max_new_tokens (`int`, *optional*):
                The amount of maximum tokens to generate. By default it will use `generate` default.

            generate_kwargs (`Dict`, *optional*):
                Pass it to send all of these arguments directly to `generate` allowing full control of this function.

            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:

            - **generated_text** (`str`) -- The generated text.
        imagesNzBCannot call the image-to-text pipeline without an inputs argument!)popr'   r   r/   r.   r   r    r!   r/   |   s
    
c                 C   s  t ||d}|d urFtd t|ts>tdt| d| jjj	}|dkr| j
|| jd}| jdkrv|| j}| j|dd	j}| jjg| }t|d
}|d|i n|dkr| j
||| jd}| jdkr|| j}n^|dkr4| j
|| jd}| jdkr|| j}| j|| jd}|| ntd| dn(| j
|| jd}| jdkrn|| j}| jjj	dkr|d u rd |d< |S )N)r#   u   Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48 of 🤗 Transformers. Use the `image-text-to-text` pipeline insteadz&Received an invalid text input, got - zy - but expected a single string. Note also that one single text can be provided for conditional image to text generation.git)r0   return_tensorsptF)textZadd_special_tokensr   	input_idsZ
pix2struct)r0   Zheader_textr3   zvision-encoder-decoder)r3   zModel type z- does not support conditional text generation)r   loggerZwarning_once
isinstancestrr'   typemodelconfig
model_typeZimage_processorr   toZdtyper%   r6   Zcls_token_idtorchZtensorZ	unsqueezer(   )r   imager"   r#   r=   model_inputsr6   Ztext_inputsr    r    r!   
preprocess   sF    





zImageToTextPipeline.preprocessc                 K   sp   d|v r4t |d tr4tdd |d D r4d |d< d|vrF| j|d< || jj}| jj|fi ||}|S )Nr6   c                 s   s   | ]}|d u V  qd S r-   r    ).0xr    r    r!   	<genexpr>       z/ImageToTextPipeline._forward.<locals>.<genexpr>generation_config)r8   listallrG   r1   r;   Zmain_input_namegenerate)r   rA   r)   r+   model_outputsr    r    r!   _forward   s    
zImageToTextPipeline._forwardc                 C   s0   g }|D ]"}d| j j|ddi}|| q|S )NZgenerated_textT)Zskip_special_tokens)r%   decodeappend)r   rK   recordsZ
output_idsrecordr    r    r!   postprocess   s    zImageToTextPipeline.postprocess)NNNN)NN)__name__
__module____qualname____doc__Z_pipeline_calls_generateZ_load_processorZ_load_image_processorZ_load_feature_extractorZ_load_tokenizerr   Z_default_generation_configr   r*   r   r   r9   r   rH   dictr/   rB   rL   rQ   __classcell__r    r    r   r!   r   .   s&   
(4&$
3r   )typingr   r   r   Z
generationr   utilsr   r   r	   r
   r   r   baser   r   ZPILr   Zimage_utilsr   Zmodels.auto.modeling_tf_autor   r?   Zmodels.auto.modeling_autor   Z
get_loggerrR   r7   r   r    r    r    r!   <module>   s    
