a
    h!&                     @   s   d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZ ddlmZmZ dd	lmZ d
dlmZ eeZG dd deZdgZdS )zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
    N)OptionalUnion   )BatchFeature)ProcessorMixin)
AddedTokenPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypelogging)
VideoInput   )AutoTokenizerc                       s   e Zd ZdZg dZdZdZdZd fdd	Zde	e
eeee ee f ee
eeef e
eeef ee eee ee eeeeeeee
eef  edddZedd Z fddZe fddZ  ZS )InstructBlipVideoProcessora  
    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
    processor.

    [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the
    docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.

    Args:
        video_processor (`InstructBlipVideoVideoProcessor`):
            An instance of [`InstructBlipVideoVideoProcessor`]. The video processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):
            Number of tokens used by the Qformer as queries, should be same as in model's config.
    )video_processor	tokenizerqformer_tokenizerZAutoVideoProcessorr   Nc                    sP   t |ds.tdddd| _|j| jgdd n|j| _|| _t ||| d S )Nvideo_tokenz<video>FT)
normalizedZspecial)Zspecial_tokens)hasattrr   r   Z
add_tokensnum_query_tokenssuper__init__)selfr   r   r   r   kwargs	__class__ ~/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.pyr   ?   s    
z#InstructBlipVideoProcessor.__init__TFr   )imagestextadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbosereturn_tensorsreturnc                    s  |du r|du rt di }|durPt|tr8|g}n t|tsXt|d tsXt d| jf ||||||||	|
||||||d|}|d|d< |d|d	< |dur|| j8 }| jf ||||||||	|
|||||dd|}|durF| jj	| j d
 }| j|d|	|
||||dd	|D ]"  fdd|  D | < q"|
| |durr| j||d}|
| t||d}|S )a%  
        This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
        [`BertTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.
        Nz3You have to specify at least one of images or text.r   zAInvalid input text. Please provide a string, or a list of strings)r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   Z	input_idsqformer_input_idsZattention_maskqformer_attention_mask   F)r#   r)   r*   r+   r,   r-   r.   r0   c                    s   g | ]}  | qS r   r   ).0samplekZvideo_text_encodingr   r    
<listcomp>       z7InstructBlipVideoProcessor.__call__.<locals>.<listcomp>)r0   )Ztensor_type)
ValueError
isinstancestrlistr   popr   r   r   contentupdater   r   )r   r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r   encodingZqformer_text_encodingZtext_encodingZvideo_tokensZimage_encodingr   r7   r    __call__H   s    



 


z#InstructBlipVideoProcessor.__call__c                 C   s$   | j j}| jj}ddg}|| | S )Nr2   r3   )r   model_input_namesr   )r   Ztokenizer_input_namesZvideo_processor_input_namesZqformer_input_namesr   r   r    rD      s    z,InstructBlipVideoProcessor.model_input_namesc                    s   t j|rtd| dt j|dd t j|d}| j| d| jv }|r^| j	d t
 j|fi |}|r|  jdg7  _|S )NzProvided path (z#) should be a directory, not a fileT)exist_okr   )ospathisfiler;   makedirsjoinr   save_pretrained
attributesremover   )r   Zsave_directoryr   Zqformer_tokenizer_pathZqformer_presentoutputsr   r   r    rK      s    
z*InstructBlipVideoProcessor.save_pretrainedc                    s>   t  j|fi |}t|tr&|d }tj|dd}||_|S )Nr   r   )Z	subfolder)r   from_pretrainedr<   tupler   r   )clsZpretrained_model_name_or_pathr   	processorr   r   r   r    rO      s    
z*InstructBlipVideoProcessor.from_pretrained)N)NNTFNNr   NNFFFFFTN)__name__
__module____qualname____doc__rL   Zvideo_processor_classZtokenizer_classZqformer_tokenizer_classr   r   r   r
   r	   r>   boolr=   r   r   r   intr   r   rC   propertyrD   rK   classmethodrO   __classcell__r   r   r   r    r   '   s\                   h
r   )rV   rF   typingr   r   Zimage_processing_utilsr   Zprocessing_utilsr   Ztokenization_utils_baser   r   r	   r
   r   utilsr   r   Zvideo_utilsr   autor   Z
get_loggerrS   loggerr   __all__r   r   r   r    <module>   s   
 1