a
    h                     @   s   d dl mZmZ d dlZddlmZ ddlmZm	Z	 ddl
mZmZmZmZmZ ddlmZmZ G dd	 d	eZG d
d deddZG dd deZdgZdS )    )OptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)AudioKwargsImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   @   s   e Zd ZU ee ed< dS )Gemma3nImagesKwargsZdo_convert_rgbN)__name__
__module____qualname__r   bool__annotations__ r   r   j/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/gemma3n/processing_gemma3n.pyr      s   
r   c                   @   s*   e Zd ZU eed< eed< dddiiZdS )Gemma3nProcessorKwargsaudio_kwargsimages_kwargstext_kwargspaddingFN)r   r   r   r   r   r   	_defaultsr   r   r   r   r      s   
r   F)totalc                       s   e Zd ZdZg dZdZdZdZdeed	 fd
dZ	de
eeeee ee f eeejee eej eee  f  ee edddZ  ZS )Gemma3nProcessorat  
    A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer
    into a single processor.

    Args:
        feature_extractor (`Gemma3nAudioFeatureExtractor`):
            Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This
            should return a `BatchFeature` with `input_features` and `input_features_mask` features.
        image_processor (`SiglipImageProcessorFast`):
            Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature`
            with a `pixel_values` feature.
        tokenizer (`GemmaTokenizerFast`):
            The text tokenizer for the model.
        chat_template (`string`, *optional*):
            A Jinja template for generating text prompts from a set of messages.
        audio_seq_length (int, *optional*, defaults to 188):
            The number of audio soft tokens that will be added to the text prompt
        image_seq_length (int, *optional*, defaults to 256):
            The number of image soft tokens that should be added to
    )feature_extractorimage_processor	tokenizerZAutoFeatureExtractorZAutoImageProcessorZAutoTokenizerN      )audio_seq_lengthimage_seq_lengthc           
         s   || _ |j| _|j| _|j| _d|jg| }d|j | |j d| _|| _|j| _|j	| _	|j
| _
d|j
g| }	d|j	 |	 |j d| _t jf ||||d| d S )N z

)r   r    r!   chat_template)r$   audio_token_idZ	boa_tokenaudio_tokenjoinZ	eoa_tokenfull_audio_sequencer%   image_token_idZ	boi_tokenimage_tokenZ	eoi_tokenfull_image_sequencesuper__init__)
selfr   r    r!   r'   r$   r%   kwargsZaudio_tokens_expandedZimage_tokens_expanded	__class__r   r   r0   C   s(    
zGemma3nProcessor.__init__)imagestextaudior2   returnc                    s  |d u r |d u r |d u r t d jtfd jji|}t|trL|g}n t|tslt|d tslt d|d ur j|fi |d }|s fdd|D } fdd|D }ni }|d urH j	
|}t|} j	|fi |d	 }	|s fd
d|D }t|t|kr4t dt| dt| d fdd|D }ni }	|d dd }
 jf d|i|d ddi} j||dgd |d }t|}d|| jk< d|| jk< dd | D }| |d< ti ||	||
dS )Nz5Provide at least one of `text`, `images`, or `audio`.Ztokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr   c                    s   g | ]
} j qS r   )r)   ).0_r1   r   r   
<listcomp>}       z-Gemma3nProcessor.__call__.<locals>.<listcomp>c                    s   g | ]}|  j jqS r   )replacer)   r+   r9   promptr;   r   r   r<      r=   r   c                    s"   g | ]}d   jgt| qS ) )r*   r-   len)r9   r5   r;   r   r   r<      r=   z1Received inconsistently sized batches of images (z) and text (z).c                    s   g | ]}|  j jqS r   )r>   r-   r.   r?   r;   r   r   r<      r=   r   return_tensorsr6   npimage)Z
modalitiesZ	input_ids   r   c                 S   s   i | ]\}}||  qS r   )tolist)r9   kvr   r   r   
<dictcomp>   r=   z-Gemma3nProcessor.__call__.<locals>.<dictcomp>token_type_ids)dataZtensor_type)
ValueErrorZ_merge_kwargsr   r!   Zinit_kwargs
isinstancestrlistr   r    Zfetch_imagesr   rB   popZ_check_special_mm_tokensrD   Z
zeros_liker,   r(   itemsrG   r   )r1   r5   r6   r7   Zvideosr2   Zoutput_kwargsZaudio_inputsZbatched_imagesZimage_inputsrC   Ztext_inputsZ	array_idsrK   r   r;   r   __call__c   sR    

 
zGemma3nProcessor.__call__)Nr"   r#   )NNNN)r   r   r   __doc__
attributesZfeature_extractor_classZimage_processor_classZtokenizer_classintr0   r   r   r   r   rP   r   rD   Zndarrayfloatr   r   r   rS   __classcell__r   r   r3   r   r   (   s.      "    &r   )typingr   r   numpyrD   Zfeature_extraction_utilsr   Zimage_utilsr   r   Zprocessing_utilsr   r	   r
   r   r   Ztokenization_utils_baser   r   r   r   r   __all__r   r   r   r   <module>   s   
}