a
    h                     @   s   d dl Z d dlmZmZ d dlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZ ddlmZmZ ddlmZ G d	d
 d
eZG dd deddZG dd deZdgZdS )    N)OptionalUnion   )BatchFeature)
ImageInputmake_nested_list_of_images)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)	to_py_objc                   @   sJ   e Zd ZU ee ed< ee ed< ee ed< ee ed< ee ed< dS )Gemma3ImagesKwargsdo_pan_and_scanpan_and_scan_min_crop_sizepan_and_scan_max_num_crops"pan_and_scan_min_ratio_to_activatedo_convert_rgbN)__name__
__module____qualname__r   bool__annotations__intfloat r   r   h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/gemma3/processing_gemma3.pyr      s
   
r   c                   @   s2   e Zd ZU eed< dddddddddd	Zd
S )Gemma3ProcessorKwargsimages_kwargsFT)paddingreturn_mm_token_type_ids      g333333?)r   r   r   r   r   )text_kwargsr    N)r   r   r   r   r   	_defaultsr   r   r   r   r   $   s   
r   F)totalc                       sx   e Zd ZddgZdZdZded fdd	Zdee	e
eee
 ee f ee ed
ddZdddZedd Z  ZS )Gemma3Processorimage_processor	tokenizerZAutoImageProcessorZAutoTokenizerNr#   )image_seq_lengthc                    sh   || _ |j| _|j| _|j| _d|jg| }d|j | |j d| _t jf |||d| d S )N z

)r)   r*   chat_template)	r+   image_token_id	boi_tokenZimage_tokenjoinZ	eoi_tokenfull_image_sequencesuper__init__)selfr)   r*   r-   r+   kwargsZimage_tokens_expanded	__class__r   r   r3   :   s    zGemma3Processor.__init__)imagestextr5   returnc                    s  |d u r|d u rt djtfdjji|}t|trD|g}n t|tsdt|d tsdtdi }|d urj	
|}t|}j	|fi |d }|sfdd|D }t|t|krt dt| d	t| d
t|d  fdd|D }	tt|||	D ]\}
\}} dd tj|D }t|t|krft dt| dt| dttt |D ]^\}}|rxdj ddjg|  }|d | | ||tj d   }|||
< qxqfdd|D }|d dd }|d dd}jf d|i|d }j||dgd |rpt|d }t|}d||jk< | |d< ti |||dS ) Nz+Provide at least one of `text` or `images`.Ztokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr    c                    s"   g | ]}d   jgt| qS ) )r0   r/   len.0r8   r4   r   r   
<listcomp>n       z,Gemma3Processor.__call__.<locals>.<listcomp>z1Received inconsistently sized batches of images (z) and text (z).	num_cropsc                    s&   g | ]} fd dt t|D qS )c                    s   g | ]}  d qS )r   )pop)r>   _rB   r   r   r@   w   rA   z7Gemma3Processor.__call__.<locals>.<listcomp>.<listcomp>)ranger<   r=   rE   r   r   r@   w   rA   c                 S   s   g | ]}|  qS r   )start)r>   mr   r   r   r@   y   rA   zPrompt contained z image tokens but received z images.zHere is the original image z0 and here are some crops to help you see better r;   c                    s   g | ]}|  j jqS r   )replacer/   r1   )r>   promptr?   r   r   r@      rA   r%   return_tensorsr"   Fr9   image)Z
modalitiesZ	input_ids   token_type_ids)dataZtensor_type)
ValueErrorZ_merge_kwargsr   r*   Zinit_kwargs
isinstancestrlist	TypeErrorr)   Zfetch_imagesr   r<   r   rC   	enumerateziprefinditerr/   reversedr0   Z_check_special_mm_tokensnparrayZ
zeros_liker.   tolistr   )r4   r8   r9   ZvideosZaudior5   Zoutput_kwargsZimage_inputsZbatched_imagesZbatch_num_cropsZ	batch_idxrJ   Zimage_indexesnumidxZformatted_image_textrK   r"   Ztext_inputsZ	array_idsZmm_token_type_idsr   )rB   r4   r   __call__P   sf    

&
zGemma3Processor.__call__c                 K   sH   i }|dur:| j gt| }dgt| }|||d tf i |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        NrM   )num_image_tokensnum_image_patches)r+   r<   updater	   )r4   Zimage_sizesr5   Zvision_datar`   ra   r   r   r   _get_num_multimodal_tokens   s    z*Gemma3Processor._get_num_multimodal_tokensc                 C   s0   | j jdg }| jj}dd |D }t|| S )NrN   c                 S   s   g | ]}|d kr|qS rE   r   )r>   namer   r   r   r@      rA   z5Gemma3Processor.model_input_names.<locals>.<listcomp>)r*   model_input_namesr)   rS   )r4   Ztokenizer_input_namesZimage_processor_input_namesr   r   r   re      s    z!Gemma3Processor.model_input_names)Nr#   )NNNN)N)r   r   r   
attributesZimage_processor_classZtokenizer_classr   r3   r   r   r   r   rS   r   r   r   r_   rc   propertyre   __classcell__r   r   r6   r   r(   5   s*         K
r(   )rW   typingr   r   numpyrZ   Zfeature_extraction_utilsr   Zimage_utilsr   r   Zprocessing_utilsr   r	   r
   r   r   Ztokenization_utils_baser   r   utilsr   r   r   r(   __all__r   r   r   r   <module>   s    