a
    hq"                     @   s   d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	 ddl
mZmZmZ ddlmZmZ dd	lmZ G d
d deddZeeZG dd deZdgZdS )z
Processor class for Donut.
    N)contextmanager)OptionalUnion   )
ImageInput)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)loggingc                   @   s   e Zd Zi ZdS )DonutProcessorKwargsN)__name__
__module____qualname__	_defaults r   r   f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/donut/processing_donut.pyr      s   r   F)totalc                       s   e Zd ZdZddgZdZdZd fdd	Zdee	e
eee eef  ee d	d
dZedd Zedd ZdddZedd Zedd Z  ZS )DonutProcessora  
    Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
    processor.

    [`DonutProcessor`] offers all the functionalities of [`DonutImageProcessor`] and
    [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and
    [`~DonutProcessor.decode`] for more information.

    Args:
        image_processor ([`DonutImageProcessor`], *optional*):
            An instance of [`DonutImageProcessor`]. The image processor is a required input.
        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*):
            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
    image_processor	tokenizerZAutoImageProcessorZAutoTokenizerNc                    sr   d }d|v r"t dt |d}|d ur.|n|}|d u rBtd|d u rRtdt || | j| _d| _	d S )Nfeature_extractorzhThe `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.z)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.F)
warningswarnFutureWarningpop
ValueErrorsuper__init__r   current_processor_in_target_context_manager)selfr   r   kwargsr   	__class__r   r   r   9   s    
zDonutProcessor.__init__)imagestextr#   c           	      K   s   | j r| j||fi |S |du r2|du r2td| jtfd| jji|}|durj| j|fi |d }|dur|dur|d dd | j|fi |d }|du r|S |du r|S |d |d	< |d |d< |S dS )
a  
        When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
        [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
        [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
        [`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
        NzBYou need to specify either an `images` or `text` input to process.Ztokenizer_init_kwargsZimages_kwargsZtext_kwargsZadd_special_tokensF	input_idslabels)	r!   r    r   Z_merge_kwargsr   r   Zinit_kwargsr   
setdefault)	r"   r&   r'   ZaudioZvideosr#   Zoutput_kwargsinputs	encodingsr   r   r   __call__M   s0    zDonutProcessor.__call__c                 C   s   | j j}t|ddg S )Nr(   r)   )r   model_input_nameslist)r"   Zimage_processor_input_namesr   r   r   r.   w   s    z DonutProcessor.model_input_namesc                 c   s0   t d d| _| j| _dV  | j| _d| _dS )z
        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
        z`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your labels by using the argument `text` of the regular `__call__` method (either in the same call as your images inputs, or in a separate call.TNF)r   r   r!   r   r    r   r"   r   r   r   as_target_processor}   s    z"DonutProcessor.as_target_processorFc                 C   sX  |du r| j  }i }|r,td|tj}|du r8q,|| d }d|vrTq,|d|dd  }|tdtd  }t|}td| d|tj}	|	du r|	|d}q|	
 }	t|}
t|	}t|
 d| |tjtjB }|dur|
d }d|v rVd|v rV| j|d|d	}|rt|dkrL|d
 }|||< ng ||< |dD ]R}| }||v r|d
 dkr|dd dkr|dd }|| | qht|| dkr|| d
 ||< |||	t|	 d  }|dd dkr|g| j|dd d|d	 S q|rB|r>|gS |S |rLg S d|iS dS )zS
        Convert a (generated) token sequence into an ordered JSON format.
        Nz<s_>   z</s_ z(.*?)T)is_inner_valueadded_vocabr   z<sep/><z/>   Ztext_sequence)r   Zget_added_vocabresearch
IGNORECASEstartindexlenescapereplacegroupDOTALLstrip
token2jsonsplitappendfind)r"   tokensr5   r6   outputZpotential_startZstart_tokenkeyZkey_escapedZ	end_tokenZstart_token_escapedZend_token_escapedcontentvalueleafr   r   r   rE      sX    





* zDonutProcessor.token2jsonc                 C   s   t dt | jS )Nzg`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.)r   r   r   image_processor_classr0   r   r   r   feature_extractor_class   s
    z&DonutProcessor.feature_extractor_classc                 C   s   t dt | jS )Nz[`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.)r   r   r   r   r0   r   r   r   r      s
    z DonutProcessor.feature_extractor)NN)NNNN)FN)r   r   r   __doc__
attributesrO   Ztokenizer_classr   r   r   r   strr/   r   r
   r	   r   r-   propertyr.   r   r1   rE   rP   r   __classcell__r   r   r$   r   r   %   s.       *


:
r   )rQ   r:   r   
contextlibr   typingr   r   Zimage_utilsr   Zprocessing_utilsr   r   r	   Ztokenization_utils_baser
   r   utilsr   r   Z
get_loggerr   loggerr   __all__r   r   r   r   <module>   s   
 4