a
    hN                     @   s*   d Z ddlmZ G dd deZdgZdS )z$Speech processor class for SpeechT5.   )ProcessorMixinc                       s8   e Zd ZdZdZdZ fddZdd Zdd	 Z  Z	S )
SpeechT5Processora}  
    Constructs a SpeechT5 processor which wraps a feature extractor and a tokenizer into a single processor.

    [`SpeechT5Processor`] offers all the functionalities of [`SpeechT5FeatureExtractor`] and [`SpeechT5Tokenizer`]. See
    the docstring of [`~SpeechT5Processor.__call__`] and [`~SpeechT5Processor.decode`] for more information.

    Args:
        feature_extractor (`SpeechT5FeatureExtractor`):
            An instance of [`SpeechT5FeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`SpeechT5Tokenizer`):
            An instance of [`SpeechT5Tokenizer`]. The tokenizer is a required input.
    ZSpeechT5FeatureExtractorZSpeechT5Tokenizerc                    s   t  || d S )N)super__init__)selffeature_extractor	tokenizer	__class__ l/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/speecht5/processing_speecht5.pyr   %   s    zSpeechT5Processor.__init__c                 O   sn  | dd}| dd}| dd}| dd}| dd}|durT|durTtd|durl|durltd|du r|du r|du r|du rtd	|dur| j|g|R d|i|}n |dur| j|fi |}nd}|dur| j|||d
|}	|	d }
n*|dur*| j|fi |}	|	d }
nd}	|du r<|	S |	durj|
|d< |	d}|durj||d< |S )a  
        Processes audio and text input, as well as audio and text targets.

        You can process audio by using the argument `audio`, or process audio targets by using the argument
        `audio_target`. This forwards the arguments to SpeechT5FeatureExtractor's
        [`~SpeechT5FeatureExtractor.__call__`].

        You can process text by using the argument `text`, or process text labels by using the argument `text_target`.
        This forwards the arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.__call__`].

        Valid input combinations are:

        - `text` only
        - `audio` only
        - `text_target` only
        - `audio_target` only
        - `text` and `audio_target`
        - `audio` and `audio_target`
        - `text` and `text_target`
        - `audio` and `text_target`

        Please refer to the docstring of the above two methods for more information.
        audioNtexttext_targetaudio_targetsampling_ratez\Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?z\Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?zaYou need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process.)r   r   input_values	input_idslabelsattention_maskdecoder_attention_mask)pop
ValueErrorr   r   get)r   argskwargsr   r   r   r   r   inputstargetsr   r   r   r   r   __call__(   sJ     







zSpeechT5Processor.__call__c           
      O   sl  | dd}| dd}| dd}|dur<|dur<td|du r\|du r\|du r\td|dur| jj|g|R i |}n"|dur| jj|fi |}nd}|dur(d|v st|trd|d v r| jj|fi |}|d }n>| jj}| jj| j_| jj|g|R i |}|| j_|d }nd}|du r:|S |durh||d< |	d}	|	durh|	|d	< |S )
au  
        Collates the audio and text inputs, as well as their targets, into a padded batch.

        Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
        by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].

        Valid input combinations are:

        - `input_ids` only
        - `input_values` only
        - `labels` only, either log-mel spectrograms or text tokens
        - `input_ids` and log-mel spectrogram `labels`
        - `input_values` and text `labels`

        Please refer to the docstring of the above two methods for more information.
        r   Nr   r   z:Cannot process both `input_values` and `input_ids` inputs.zZYou need to specify either an `input_values`, `input_ids`, or `labels` input to be padded.    r   r   )
r   r   r   padr   
isinstancelistZfeature_sizeZnum_mel_binsr   )
r   r   r   r   r   r   r   r   Zfeature_size_hackr   r   r   r   r    o   s@    






zSpeechT5Processor.pad)
__name__
__module____qualname____doc__Zfeature_extractor_classZtokenizer_classr   r   r    __classcell__r   r   r	   r   r      s   Gr   N)r&   Zprocessing_utilsr   r   __all__r   r   r   r   <module>   s    