a
    h@                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlZddl	m
Z
mZ e rNd dlZe
 r\d dlZddlmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ G d
d deddZG dd deddZG dd deZdgZdS )    N)Path)AnyOptionalUnion   )is_soundfile_availableis_torch_available)
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   @   s"   e Zd ZU eeeef  ed< dS )CsmAudioKwargsencoded_length_kwargsN)__name__
__module____qualname__r   dictstrr   __annotations__ r   r   b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/csm/processing_csm.pyr   %   s   
r   F)totalc                   @   sJ   e Zd ZU eed< ddddg dg dg ddd	d
dddidZdS )CsmProcessorKwargsaudio_kwargsTleftF)paddingZpadding_sideZadd_special_tokens)   r         r   r"   
   r   r"      r   r"      r      )r"   r"   r"   r'   r"   r"      r"   r"      r"   r"   r#   r"      )r"   r"   r"   r"   r"   r"   r"   r"   r"   r"   r"   r"   r"   r"   r"   )kernel_sizesstrides	dilationsuse_causal_convi]  )r   sampling_ratereturn_tensorspt)text_kwargsr   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r   r   )   s   
	r   c                       s   e Zd ZdZddgZdZdZd fdd	Zedd	d
Z	e
eeeeeeef  f ee dddZdeeeeee ee f  ee
 ee ee ee dddZedd Z  ZS )CsmProcessora  
    Constructs a Csm processor which wraps [`EncodecFeatureExtractor`] and
    [`PretrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
    tokenizer functionalities. See the [`~CsmProcessor.__call__`] for more
    information.
    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
        ```python
        from transformers import CsmProcessor
        from datasets import load_dataset

        ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        audio = ds[0]["audio"]["array"]

        processor = CsmProcessor.from_pretrained("sesame/csm-1b")

        processor(
            text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
            audio=audio,
            text_kwargs = {"padding": False},
            audio_kwargs = {"sampling_rate": 16000},
            common_kwargs = {"return_tensors": "pt"},
        )
        # this should error out because EncodecFeatureExtractor expects a 24kHz audio :)
        ```

    Args:
        feature_extractor ([`EncodecFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.

    feature_extractor	tokenizerZEncodecFeatureExtractorZPreTrainedTokenizerFastNc                    sv   t |ds d| _|| j| _n|j| _|j| _t |dsPd| _|| j| _n|j| _|j| _t j|||d d S )Naudio_tokenz	<|AUDIO|>audio_eos_tokenz<|audio_eos|>)chat_template)hasattrr8   Zconvert_tokens_to_idsaudio_token_idr9   audio_eos_token_idsuper__init__)selfr6   r7   r:   	__class__r   r   r?   f   s    

zCsmProcessor.__init__c                 C   s   | }|du s$|du s$|du s$|du r(|S t |||D ]\}}}|d | d }	|| }
|
d }|
| }||	 |
 | d }t|d }|| | |
 }|| }|r|
}|}n|}|| }|| | }|||d   d | d }q4|S )a|  
        Compute the length of the encoded audio sequence.

        Args:
            audio_length (int): The length of the audio sequence.
            kernel_sizes (list[int]): The kernel sizes for the convolutional layers.
            strides (list[int]): The strides for the convolutional layers.
            use_causal_conv (bool): Whether to use causal convolutions.
        Nr"   r*   )zipmathceil)Zaudio_lengthr+   r,   r-   r.   Z
cur_lengthZkernel_sizeZstrideZdilationZeffective_kernel_sizeZpadding_totalpadding_rightpadding_leftZn_framesZideal_lengthZextra_paddingr   r   r   _get_encoded_length|   s(     z CsmProcessor._get_encoded_length)audiosaving_pathkwargsc           	      K   s   t  stdt|}t|ttfr,|g}n(t|ttfrLtdd |D sTt	dt
|t
|krlt	d| jtfi |}|d }|d }t||D ]2\}}t|tjr|   }t||| qd S )Nz/Please install `soundfile` to save audio files.c                 s   s   | ]}t |ttfV  qd S N)
isinstancer   r   ).0pr   r   r   	<genexpr>       z*CsmProcessor.save_audio.<locals>.<genexpr>zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer   r/   )r   ImportErrorr
   rM   r   r   listtupleall
ValueErrorlen_merge_kwargsr   rC   torchTensorcpufloatnumpysfwrite)	r@   rI   rJ   rK   output_kwargsr   r/   Zaudio_valuerO   r   r   r   
save_audio   s(     zCsmProcessor.save_audioF      ?)textrI   output_labelsdepth_decoder_labels_ratiorK   c              
      s  j tfdjji|}|d }|d }|d }	|	dd}
|
dkrXtjj dt|t	rj|g}n(t|t
tfrtd	d
 |D stdfdd|D }d}|durt|}t|}t|dkr|t|kr|du rtdntd| d| d|dur|di   fdd|D }| }g }|D ]t}g }j|v r|d}j| }|| |jdd}qFd|v r|d|dd}q|| q>|}j|fi |}i }|| |dur|dd g g  }}d}|D ]}|dkr2|td |tdg n`|tjdd ||||  D dd |tdd ||||  D jdd ||7 }q j|fi |}|dd || tdd
 |D fdd|D }tj|dd|d < |r|d! jk }|j d }|d"krHt!|dt"|d|   }|| }n|}t#|d! jk|d! j$kB |d! d#}d$||dddf |dddf f< ||d%< t%||
d&S )'a  
        Main method to prepare text(s) and audio to be fed as input to the model. This method forwards the `text`
        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode
        the text. To prepare the audio, this method forwards the `audio` arguments to
        EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`]. Please refer
        to the docstring of the above two methods for more information.

        Args:
            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
                tensor.
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
                - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
                - `-100` will be ignored in the loss computation
                - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
            depth_decoder_labels_ratio (float, *optional*, default=1.0):
                The ratio of audio frames to keep for the depth decoder labels.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **labels** -- List of labels for the audio frames. Returned when `output_labels=True`.
        Ztokenizer_init_kwargsr2   r   r3   r0   Nr1   z% only supports `return_tensors='pt'`.c                 s   s   | ]}t |tV  qd S rL   )rM   r   rN   tr   r   r   rP     rQ   z(CsmProcessor.__call__.<locals>.<genexpr>zAInvalid input text. Please provide a string, or a list of stringsc                    s   g | ]}|  jqS r   )countr8   rf   )r@   r   r   
<listcomp>  rQ   z)CsmProcessor.__call__.<locals>.<listcomp>r   z@No audio were provided, but there are audio tokens in the promptz)The number of audio tokens in each text (z7) should be the same as the number of provided audios (z).r   c                    s$   g | ]}j |jd  fi  qS )rH   shape)rN   Zaudio_array)r   r@   r   r   ri     s   z<placeholder>r"   Zreturn_attention_maskrk   c                 S   s(   g | ] }t |tjr |  n|qS r   )rM   rY   rZ   r[   r]   rN   elr   r   r   ri   >  s   )Zaxisc                 S   s   g | ]}|j d  qS rj   rl   rm   r   r   r   ri   F  rQ   )dimpadding_maskc                 s   s   | ]}|j d  V  qdS )rk   Nro   rN   Zcut_idxsr   r   r   rP   O  rQ   c                    s.   g | ]&}t jjj|d  |jd  fddqS )r   rk   )value)rY   nnZ
functionalpadrl   rr   )max_lenr   r   ri   P  s   input_values_cutoffsZ	input_idsrb   iilabels)dataZtensor_type)&rX   r   r7   Zinit_kwargspoprV   rB   r   rM   r   rS   rT   rU   r
   rW   sumcopyr8   appendreplaceupdatenpZzerosrY   ZtensorZconcatenateZcumsumr6   maxstackr<   Znonzerorl   Zrandpermintwherer=   r   )r@   rc   rI   rd   re   rK   r`   r2   r   r3   r0   Zn_audio_in_textZn_audioZnum_audio_tokens_listZnum_audio_tokens_list_copyZexpanded_textsampleZreplace_strZnum_audio_tokensZexpanded_audio_tokenencodingry   Zconcatenated_audiorw   offsetZaudio_inputsZaudio_frame_idxsZn_audio_framesZ	rand_idxsZskip_frames_idxsrx   r   )r   rv   r@   r   __call__   s    /
 









	&




$zCsmProcessor.__call__c                 C   s0   | j j}| jj}dd |D }t|| dg S )Nc                 S   s   g | ]}|d kr|qS )rq   r   )rN   namer   r   r   ri   r  rQ   z2CsmProcessor.model_input_names.<locals>.<listcomp>rw   )r7   model_input_namesr6   rS   )r@   Ztokenizer_input_namesZfeature_extractor_input_namesr   r   r   r   k  s    zCsmProcessor.model_input_names)N)NNNN)NFrb   )r   r   r   __doc__
attributesZfeature_extractor_classZtokenizer_classr?   staticmethodrH   r	   r   r   r   rS   r   r   ra   r   r   r   boolr\   r   propertyr   __classcell__r   r   rA   r   r5   >   s2   # (%    'r5   ) rD   pathlibr   typingr   r   r   r]   r   utilsr   r   rY   Z	soundfiler^   Zaudio_utilsr	   r
   Zfeature_extraction_utilsr   Zprocessing_utilsr   r   r   r   Ztokenization_utils_baser   r   r   r   r5   __all__r   r   r   r   <module>   s$     :