a
    h
6                     @   s   d Z ddlZddlZddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZ eeZG dd de
ZdgZdS )z
Processor class for Bark
    N)Optional   )BatchFeature)ProcessorMixin)BatchEncoding)logging)cached_file   )AutoTokenizerc                       s   e Zd ZdZdZdgZddddZd fdd		ZedddZ	de
d fddZd ee dddZd!ee dddZd"edddZ  ZS )#BarkProcessora	  
    Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.

    Args:
        tokenizer ([`PreTrainedTokenizer`]):
            An instance of [`PreTrainedTokenizer`].
        speaker_embeddings (`dict[dict[str]]`, *optional*):
            Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
            `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
            embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
            [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
            a list of `voice_preset_names`.

    r
   	tokenizer   r	   Zsemantic_promptZcoarse_promptZfine_promptNc                    s   t  | || _d S )N)super__init__speaker_embeddings)selfr   r   	__class__ d/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/bark/processing_bark.pyr   =   s    zBarkProcessor.__init__speaker_embeddings_path.jsonc                 K   s   |durt |||dd|dd|dd|dd|dd|dd|d	d|d
ddddd}|du rtdtj|| d d}qt|}t	|}W d   q1 s0    Y  nd}t
j|fi |}| ||dS )a  
        Instantiate a Bark processor associated with a pretrained model.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

                - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
                  huggingface.co.
                - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
                  method, e.g., `./my_model_directory/`.
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file containing the speaker_embeddings dictionary located in
                `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
            **kwargs
                Additional keyword arguments passed along to both
                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
        N	subfolder	cache_dirforce_downloadFproxiesresume_downloadlocal_files_onlyuse_auth_tokenrevisionr   r   r   r   r   r   tokenr   Z _raise_exceptions_for_gated_repoZ%_raise_exceptions_for_missing_entriesZ'_raise_exceptions_for_connection_errors`z` does not exists
                    , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
                    dictionary if wanted, otherwise set `speaker_embeddings_dict_path=None`.)r   r   )r   poploggerwarningospathjoinopenjsonloadr
   from_pretrained)clsZ!pretrained_processor_name_or_pathspeaker_embeddings_dict_pathkwargsZspeaker_embeddings_pathr   Zspeaker_embeddings_jsonr   r   r   r   r,   B   s4    








*zBarkProcessor.from_pretrainedr   F)push_to_hubc              
      s  | j durtjtj||ddd i }||d< | j D ]~}|dkr6| |}i }	| j | D ]P}
tjtj|d || d|
 ||
 dd tj|| d|
 d	|	|
< qZ|	||< q6ttj||d
}t	
|| W d   n1 s0    Y  t j||fi | dS )a|  
        Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
        using the [`~BarkProcessor.from_pretrained`] method.

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
                if it does not exist).
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file that will contains the speaker_embeddings nested path dictionary, if it
                exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
            speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
                The name of the folder in which the speaker_embeddings arrays will be saved.
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            kwargs:
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        NZv2T)exist_okrepo_or_path_F)Zallow_picklez.npyw)r   r&   makedirsr'   r(   _load_voice_presetnpsaver)   r*   dumpr   save_pretrained)r   Zsave_directoryr.   Zspeaker_embeddings_directoryr0   r/   Zembeddings_dictZ
prompt_keyvoice_presetZtmp_dictkeyfpr   r   r   r:   z   s*    


 
*zBarkProcessor.save_pretrained)r;   c                 K   s   | j | }i }dD ]}||vr4td| d| dt| j dd|| |dd |dd |d	d
|dd |dd |dd
|dd |dd d
d
d
d}|d u rtdtj| j dd||  d| dt	|||< q|S )Nr   #Voice preset unrecognized, missing z% as a key in self.speaker_embeddings[z].r2   /r   r   r   Fr   r   r   r   r   r    r"   z{` does not exists
                    , no preloaded voice preset will be used - Make sure to provide correct paths to the z 
                    embeddings.)
r   
ValueErrorr   getr#   r&   r'   r(   r7   r+   )r   r;   r/   Zvoice_preset_pathsZvoice_preset_dictr<   r'   r   r   r   r6      s<    








 z BarkProcessor._load_voice_presetc                 C   s   dD ]}||vr t d| dt|| tjsNt| dt| j|  dt|| j| j| krt | dt| j|  dqd S )Nr   r>   z
 as a key.z voice preset must be a z
D ndarray.)	r@   
isinstancer7   Zndarray	TypeErrorstrpreset_shapelenshape)r   r;   r<   r   r   r   _validate_voice_preset_dict   s    z)BarkProcessor._validate_voice_preset_dictpt   T)returnc           
   	   K   s   |durbt |tsbt |tr<| jdur<|| jv r<| |}n&t |trX|dsX|d }t|}|dur| j|fi | t	||d}| j
|f|d||||d|}	|dur||	d< |	S )a#  
        Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs`
        arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a
        voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded
        to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename.

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            voice_preset (`str`, `dict[np.ndarray]`):
                The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
                `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
                it can be a valid file name of a local `.npz` single voice preset.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchEncoding`]: A [`BatchEncoding`] object containing the output of the `tokenizer`.
            If a voice preset is provided, the returned object will include a `"history_prompt"` key
            containing a [`BatchFeature`], i.e the voice preset with the right tensors type.
        Nz.npz)dataZtensor_type
max_length)return_tensorspaddingrM   return_attention_maskreturn_token_type_idsadd_special_tokensZhistory_prompt)rB   dictrD   r   r6   endswithr7   r+   rH   r   r   )
r   textr;   rN   rM   rR   rP   rQ   r/   Zencoded_textr   r   r   __call__   s:    %
zBarkProcessor.__call__)N)r   )r   r   F)N)N)NNrI   rJ   FTF)__name__
__module____qualname____doc__Ztokenizer_class
attributesrE   r   classmethodr,   boolr:   r   rD   r6   rS   rH   r   rV   __classcell__r   r   r   r   r   $   s:    :   9$       
r   )rZ   r*   r&   typingr   numpyr7   Zfeature_extraction_utilsr   Zprocessing_utilsr   Ztokenization_utils_baser   utilsr   Z	utils.hubr   autor
   Z
get_loggerrW   r$   r   __all__r   r   r   r   <module>   s   
  