a
    h$                     @   s   d Z ddlZddlZddlZddlmZmZmZ ddlm	Z	 ddl
mZmZmZ e r^ddlZe rlddlZeeZddiZd	d
 ZG dd de	ZdgZdS )zTokenization class for VITS.    N)AnyOptionalUnion   )PreTrainedTokenizer)is_phonemizer_availableis_uroman_availablelogging
vocab_filez
vocab.jsonc                 C   s    t d}|| }|d u}|S )Nz[^\x00-\x7F])recompilesearch)input_stringZnon_roman_patternmatchZhas_non_roman r   f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/vits/tokenization_vits.pyhas_non_roman_characters%   s    

r   c                	       s   e Zd ZdZeZddgZd$dd	 fd
dZedd Z	dd Z
dd Zdd Zd%eeee eeeeef f dddZeee dddZee edddZdd Zdd  Zd&eee eee df d!d"d#Z  ZS )'VitsTokenizera  
    Construct a VITS tokenizer. Also supports MMS-TTS.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        language (`str`, *optional*):
            Language identifier.
        add_blank (`bool`, *optional*, defaults to `True`):
            Whether to insert token id 0 in between the other tokens.
        normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the input text by removing all casing and punctuation.
        phonemize (`bool`, *optional*, defaults to `True`):
            Whether to convert the input text into phonemes.
        is_uroman (`bool`, *optional*, defaults to `False`):
            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
    Z	input_idsZattention_mask<pad><unk>NTF)returnc	              
      s   t |dd}
t|
| _W d    n1 s.0    Y  dd | j D | _|| _|| _|| _|| _	|| _
t jf |||||||d|	 d S )Nutf-8encodingc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>W       z*VitsTokenizer.__init__.<locals>.<dictcomp>)	pad_token	unk_tokenlanguage	add_blank	normalize	phonemize	is_uroman)openjsonloadencoderitemsdecoderr!   r"   r#   r$   r%   super__init__)selfr
   r   r    r!   r"   r#   r$   r%   kwargsZvocab_handle	__class__r   r   r-   H   s&    *zVitsTokenizer.__init__c                 C   s
   t | jS N)lenr)   r.   r   r   r   
vocab_sizej   s    zVitsTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokens)r   ir4   r   r   r   o   r   z+VitsTokenizer.get_vocab.<locals>.<dictcomp>)ranger5   updateadded_tokens_encoder)r.   Zvocabr   r4   r   	get_vocabn   s    zVitsTokenizer.get_vocabc                 C   s   t | j t | j  }d}d}|t|k rd}|D ]8}|||t|  |kr8||7 }|t|7 }d} qrq8|s$|||  7 }|d7 }q$|S )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased. r   FT   )listr)   keysr9   r3   lower)r.   r   Zall_vocabularyfiltered_textr6   Zfound_matchwordr   r   r   normalize_texts   s    
zVitsTokenizer.normalize_textc                 C   s   | j dkr|dd}|S )z4Special treatment of characters in certain languagesZronu   țu   ţ)r!   replace)r.   textr   r   r   _preprocess_char   s    
zVitsTokenizer._preprocess_char)rD   is_split_into_wordsr#   r   c                    s   |dur|n j }|r  |} |}t|r\ jr\t sJtd nt	 }|
|} jrt sptdtj|dddddd}tdd	|}n$|rd
tt fdd| }||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize.
            normalize (`bool`, *optional*, defaults to `None`):
                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
                text consists only of lower-case characters.
            kwargs (`dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `tuple[str, dict[str, Any]]`: The prepared text and the unused kwargs.
        NaC  Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` Note `uroman` requires python version >= 3.10Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uromanzEPlease install the `phonemizer` Python package to use this tokenizer.zen-usZespeakT)r!   backendstripZpreserve_punctuationZwith_stressz\s+ r;   c                    s
   |  j v S r2   )r)   )charr4   r   r   <lambda>   r   z8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>)r#   rB   rE   r   r%   r   loggerwarningurZUromanZromanize_stringr$   r   ImportError
phonemizerr   subjoinr=   filterrH   )r.   rD   rF   r#   r/   r@   uromanr   r4   r   prepare_for_tokenization   s4    


 z&VitsTokenizer.prepare_for_tokenization)rD   r   c                 C   s@   t |}| jr<| dgt|d d  }||ddd< |}|S )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r      r<   N)r=   r"   _convert_id_to_tokenr3   )r.   rD   tokensZinterspersedr   r   r   	_tokenize   s    zVitsTokenizer._tokenize)rX   r   c                 C   s*   | j r t|dkr |dd d }d|S )Nr<   rV   r;   )r"   r3   rR   )r.   rX   r   r   r   convert_tokens_to_string   s    z&VitsTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r)   getr    )r.   tokenr   r   r   _convert_token_to_id   s    z"VitsTokenizer._convert_token_to_idc                 C   s   | j |S )z=Converts an index (integer) in a token (str) using the vocab.)r+   r[   )r.   indexr   r   r   rW      s    z"VitsTokenizer._convert_id_to_token)save_directoryfilename_prefixr   c              	   C   s   t j|s"td| d d S t j||r6|d ndtd  }t|ddd.}|t	j
| jd	d
ddd  W d    n1 s0    Y  |fS )NzVocabulary path (z) should be a directory-r;   r
   wr   r   rV   TF)indent	sort_keysensure_ascii
)ospathisdirrL   errorrR   VOCAB_FILES_NAMESr&   writer'   dumpsr)   )r.   r_   r`   r
   fr   r   r   save_vocabulary   s    <zVitsTokenizer.save_vocabulary)r   r   NTTTF)FN)N)__name__
__module____qualname____doc__rk   Zvocab_files_namesZmodel_input_namesr-   propertyr5   r:   rB   rE   strboolr   tupledictr   rU   r=   rY   rZ   r]   rW   r   ro   __classcell__r   r   r0   r   r   /   s8          "
 
Ar   )rs   r'   rg   r   typingr   r   r   Ztokenization_utilsr   utilsr   r   r	   rP   rT   rN   Z
get_loggerrp   rL   rk   r   r   __all__r   r   r   r   <module>   s    

 H