a
    h                     @   sP   d Z ddlmZ ddlmZmZ ddlmZ ee	Z
G dd deZdgZdS )	zTokenization class for Dia.    )Optional   )
AddedTokenPreTrainedTokenizer)loggingc                       s   e Zd ZdZddgZdee ee ee ed fdd	Ze	d
d Z
dd Zeee dddZdd Zdd Zee edddZdeee ee dddZ  ZS )DiaTokenizera  
    Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        unk_token (`str`, *optional*, defaults to `"<pad>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequences when encoding. Sequences longer than this will be truncated.
        offset (`int`, *optional*, defaults to 0):
            The offset of the tokenizer.
    Z	input_idsZattention_mask<pad>   r   )	pad_token	unk_token
max_lengthoffsetc                    sl   t |trt|n|}t |tr(t|n|}d| _|tdtdd| _|| _t jf |||d| d S )N   z[S1]z[S2])r         )r   r
   r   )
isinstancestrr   _utf_vocab_sizeZ_added_tokens_decoderr   super__init__)selfr
   r   r   r   kwargs	__class__ d/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/dia/tokenization_dia.pyr   /   s    	zDiaTokenizer.__init__c                 C   s   | j S )N)r   r   r   r   r   
vocab_sizeE   s    zDiaTokenizer.vocab_sizec                    s.    fddt  j j D }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokens.0ir   r   r   
<dictcomp>J       z*DiaTokenizer.get_vocab.<locals>.<dictcomp>)ranger   r   updateadded_tokens_encoder)r   Zvocabr   r   r   	get_vocabI   s    zDiaTokenizer.get_vocab)textreturnc                 C   s   dd | dD }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 S   s   g | ]}t |qS r   )chrr   r   r   r   
<listcomp>P   r"   z*DiaTokenizer._tokenize.<locals>.<listcomp>utf-8)encode)r   r'   tokensr   r   r   	_tokenizeN   s    zDiaTokenizer._tokenizec                 C   s$   t |dkrd}nt|| j }|S )z0Converts a token (str) in an id using the vocab.r   N)lenordr   )r   tokenZtoken_idr   r   r   _convert_token_to_idS   s    z!DiaTokenizer._convert_token_to_idc                 C   s   t || j }|S )z=Converts an index (integer) in a token (str) using the vocab.)r)   r   )r   indexr1   r   r   r   _convert_id_to_token]   s    z!DiaTokenizer._convert_id_to_token)r-   r(   c                 C   sl   d}|D ]P}|| j v r0| j | }t|d}n || jv rF|d}n
|d}||7 }q|jddd}|S )z:Converts a sequence of tokens (string) in a single string.r"   r+   ignore)errors)Zadded_tokens_decoderr   r,   r%   decode)r   r-   bstringr1   Zadded_token_objZ
tok_stringstringr   r   r   convert_tokens_to_stringb   s    




z%DiaTokenizer.convert_tokens_to_stringN)save_directoryfilename_prefixr(   c                 C   s   dS )Nr   r   )r   r;   r<   r   r   r   save_vocabularyr   s    zDiaTokenizer.save_vocabulary)r   r   r	   r   )N)__name__
__module____qualname____doc__Zmodel_input_namesr   r   intr   propertyr   r&   listr.   r2   r4   r:   tupler=   __classcell__r   r   r   r   r      s(       

r   N)rA   typingr   Ztokenization_utilsr   r   utilsr   Z
get_loggerr>   loggerr   __all__r   r   r   r   <module>   s   
\