a
    h 1                     @   s   d Z ddlZddlmZ ddlmZmZ ddlZddl	m
Z
 ddlmZ ddlmZ eeZd	Zd
diZeddG dd de
ZdgZdS )zTokenization classes for .    N)copyfile)AnyOptional   )PreTrainedTokenizer)logging)requiresu   ▁
vocab_filezsentencepiece.bpe.model)sentencepiece)backendsc                       s   e Zd ZdZeZddgZd)eee	e
f  dd	 fd
dZdd Zdd Zd*ee eee  ee dddZd+ee eee  eee d fddZd,ee eee  ee dddZedd Zdd Ze	ee	 dddZd d! Zd"d# Zd$d% Zd-e	ee	 ee	 d&d'd(Z  ZS ).XGLMTokenizera  
    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    Z	input_idsZattention_mask<s></s><unk><pad>N)sp_model_kwargsreturnc	              
      s  |d u ri n|_ d_dd tjD }
 dg p:g  d<  d   fdd|
D 7  < tjf i j _jt| |_	d_
dddd	d
_tjfddtjD }
j|
 dd j D _t jf ||||||j d  d S )N   c                 S   s   g | ]}d | dqS z<madeupword> .0ir   r   f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/xglm/tokenization_xglm.py
<listcomp>w       z*XGLMTokenizer.__init__.<locals>.<listcomp>additional_special_tokensc                    s   g | ]}| d  vr|qS )r   r   )r   word)kwargsr   r   r   z   s      r      r   )r   r   r   r   c                    s$   i | ]}d | d|  j  qS r   )fairseq_offsetr   )selfsp_sizer   r   
<dictcomp>   r   z*XGLMTokenizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r   r   )r   kvr   r   r   r%      r   )	bos_token	eos_token	unk_token	sep_token	cls_token	pad_tokenr   )r   num_madeup_wordsrangegetspmSentencePieceProcessorsp_modelLoadstrr	   r"   fairseq_tokens_to_idslenupdateitemsfairseq_ids_to_tokenssuper__init__)r#   r	   r(   r)   r+   r,   r*   r-   r   r   Zmadeup_words	__class__)r   r#   r$   r   r<   g   s6    
	
zXGLMTokenizer.__init__c                 C   s$   | j  }d |d< | j |d< |S )Nr3   sp_model_proto)__dict__copyr3   serialized_model_proto)r#   stater   r   r   __getstate__   s    
zXGLMTokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjf i | j| _| j| j d S )Nr   )r@   hasattrr   r1   r2   r3   ZLoadFromSerializedProtor?   )r#   dr   r   r   __setstate__   s
    
zXGLMTokenizer.__setstate__)token_ids_0token_ids_1r   c                 C   s0   |du r| j g| S | j g}|| | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)sep_token_idr#   rH   rI   sepr   r   r    build_inputs_with_special_tokens   s    z.XGLMTokenizer.build_inputs_with_special_tokensF)rH   rI   already_has_special_tokensr   c                    s\   |rt  j||ddS |du r2dgdgt|  S dgdgt|  ddg dgt|  S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rH   rI   rN   Nr    r   )r;   get_special_tokens_maskr7   )r#   rH   rI   rN   r=   r   r   rO      s    z%XGLMTokenizer.get_special_tokens_maskc                 C   s@   | j g}|du r"t|| dg S t|| | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.

        Nr   )rJ   r7   rK   r   r   r   $create_token_type_ids_from_sequences   s    z2XGLMTokenizer.create_token_type_ids_from_sequencesc                 C   s   t | j| j | j S )N)r7   r3   r"   r.   r#   r   r   r   
vocab_size   s    zXGLMTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokensr   rQ   r   r   r%     r   z+XGLMTokenizer.get_vocab.<locals>.<dictcomp>)r/   rR   r8   Zadded_tokens_encoder)r#   Zvocabr   rQ   r   	get_vocab   s    zXGLMTokenizer.get_vocab)textr   c                 C   s   | j j|tdS )N)Zout_type)r3   encoder5   )r#   rT   r   r   r   	_tokenize  s    zXGLMTokenizer._tokenizec                 C   s4   || j v r| j | S | j|}|r.|| j S | jS )z0Converts a token (str) in an id using the vocab.)r6   r3   Z	PieceToIdr"   Zunk_token_id)r#   tokenZspm_idr   r   r   _convert_token_to_id  s    

z"XGLMTokenizer._convert_token_to_idc                 C   s&   || j v r| j | S | j|| j S )z=Converts an index (integer) in a token (str) using the vocab.)r:   r3   Z	IdToPiecer"   )r#   indexr   r   r   _convert_id_to_token  s    

z"XGLMTokenizer._convert_id_to_tokenc                 C   s   d |td }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r#   tokensZ
out_stringr   r   r   convert_tokens_to_string  s    z&XGLMTokenizer.convert_tokens_to_string)save_directoryfilename_prefixr   c                 C   s   t j|s"td| d d S t j||r6|d ndtd  }t j| jt j|krzt j	| jrzt
| j| nLt j	| jst|d$}| j }|| W d    n1 s0    Y  |fS )NzVocabulary path (z) should be a directory-r[   r	   wb)ospathisdirloggererrorr]   VOCAB_FILES_NAMESabspathr	   isfiler   openr3   rB   write)r#   rc   rd   Zout_vocab_filefiZcontent_spiece_modelr   r   r   save_vocabulary  s    (
(zXGLMTokenizer.save_vocabulary)r   r   r   r   r   r   N)N)NF)N)N)__name__
__module____qualname____doc__rl   Zvocab_files_namesZmodel_input_namesr   dictr5   r   r<   rD   rG   listintrM   boolrO   rP   propertyrR   rS   rV   rX   rZ   rb   tuplerr   __classcell__r   r   r=   r   r   #   sN   ?       	8   
	r   )rv   rg   shutilr   typingr   r   r
   r1   Ztokenization_utilsr   utilsr   Zutils.import_utilsr   Z
get_loggerrs   rj   r_   rl   r   __all__r   r   r   r   <module>   s   
  