a
    hM                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
ZddlmZ ddlmZ ddlmZ erxdd	lmZ dd
lmZ ddlmZ eeZddiZdZeddG dd deZdgZdS )z Tokenization class for model T5.    N)copyfile)TYPE_CHECKINGAnyOptional   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)logging)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc                       sv  e Zd ZdZeZddgZd:eee	e
f  dd
 fddZd;ddZedd Zedd Zdd Zd<ee eee  eee d fddZdd Zdd Zee ee dddZd=ee eee  ee d d!d"Zd>ee eee  ee d d#d$Zd%d& Zd'd( Zd)ee	 d* fd+d,Zed-d. Zd/d0 Zd1d2 Z d3d4 Z!d5d6 Z"d?e	ee	 e#e	 d7d8d9Z$  Z%S )@T5Tokenizera  
    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
           Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be
            retrieved by calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids
            method
         additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        legacy (`bool`, *optional*):
            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
            example:

            - `legacy=True`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
            >>> tokenizer.encode("Hello <extra_id_0>.")
            [8774, 32099, 3, 5, 1]
            ```
            - `legacy=False`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
            [8774, 32099, 5, 1]
            ```
            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    Z	input_idsZattention_mask</s><unk><pad>d   NT)sp_model_kwargsreturnc
                    s  t |trt|ddn|}t |tr0t|ddn|}t |trJt|ddn|}|d u rZi n|| _|| _|| _tjf i | j| _| j	| |d urdd |D }t
|dk r|dd t|D 7 }n*|dkr|t
|krtd| d	| d
ndd t|D }|}i | _tt
|D ]:}td| ddddddd| jt
| jd | | < q|d u rxtd| j d d}|| _| |
dd| _|	| _t jf |||||| j||	d|
 d S )NT)specialc                 S   s   g | ]}d t |v r|qS )
<extra_id_)str).0x r   b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/t5/tokenization_t5.py
<listcomp>       z(T5Tokenizer.__init__.<locals>.<listcomp>   c                 S   s   g | ]}d | dqS r   >r   r   ir   r   r   r      r   r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensc                 S   s   g | ]}d | dqS r!   r   r#   r   r   r   r      r   r   r"   F)Zsingle_wordlstriprstripr   
normalizedz2You are using the default legacy behaviour of the a_  . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565	from_slow)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   legacyadd_prefix_space)
isinstancer   r	   r   r   Z
_extra_idsspmSentencePieceProcessorsp_modelLoadlenrange
ValueErrorZ_added_tokens_decoderloggerZwarning_once	__class__r.   get_spm_processorpopr/   super__init__)selfr   r)   r*   r+   r,   r-   r   r.   r/   kwargsZextra_tokensr$   r9   r   r   r=      sV    "
	zT5Tokenizer.__init__Fc                 C   s   t jf i | j}| js|r,|| j |S t| jdd}| }td| j	j
 d}|j|}| }d|_|j| | }|| W d    n1 s0    Y  |S )NrbzThe new behaviour of z (with `self.legacy = False`)F)r1   r2   r   r.   r4   r   openreadr   r9   __name__Z
ModelProtoZ
FromStringZNormalizerSpecZadd_dummy_prefixnormalizer_specZ	MergeFromZSerializeToStringZLoadFromSerializedProto)r>   r(   	tokenizerfr3   Z	model_pb2modelrE   r   r   r   r:      s    
(zT5Tokenizer.get_spm_processorc                 C   sZ   | t jv rVt j|  }|d ur(||kr(|S |d u rVtd| d|  d| d| d	t |S )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   Zmax_model_input_sizeswarningswarnFutureWarning)Zpretrained_model_name_or_pathZmax_model_lengthZinit_max_model_lengthZdeprecated_max_model_lengthr   r   r   !_eventually_correct_t5_max_length   s$    

	z-T5Tokenizer._eventually_correct_t5_max_lengthc                 C   s
   | j  S N)r3   Zget_piece_sizer>   r   r   r   
vocab_size   s    zT5Tokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokensr#   rN   r   r   
<dictcomp>   r   z)T5Tokenizer.get_vocab.<locals>.<dictcomp>)r6   rO   updateZadded_tokens_encoder)r>   Zvocabr   rN   r   	get_vocab   s    zT5Tokenizer.get_vocab)token_ids_0token_ids_1already_has_special_tokensr   c                    sZ   |rt  j||ddS |du r2dgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rS   rT   rU   Nr   r    )r<   get_special_tokens_maskr5   )r>   rS   rT   rU   r@   r   r   rV      s    z#T5Tokenizer.get_special_tokens_maskc                 C   s   t ttdd | jS )Nc                 S   s   t td| d uS )Nz<extra_id_\d+>)boolresearch)r   r   r   r   <lambda>  r   z1T5Tokenizer.get_sentinel_tokens.<locals>.<lambda>)listsetfilterr-   rN   r   r   r   get_sentinel_tokens  s    zT5Tokenizer.get_sentinel_tokensc                    s    fdd   D S )Nc                    s   g | ]}  |qS r   )Zconvert_tokens_to_ids)r   tokenrN   r   r   r     r   z6T5Tokenizer.get_sentinel_token_ids.<locals>.<listcomp>)r^   rN   r   rN   r   get_sentinel_token_ids  s    z"T5Tokenizer.get_sentinel_token_ids)	token_idsr   c                 C   sB   t |dkr2|d | jkr2td| j d |S || jg S dS )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.N)r5   eos_token_idrI   rJ   r)   )r>   ra   r   r   r   _add_eos_if_not_present  s    z#T5Tokenizer._add_eos_if_not_present)rS   rT   r   c                 C   s<   | j g}|du r"t|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        Nr   )rc   r5   )r>   rS   rT   Zeosr   r   r   $create_token_type_ids_from_sequences'  s    z0T5Tokenizer.create_token_type_ids_from_sequencesc                 C   s,   |  |}|du r|S |  |}|| S dS )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)rd   )r>   rS   rT   r   r   r    build_inputs_with_special_tokens=  s
    

z,T5Tokenizer.build_inputs_with_special_tokensc                 C   s   | j  }d |d< |S )Nr3   )__dict__copy)r>   stater   r   r   __getstate__W  s    
zT5Tokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjf i | j| _| j| j d S )Nr   )rg   hasattrr   r1   r2   r3   r4   r   )r>   dr   r   r   __setstate__\  s
    
zT5Tokenizer.__setstate__r
   )textr   c                    s   | j st|dkr&t j|fi |S |td}| jr@t| }t j|fi |}t|dkr|d tkr|d | jv r|dd }|S )z
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        r    r    N)r.   r5   r<   tokenizereplaceSPIECE_UNDERLINEr/   all_special_tokensr>   rn   r?   tokensr@   r   r   rp   f  s    &zT5Tokenizer.tokenizec                 C   s   t | jt| jS rM   )r5   r3   encoder   r*   rN   r   r   r   unk_token_lengthx  s    zT5Tokenizer.unk_token_lengthc                 K   sZ   | j s|tdfs$| jj|tdS | jj| j| td}t|| jkrV|| jd S |S )u(  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        ro   )Zout_typeN)	r.   
startswithrr   r3   rv   r   r*   r5   rw   rt   r   r   r   	_tokenize|  s    
zT5Tokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r3   Zpiece_to_id)r>   r_   r   r   r   _convert_token_to_id  s    z T5Tokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r3   Z	IdToPiece)r>   indexr_   r   r   r   _convert_id_to_token  s    z T5Tokenizer._convert_id_to_tokenc                 C   s   |d  tr(| jr(|d dd |d< g }d}d}|D ]F}|| jv rp|sR|d7 }|| j|| 7 }d}g }q8|| d}q8|| j|7 }| S )z:Converts a sequence of tokens (string) in a single string.r   r    N Fro   T)rx   rr   r/   rs   r3   decodeappendstrip)r>   ru   Zcurrent_sub_tokensZ
out_stringZprev_is_specialr_   r   r   r   convert_tokens_to_string  s     

z$T5Tokenizer.convert_tokens_to_string)save_directoryfilename_prefixr   c                 C   s   t j|s"td| d d S t j||r6|d ndtd  }t j| jt j|krzt j	| jrzt
| j| nLt j	| jst|d$}| j }|| W d    n1 s0    Y  |fS )NzVocabulary path (z) should be a directory-r}   r   wb)ospathisdirr8   errorjoinVOCAB_FILES_NAMESabspathr   isfiler   rB   r3   Zserialized_model_protowrite)r>   r   r   Zout_vocab_filefiZcontent_spiece_modelr   r   r   save_vocabulary  s    (
(zT5Tokenizer.save_vocabulary)r   r   r   r   NNNT)F)NF)N)N)N)&rD   
__module____qualname____doc__r   Zvocab_files_namesZmodel_input_namesr   dictr   r   r=   r:   staticmethodrL   propertyrO   rR   r[   intrW   rV   r^   r`   rd   re   rf   rj   rm   rp   rw   ry   rz   r|   r   tupler   __classcell__r   r   r@   r   r   ,   sb   N        I


   

r   )r   r   rX   rI   shutilr   typingr   r   r   r   r1   Zconvert_slow_tokenizerr   Ztokenization_utilsr   Ztokenization_utils_baser	   r
   utilsr   Zutils.import_utilsr   Z
get_loggerrD   r8   r   rr   r   __all__r   r   r   r   <module>   s,   
   