a
    h7                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlZddl	m
Z
mZ ddlmZ ddlmZ erhddlmZ eeZd	d
iZdZeddG dd deZdgZdS )    N)copyfile)TYPE_CHECKINGAnyOptional   )
AddedTokenPreTrainedTokenizer)logging)requires)	TextInput
vocab_fileztokenizer.modelu   ▁)sentencepiece)backendsc                
       s(  e Zd ZdZeZddgZd0eee	e
f  d fddZdd Zdd Zedd Zdd Zdee	 d fddZdd Zdd Zdd Zd d! Zd1ee	 ee	 d"d#d$Zd2d%d&Zd3ee eee  eee d' fd(d)Zd4ee eee  ee d*d+d,Zd5ee eee	d-d.d/Z  ZS )6GemmaTokenizera
  
    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Gemma should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
    Z	input_idsZattention_mask<unk><bos><eos><pad>NTF)sp_model_kwargsc                    s   |d u ri n|| _ t|tr*t|dddn|}t|trFt|dddn|}t|trbt|dddn|}t|tr~t|dddn|}|| _|| _|| _|
| _tj	f i | j | _
| j
| t jf ||||||||	|
|d
| d S )NFT)
normalizedZspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokens)r   
isinstancestrr   r   r   r   r   spmSentencePieceProcessorsp_modelLoadsuper__init__)selfr   r   r   r   r   r   r   r   r   r   r   kwargs	__class__ h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/gemma/tokenization_gemma.pyr&   ^   s2    zGemmaTokenizer.__init__c                 C   s$   | j  }d |d< | j |d< |S )Nr#   sp_model_proto)__dict__copyr#   serialized_model_proto)r'   stater+   r+   r,   __getstate__   s    
zGemmaTokenizer.__getstate__c                 C   s2   | j | tjf i | j| _| j| j d S N)r.   updater!   r"   r   r#   ZLoadFromSerializedProtor-   )r'   dr+   r+   r,   __setstate__   s    zGemmaTokenizer.__setstate__c                 C   s
   | j  S )zReturns vocab size)r#   Zget_piece_sizer'   r+   r+   r,   
vocab_size   s    zGemmaTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )zReturns vocab as a dictc                    s   i | ]}  ||qS r+   )Zconvert_ids_to_tokens).0ir7   r+   r,   
<dictcomp>       z,GemmaTokenizer.get_vocab.<locals>.<dictcomp>)ranger8   r4   Zadded_tokens_encoder)r'   Zvocabr+   r7   r,   	get_vocab   s    zGemmaTokenizer.get_vocabr   )textreturnc                    s   t  j|fi |S )ze
        Args:
            text: TextInput
        Simply calls PreTrainedTokenizer's method
        )r%   tokenizer'   r?   r(   r)   r+   r,   rA      s    zGemmaTokenizer.tokenizec                 K   s   | j j|tdS )z
        Args:
            text: TextInput
        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
        )Zout_type)r#   encoder    rB   r+   r+   r,   	_tokenize   s    zGemmaTokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r#   Zpiece_to_id)r'   tokenr+   r+   r,   _convert_token_to_id   s    z#GemmaTokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r#   Z	IdToPiece)r'   indexrE   r+   r+   r,   _convert_id_to_token   s    z#GemmaTokenizer._convert_id_to_tokenc                 C   sT   g }d}|D ]2}|| j v r4|| j|| 7 }g }q|| q|| j|7 }|S )z:Converts a sequence of tokens (string) in a single string. )Z_added_tokens_encoderr#   decodeappend)r'   tokensZcurrent_sub_tokensZ
out_stringrE   r+   r+   r,   convert_tokens_to_string   s    
z'GemmaTokenizer.convert_tokens_to_string)filename_prefixr@   c                 C   s   t j|s"td| d dS t j||r6|d ndtd  }t j| jt j|krzt j	| jrzt
| j| nLt j	| jst|d$}| j }|| W d   n1 s0    Y  |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directoryN-rI   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr#   r0   write)r'   Zsave_directoryrN   Zout_vocab_filefiZcontent_spiece_modelr+   r+   r,   save_vocabulary   s    (
(zGemmaTokenizer.save_vocabularyc                 C   sL   | j r| jgng }| jr | jgng }|| | }|d urH|| | | }|S r3   )r   bos_token_idr   eos_token_idr'   token_ids_0token_ids_1r^   r_   outputr+   r+   r,    build_inputs_with_special_tokens   s    z/GemmaTokenizer.build_inputs_with_special_tokens)ra   rb   already_has_special_tokensr@   c                    s   |rt  j||ddS | jr"dgng }| jr2dgng }|du rT|dgt|  | S |dgt|  | | dgt|  | S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)ra   rb   re      Nr   )r%   get_special_tokens_maskr   r   len)r'   ra   rb   re   r^   r_   r)   r+   r,   rg      s(    z&GemmaTokenizer.get_special_tokens_mask)ra   rb   r@   c                 C   s`   | j r| jgng }| jr | jgng }dgt|| |  }|dur\|dgt|| |  7 }|S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`list[int]`):
                List of ids.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   Nrf   )r   r^   r   r_   rh   r`   r+   r+   r,   $create_token_type_ids_from_sequences  s    z3GemmaTokenizer.create_token_type_ids_from_sequences)	token_idsskip_special_tokensr   r@   c                 K   s   g }g }|D ]V}|r || j v r q|| jv rX|r@|| j| || j| j g }q|| q|rz|| j| |rd|}n
d|}|tdS )N rI   )	Zall_special_idsZ_added_tokens_decoderrK   r#   rJ   contentrV   replaceSPIECE_UNDERLINE)r'   rj   rk   r   r(   Z	sub_textsZcurrent_sub_textZidsr+   r+   r,   _decode1  s"    

zGemmaTokenizer._decode)
r   r   r   r   NTFFFF)N)N)NF)N)FF) __name__
__module____qualname____doc__rW   Zvocab_files_namesZmodel_input_namesr   dictr    r   r&   r2   r6   propertyr8   r>   listrA   rD   rF   rH   rM   tupler]   rd   intboolrg   ri   rp   __classcell__r+   r+   r)   r,   r   +   s\   .          *

 & $  r   )rQ   shutilr   typingr   r   r   r   r!   Ztokenization_utilsr   r   utilsr	   Zutils.import_utilsr
   Ztokenization_utils_baser   Z
get_loggerrq   rT   rW   ro   r   __all__r+   r+   r+   r,   <module>   s    
  %