a
    h                     @   s^  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ d
dlmZ d
dlmZ d
dlm Z  d
dl!m"Z" d
dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d
dl.m/Z/m0Z0m1Z1 e12e3Z4dZ5dZ6dZ7dZ8dZ9e$d7 Z$eeeedZ:e5e8dZ;e0e$G dd de)Z<dS )z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)AnyOptionalUnion)Encoding)	Tokenizer)Decoder)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer   )convert_slow_tokenizer)convert_gguf_tokenizer)load_gguf_checkpoint)PreTrainedTokenizer)
INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBaseSpecialTokensMixin	TextInputTextInputPairTruncationStrategy)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)BPEUnigram	WordLevel	WordPiece)tokenizer_file
vocab_filec                       s  e Zd ZU dZeZdZeed<  fddZ	e
edddZe
edd	d
Ze
edddZeeef dddZe
eeef dddZe
eeef dddZe
eeef dddZeeef dddZedddZedddZe
edddZe
edddZdMeee ee eeeeee eee!f e"e f d!	d"d#Z#e$ee%e f e$ee"e f d$d%d&Z&eed'd(d)Z'eee d*d+d,Z(dNe"e$eef  ed-d.d/Z)dOeed0d1d2Z*dPe$ee"e f ee$ee"e f d3d4d5Z+dQeee ee"e d6d7d8Z,e-e.eeee ee d9d:d;Z/d e-j0e.j1dd<ddddddddddd dfe$e"e2 e"e3 e"e4 e"e5 f ee-e.ee eeee ee ee ee ee eeeeeee6d=d>d?Z7dd e-j0e.j1dd<ddddddddddd dfe$e2e4f ee$e2e4f  ee-e.ee eeee ee ee ee ee eeeeeee6d@dAdBZ8e"e ed$dCdDZ9dRe$ee"e f eee edEdFdGZ:dSe$ee;j<f e e ee ee e e dHdIdJZ=dTdKdLZ>  Z?S )UPreTrainedTokenizerFastaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    Nslow_tokenizer_classc                    s  | dd }| dd }| dd }| dd }| dd}| di }|dd| _|rt|d u rt| jd u rttd	|d urt|}	n|d ur|st|}	n|rt	|}	n|d urt
|d
}
|
d d }|
d }|
d }t||\}	}|| t|dkr~|| nj| jd urD|durD| j|i |}t	|}	n:|sv|d
| _|dg | _t	| dd}	d }ntd|	| _|d ur||j d| _| jj}|d ur| jjf i | |d|d  |d|d  |d|d  |d|d  n
| j  | jj}|d ur| jjf i | |d|d  |d|d  |d|d  |d|d  |d|d  t jf i | | j| j_d d! | jD   fd"d#t| d$d% d&D t | j!" d'd# D  fd(d#| j#D 7 tdkrg }| j$}D ]\}t%|t&rH|j'pRt(||v n
t(||v }t%|t(rnt&||d)}n||_'|)| q$|r| *| zXt+,| j-j./ }|d| j| jkrt0t1| d*}| j|d< |f i || j-_.W n t2y   Y n0 d S )+Ntokenizer_objectZ__slow_tokenizer	gguf_filer$   	from_slowFadded_tokens_decoderadd_prefix_spacezCannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.r%   configZ
model_type	tokenizertokenizer_configr   additional_special_tokensT)Zfrom_tiktokena9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.
max_lengthtruncation_side	directionstridetruncation_strategystrategy	pad_tokenpad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofc                 S   s   h | ]}t t|qS  hashrepr.0tokenr=   r=   `/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py	<setcomp>       z3PreTrainedTokenizerFast.__init__.<locals>.<setcomp>c                    s$   g | ]\}}t t| vr|qS r=   r>   )rB   indexrC   )added_tokens_decoder_hashr=   rD   
<listcomp>   s   z4PreTrainedTokenizerFast.__init__.<locals>.<listcomp>c                 S   s   | d S Nr   r=   )xr=   r=   rD   <lambda>   rF   z2PreTrainedTokenizerFast.__init__.<locals>.<lambda>keyc                 S   s   g | ]}t |qS r=   )strrA   r=   r=   rD   rI      rF   c                    s    g | ]}| vr|vr|qS r=   r=   rA   )encodertokens_to_addr=   rD   rI      s   )specialtype)3popgetr,   r'   
ValueErrorcopydeepcopyTokenizerFast	from_filer   r   r   updatelenr%   r0   
_tokenizerinit_kwargs_decode_use_source_tokenizer
truncationenable_truncation
setdefaultno_truncationpaddingenable_paddingsuper__init__split_special_tokensencode_special_tokensr+   sorteditemslistadded_tokens_encoderkeysZall_special_tokens_extendedZall_special_tokens
isinstancer   rR   rO   append
add_tokensjsonloadsbackend_tokenizerpre_tokenizer__getstate__getattrpre_tokenizers_fast	Exception)selfargskwargsr(   Zslow_tokenizerr)   Zfast_tokenizer_filer*   r+   Zfast_tokenizerZ
gguf_paramarchitectureZtokenizer_dictr/   Zadditional_kwargs_truncation_paddingtokensspecial_tokensrC   Z
is_specialZpre_tok_stateZpre_tok_class	__class__)rH   rP   rQ   rD   rg   b   s    












z PreTrainedTokenizerFast.__init__)returnc                 C   s   dS )NTr=   rz   r=   r=   rD   is_fast   s    zPreTrainedTokenizerFast.is_fastc                 C   sD   d| j v r<| j d dr<t| dr8| jr8tj| jS dS dS dS )z
        `bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        r%   z.modelFTN)vocab_files_namesendswithhasattrr%   ospathisfiler   r=   r=   rD   can_save_slow_tokenizer   s
    z/PreTrainedTokenizerFast.can_save_slow_tokenizerc                 C   s   | j jddS )zP
        `int`: Size of the base vocabulary (without the added tokens).
        FZwith_added_tokensr]   Zget_vocab_sizer   r=   r=   rD   
vocab_size   s    z"PreTrainedTokenizerFast.vocab_sizec                 C   s   | j jddS )NTr   )r]   	get_vocabr   r=   r=   rD   r      s    z!PreTrainedTokenizerFast.get_vocabc                 C   s   |   S N)r   r   r=   r=   rD   vocab   s    zPreTrainedTokenizerFast.vocabc                 C   s    dd t | j dd dD S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 S   s   i | ]\}}|j |qS r=   contentrB   vkr=   r=   rD   
<dictcomp>  rF   z@PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<dictcomp>c                 S   s   | d S rJ   r=   itemr=   r=   rD   rL     rF   z>PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<lambda>rM   rj   r+   rk   r   r=   r=   rD   rm      s    z,PreTrainedTokenizerFast.added_tokens_encoderc                 C   s
   | j  S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `dict[str, int]`: The added tokens.
        )r]   Zget_added_tokens_decoderr   r=   r=   rD   r+     s    z,PreTrainedTokenizerFast.added_tokens_decoderc                 C   s    dd t | j dd dD S )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `dict[str, int]`: The added tokens.
        c                 S   s   i | ]\}}|j |qS r=   r   r   r=   r=   rD   r     rF   z;PreTrainedTokenizerFast.get_added_vocab.<locals>.<dictcomp>c                 S   s   | d S rJ   r=   r   r=   r=   rD   rL     rF   z9PreTrainedTokenizerFast.get_added_vocab.<locals>.<lambda>rM   r   r   r=   r=   rD   get_added_vocab  s    z'PreTrainedTokenizerFast.get_added_vocabc                 C   s   dS )zN
        Returns True, to avoid expensive `assert tokenizer` gotchas.
        Tr=   r   r=   r=   rD   __bool__  s    z PreTrainedTokenizerFast.__bool__c                 C   s   | j jddS )zD
        Size of the full vocabulary with the added tokens.
        Tr   r   r   r=   r=   rD   __len__  s    zPreTrainedTokenizerFast.__len__c                 C   s   | j S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )r]   r   r=   r=   rD   rt   %  s    z)PreTrainedTokenizerFast.backend_tokenizerc                 C   s   | j jS )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )r]   decoderr   r=   r=   rD   r   ,  s    zPreTrainedTokenizerFast.decoderFT)	encodingreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverboser   c	                 C   s   |du rd| j v }|du r$d| j v }|r@|jdur@|g|j }	n|g}	tt}
|	D ]|}|
d |j |rz|
d |j |r|
d |j |r|
d |j |r|
d |j	 |rR|
d t
|j qR|
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        NZtoken_type_idsattention_mask	input_idsspecial_tokens_maskZoffset_mappingr;   )Zmodel_input_namesZoverflowingr   rl   rp   idsZtype_idsr   r   offsetsr\   )rz   r   r   r   r   r   r   r   r   	encodingsZencoding_dicter=   r=   rD   _convert_encoding3  s*    

z)PreTrainedTokenizerFast._convert_encoding)r   r   c                    s&   t |tr |S  fdd|D S )aX  
        Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `list[int]`: The token id or list of token ids.
        c                    s   g | ]}  |qS r=   )#_convert_token_to_id_with_added_vocrA   r   r=   rD   rI   p  rF   zAPreTrainedTokenizerFast.convert_tokens_to_ids.<locals>.<listcomp>)ro   rO   r   rz   r   r=   r   rD   convert_tokens_to_idsb  s    

z-PreTrainedTokenizerFast.convert_tokens_to_ids)rC   r   c                 C   s   | j |}|d u r| jS |S r   )r]   token_to_idZunk_token_id)rz   rC   rG   r=   r=   rD   r   r  s    z;PreTrainedTokenizerFast._convert_token_to_id_with_added_voc)rG   r   c                 C   s   | j t|S r   )r]   id_to_tokenint)rz   rG   r=   r=   rD   _convert_id_to_tokenx  s    z,PreTrainedTokenizerFast._convert_id_to_token)
new_tokensr   c                 C   s   |r| j |S | j |S r   )r]   add_special_tokensrq   )rz   r   r   r=   r=   rD   _add_tokens{  s    z#PreTrainedTokenizerFast._add_tokens)pairr   c                 C   s   | j |S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )r]   num_special_tokens_to_add)rz   r   r=   r=   rD   r     s    z1PreTrainedTokenizerFast.num_special_tokens_to_add)r   skip_special_tokensr   c                 C   s`   t |tr| j|S g }|r(t| jnt }|D ](}t|}||v rHq2|| j| q2|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `list[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `list[str]`: The decoded token(s).
        )ro   r   r]   r   setZall_special_idsrp   )rz   r   r   r   Zids_to_skiprG   r=   r=   rD   convert_ids_to_tokens  s    
z-PreTrainedTokenizerFast.convert_ids_to_tokens)textr   r   r   c                 K   s   | j f |||d| S )N)r   	text_pairr   )Zencode_plusr   )rz   r   r   r   r|   r=   r=   rD   tokenize  s    z PreTrainedTokenizerFast.tokenizepadding_strategyr5   r1   r4   r<   r:   c                    s   | j j | j j}|tjkr. durz| j   nL|||j| jd} du rNd}	n fdd|D }	|	|krz| j jf i | |t	j
kr|dur| j   nR|t	jkr|nd}
|
|dur|n| j| j| j| j|d}||kr| j jf i | dS )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r1   r4   r6   r3   c                    s   i | ]}|  |d qS r   rU   )rB   r   r~   r=   rD   r     rF   zFPreTrainedTokenizerFast.set_truncation_and_padding.<locals>.<dictcomp>)r;   r3   Zpad_idr7   r9   r<   )r]   r`   rd   r   DO_NOT_TRUNCATErc   valuer2   ra   r   
DO_NOT_PADZ
no_paddingZ
MAX_LENGTHr:   Zpad_token_idr7   r8   re   )rz   r   r5   r1   r4   r<   r:   r   targetcurrentr;   r=   r   rD   set_truncation_and_padding  s8    !

z2PreTrainedTokenizerFast.set_truncation_and_paddingr   )batch_text_or_text_pairsr   r   r5   r1   r4   is_split_into_wordsr<   r:   return_tensorsr   r   r   r   r   r   r   rh   r   c                    s$  t |ttfs"tdt| dj||||||	d jj|krL|j_jj|||d}fdd|D }i }|d d D ]  fdd|D }|| < qd	d |D }rg }t	|D ]"\}\}}||gt
|d
  7 }q||d< |d
 D ]}|| q t|||
dS )Nz:batch_text_or_text_pairs has to be a list or a tuple (got )r   )r   Zis_pretokenizedc                    s&   g | ]}j | d qS ))r   r   r   r   r   r   r   r   )r   )rB   r   )r   r   r   r   r   r   rz   r   r=   rD   rI   5  s   z>PreTrainedTokenizerFast._batch_encode_plus.<locals>.<listcomp>r   c                    s"   g | ]\}}|  D ]}|qqS r=   r=   )rB   r   _r   rM   r=   rD   rI   K  rF   c                 S   s   g | ]\}}|D ]}|qqS r=   r=   )rB   r   r   r   r=   r=   rD   rI   M  rF   r   overflow_to_sample_mapping)Ztensor_type)ro   tuplerl   	TypeErrorrS   r   r]   ri   Zencode_batch	enumerater\   &_eventual_warn_about_too_long_sequencer   )rz   r   r   r   r5   r1   r4   r   r<   r:   r   r   r   r   r   r   r   r   rh   r   Ztokens_and_encodingsZsanitized_tokensstackZsanitized_encodingsr   itoksr   r   r=   )	rN   r   r   r   r   r   r   rz   r   rD   _batch_encode_plus   sF    	
z*PreTrainedTokenizerFast._batch_encode_plus)r   r   r   r   r5   r1   r4   r   r<   r:   r   r   r   r   r   r   r   r   rh   r   c                 K   s   |r||fgn|g}| j |f|||||||	|
|||||||||d|}|d u rp|sptdd | D |j}| |d || |S )N)r   r   r   r5   r1   r4   r<   r:   r   r   r   r   r   r   r   r   rh   c                 S   s8   i | ]0\}}|t |d kr0t|d  tr0|d  n|qS )r   )r\   ro   rl   )rB   rN   r   r=   r=   rD   r     s   z8PreTrainedTokenizerFast._encode_plus.<locals>.<dictcomp>r   )r   r   rk   r   r   )rz   r   r   r   r   r5   r1   r4   r   r<   r:   r   r   r   r   r   r   r   r   rh   r|   Zbatched_inputZbatched_outputr=   r=   rD   _encode_plus[  sB    z$PreTrainedTokenizerFast._encode_plusc                 C   s$   | j jd ur| j j|S d|S )N )rt   r   decodejoinr   r=   r=   rD   convert_tokens_to_string  s    
z0PreTrainedTokenizerFast.convert_tokens_to_string)	token_idsr   clean_up_tokenization_spacesr   c                 K   sZ   | dd| _t|tr|g}| jj||d}|d ur:|n| j}|rR| |}|S |S d S )NZuse_source_tokenizerF)r   )rT   r_   ro   r   r]   r   r   Zclean_up_tokenization)rz   r   r   r   r|   r   Z
clean_textr=   r=   rD   _decode  s    

zPreTrainedTokenizerFast._decode)save_directory
file_nameslegacy_formatfilename_prefixr   c                    s:  t |} jdu r"|du r"td|du s2|du o@ jduo@ j}|du pP|du }|rtj||rj|d ndt } fdd j	 D }|rt
|d	d
d0}	tj|ddddd }
|	|
 W d   n1 s0    Y   j||d}|| |f }|r6tj||r|d ndt } j| ||f }|S )z
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
        file containing {config + vocab + added-tokens}.
        NTzYour tokenizer does not have a legacy version defined and therefore cannot register this version. You might consider leaving the legacy_format at `None` or setting it to `False`.F- c                    s    i | ]\}}| j kr||qS r=   )r   )rB   tokrG   r   r=   rD   r     rF   z<PreTrainedTokenizerFast._save_pretrained.<locals>.<dictcomp>wzutf-8)r      )indent	sort_keysensure_ascii
)r   )rO   r'   rV   r   r   r   r   ADDED_TOKENS_FILErm   rk   openrr   dumpswriteZsave_vocabularyTOKENIZER_FILErt   save)rz   r   r   r   r   Z	save_slowZ	save_fastZadded_tokens_fileZadded_vocabfZout_strZvocab_filesr$   r=   r   rD   _save_pretrained  s:    (
z(PreTrainedTokenizerFast._save_pretrainedc              	      s>  t | j }|d}|d}	d}
|d d dkrRi |d d< g |d d< n|d d d	kr|d d
 dur|d d
 }|d d | d }
 dur|
 v r |
 }
d|d d
< |
dgg|d d< n6|d d dv ri |d d< ntd|d d  d durBd|d v rB|d d  v rB |d d  |d d< tt |g }|D ]v}|dd}|dd}|d d d	kr|sqZ dur|d  v r |d  |d< |	t
f i | qZ|dur|| |d d dkr$d|vr$|d d dur$|d d |d< |d d dkrbd|vrb|d d durb|d d |d< |d d d	kr|
dur|
|d< |d dur|d d dks|d d dkrd|d v rtdd |d d D rtj |d< t|d d  }|f ||d|}j|||d |	durVt  }d|	v r|	d D ]}|	d | d  } dur~ fd!d"|D }||	d | d < |D ]"}|}|du rtd#qfd$d"|D |	d | d%< qNd&D ]`}||	v r|	| \}} dur| v r | }|}|du r.td#||g|	|< q|	|d< tt || j }tj }|d' |D ]}t| |durxt| |} dur| v r | }| j|d}t|t
rt
||j|j|j|jd(d)||< n|||< qx| j }|dur|| t!|dkr*||d'< | j"f d*i|S )+uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `list[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokenspost_processorNmodelrS   r    r   Zmergesr!   unk_idr   g        )r"   r#   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.	unk_tokenrR   idr   Zcontinuing_subword_prefixZend_of_word_suffixru   	ByteLevelSequenceZpretokenizersc                 s   s   | ]}|d  dkV  qdS )rS   r   Nr=   )rB   Zpretokenizerr=   r=   rD   	<genexpr>P  s   zBPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<genexpr>Zinitial_alphabet)r   r   )r;   trainerr   r   c                    s   g | ]}  ||qS r=   r   rA   )special_tokens_mapr=   rD   rI   b  rF   zCPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<listcomp>zQAttempted to set a token in the post processor that does not exist in the mappingc                    s   g | ]}  |qS r=   )r   rA   )r.   r=   rD   rI   k  rF   r   )clssepr0   T)single_wordlstriprstrip
normalizedrR   r(   )#rr   rs   r]   Zto_strrT   rV   rY   Zfrom_strr   rp   r   extendanyrx   r   alphabetMODEL_TO_TRAINER_MAPPINGZtrain_from_iteratorr   r^   rW   r   ZSPECIAL_TOKENS_ATTRIBUTESremoverw   Z_special_tokens_maprU   ro   r  r  r  r  r0   r\   r   )rz   Ztext_iteratorr   r;   Znew_special_tokensr  r|   Ztokenizer_jsonr   r   r   r   r   Zadded_tokenrR   r   Ztrainer_classr  Ztrained_tokenizer_jsonrN   r   rC   Ztoken_idZspecial_tokenZspecial_tokens_listZspecial_token_fullr0   r=   )r  r.   rD   train_new_from_iterator  s    "






	




"






	

z/PreTrainedTokenizerFast.train_new_from_iterator)NNFFFFT)F)F)F)NF)FN)NN)NNN)@__name__
__module____qualname____doc__VOCAB_FILES_NAMESr   r'   r   __annotations__rg   propertyboolr   r   r   r   dictrO   r   r   rm   r   r+   r   r   r   rY   rt   DecoderFastr   EncodingFastr   r   r   rl   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   PathLiker   r  __classcell__r=   r=   r   rD   r&   Q   sD  
|			       /( P^
=
    5   r&   )=r  rW   rr   r   collectionsr   collections.abcr   typingr   r   r   Ztokenizers.pre_tokenizersZpre_tokenizersrx   Z
tokenizersr   r  r   rY   Ztokenizers.decodersr	   r  Ztokenizers.trainersr
   r   r   r   r   Zintegrations.ggmlr   Zmodeling_gguf_pytorch_utilsr   Ztokenization_utilsr   Ztokenization_utils_baser   r   r   r   r   r   r   r   r   r   utilsr   r   r   Z
get_loggerr  loggerr   ZSPECIAL_TOKENS_MAP_FILEZTOKENIZER_CONFIG_FILEZTIKTOKEN_VOCAB_FILEr   r  r  r&   r=   r=   r=   rD   <module>   s@   0


