a
    h                     @   st   d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ddlmZ G dd deZd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC   )BaseTokenizerc                
       s   e Zd ZdZdeeeeeef f  eeee	e
eef  f  eeef eeee ee d fdd	Zeeed
ddZdddgdg dfeee	e f eee	eeef  ee	e edddZdddgdg ddfeee eee  f eee	eeef  ee	e eee dddZ  ZS )SentencePieceBPETokenizerzrSentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    N<unk>   ▁TF)vocabmerges	unk_tokenreplacementadd_prefix_spacedropoutfuse_unkc                    s   |d ur(|d ur(t t|||||d}nt t|||d}|t|d ur\|t|g t |_|rldnd}	tj||	d|_	t
j||	d|_d||||d}
t ||
 d S )N)r   r   r   alwaysnever)r   prepend_schemeZSentencePieceBPE)modelr   r   r   r   )r	   r   Ztoken_to_idstrZadd_special_tokensr   Z
normalizerr   Z	MetaspaceZpre_tokenizerr
   decodersuper__init__)selfr   r   r   r   r   r   r   	tokenizerr   
parameters	__class__ h/var/www/html/assistant/venv/lib/python3.9/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr"      s     
z"SentencePieceBPETokenizer.__init__)vocab_filenamemerges_filenamec                 K   s"   t | |\}}t||fi |S )N)r   	read_filer   )r*   r+   kwargsr   r   r(   r(   r)   	from_file1   s    z#SentencePieceBPETokenizer.from_filei0u     i  )files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc           	      C   s:   t j||||||d}t|tr&|g}| jj||d dS )z%Train the model using the given filesr1   r2   r3   r4   r5   r6   )trainerN)r   
BpeTrainer
isinstancer   
_tokenizertrain)	r#   r0   r1   r2   r3   r4   r5   r6   r8   r(   r(   r)   r<   6   s    
zSentencePieceBPETokenizer.train)iteratorr1   r2   r3   r4   r5   r6   lengthc	           
      C   s,   t j||||||d}	| jj||	|d dS )z(Train the model using the given iteratorr7   )r8   r>   N)r   r9   r;   train_from_iterator)
r#   r=   r1   r2   r3   r4   r5   r6   r>   r8   r(   r(   r)   r?   N   s    z-SentencePieceBPETokenizer.train_from_iterator)NNr   r   TNF)__name__
__module____qualname____doc__r   r   r   r   intr   r   r   boolfloatr"   staticmethodr.   r<   r   r?   __classcell__r(   r(   r&   r)   r   
   sf          
!r   N)typingr   r   r   r   r   r   Z
tokenizersr   r	   r
   r   r   Ztokenizers.modelsr   Ztokenizers.normalizersr   Zbase_tokenizerr   r   r(   r(   r(   r)   <module>   s
    