a
    h                     @   s   d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZmZmZ ddlmZ G dd deZd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizers
processorstrainers)BPE)	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                       s  e Zd ZdZdeeeeeef f  eeee	e
eef  f  eeee ee ee ee ed	 fddZeeeddd	Zd
ddg feee	e f eeee	eeef  dddZd
ddg dfeee eee  f eeee	eeef  ee dddZ  ZS )ByteLevelBPETokenizerzjByteLevelBPETokenizer

    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
    NF)	vocabmergesadd_prefix_space	lowercasedropoutunicode_normalizercontinuing_subword_prefixend_of_word_suffixtrim_offsetsc
              	      s   |d ur0|d ur0t t||||p d|p&dd}
n
t t }
g }|rP|t|g7 }|r`|t g7 }t|dkrt|dkrt||
_n
|d |
_tj|d|
_	t
 |
_tj|	d|
_d|||||||	d}t |
| d S )	N )r   r   r   r   r   )r   )r   ZByteLevelBPE)modelr   r   r   r   r   r   r   )r	   r   r   r   lenr   Z
normalizerr   	ByteLevelZpre_tokenizerr
   decoderr   Zpost_processorsuper__init__)selfr   r   r   r   r   r   r   r   r   	tokenizerZnormalizers
parameters	__class__ e/var/www/html/assistant/venv/lib/python3.9/site-packages/tokenizers/implementations/byte_level_bpe.pyr$      sB    



zByteLevelBPETokenizer.__init__)vocab_filenamemerges_filenamec                 K   s"   t | |\}}t||fi |S )N)r   	read_filer   )r,   r-   kwargsr   r   r*   r*   r+   	from_fileJ   s    zByteLevelBPETokenizer.from_filei0u     T)files
vocab_sizemin_frequencyshow_progressspecial_tokensc                 C   s>   t j||||tj d}t|tr*|g}| jj||d dS )z%Train the model using the given filesr3   r4   r5   r6   Zinitial_alphabet)trainerN)	r   
BpeTrainerr   r!   alphabet
isinstancestr
_tokenizertrain)r%   r2   r3   r4   r5   r6   r8   r*   r*   r+   r>   O   s    

zByteLevelBPETokenizer.train)iteratorr3   r4   r5   r6   lengthc                 C   s0   t j||||tj d}| jj|||d dS )z(Train the model using the given iteratorr7   )r8   r@   N)r   r9   r   r!   r:   r=   train_from_iterator)r%   r?   r3   r4   r5   r6   r@   r8   r*   r*   r+   rA   d   s    z)ByteLevelBPETokenizer.train_from_iterator)	NNFFNNNNF)__name__
__module____qualname____doc__r   r   r<   r   intr   r   boolfloatr$   staticmethodr0   r   r>   r   rA   __classcell__r*   r*   r(   r+   r   
   s^            :r   N)typingr   r   r   r   r   r   Z
tokenizersr   r	   r
   r   r   r   Ztokenizers.modelsr   Ztokenizers.normalizersr   r   r   Zbase_tokenizerr   r   r*   r*   r*   r+   <module>   s
     