a
    h                  :   @   s  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZmZmZ ddlmZmZmZmZ dd	lmZ eeZdrddZeedddZdd Z G dd dZ!G dd de!Z"eedddZ#G dd dZ$G dd de$Z%G dd de$Z&G dd  d e$Z'G d!d" d"e$Z(G d#d$ d$e$Z)G d%d& d&e$Z*G d'd( d(e$Z+G d)d* d*e$Z,G d+d, d,e$Z-G d-d. d.e$Z.G d/d0 d0e$Z/G d1d2 d2e$Z0G d3d4 d4e0Z1G d5d6 d6e0Z2G d7d8 d8e0Z3G d9d: d:e0Z4G d;d< d<e0Z5G d=d> d>e0Z6G d?d@ d@e0Z7G dAdB dBe0Z8G dCdD dDe0Z9G dEdF dFe0Z:G dGdH dHe0Z;G dIdJ dJe0Z<G dKdL dLe0Z=G dMdN dNe0Z>G dOdP dPe0Z?G dQdR dRe0Z@G dSdT dTe$ZAG dUdV dVe0ZBG dWdX dXe$ZCG dYdZ dZe$ZDG d[d\ d\e$ZEG d]d^ d^e0ZFG d_d` d`e0ZGG dadb dbe0ZHG dcdd dde$ZIG dedf dfe0ZJG dgdh dhe0ZKdidj ZLG dkdl dlZMe1e-e2e%eBeEe3eCe*e%e/e4e%e%e%e%e%e1e'e*e+e%e%e-e9e-e-e%eIe5e6e(e%e-e7e)e>e,e%e;e<e%e-e.e8e%e?e@eAe9e:e&eFeHeHeGeHdm9ZNdse	dodpdqZOdS )tz
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)Optional)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR c                 C   sl   t  rddlm} |S t rZdd l}t|jjtdk rJddl	m} nddl	m
} |S tt| d S )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   Zgoogle.protobufr   parseprotobuf__version__Ztransformers.utilsr   ImportErrorr   format)error_messager   Zgoogle r   _/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/convert_slow_tokenizer.pyimport_protobuf#   s    r    )add_prefix_spacereturnc                 C   s"   | rd}t |ddsd}nd}|S )NalwayslegacyTfirstnever)getattr)r!   original_tokenizerprepend_schemer   r   r   _get_prepend_scheme4   s    r*   c           
         s   |d u}|rt |n }g }| D ]x\}}g }tdt|D ]>}|d | ||d   }}	| v r>|	 v r>|||	|f q>t| fddd}|| q$t|dd |d}dd |D }|S )	Nr   c                    s    | d   | d  fS Nr   r   r   xvocabr   r   <lambda>I       z!generate_merges.<locals>.<lambda>keyc                 S   s   | d t | d t | d fS )N   r   r   )lenvalr   r   r   r0   L   r1   r3   reversec                 S   s   g | ]}|d  |d fqS r   r   r   .0r7   r   r   r   
<listcomp>M   r1   z#generate_merges.<locals>.<listcomp>)dictitemsranger5   appendsortedextend)
r/   vocab_scoresr9   mergesmergeZpiece_scorelocalindexpiece_lpiece_rr   r.   r   generate_merges>   s    rK   c                   @   sB   e Zd ZdZedddZd	eeeef e	e f dddZ
dS )
SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    )modelc                 C   s.   t | d ddlm} | | _| j| d S )Nr   r   )SentencePieceProcessor)r   r   rN   spLoad)selfrM   rN   r   r   r   __init__V   s    
zSentencePieceExtractor.__init__Nr"   c                    s2   | j   fddt  D }t||}||fS )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        c                    s   i | ]}  ||qS r   Zid_to_piecer<   rH   rO   r   r   
<dictcomp>c   r1   z2SentencePieceExtractor.extract.<locals>.<dictcomp>)rO   r@   GetPieceSizerK   rQ   rD   r/   rE   r   rW   r   extract]   s    
zSentencePieceExtractor.extract)N)__name__
__module____qualname____doc__strrR   tupler>   intlistr[   r   r   r   r   rL   Q   s   rL   c                   @   s0   e Zd Zdeeeef ee f dddZdS )GemmaSentencePieceExtractorNrS   c                    sH   | j   fddt  D }d|vr6|d|d< t||}||fS )rT   c                    s   i | ]}  ||qS r   rU   rV   rW   r   r   rX   q   r1   z7GemmaSentencePieceExtractor.extract.<locals>.<dictcomp>	<0x09>)rO   r@   rY   getrK   rZ   r   rW   r   r[   k   s    
z#GemmaSentencePieceExtractor.extract)N)	r\   r]   r^   ra   r>   r`   rb   rc   r[   r   r   r   r   rd   j   s   rd   )piecer"   c                 C   s&   t | dk p$| d dkp$| d   S )Nr4   ,)r5   isdigit)rh   r   r   r   check_number_comma{   s    rm   c                   @   s"   e Zd Zdd ZedddZdS )	Converterc                 C   s
   || _ d S N)r(   )rQ   r(   r   r   r   rR      s    zConverter.__init__rS   c                 C   s
   t  d S ro   )NotImplementedErrorrQ   r   r   r   	converted   s    zConverter.convertedN)r\   r]   r^   rR   r   rr   r   r   r   r   rn      s   rn   c                   @   s   e Zd ZedddZdS )BertConverterrS   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j drV| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )N	unk_tokenFbasic_tokenizerTZ
clean_textZhandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr(   r/   r   r   r`   ru   hasattrrv   tokenize_chinese_charsrx   do_lower_caser   BertNormalizer
normalizerr	   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr
   TemplateProcessingpost_processorr   decoder
rQ   r/   	tokenizerr   rx   r   clssepr   r   r   r   r   rr      s:    



zBertConverter.convertedNr\   r]   r^   r   rr   r   r   r   r   rs      s   rs   c                   @   s   e Zd ZedddZdS )SplinterConverterrS   c              
   C   sZ  | j j}tt|t| j jd}d}d}d}t| j drV| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}t| j j}d}	| j j}
| j j}| j j}| j d}| j jdkr| d| d	|	 d	| d
| d
}n"| d| d
| d	|	 d	| d
}tj| d| d|||
f||f||f|	|fgd|_tjdd|_|S )Nrt   Frv   Trw   .rightrz    r|   r}   r{   r~   r   r   )r(   r/   r   r   r`   ru   r   rv   r   rx   r   r   r   r   r	   r   r   r   r   Zquestion_tokenr   r   question_token_idconvert_tokens_to_idsZpadding_sider
   r   r   r   r   )rQ   r/   r   r   rx   r   r   r   questiondotr   r   r   Zdot_token_idr   r   r   r   rr      sL    



$"
zSplinterConverter.convertedNr   r   r   r   r   r      s   r   c                   @   s   e Zd ZedddZdS )FunnelConverterrS   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j drV| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )Nrt   Frv   Trw   z:2 $A:0 r{   r|   r}   r~   r   r   r   r   r   r   r   rr      s:    



zFunnelConverter.convertedNr   r   r   r   r   r      s   r   c                   @   s   e Zd ZedddZdS )MPNetConverterrS   c           
   
   C   s   | j j}tt|t| j jd}d}d}d}t| j drV| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	| d
||f||	fgd|_tjdd|_|S )Nrt   Frv   Trw   rz   r{   z:0 r|   r}   r~   r   r   r   r   r   r   r   rr     s:    



zMPNetConverter.convertedNr   r   r   r   r   r     s   r   c                   @   s   e Zd ZedddZdS )OpenAIGPTConverterrS   c              	   C   s   | j j}t| j j }| j j}tt||d t|ddd}|	t|d ur^|
t|g tjdd|_t |_tjdd|_|S )N</w>F)r/   rE   dropoutru   end_of_word_suffixfuse_unkT)ry   suffix)r(   encoderrc   	bpe_rankskeysru   r   r   r`   Ztoken_to_idadd_special_tokensr   r   r   r	   r   r   r   
BPEDecoderr   rQ   r/   rE   ru   r   r   r   r   rr   /  s&    
zOpenAIGPTConverter.convertedNr   r   r   r   r   r   .  s   r   c                   @   s<   e Zd Zdeeeef  eeeeef   e	dddZ
dS )GPT2ConverterNr/   rE   r"   c              	   C   s   |s| j j}|st| j j}tt||d dddd}t| j dd}tj|d|_	t
 |_t| j ddr| j j}| j j}tj| d| d||fgd	|_ntjdd
|_|S )Nr   Fr/   rE   r   continuing_subword_prefixr   r   r!   r!   Zadd_bos_tokenz:0 $A:0z:0 $A:0 $B:1r~   trim_offsets)r(   r   rc   r   r   r   r'   r	   	ByteLevelr   r   r   	bos_tokenbos_token_idr
   r   r   )rQ   r/   rE   r   r!   Zbosr   r   r   r   rr   J  s8    


zGPT2Converter.converted)NNr\   r]   r^   r   r>   r`   rb   rc   ra   r   rr   r   r   r   r   r   I  s
     r   c                   @   s   e Zd ZedddZdS )HerbertConverterrS   c                 C   s   d}d}| j j}t| j j }||d d v r<|dd  }tt||d | j j|d}tj	ddd|_
t |_tj|d|_tj| j j| j jf| j j| j jfd	|_|S )
Nz	#version:r   r   r   )r   ru   r   F)ry   rx   r   )r   r   )r(   r   rc   r   r   r   r   ru   r   r   r   r	   r   r   r   r   r   r
   ZBertProcessingr   r   r   r   r   )rQ   Ztokenizer_info_strZtoken_suffixr/   rE   r   r   r   r   rr   r  s.    

zHerbertConverter.convertedNr   r   r   r   r   r   q  s   r   c                   @   s<   e Zd Zdeeeef  eeeeef   e	dddZ
dS )Qwen2ConverterNr   c                 C   s   |s| j j}|s t| j j }tt||d d ddddd}t |_	t
t
jtddddt
jt| j ddddg|_t |_tjdd	|_|S )
Nr   F)r/   rE   r   ru   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr!   r!   Z	use_regexr   )r(   r   rc   r   r   r   r   r   NFCr   r	   SequenceSplitr   r   r'   r   r   r   r
   r   )rQ   r/   rE   r   r   r   r   rr     sD    

zQwen2Converter.converted)NNr   r   r   r   r   r     s
     r   c                   @   s   e Zd ZedddZdS )RobertaConverterrS   c              	   C   sv   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tj|j|jf|j|jf|j	dd|_|S )Nr   Fr   r   Tr   r   r!   r   )r(   r   rc   r   r   r   r   r	   r   r!   r   r   r   r
   RobertaProcessingr   r   r   r   r   rQ   otr/   rE   r   r   r   r   rr     s,    


zRobertaConverter.convertedNr   r   r   r   r   r     s   r   c                   @   s   e Zd ZedddZdS )RoFormerConverterrS   c           
      C   s   ddl m} | jj}tt|t| jjd}d}d}t| jdrT| jj	j
}| jj	j}tjdd||d|_tj|||_t| jj}t| jj}| jj}| jj}	tj| d| d	| d| d
| d||f||	fgd|_tjdd|_|S )Nr   )JiebaPreTokenizerrt   Frv   Trw   rz   r{   r|   r}   r~   r   r   )Z"models.roformer.tokenization_utilsr   r(   r/   r   r   r`   ru   r   rv   rx   r   r   r   r   r	   ZPreTokenizerZcustomr   r   r   r   r   r
   r   r   r   r   )
rQ   r   r/   r   rx   r   r   r   r   r   r   r   r   rr     s8    

zRoFormerConverter.convertedNr   r   r   r   r   r     s   r   c                   @   s   e Zd ZedddZdS )DebertaConverterrS   c              	   C   s~   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjddd| j dfd| j dfgd	|_|S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r~   )r(   r   rc   r   r   r   r   r	   r   r!   r   r   r   r
   r   r   r   r   r   r   r   rr     s.    
	zDebertaConverter.convertedNr   r   r   r   r   r     s   r   c                       sn   e Zd ZdZeZi Z fddZdd Zdd Z	dd	 Z
d
d Zdd Zdd Zdd ZedddZ  ZS )SpmConverterFc                    s   t | d t j|  t }| }t| jjd}||	  W d    n1 sV0    Y  || _
| j
jjr| jstd d S )Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrR   r    
ModelProtoopenr(   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rQ   args	model_pb2mf	__class__r   r   rR   &  s    
,zSpmConverter.__init__c                 C   s   dd |j D S )Nc                 S   s   g | ]}|j |jfqS r   rh   scorer<   rh   r   r   r   r=   <  r1   z&SpmConverter.vocab.<locals>.<listcomp>piecesrQ   r   r   r   r   r/   ;  s    zSpmConverter.vocabc                 C   s   |j jS ro   )r   unk_idr   r   r   r   r   >  s    zSpmConverter.unk_idc           	   	      s   |j j} |}|dkr6tt| | jd}nZ|dkr  jj	
|\}}dd t|D }tt|||j jd jd d}ntd fd	d
t|jD }|dd
 t|dd dD  |S )Nr   r   r   r4   c                 S   s   i | ]\}\}}||qS r   r   )r<   iwordr   r   r   r   rX   P  r1   z*SpmConverter.tokenizer.<locals>.<dictcomp>T)ru   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                    s8   g | ]0\}}|j d v r||j|j dkp0|j jv fqS )      r   typerh   r   r<   idprq   r   r   r=   e  s   
z*SpmConverter.tokenizer.<locals>.<listcomp>c                 S   s    g | ]\}}}t |d |dqS )F
normalizedspecialr   r<   r   tokenr   r   r   r   r=   k  s   c                 S   s   | d S Nr   r   r,   r   r   r   r0   m  r1   z(SpmConverter.tokenizer.<locals>.<lambda>r2   )r   
model_typer/   r   r   r   r   SpmExtractorr(   r   r[   	enumerater   Z	unk_piece	Exceptionr   
add_tokensrB   )	rQ   r   r   rD   r   _rE   Z	bpe_vocabspm_added_tokensr   rq   r   r   A  sF    

zSpmConverter.tokenizerc                 C   sN   |j j}tjdddttddg}|s4t|S tt|g| S d S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr   StripReplacer   r   PrecompiledrQ   r   r	  Z_normalizersr   r   r   r   s  s    
zSpmConverter.normalizerc                 C   s   t || j}tj||dS Nreplacementr)   )r*   r(   r	   	MetaspacerQ   r  r!   r)   r   r   r   r   ~  s    zSpmConverter.pre_tokenizerc                 C   s   d S ro   r   rq   r   r   r   r     s    zSpmConverter.post_processorc                 C   s   t || j}tj||dS r  )r*   r(   r   r  r  r   r   r   r     s    zSpmConverter.decoderrS   c                 C   s   |  | j}| | j}|d ur&||_d}d}t| jdrB| jj}| ||}|d ur\||_| |||_|  }|r|||_|S )Nr  Tr!   )	r   r   r   r   r(   r!   r   r   r   )rQ   r   r   r  r!   r   r   r   r   r   rr     s     zSpmConverter.converted)r\   r]   r^   r   rL   r   r   rR   r/   r   r   r   r   r   r   r   rr   __classcell__r   r   r   r   r   !  s   2r   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )AlbertConverterc                 C   s   dd |j D S )Nc                 S   s2   g | ]*}t |jr|j|jfn|j|jd  fqS d   rm   rh   r   r   r   r   r   r=     s   z)AlbertConverter.vocab.<locals>.<listcomp>r   r   r   r   r   r/     s    zAlbertConverter.vocabc                 C   s   t ddt ddg}| jjs<|t   |t   | jjrR|t   |j	j
}|rn|t | |t tdd t |S Nz``"z''r  r   r   r  r(   keep_accentsrA   NFKDStripAccentsr   	Lowercaser  r	  r  r   r   rQ   r   Zlist_normalizersr	  r   r   r   r     s    

zAlbertConverter.normalizerc                 C   s,   t jddd| jdfd| jdfgdS Nr   r   r   r   r~   r
   r   r(   r   rq   r   r   r   r     s    zAlbertConverter.post_processorNr\   r]   r^   r/   r   r   r   r   r   r   r    s   r  c                   @   s   e Zd Zdd Zdd ZdS )BarthezConverterc                 C   s   d}|S Nr   r   rQ   r   r   r   r   r   r     s    zBarthezConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r~   r!  rq   r   r   r   r     s    zBarthezConverter.post_processorN)r\   r]   r^   r   r   r   r   r   r   r#    s   r#  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )CamembertConverterc                 C   s2   g d}|dd |j dd  D 7 }|dg7 }|S )N))z
<s>NOTUSED        <pad>r*  )z</s>NOTUSEDr*  z<unk>r*  )z<unk>NOTUSEDic                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=     r1   z,CamembertConverter.vocab.<locals>.<listcomp>r   z<mask>r*  r   rQ   r   r/   r   r   r   r/     s    
zCamembertConverter.vocabc                 C   s   dS r$  r   r   r   r   r   r     s    zCamembertConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS r&  r!  rq   r   r   r   r     s    z!CamembertConverter.post_processorNr\   r]   r^   r/   r   r   r   r   r   r   r)    s   r)  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )DebertaV2Converterc                 C   sH   g }| j jr|tjdd t|| j }|tj||d t|S )Nr   )r   r  )r(   Zsplit_by_punctrA   r	   Punctuationr*   r  r   )rQ   r  r!   Zlist_pretokenizersr)   r   r   r   r     s    z DebertaV2Converter.pre_tokenizerc                 C   sd   g }| j jr|t  |t  |jj}|rD|t| |t	t
dd t|S )Nr  r   )r(   r   rA   r   r  r
  r  r	  r  r  r   r   r  r   r   r   r     s    zDebertaV2Converter.normalizerc                 C   s,   t jddd| jdfd| jdfgdS r   r!  rq   r   r   r   r   
  s    z!DebertaV2Converter.post_processorN)r\   r]   r^   r   r   r   r   r   r   r   r1    s   r1  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )MBartConverterc                 C   s>   g d}|dd |j dd  D 7 }|g d7 }|dg7 }|S )Nr'  r*  r+  r(  r*  r-  c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=     r1   z(MBartConverter.vocab.<locals>.<listcomp>r   )Zar_ARr*  cs_CZr*  de_DEr*  en_XXr*  Zes_XXr*  et_EEr*  fi_FIr*  Zfr_XXr*  gu_INr*  hi_INr*  it_ITr*  Zja_XXr*  kk_KZr*  ko_KRr*  lt_LTr*  lv_LVr*  Zmy_MMr*  ne_NPr*  Znl_XXr*  ro_ROr*  ru_RUr*  si_LKr*  tr_TRr*  vi_VNr*  zh_CNr*  r.  r   r/  r   r   r   r/     s
    
zMBartConverter.vocabc                 C   s   dS r$  r   r   r   r   r   r   <  s    zMBartConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS )Nz$A </s> en_XXz$A $B </s> en_XXr=  r(  r~   r!  rq   r   r   r   r   ?  s    zMBartConverter.post_processorNr0  r   r   r   r   r3    s   &r3  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )MBart50Converterc                 C   s>   g d}|dd |j dd  D 7 }|g d7 }|dg7 }|S )Nr4  c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=   R  r1   z*MBart50Converter.vocab.<locals>.<listcomp>r   )4r7  r8  r:  r<  r>  r?  rA  rC  rD  rF  rH  rJ  rK  rM  rO  rQ  rS  rT  rV  rW  rY  r[  r]  r_  ra  )af_ZAr*  )az_AZr*  )bn_INr*  )fa_IRr*  )he_ILr*  )hr_HRr*  )id_IDr*  )ka_GEr*  )Zkm_KHr*  )mk_MKr*  )ml_INr*  )mn_MNr*  )mr_INr*  )pl_PLr*  )ps_AFr*  )Zpt_XXr*  )sv_SEr*  )sw_KEr*  )ta_INr*  )te_INr*  )th_THr*  )Ztl_XXr*  )uk_UAr*  )ur_PKr*  )xh_ZAr*  )gl_ESr*  )sl_SIr*  r.  r   r/  r   r   r   r/   K  s
    
zMBart50Converter.vocabc                 C   s   dS r$  r   r   r   r   r   r   W  s    zMBart50Converter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS )Nzen_XX $A </s>zen_XX $A $B </s>r=  r(  r~   r!  rq   r   r   r   r   Z  s    zMBart50Converter.post_processorNr0  r   r   r   r   rc  J  s   rc  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )NllbConverterc                 C   s(   g d}|dd |j dd  D 7 }|S )Nr4  c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=   m  r1   z'NllbConverter.vocab.<locals>.<listcomp>r   r   r/  r   r   r   r/   f  s    zNllbConverter.vocabc                 C   s   dS r$  r   r   r   r   r   r   p  s    zNllbConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS )Nzeng_Latn $A </s>zeng_Latn $A $B </s>Zeng_Latnr(  r~   r!  rq   r   r   r   r   s  s    zNllbConverter.post_processorNr0  r   r   r   r   r|  e  s   
r|  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )SeamlessM4TConverterc                 C   s(   g d}|dd |j dd  D 7 }|S )N)r+  r-  r5  r6  c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=     r1   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>r   r   r/  r   r   r   r/     s    zSeamlessM4TConverter.vocabc                 C   s   | j jS ro   )r(   Zunk_token_idr   r   r   r   r     s    zSeamlessM4TConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS )Nz__eng__ $A </s>z__eng__ $A $B </s>Z__eng__r(  r~   r!  rq   r   r   r   r     s    z#SeamlessM4TConverter.post_processorNr0  r   r   r   r   r}  ~  s   
r}  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )XLMRobertaConverterc                 C   s2   g d}|dd |j dd  D 7 }|dg7 }|S )Nr4  c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=     r1   z-XLMRobertaConverter.vocab.<locals>.<listcomp>r   r.  r   r/  r   r   r   r/     s    
zXLMRobertaConverter.vocabc                 C   s   d}|S r$  r   r%  r   r   r   r     s    zXLMRobertaConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS r&  r!  rq   r   r   r   r     s    z"XLMRobertaConverter.post_processorNr0  r   r   r   r   r~    s   r~  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )XLNetConverterc                 C   s   dd |j D S )Nc                 S   s2   g | ]*}t |jr|j|jfn|j|jd  fqS r  r  r   r   r   r   r=     s   z(XLNetConverter.vocab.<locals>.<listcomp>r   r   r   r   r   r/     s    zXLNetConverter.vocabc                 C   s   t ddt ddg}| jjs<|t   |t   | jjrR|t   |j	j
}|rn|t | |t tdd t |S r  r  r  r   r   r   r     s    

zXLNetConverter.normalizerc                 C   s,   t jddd| jdfd| jdfgdS )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r~   r!  rq   r   r   r   r     s    zXLNetConverter.post_processorNr"  r   r   r   r   r    s   r  c                   @   s   e Zd ZdS )ReformerConverterNr\   r]   r^   r   r   r   r   r    s   r  c                   @   s   e Zd Zdd Zdd ZdS )RemBertConverterc                 C   s   t ddt ddt tddg}| jjsJ|t   |t   | jjr`|t 	  |j
j}|r||t | t |S r  )r   r  r   r(   r  rA   r  r  r   r  r  r	  r  r   r  r   r   r   r     s    

zRemBertConverter.normalizerc                 C   s,   t jddd| jdfd| jdfgdS r   r!  rq   r   r   r   r     s    zRemBertConverter.post_processorN)r\   r]   r^   r   r   r   r   r   r   r    s   r  c                   @   s   e Zd ZdS )BertGenerationConverterNr  r   r   r   r   r    s   r  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
PegasusConverterc                 C   s   | j jdf| j jdfg}| j jd ur6|| j jdfg7 }| j jd urd| j j| j jk rd|| j jdfg7 }|dd td| j jD 7 }|dd |jdd  D 7 }|S )Nr*  c                 S   s   g | ]}d | ddfqS )z<unk_>g      Yr   r<   r   r   r   r   r=     r1   z*PegasusConverter.vocab.<locals>.<listcomp>r4   c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=     r1   )	r(   	pad_token	eos_tokenZmask_token_sentZ
mask_tokenZmask_token_idoffsetr@   r   r/  r   r   r   r/      s    


zPegasusConverter.vocabc                 C   s   |j j| jj S ro   )r   r   r(   r  r   r   r   r   r     s    zPegasusConverter.unk_idc                 C   s(   t || j}tt tj||dgS r  )r*   r(   r	   r   ZWhitespaceSplitr  r  r   r   r   r     s    zPegasusConverter.pre_tokenizerc                 C   s0   | j j}|| j jfg}tjd|gdd|g|dS )N$A$Br~   )r(   r  eos_token_idr
   r   )rQ   eosr   r   r   r   r     s    
zPegasusConverter.post_processorN)r\   r]   r^   r/   r   r   r   r   r   r   r   r    s   	r  c                   @   s   e Zd Zdd Zdd ZdS )T5Converterc                 C   s:   | j j}dd |jD }|dd t|d ddD 7 }|S )Nc                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=   *  r1   z%T5Converter.vocab.<locals>.<listcomp>c                 S   s   g | ]}d | ddfqS )z
<extra_id_r  r*  r   r  r   r   r   r=   +  r1   r   ri   )r(   Z
_extra_idsr   r@   )rQ   r   Znum_extra_idsr/   r   r   r   r/   (  s    zT5Converter.vocabc                 C   s&   t jddgg dd| jdfgdS Nr  r(  )r  r(  r  r(  r~   r!  rq   r   r   r   r   .  s    zT5Converter.post_processorN)r\   r]   r^   r/   r   r   r   r   r   r  '  s   r  c                   @   s   e Zd Zdd ZdS )UdopConverterc                 C   s&   t jddgg dd| jdfgdS r  r!  rq   r   r   r   r   9  s    zUdopConverter.post_processorNr\   r]   r^   r   r   r   r   r   r  8  s   r  c                   @   s   e Zd ZedddZdS )WhisperConverterrS   c           	   	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j j}| j |}| j j}| j j}ddd |D }tj| d| d	| d
| d||fgt||d|_|S )Nr   Fr   r   r   c                 S   s   g | ]}| d qS )r{   r   r<   r   r   r   r   r=   Z  r1   z.WhisperConverter.converted.<locals>.<listcomp>z $A:0 r{   z $A:0 $B:1 r}   r~   )r(   r   rc   r   r   r   r   r	   r   r!   r   r   r   Zprefix_tokensconvert_ids_to_tokensr  r  joinr
   r   zipr   )	rQ   r/   rE   r   Zprefix_token_idsprefixesr  r  Zprefix_templater   r   r   rr   D  s8    
	zWhisperConverter.convertedNr   r   r   r   r   r  C  s   r  c                   @   s   e Zd Zdd ZdS )BigBirdConverterc                 C   s,   t jddd| jdfd| jdfgdS r   r!  rq   r   r   r   r   h  s    zBigBirdConverter.post_processorNr  r   r   r   r   r  g  s   r  c                   @   s   e Zd ZedddZdS )CLIPConverterrS   c              
   C   s   | j j}t| j j }| j j}tt||d dddt|d}t	
t	 t	tddt	 g|_t
tjtddd	d
tjddg|_t |_tj| j j| j jf| j j| j jfddd|_|S )Nr   r   Fr/   rE   r   r   r   r   ru   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+ZremovedTr   r   r   )r(   r   rc   r   r   ru   r   r   r`   r   r   r   r  r   r  r   r	   r   r   r   r   r   r
   r   r  r  r   r   r   r   r   r   r   rr   t  sD    


zCLIPConverter.convertedNr   r   r   r   r   r  s  s   r  c                   @   s   e Zd ZedddZdS )LayoutLMv2ConverterrS   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j drV| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )Nrt   FTrv   rw   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   rr     s:    



zLayoutLMv2Converter.convertedNr   r   r   r   r   r    s   r  c                   @   s   e Zd ZedddZdS )BlenderbotConverterrS   c              	   C   st   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjd|j d|j|jfgd|_|S )Nr   Fr   r   z$A:0 r{   )r   r   )r(   r   rc   r   r   r   r   r	   r   r!   r   r   r   r
   r   r  r  r   r   r   r   r   rr     s*    

zBlenderbotConverter.convertedNr   r   r   r   r   r    s   r  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )XGLMConverterc                 C   s4   g d}|dd |j dd  D 7 }|g d7 }|S )Nr4  c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=     r1   z'XGLMConverter.vocab.<locals>.<listcomp>r   ))z<madeupword0>r*  )z<madeupword1>r*  )z<madeupword2>r*  )z<madeupword3>r*  )z<madeupword4>r*  )z<madeupword5>r*  )z<madeupword6>r*  r   r/  r   r   r   r/     s    zXGLMConverter.vocabc                 C   s   d}|S r$  r   r%  r   r   r   r     s    zXGLMConverter.unk_idc                 C   s,   t jddd| jdfd| jdfgdS )Nz</s> $Az</s> $A </s> </s> $Br'  r(  r~   r!  rq   r   r   r   r     s    zXGLMConverter.post_processorNr0  r   r   r   r   r    s   r  c                   @   sD   e Zd ZdZeZddhZdd Zdd Zdd	 Z	d
d Z
dd ZdS )GemmaConverterTz<start_of_turn>z<end_of_turn>c                 C   s   t ddS Nr   r  )r   r  r   r   r   r   r     s    zGemmaConverter.normalizerc                 C   s|   | j jdf| j jdf| j jdfg}|dd |jdd  D 7 }tdd |D sxtdd t|D d }|d urxd||< |S )	Nr*  c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=     r1   z(GemmaConverter.vocab.<locals>.<listcomp>r   c                 s   s   | ]}|d  dkV  qdS )r   re   Nr   )r<   r-   r   r   r   	<genexpr>  r1   z'GemmaConverter.vocab.<locals>.<genexpr>c                 s   s"   | ]\}}|d  dkr|V  qdS )r   rf   Nr   )r<   r   r-   r   r   r   r    r1   )re   r*  )r(   r  r  r   r   anynextr   )rQ   r   r/   Zoverride_indexr   r   r   r/     s    


zGemmaConverter.vocabc                 C   s   t ddS )Nr   Zmerged_with_previous)r	   r   rQ   r  r!   r   r   r   r      s    zGemmaConverter.pre_tokenizerc                 C   s   d}|S r$  r   r%  r   r   r   r   #  s    zGemmaConverter.unk_idc                 C   s    t t ddt  t  gS )Nr  r   )r   r   r  ByteFallbackFuser  r   r   r   r   '  s    
zGemmaConverter.decoderN)r\   r]   r^   r   rd   r   r   r   r/   r   r   r   r   r   r   r   r    s   r  c                   @   s@   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dS )LlamaConverterTc                 C   sN   | j ddf| j ddf| j ddfg}|dd |jdd  D 7 }|S )Nr   r*  r   r4   c                 S   s   g | ]}|j |jfqS r   r   r   r   r   r   r=   :  r1   z(LlamaConverter.vocab.<locals>.<listcomp>r   )r(   r  r   r/  r   r   r   r/   4  s    zLlamaConverter.vocabc                 C   s   d}|S r   r   r%  r   r   r   r   =  s    zLlamaConverter.unk_idc                 C   s<   t ddt  t  g}|r2|t jdddg7 }t |S Nr  r   r   )contentr  r   r  r  r  r
  r   rQ   r  r!   sequencer   r   r   r   A  s    
zLlamaConverter.decoderc                 C   sT   t | jddrPg }t | jddr2|tjddg7 }|tjdddg7 }t|S d S )Nr$   Tr!   r  )prependr   )patternr  )r'   r(   r   Prependr  r   )rQ   r   r  r   r   r   r   K  s    
zLlamaConverter.normalizerc                 C   s.   t | jdds*t|| j}tj||ddS d S )Nr$   TFr  r)   split)r'   r(   r*   r	   r  r  r   r   r   r   T  s    zLlamaConverter.pre_tokenizerc                 C   s   d S ro   r   rq   r   r   r   r   Z  s    zLlamaConverter.post_processorN)
r\   r]   r^   r   r/   r   r   r   r   r   r   r   r   r   r  1  s   	
	r  c                   @   s   e Zd ZedddZdS )MarkupLMConverterrS   c           	   
   C   s   | j }|j}t|j }tt||d ddd| j jd}tj	|j
d|_t	 |_t| j j}t| j j}| j j}| j j}tj| d| | d| d| ||f||fgd|_|S )Nr   Fr  r   z $A z $B r~   )r(   r   rc   r   r   r   r   ru   r	   r   r!   r   r   r   r`   r   r   r   r   r
   r   r   )	rQ   r   r/   rE   r   r   r   r   r   r   r   r   rr   `  s8    
	zMarkupLMConverter.convertedNr   r   r   r   r   r  _  s   r  c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )MoshiConverterTNc                 K   sf   t | d t| | t }| }t|d}||  W d    n1 sR0    Y  || _d S Nr   r   	r   rn   rR   r    r   r   r   r   r   )rQ   r   Zmodel_max_lengthkwargsr   r   r   r   r   r   rR     s    
,zMoshiConverter.__init__c                 C   s>   |j j}tddg}|s$t|S tt|g| S d S r  )r  r	  r   r  r   r  r  r   r   r   r     s    

zMoshiConverter.normalizerc                 C   s<   t ddt  t  g}|r2|t jdddg7 }t |S r  r  r  r   r   r   r     s    
zMoshiConverter.decoderc                 C   s   d}t j||ddS )Nr%   Fr  )r	   r  r  r   r   r   r     s    zMoshiConverter.pre_tokenizer)N)r\   r]   r^   r   rR   r   r   r   r   r   r   r   r    s
   


r  c                   @   sR   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd ZdS )HeliumConverterTNc                 G   sf   t | d t| | t }| }t|d}||  W d    n1 sR0    Y  || _d S r  r  )rQ   r   r   r   r   r   r   r   r   rR     s    
,zHeliumConverter.__init__c                    s     |}tt| | jd} fddt|jD }|dd t|dd dD  |t	dd	d	d
g |j
ddd |S )Nr   c                    s8   g | ]0\}}|j d v r||j|j dkp0|j jv fqS r   r   r   rq   r   r   r=     s   
z-HeliumConverter.tokenizer.<locals>.<listcomp>c                 S   s"   g | ]\}}}t |d |ddqS )FT)r   r   Zsingle_wordr   r   r   r   r   r=     s   c                 S   s   | d S r   r   r,   r   r   r   r0     r1   z+HeliumConverter.tokenizer.<locals>.<lambda>r2   
Fr   r,  r   )r  Zpad_id)r/   r   r   r   r   r   r   r  rB   r   Zenable_padding)rQ   r   rD   r   r  r   rq   r   r     s&    

zHeliumConverter.tokenizerc                 C   sB   g }|j D ]2}|jdkr*|d|jfg7 }q
||j|jfg7 }q
|S )Nz<0x0A>r  )r   rh   r   )rQ   r   r/   rh   r   r   r   r/     s    

zHeliumConverter.vocabc                 C   s   d}|S r   r   r%  r   r   r   r     s    zHeliumConverter.unk_idc                 C   s8   t ddt  t  g}|t jdddg7 }t |S r  r  r  r   r   r   r     s    
zHeliumConverter.decoderc                 C   s   t t dt ddgS r  )r   r   r  r  r   r   r   r   r     s    zHeliumConverter.normalizerc                 C   s   t t ddgS )Nr  
contiguous)r	   r   r   r  r   r   r   r     s    zHeliumConverter.pre_tokenizerc                 C   s   t jddgg ddgdS )Nr'  r  )r'  r  r'  r  )r'  r   r~   )r
   r   rq   r   r   r   r     s    zHeliumConverter.post_processor)N)r\   r]   r^   r   rR   r   r/   r   r   r   r   r   r   r   r   r   r    s   
		r  c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ],}|| vrf| | |d
|  |d7 }qfdd |D }tt| |S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r   )chr)r<   nr   r   r   r=     r1   z$bytes_to_unicode.<locals>.<listcomp>)rc   r@   ordrA   r>   r  )bscsr  br   r   r   bytes_to_unicode  s    L

r  c                       sF   e Zd ZdZd fdd	Zeddd	Zd
d ZedddZ	  Z
S )TikTokenConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                    s:   t  j|  || _|| _|| _t|tr0| n|| _d S ro   )	r   rR   r   r  r!   
isinstancer>   r   additional_special_tokens)rQ   r   r  r!   r  r   r  r   r   r   rR   %  s    	
zTikTokenConverter.__init__)tiktoken_urlc                    s$  zddl m} W n ty*   tdY n0 || t fddg }i }  D ]\}}|||< t|dkrxqVg }tdt|D ]J}|d | ||d   }	}
|	 v r|
 v r|	|
  v r||	|
|f qt	| fddd	d
}|
| qVt	|dd d	d
}fdd|D }||fS )Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c                    s   d  fdd| dD S )Nr   c                    s   g | ]} t | qS r   )r  )r<   charbyte_encoderr   r   r=   D  r1   zdTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>zlatin-1)r  decode)r  r  r   r   token_bytes_to_stringC  s    zPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringr   c                    s    | d   | d  fS r+   r   r,   )r   r   r   r0   Q  r1   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>Fr8   c                 S   s   | d S )Nr4   r   r6   r   r   r   r0   S  r1   c                    s$   g | ]} |d   |d fqS r:   r   r;   )r  r   r   r=   T  r1   zETikTokenConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)Ztiktoken.loadr  r  
ValueErrorr  r?   r5   r@   rA   rB   rC   )rQ   r  r  rE   r/   r   ZrankrG   rH   rI   rJ   r   )r   r  r  r   extract_vocab_merges_from_model8  s2    
z1TikTokenConverter.extract_vocab_merges_from_modelc                 C   s:   |  | j\}}tt||dd}t|jdr6d|j_|S )NF)r   ignore_mergesT)r  r   r   r   r   rM   r  )rQ   rD   rE   r   r   r   r   r   W  s
    zTikTokenConverter.tokenizerrS   c                 C   sh   |   }ttjt| jdddtj| jddg|_t	 |_
|dd | jD  tjdd|_|S )Nr   Fr   r   c                 S   s   g | ]}t |d ddqS )FTr   r   r  r   r   r   r=   i  r1   z/TikTokenConverter.converted.<locals>.<listcomp>r   )r   r	   r   r   r   r  r   r!   r   r   r   r   r  r
   r   )rQ   r   r   r   r   rr   ^  s    
zTikTokenConverter.converted)Nr  FN)r\   r]   r^   r_   rR   r`   r  r   r   rr   r  r   r   r   r   r     s       r  )9ZAlbertTokenizerZBartTokenizerZBarthezTokenizerZBertTokenizerZBigBirdTokenizerZBlenderbotTokenizerZCamembertTokenizerZCLIPTokenizerZCodeGenTokenizerZConvBertTokenizerZDebertaTokenizerZDebertaV2TokenizerZDistilBertTokenizerZDPRReaderTokenizerZDPRQuestionEncoderTokenizerZDPRContextEncoderTokenizerZElectraTokenizerZFNetTokenizerZFunnelTokenizerZGPT2TokenizerZHerbertTokenizerZLayoutLMTokenizerZLayoutLMv2TokenizerZLayoutLMv3TokenizerZLayoutXLMTokenizerZLongformerTokenizerZLEDTokenizerZLxmertTokenizerZMarkupLMTokenizerZMBartTokenizerZMBart50TokenizerZMPNetTokenizerZMobileBertTokenizerZMvpTokenizerZNllbTokenizerZOpenAIGPTTokenizerZPegasusTokenizerZQwen2TokenizerZRealmTokenizerZReformerTokenizerZRemBertTokenizerZRetriBertTokenizerZRobertaTokenizerZRoFormerTokenizerZSeamlessM4TTokenizerZSqueezeBertTokenizerZT5TokenizerZUdopTokenizerZWhisperTokenizerZXLMRobertaTokenizerZXLNetTokenizerZSplinterTokenizerZXGLMTokenizerZLlamaTokenizerZCodeLlamaTokenizerZGemmaTokenizerZPhi3TokenizerFrS   c                 C   sv   | j j}|tv r(|s(t| }||  S z td t| j| jd W S  t	yp   t
dtt  Y n0 dS )a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zConverting from Tiktoken)r   r  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: N)r   r\   SLOW_TO_FAST_CONVERTERSrr   loggerinfor  r   r  r  r  rc   r   )Ztransformer_tokenizerZfrom_tiktokenZtokenizer_class_nameZconverter_classr   r   r   convert_slow_tokenizer  s     

r  )r   )F)Pr_   r   typingr   	packagingr   Z
tokenizersr   r   r   r   r   r	   r
   Ztokenizers.modelsr   r   r   utilsr   r   r   r   Zutils.import_utilsr   Z
get_loggerr\   r  r    boolr`   r*   rK   rL   rd   rm   rn   rs   r   r   r   r   r   r   r   r   r   r   r   r  r#  r)  r1  r3  rc  r|  r}  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r   <module>   s   $


'2''(.' %!5% ($+'4.&)ZR=