a
    ½Àh ]  ã                   @   s¬   d dl Z d dlZd dlZd dlZd dlmZ ddlmZmZm	Z	m
Z
 ddlmZ e e¡ZdddœZd	d
„ Zdd„ Zdd„ Zdd„ ZG dd„ dƒZG dd„ deƒZdgZdS )é    N)ÚOptionalé   )ÚPreTrainedTokenizerÚ_is_controlÚ_is_punctuationÚ_is_whitespace)Úloggingz
vocab.jsonz
merges.txt)Ú
vocab_fileÚmerges_filec                 C   s6   t ƒ }| d }| dd… D ]}| ||f¡ |}q|S )zƒ
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r   é   N)ÚsetÚadd)ÚwordÚpairsZ	prev_charÚchar© r   úl/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/herbert/tokenization_herbert.pyÚ	get_pairs"   s    r   c                 C   s¸  |   dd¡} t dd| ¡} |   dd¡} |   dd¡} |   dd¡} |   d	d
¡} |   dd
¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   d d!¡} |   d"d#¡} |   d$d%¡} |   d&d'¡} |   d(d)¡} |   d*d+¡} |   d,d-¡} t d.d| ¡} |   d/d0¡} |   d1d2¡} |   d3d4¡} |   d5d6¡} |   d7d8¡} |   d9d:¡} |   d;d<¡} |   d=d>¡} |   d?d@¡} | S )Azz
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
    u   ï¼Œú,u   ã€‚\s*z. u   ã€u   â€ú"u   â€œu   âˆ¶ú:u   ï¼šu   ï¼Ÿú?u   ã€Šu   ã€‹u   ï¼‰ú)u   ï¼ú!u   ï¼ˆú(u   ï¼›ú;u   ï¼‘Ú1u   ã€u   ã€Œu   ï¼Ú0u   ï¼“Ú3u   ï¼’Ú2u   ï¼•Ú5u   ï¼–Ú6u   ï¼™Ú9u   ï¼—Ú7u   ï¼˜Ú8u   ï¼”Ú4u   ï¼Ž\s*u   ï½žú~u   â€™ú'u   â€¦z...u   â”ú-u   ã€ˆú<u   ã€‰ú>u   ã€ú[u   ã€‘ú]u   ï¼…ú%)ÚreplaceÚreÚsub)Útextr   r   r   Úreplace_unicode_punct0   sJ    r2   c                 C   s8   g }| D ]$}t  |¡}| d¡r"q| |¡ qd |¡S )zw
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
    ÚCÚ )ÚunicodedataÚcategoryÚ
startswithÚappendÚjoin)r1   Úoutputr   Úcatr   r   r   Úremove_non_printing_char\   s    

r<   c                 C   s   |   ¡ } | sg S |  ¡ }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)ÚstripÚsplit)r1   Útokensr   r   r   Úwhitespace_tokenizej   s
    r@   c                   @   sN   e Zd ZdZddd„Zddd„Zdd	„ Zdd
d„Zdd„ Zdd„ Z	dd„ Z
dS )ÚBasicTokenizeraª  
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    TNc                 C   s2   |d u rg }|| _ t|ƒ| _|| _|| _|| _d S ©N)Údo_lower_caser   Únever_splitÚtokenize_chinese_charsÚstrip_accentsÚdo_split_on_punc)ÚselfrC   rD   rE   rF   rG   r   r   r   Ú__init__‹   s    
zBasicTokenizer.__init__c                 C   s¶   |r| j  t|ƒ¡n| j }|  |¡}| jr4|  |¡}t d|¡}t|ƒ}g }|D ]R}||vr| j	r€| 
¡ }| jdur|  |¡}n| jr|  |¡}| |  ||¡¡ qPtd |¡ƒ}|S )aj  
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        ÚNFCFú )rD   Úunionr   Ú_clean_textrE   Ú_tokenize_chinese_charsr5   Ú	normalizer@   rC   ÚlowerrF   Ú_run_strip_accentsÚextendÚ_run_split_on_puncr9   )rH   r1   rD   Zunicode_normalized_textZorig_tokensÚsplit_tokensÚtokenZoutput_tokensr   r   r   Útokenize›   s$    




zBasicTokenizer.tokenizec                 C   sB   t  d|¡}g }|D ]"}t  |¡}|dkr,q| |¡ qd |¡S )z$Strips accents from a piece of text.ZNFDZMnr4   )r5   rO   r6   r8   r9   )rH   r1   r:   r   r;   r   r   r   rQ   Á   s    
z!BasicTokenizer._run_strip_accentsc                 C   s–   | j r|dur||v r|gS t|ƒ}d}d}g }|t|ƒk rˆ|| }t|ƒr^| |g¡ d}n |rl| g ¡ d}|d  |¡ |d7 }q0dd„ |D ƒS )	z&Splits punctuation on a piece of text.Nr   TFéÿÿÿÿr   c                 S   s   g | ]}d   |¡‘qS )r4   )r9   )Ú.0Úxr   r   r   Ú
<listcomp>à   ó    z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)rG   ÚlistÚlenr   r8   )rH   r1   rD   ÚcharsÚiZstart_new_wordr:   r   r   r   r   rS   Ì   s"    

z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ]@}t |ƒ}|  |¡r>| d¡ | |¡ | d¡ q| |¡ qd |¡S )z)Adds whitespace around any CJK character.rK   r4   )ÚordÚ_is_chinese_charr8   r9   ©rH   r1   r:   r   Úcpr   r   r   rN   â   s    


z&BasicTokenizer._tokenize_chinese_charsc                 C   sˆ   |dkr|dks€|dkr |dks€|dkr0|dks€|dkr@|dks€|d	krP|d
ks€|dkr`|dks€|dkrp|dks€|dkr„|dkr„dS dS )z6Checks whether CP is the codepoint of a CJK character.i N  iÿŸ  i 4  i¿M  i   iß¦ i § i?· i@· i¸ i ¸ i¯Î i ù  iÿú  i ø iú TFr   )rH   rc   r   r   r   ra   ï   sD    ÿÿþþýýüüûûúúùùøø
zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]D}t |ƒ}|dks|dkst|ƒr.qt|ƒrB| d¡ q| |¡ qd |¡S )zBPerforms invalid character removal and whitespace cleanup on text.r   iýÿ  rK   r4   )r`   r   r   r8   r9   rb   r   r   r   rM     s    zBasicTokenizer._clean_text)TNTNT)N)N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__rI   rV   rQ   rS   rN   ra   rM   r   r   r   r   rA   t   s        ú

&
rA   c                       s"  e Zd ZdZeZddddddddg d	¢ddf‡ fd
d„	Zedd„ ƒZdd„ Z	dd„ Z
dd„ Zdd„ Zedd„ ƒZdd„ Zdd„ Zdd„ Zdd„ Zd d!„ Zd"d#„ Zd1ee eee  ee d$œd%d&„Zd2ee eee  eee d'œ‡ fd(d)„Zd3eee ee d*œd+d,„Zd-d.„ Zd/d0„ Z‡  ZS )4ÚHerbertTokenizeraÎ  
    Construct a BPE tokenizer for HerBERT.

    Peculiarities:

    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
      punctuation character will be treated separately.

    - Such pretokenized input is BPE subtokenized

    This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.
    Nz<s>z<unk>z<pad>z<mask>z</s>F)
z
<special0>z
<special1>z
<special2>z
<special3>z
<special4>z
<special5>z
<special6>z
<special7>z
<special8>z
<special9>c                    s†  zdd l }W n ty&   tdƒ‚Y n0 || _i | _i | _h d£| _|
| _|| _|| _|d urz|d urzt	|ƒt	|ƒkszJ ‚d | _
d | _t|dd}t |¡| _W d   ƒ n1 s´0    Y  dd„ | j ¡ D ƒ| _t|dd&}| ¡  d¡d d	… }W d   ƒ n1 s0    Y  d
d„ |D ƒ}tt|tt	|ƒƒƒƒ| _i | _tƒ jf ||	||||||||
d dœ|¤Ž td| jddd| _d S )Nr   zrYou need to install sacremoses to use HerbertTokenizer. See https://pypi.org/project/sacremoses/ for installation.>   ÚzhÚjaÚthúutf-8©Úencodingc                 S   s   i | ]\}}||“qS r   r   )rX   ÚkÚvr   r   r   Ú
<dictcomp>]  r[   z-HerbertTokenizer.__init__.<locals>.<dictcomp>Ú
rW   c                 S   s    g | ]}t | ¡ d d… ƒ‘qS )Né   )Útupler>   )rX   Úmerger   r   r   rZ   `  r[   z-HerbertTokenizer.__init__.<locals>.<listcomp>)Ú	unk_tokenÚ	bos_tokenÚ	sep_tokenÚ	pad_tokenÚ	cls_tokenÚ
mask_tokenÚadditional_special_tokensÚlang2idÚid2langÚdo_lowercase_and_remove_accentÚtokenizer_fileF)rC   rD   rE   rF   )Ú
sacremosesÚImportErrorÚsmÚcache_moses_punct_normalizerÚcache_moses_tokenizerZlang_with_custom_tokenizerr   r}   r~   r]   Úja_word_tokenizerZzh_word_tokenizerÚopenÚjsonÚloadÚencoderÚitemsÚdecoderÚreadr>   ÚdictÚzipÚrangeÚ	bpe_ranksÚcacheÚsuperrI   rA   Zall_special_tokensÚbert_pre_tokenizer)rH   r	   r
   r€   rz   rv   ry   r{   rx   rw   r   r|   r}   r~   Úkwargsr   Zvocab_handleZmerges_handleZmerges©Ú	__class__r   r   rI   &  s\    ÿ

*6õôüzHerbertTokenizer.__init__c                 C   s   | j S rB   )r   ©rH   r   r   r   rC   z  s    zHerbertTokenizer.do_lower_casec                 C   s8   || j vr$| jj|d}|| j |< n
| j | }| |¡S )N©Úlang)r„   rƒ   ZMosesPunctNormalizerrO   )rH   r1   rš   Zpunct_normalizerr   r   r   Úmoses_punct_norm€  s
    

z!HerbertTokenizer.moses_punct_normc                 C   s>   || j vr$| jj|d}|| j |< n
| j | }|j|dddS )Nr™   F)Z
return_strÚescape)r…   rƒ   ZMosesTokenizerrV   )rH   r1   rš   Zmoses_tokenizerr   r   r   Úmoses_tokenize‰  s
    

zHerbertTokenizer.moses_tokenizec                 C   s    t |ƒ}|  ||¡}t|ƒ}|S rB   )r2   r›   r<   )rH   r1   rš   r   r   r   Úmoses_pipeline’  s    zHerbertTokenizer.moses_pipelinec              	   C   s˜   | j d u rˆz(dd l}| dtj d¡› d¡| _ W nT ttfy†   t d¡ t d¡ t d¡ t d¡ t d	¡ t d
¡ ‚ Y n0 t	| j  
|¡ƒS )Nr   z-model r&   z/local/share/kytea/model.binz™Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following stepsz81. git clone git@github.com:neubig/kytea.git && cd kyteaz2. autoreconf -iz#3. ./configure --prefix=$HOME/localz4. make && make installz5. pip install kytea)r†   ÚMykyteaÚosÚpathÚ
expanduserÚAttributeErrorr‚   ÚloggerÚerrorr\   ZgetWS)rH   r1   rŸ   r   r   r   Úja_tokenize™  s"    
ÿ
ÿ




zHerbertTokenizer.ja_tokenizec                 C   s
   t | jƒS rB   )r]   rŠ   r˜   r   r   r   Ú
vocab_size®  s    zHerbertTokenizer.vocab_sizec                 C   s   t | jfi | j¤ŽS rB   )rŽ   rŠ   Zadded_tokens_encoderr˜   r   r   r   Ú	get_vocab´  s    zHerbertTokenizer.get_vocabc           
         sŠ  t |d d… ƒ|d d f }|ˆ jv r2ˆ j| S t|ƒ}|sF|d S t|‡ fdd„d}|ˆ jvrhqd|\}}g }d}|t|ƒk r:z| ||¡}	W n* tyÀ   | ||d … ¡ Y q:Y n0 | |||	… ¡ |	}|| |kr"|t|ƒd k r"||d  |kr"| 	|| ¡ |d7 }qx| 	|| ¡ |d7 }qxt |ƒ}|}t|ƒdkrZqdqFt|ƒ}qFd	 
|¡}|d
kr|d}|ˆ j|< |S )NrW   ú</w>c                    s   ˆ j  | tdƒ¡S )NÚinf)r‘   ÚgetÚfloat)Úpairr˜   r   r   Ú<lambda>Â  r[   z&HerbertTokenizer.bpe.<locals>.<lambda>©Úkeyr   r   rs   rK   z
  </w>z
</w>)rt   r’   r   Úminr‘   r]   ÚindexÚ
ValueErrorrR   r8   r9   )
rH   rU   r   r   ZbigramÚfirstÚsecondZnew_wordr_   Újr   r˜   r   Úbpe¸  sF    


2





zHerbertTokenizer.bpec                 C   s<   | j  |¡}g }|D ]"}|r| t|  |¡ d¡ƒ¡ q|S )NrK   )r”   rV   rR   r\   r·   r>   )rH   r1   Z
pre_tokensrT   rU   r   r   r   Ú	_tokenizeä  s    zHerbertTokenizer._tokenizec                 C   s   | j  || j  | j¡¡S )z0Converts a token (str) in an id using the vocab.)rŠ   r«   rv   )rH   rU   r   r   r   Ú_convert_token_to_idï  s    z%HerbertTokenizer._convert_token_to_idc                 C   s   | j  || j¡S )z=Converts an index (integer) in a token (str) using the vocab.)rŒ   r«   rv   )rH   r²   r   r   r   Ú_convert_id_to_tokenô  s    z%HerbertTokenizer._convert_id_to_tokenc                 C   s   d  |¡ dd¡ ¡ }|S )z:Converts a sequence of tokens (string) in a single string.r4   r©   rK   )r9   r.   r=   )rH   r?   Z
out_stringr   r   r   Úconvert_tokens_to_stringù  s    z)HerbertTokenizer.convert_tokens_to_string)Útoken_ids_0Útoken_ids_1Úreturnc                 C   s8   | j g}| jg}|du r$|| | S || | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.

        N)Zbos_token_idZsep_token_id)rH   r¼   r½   ZbosÚsepr   r   r   Ú build_inputs_with_special_tokensÿ  s
    z1HerbertTokenizer.build_inputs_with_special_tokens)r¼   r½   Úalready_has_special_tokensr¾   c                    sf   |rt ƒ j||ddS |durLdgdgt|ƒ  dg dgt|ƒ  dg S dgdgt|ƒ  dg S )aÄ  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r¼   r½   rÁ   Nr   r   )r“   Úget_special_tokens_maskr]   )rH   r¼   r½   rÁ   r–   r   r   rÂ     s    ÿ.z(HerbertTokenizer.get_special_tokens_mask)Úsave_directoryÚfilename_prefixr¾   c           
   	   C   sJ  t j |¡s"t d|› d¡ d S t j ||r6|d ndtd  ¡}t j ||rX|d ndtd  ¡}t|ddd	.}| t	j
| jd
dddd ¡ W d   ƒ n1 s¨0    Y  d}t|ddd	l}t| j ¡ dd„ dD ]D\}}	||	krt d|› d¡ |	}| d |¡d ¡ |d7 }qÜW d   ƒ n1 s80    Y  ||fS )NzVocabulary path (z) should be a directoryr(   r4   r	   r
   Úwrl   rm   rs   TF)ÚindentÚ	sort_keysÚensure_asciirr   r   c                 S   s   | d S )Nr   r   )Úkvr   r   r   r®   H  r[   z2HerbertTokenizer.save_vocabulary.<locals>.<lambda>r¯   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rK   r   )r    r¡   Úisdirr¤   r¥   r9   ÚVOCAB_FILES_NAMESr‡   Úwriterˆ   ÚdumpsrŠ   Úsortedr‘   r‹   Úwarning)
rH   rÃ   rÄ   r	   Z
merge_fileÚfr²   ÚwriterZ
bpe_tokensZtoken_indexr   r   r   Úsave_vocabulary8  s,    ÿÿ<

ÿ*z HerbertTokenizer.save_vocabularyc                 C   s   | j  ¡ }d |d< |S )Nrƒ   )Ú__dict__Úcopy)rH   Ústater   r   r   Ú__getstate__U  s    
zHerbertTokenizer.__getstate__c                 C   s8   || _ zdd l}W n ty,   tdƒ‚Y n0 || _d S )Nr   znYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.)rÓ   r   r‚   rƒ   )rH   Údr   r   r   r   Ú__setstate__[  s    ÿ
zHerbertTokenizer.__setstate__)N)NF)N) rd   re   rf   rg   rË   Zvocab_files_namesrI   ÚpropertyrC   r›   r   rž   r¦   r§   r¨   r·   r¸   r¹   rº   r»   r\   Úintr   rÀ   ÚboolrÂ   Ústrrt   rÒ   rÖ   rØ   Ú__classcell__r   r   r–   r   rh     sR   çT
		
, ÿþ ÿþrh   )rˆ   r    r/   r5   Útypingr   Ztokenization_utilsr   r   r   r   Úutilsr   Z
get_loggerrd   r¤   rË   r   r2   r<   r@   rA   rh   Ú__all__r   r   r   r   Ú<module>   s(   
þ,
 "  V