a
    h&                     @   s   d Z ddlZddlZddlmZ ddlmZmZ e r>ddlZddl	m
Z
 ddlmZ eeZdd	iZd
d ZG dd dZG dd de
ZdgZdS )z Tokenization classes for CPMAnt.    N)Optional)is_jieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                 C   sf   t  }t| ddd}| }W d   n1 s40    Y  t|D ]\}}|d}|||< qF|S )z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextoken r   j/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocab$   s    &

r   c                   @   s   e Zd ZdddZdd ZdS )	WordpieceTokenizer<unk>   c                 C   s   || _ || _|| _d S N)r   	unk_tokenmax_input_chars_per_word)selfr   r    r!   r   r   r   __init__0   s    zWordpieceTokenizer.__init__c                 C   s   t |}t|| jkr| jgS d}g }|t|k rt|}d }||k rrd||| }|| jv rh|}qr|d8 }q>|d u r|| j |d7 }q&|| |}q&|S )Nr       )listlenr!   r    joinr   append)r"   r   charsstartZ
sub_tokensendZ
cur_substrsubstrr   r   r   tokenize5   s(    



zWordpieceTokenizer.tokenizeN)r   r   )__name__
__module____qualname__r#   r.   r   r   r   r   r   /   s   
r   c                	       s  e Zd ZdZeZddgZdZd2 fdd	Ze	dd Z
e	dd Ze	dd Ze	edddZdd Zdd Z fddZdd  Zee ed!d"d#Zd$d% Zd&d' Zd3eee ee d)d*d+Zd4ee eee  ee d,d-d.Zd5ee eee  eee d/ fd0d1Z  ZS )6CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    Z	input_idsZattention_maskF<d></d><s></s><pad>r   </n></_>leftc                    s   t | dg || _|| _t|| _| j|	 | jd< | j| | jd< | j|	= | j|= tt| j dd d| _dd | j D | _	t
| j|d	| _t jf ||||||||	|
d
	| d S )Njieba r   c                 S   s   | d S Nr%   r   xr   r   r   <lambda>       z*CpmAntTokenizer.__init__.<locals>.<lambda>keyc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>   rA   z,CpmAntTokenizer.__init__.<locals>.<dictcomp>)r   r    )		bod_token	eod_token	bos_token	eos_token	pad_tokenr    
line_tokenspace_tokenpadding_side)r   rH   rI   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr#   )r"   r   rH   rI   rJ   rK   rL   r    rM   rN   rO   kwargs	__class__r   r   r#   l   s0    

zCpmAntTokenizer.__init__c                 C   s   | j | j S r   )rP   rH   r"   r   r   r   bod_token_id   s    zCpmAntTokenizer.bod_token_idc                 C   s   | j | j S r   )rP   rI   rY   r   r   r   eod_token_id   s    zCpmAntTokenizer.eod_token_idc                 C   s
   | j d S )Nr   rP   rY   r   r   r   
newline_id   s    zCpmAntTokenizer.newline_id)returnc                 C   s
   t | jS r   )r'   rP   rY   r   r   r   
vocab_size   s    zCpmAntTokenizer.vocab_sizec                 C   s   t | jfi | jS r   )dictrP   Zadded_tokens_encoderrY   r   r   r   	get_vocab   s    zCpmAntTokenizer.get_vocabc                 C   s.   g }t j|ddD ]}|| j| q|S )zTokenize a string.F)Zcut_all)r;   cutextendrT   r.   )r"   textZoutput_tokensr?   r   r   r   	_tokenize   s    zCpmAntTokenizer._tokenizec                    s4   dd |D } fdd|D }t  j|fi |S )zDecode ids into a string.c                 S   s   g | ]}|d kr|qS )r   r   )rD   ir   r   r   
<listcomp>   rA   z+CpmAntTokenizer._decode.<locals>.<listcomp>c                    s.   g | ]&}| j kr| jkr| jkr|qS r   )Zpad_token_idZeos_token_idbos_token_id)rD   r?   rY   r   r   rg      s   )rU   _decode)r"   Z	token_idsrV   rW   rY   r   ri      s
    
zCpmAntTokenizer._decodec                 C   s
   || j v S r   r\   r"   r   r   r   r   check   s    zCpmAntTokenizer.check)r   r^   c                 C   s
   d |S )Nr$   )r(   )r"   r   r   r   r   convert_tokens_to_string   s    z(CpmAntTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)rP   getr    rj   r   r   r   _convert_token_to_id   s    z$CpmAntTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)rS   rm   r    )r"   r   r   r   r   _convert_id_to_token   s    z$CpmAntTokenizer._convert_id_to_tokenN)save_directoryfilename_prefixr^   c                 C   s(  t j|r0t j||r |d ndtd  }n|r<|d nd| }d}d| jv rj| jd | jd< | jd= d| jv r| jd | jd< | jd= tt| j	 d	d
 d| _t
|dddX}| j	 D ]<\}}||krtd| d |}||d  |d7 }qW d    n1 s0    Y  |fS )N-r$   r   r   r<   r9   r   r8   c                 S   s   | d S r=   r   r>   r   r   r   r@      rA   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>rB   wr
   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r%   )ospathisdirr(   VOCAB_FILES_NAMESrP   r   r   rQ   rR   r   loggerwarningwrite)r"   rp   rq   r   r   writerr   Ztoken_indexr   r   r   save_vocabulary   s.    


*zCpmAntTokenizer.save_vocabulary)token_ids_0token_ids_1r^   c                 C   s,   |du r| j g| S | j g| | j g | S )a1  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CPMAnt sequence has the following format:

        - single sequence: `[BOS] Sequence`.

        Args:
            token_ids_0 (`list[int]`): The first tokenized sequence that special tokens will be added.
            token_ids_1 (`list[int]`): The optional second tokenized sequence that special tokens will be added.

        Returns:
            `list[int]`: The model input with special tokens.
        N)rh   )r"   r}   r~   r   r   r    build_inputs_with_special_tokens   s    z0CpmAntTokenizer.build_inputs_with_special_tokens)r}   r~   already_has_special_tokensr^   c                    sZ   |rt  j||ddS |durFdgdgt|  dg dgt|  S dgdgt|  S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`): List of IDs.
            token_ids_1 (`list[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r}   r~   r   Nr%   r   )rU   get_special_tokens_maskr'   )r"   r}   r~   r   rW   r   r   r      s    (z'CpmAntTokenizer.get_special_tokens_mask)	r3   r4   r5   r6   r7   r   r8   r9   r:   )N)N)NF)r/   r0   r1   __doc__rw   Zvocab_files_namesZmodel_input_namesZadd_prefix_spacer#   propertyrZ   r[   r]   intr_   ra   re   ri   rk   r&   strrl   rn   ro   r   tupler|   r   boolr   __classcell__r   r   rW   r   r2   O   sP            *


  r2   )r   r   rt   typingr   Ztransformers.utilsr   r   r;   Ztokenization_utilsr   utilsr   Z
get_loggerr/   rx   rw   r   r   r2   __all__r   r   r   r   <module>   s   
  B