a
    h                  8   @   s   d Z ddlZddlZddlmZ ddlZddlmZ ddl	m
Z
 e
eZddd	Zd
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dA7ZdBdC ZG dDdE dEeZdEgZdS )Fz)Tokenization classes for Salesforce CTRL.    N)Optional   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filei i  i i  i  i#j  iv i~ i6  i  iv  i i.  i i  iך  iͨ  i  i%  i  i  i3  iR- in  iS.  iK  i iw  i  i[  i*  i  i  i/  i?  i in1  i  ip i  i i iϒ  i	  i) i- i( i  iK i  i iǢ  i  ih  i )7Z	PregnancyZChristianityZExplainZFitnessZSavingZAskZAssZJokeZ	QuestionsZThoughtsZRetailZFeminismZWritingZAtheismZNetflixZ	ComputingZOpinionZAloneFunnyZGamingZHumanZIndiaZJokerZDietZLegalZNormanZTipZWeightZMoviesZRunningZScienceZHorrorZ
ConfessionZFinanceZPoliticsZScaryZSupportZTechnologiesZTeenageEventZLearnedZNotionZ	WikipediaZBooksZExtractZConfessionsZ
ConspiracyZLinksZ	NarcissusZRelationshipZRelationshipsZReviewsZNewsZTranslationZmultilingualc                 C   s>   t  }| d }| dd D ]}|||f |}qt |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairsZ	prev_charchar r   f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/ctrl/tokenization_ctrl.py	get_pairs^   s    r   c                       s   e Zd ZdZeZeZd fdd	Ze	dd Z
dd Zd	d
 Zdd Zdd Zdd Zdd Zdeee ee dddZ  ZS )CTRLTokenizera`  
    Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    <unk>c                    s   t |dd}t|| _W d    n1 s.0    Y  dd | j D | _t |dd&}| ddd }W d    n1 s0    Y  dd	 |D }tt	|t
t|| _i | _t jf d
|i| d S )Nutf-8encodingc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>       z*CTRLTokenizer.__init__.<locals>.<dictcomp>
r
   c                 S   s   g | ]}t | qS r   )tuplesplit)r   merger   r   r   
<listcomp>   r   z*CTRLTokenizer.__init__.<locals>.<listcomp>	unk_token)openjsonloadencoderitemsdecoderreadr    dictziprangelen	bpe_rankscachesuper__init__)selfr   r   r#   kwargsZvocab_handleZmerges_handleZmerges	__class__r   r   r2      s    *4zCTRLTokenizer.__init__c                 C   s
   t | jS N)r.   r'   r3   r   r   r   
vocab_size   s    zCTRLTokenizer.vocab_sizec                 C   s   t | jfi | jS r7   )r+   r'   Zadded_tokens_encoderr8   r   r   r   	get_vocab   s    zCTRLTokenizer.get_vocabc           
         s  | j v r j | S t|}tt|d d |d d g }t|}|sN|S t| fddd}| jvrpql|\}}g }d}|t|k rBz|||}	W n* ty   |	||d   Y qBY n0 |	|||	  |	}|| |kr*|t|d k r*||d  |kr*|
||  |d7 }q|
||  |d7 }qt|}|}t|dkrbqlqNt|}qNd	|}|d d
 }| j |< |S )Nr   z</w>c                    s    j | tdS )Ninf)r/   getfloat)pairr8   r   r   <lambda>   r   z#CTRLTokenizer.bpe.<locals>.<lambda>keyr   r
      @@ )r0   r   listr   minr/   r.   index
ValueErrorextendappendjoin)
r3   tokenr   r   ZbigramfirstsecondZnew_wordijr   r8   r   bpe   sF    

"
2




zCTRLTokenizer.bpec                 C   s8   g }t d|}|D ]}|t| |d q|S )zTokenize a string.z\S+\n? )refindallrI   rE   rQ   r    )r3   textZsplit_tokenswordsrL   r   r   r   	_tokenize   s
    zCTRLTokenizer._tokenizec                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r'   r<   r#   )r3   rL   r   r   r   _convert_token_to_id   s    z"CTRLTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r)   r<   r#   )r3   rG   r   r   r   _convert_id_to_token   s    z"CTRLTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z:Converts a sequence of tokens (string) in a single string.rR   rC    )rK   replacestrip)r3   tokensZ
out_stringr   r   r   convert_tokens_to_string   s    z&CTRLTokenizer.convert_tokens_to_stringN)save_directoryfilename_prefixreturnc           
   	   C   sT  t j|s"td| d d S t j||r6|d ndtd  }t j||rX|d ndtd  }t|ddd	.}|t	j
| jd
dddd  W d    n1 s0    Y  d}t|ddd	v}|d t| j dd dD ]D\}}	||	krtd| d |	}|d|d  |d7 }qW d    n1 sB0    Y  ||fS )NzVocabulary path (z) should be a directory-rZ   r   r   wr   r   rB   TF)indent	sort_keysensure_asciir   r   z#version: 0.2
c                 S   s   | d S )Nr
   r   )kvr   r   r   r?      r   z/CTRLTokenizer.save_vocabulary.<locals>.<lambda>r@   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rR   r
   )ospathisdirloggererrorrK   VOCAB_FILES_NAMESr$   writer%   dumpsr'   sortedr/   r(   warning)
r3   r_   r`   r   Z
merge_filefrG   writerZ
bpe_tokensZtoken_indexr   r   r   save_vocabulary   s.    <


*zCTRLTokenizer.save_vocabulary)r   )N)__name__
__module____qualname____doc__rm   Zvocab_files_namesCONTROL_CODEScontrol_codesr2   propertyr9   r:   rQ   rW   rX   rY   r^   strr   r   rt   __classcell__r   r   r5   r   r   n   s   
,
r   )rx   r%   rh   typingr   regexrS   Ztokenization_utilsr   utilsr   Z
get_loggerru   rk   rm   ry   r   r   __all__r   r   r   r   <module>   s   
; 