a
    h                     @   sd   d Z ddlZddlmZ ddlmZ ddlmZ ee	Z
ddiZd	d
 ZG dd deZdgZdS )zTokenization classes for ESM.    N)Optional   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                 C   sH   t | d*}|  }dd |D W  d    S 1 s:0    Y  d S )Nrc                 S   s   g | ]}|  qS  )strip).0lr	   r	   d/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/esm/tokenization_esm.py
<listcomp>        z#load_vocab_file.<locals>.<listcomp>)openread
splitlines)r   flinesr	   r	   r   load_vocab_file   s    r   c                       s   e Zd ZdZeZddgZd& fd	d
	Zee	dddZ
e	edddZdd Zdd Ze	edddZee	dddZd'ee eee  ee dddZd(eee eee ddd Zd!d" Zeed#d$d%Z  ZS ))EsmTokenizerz&
    Constructs an ESM tokenizer.
    Z	input_idsZattention_mask<unk><cls><pad><mask><eos>c                    sf   t || _tt| j| _dd t| jD | _t jf |||||d| | j| _| 	| j d S )Nc                 S   s   i | ]\}}||qS r	   r	   )r   indtokr	   r	   r   
<dictcomp>7   r   z)EsmTokenizer.__init__.<locals>.<dictcomp>)	unk_token	cls_token	pad_token
mask_token	eos_token)
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__Zunique_no_split_tokensZ_update_trie)selfr   r   r    r!   r"   r#   kwargs	__class__r	   r   r*   +   s    

zEsmTokenizer.__init__)indexreturnc                 C   s   | j || jS Nr'   getr   r+   r/   r	   r	   r   _convert_id_to_tokenG   s    z!EsmTokenizer._convert_id_to_token)tokenr0   c                 C   s   | j || j | jS r1   r(   r3   r   r+   r6   r	   r	   r   _convert_token_to_idJ   s    z!EsmTokenizer._convert_token_to_idc                 K   s   |  S r1   )split)r+   textr,   r	   r	   r   	_tokenizeM   s    zEsmTokenizer._tokenizec                 C   s   | j  }|| j |S r1   )r(   copyupdateZadded_tokens_encoder)r+   Z
base_vocabr	   r	   r   	get_vocabP   s    
zEsmTokenizer.get_vocabc                 C   s   | j || j | jS r1   r7   r8   r	   r	   r   token_to_idU   s    zEsmTokenizer.token_to_idc                 C   s   | j || jS r1   r2   r4   r	   r	   r   id_to_tokenX   s    zEsmTokenizer.id_to_tokenN)token_ids_0token_ids_1r0   c                 C   s^   | j g}| jg}|d u r8| jd u r*|| S || | S n| jd u rJtd|| | | | S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)Zcls_token_idZeos_token_id
ValueError)r+   rB   rC   clssepr	   r	   r    build_inputs_with_special_tokens[   s    

z-EsmTokenizer.build_inputs_with_special_tokensF)rB   rC   already_has_special_tokensr0   c                    sd   |r&|durt d fdd|D S dgdgt|  dg }|dur`|dgt| dg 7 }|S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`list[int]`):
                List of ids of the first sequence.
            token_ids_1 (`list[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.c                    s   g | ]}| j v rd ndqS )   r   )Zall_special_ids)r   r6   r+   r	   r   r      r   z8EsmTokenizer.get_special_tokens_mask.<locals>.<listcomp>rI   r   )rD   len)r+   rB   rC   rH   maskr	   rJ   r   get_special_tokens_maski   s    z$EsmTokenizer.get_special_tokens_maskc                 C   s`   t j||r|d ndd }t|d"}|d| j W d    n1 sP0    Y  |fS )N- r   w
)ospathjoinr   writer$   )r+   Zsave_directoryZfilename_prefixr   r   r	   r	   r   save_vocabulary   s    0zEsmTokenizer.save_vocabulary)r0   c                 C   s
   t | jS r1   )rK   r$   rJ   r	   r	   r   
vocab_size   s    zEsmTokenizer.vocab_size)r   r   r   r   r   )N)NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESZvocab_files_namesZmodel_input_namesr*   intstrr5   r9   r<   r?   r@   rA   listr   rG   boolrM   rV   propertyrW   __classcell__r	   r	   r-   r   r   #   s8          
r   )r[   rR   typingr   Ztokenization_utilsr   utilsr   Z
get_loggerrX   loggerr\   r   r   __all__r	   r	   r	   r   <module>   s   
p