a
    hA                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZddlmZ ddlmZ ddlmZ eeZd	d
ddddZdZeddG dd deZeeeef ejdddZeddddZee
eef dddZdgZdS )    N)Path)copyfile)AnyOptionalUnion   )PreTrainedTokenizer)logging)requiresz
source.spmz
target.spmz
vocab.jsonztarget_vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabtarget_vocab_fileZtokenizer_config_fileu   ▁)sentencepiece)backendsc                	       s|  e Zd ZdZeZddgZdBeee	e
f  dd
 fddZdd Ze	e	dddZdd Ze	dddZe	ee	 dddZee	dddZ fddZ fdd Zee	 e	d!d"d#ZdCee d$d%d&Zd'd( Zd)d* Zeed$d+d,ZdDe	ee	 ee	 d-d.d/Zed$d0d1Zd2d3 Zd4d5 Z ed$d6d7Z!edd8d9d:Z"d;d< Z#d=d> Z$dEeee e%ee d?d@dAZ&  Z'S )FMarianTokenizeraB  
    Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        source_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the source language.
        target_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the target language.
        source_lang (`str`, *optional*):
            A string representing the source language.
        target_lang (`str`, *optional*):
            A string representing the target language.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        model_max_length (`int`, *optional*, defaults to 512):
            The maximum sentence length the model accepts.
        additional_special_tokens (`list[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import MarianForCausalLM, MarianTokenizer

    >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
    >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
    >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)

    >>> outputs = model(**inputs)  # should work
    ```Z	input_idsZattention_maskN<unk></s><pad>   F)sp_model_kwargsreturnc                    s.  |d u ri n|| _ t| s,J d| || _t|| _t|| jvrRtdt|	| jv sdJ |rt|| _dd | j	 D | _
g | _n(dd | j	 D | _
dd | jD | _|| _|| _||g| _t|| j | _t|| j | _| j| _| j| _|   t jf |||||	|
| j ||d	| d S )	Nzcannot find spm source z <unk> token must be in the vocabc                 S   s   i | ]\}}||qS  r   .0kvr   r   j/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/marian/tokenization_marian.py
<dictcomp>       z,MarianTokenizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r   r   r   r   r   r   r      r   c                 S   s$   g | ]}| d r|dr|qS )>><<)
startswithendswith)r   r   r   r   r   
<listcomp>   r   z,MarianTokenizer.__init__.<locals>.<listcomp>)	source_langtarget_lang	unk_token	eos_token	pad_tokenmodel_max_lengthr   r   separate_vocabs)r   r   existsr+   	load_jsonencoderstrKeyErrortarget_encoderitemsdecoderZsupported_language_codesr%   r&   	spm_filesload_spm
spm_source
spm_targetcurrent_spmcurrent_encoder_setup_normalizersuper__init__)selfr   r   r   r   r%   r&   r'   r(   r)   r*   r   r+   kwargs	__class__r   r   r<   k   sD    


zMarianTokenizer.__init__c              	   C   sN   zddl m} || jj| _W n* ttfyH   td dd | _Y n0 d S )Nr   )MosesPunctNormalizerz$Recommended: pip install sacremoses.c                 S   s   | S Nr   )xr   r   r   <lambda>   r   z3MarianTokenizer._setup_normalizer.<locals>.<lambda>)	Z
sacremosesrA   r%   	normalizepunc_normalizerImportErrorFileNotFoundErrorwarningswarn)r=   rA   r   r   r   r:      s    
z!MarianTokenizer._setup_normalizer)rC   r   c                 C   s   |r|  |S dS )zHCover moses empty string edge case. They return empty list for '' input! )rF   )r=   rC   r   r   r   rE      s    zMarianTokenizer.normalizec                 C   s   | j || j | j S rB   )r9   getr'   )r=   tokenr   r   r   _convert_token_to_id   s    z$MarianTokenizer._convert_token_to_id)textc                 C   sN   g }| drF|d }dkrF||d|d   ||d d }||fS )z6Remove language codes like >>fr<< before sentencepiecer    r!   N   )r"   findappend)r=   rO   codeZend_locr   r   r   remove_language_code   s
    z$MarianTokenizer.remove_language_code)rO   r   c                 C   s&   |  |\}}| jj|td}|| S )N)Zout_type)rU   r8   encoder/   )r=   rO   rT   piecesr   r   r   	_tokenize   s    zMarianTokenizer._tokenize)indexr   c                 C   s   | j || jS )z?Converts an index (integer) in a token (str) using the decoder.)r3   rL   r'   )r=   rY   r   r   r   _convert_id_to_token   s    z$MarianTokenizer._convert_id_to_tokenc                    s   t  j|fi |S )ao  
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
            sequences (`Union[list[int], list[list[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `list[str]`: The list of decoded sentences.
        )r;   batch_decode)r=   	sequencesr>   r?   r   r   r[      s    zMarianTokenizer.batch_decodec                    s   t  j|fi |S )a  
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, list[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        )r;   decode)r=   Z	token_idsr>   r?   r   r   r]      s    zMarianTokenizer.decode)tokensr   c                 C   sv   | j r| jn| j}g }d}|D ]4}|| jv rH|||| d 7 }g }q|| q|||7 }|td}| S )zQUses source spm if _decode_use_source_tokenizer is True, and target spm otherwiserK    )	Z_decode_use_source_tokenizerr6   r7   Zall_special_tokensZdecode_piecesrS   replaceSPIECE_UNDERLINEstrip)r=   r^   Zsp_modelZcurrent_sub_tokensZ
out_stringrM   r   r   r   convert_tokens_to_string   s    
z(MarianTokenizer.convert_tokens_to_string)r   c                 C   s$   |du r|| j g S || | j g S )z=Build model inputs from a sequence by appending eos_token_id.N)Zeos_token_id)r=   token_ids_0token_ids_1r   r   r    build_inputs_with_special_tokens  s    z0MarianTokenizer.build_inputs_with_special_tokensc                 C   s   | j | _| j| _d S rB   )r6   r8   r.   r9   r=   r   r   r   _switch_to_input_mode  s    z%MarianTokenizer._switch_to_input_modec                 C   s   | j | _| jr| j| _d S rB   )r7   r8   r+   r1   r9   rg   r   r   r   _switch_to_target_mode  s    z&MarianTokenizer._switch_to_target_modec                 C   s
   t | jS rB   )lenr.   rg   r   r   r   
vocab_size  s    zMarianTokenizer.vocab_size)save_directoryfilename_prefixr   c              	   C   s  t j|s"td| d d S g }| jrt j||r@|d ndtd  }t j||rb|d ndtd  }t| j	| t| j
| || || n8t j||r|d ndtd  }t| j	| || ttd td g| j| j| jgD ]\}}}	t j||r|d nd| }
t j|t j|
kr`t j|r`t||
 ||
 qt j|st|
d	"}|	 }|| W d    n1 s0    Y  ||
 qt|S )
NzVocabulary path (z) should be a directory-rK   r   r   r   r   wb)ospathisdirloggererrorr+   joinVOCAB_FILES_NAMES	save_jsonr.   r1   rS   zipr4   r6   r7   abspathisfiler   openZserialized_model_protowritetuple)r=   rl   rm   Zsaved_filesZout_src_vocab_fileZout_tgt_vocab_fileZout_vocab_fileZspm_save_filenameZspm_orig_pathZ	spm_modelZspm_save_pathfiZcontent_spiece_modelr   r   r   save_vocabulary  sN    


(
*zMarianTokenizer.save_vocabularyc                 C   s   |   S rB   )get_src_vocabrg   r   r   r   	get_vocabL  s    zMarianTokenizer.get_vocabc                 C   s   t | jfi | jS rB   )dictr.   Zadded_tokens_encoderrg   r   r   r   r   O  s    zMarianTokenizer.get_src_vocabc                 C   s   t | jfi | jS rB   )r   r1   Zadded_tokens_decoderrg   r   r   r   get_tgt_vocabR  s    zMarianTokenizer.get_tgt_vocabc                 C   s"   | j  }|tg d |S )N)r6   r7   r8   rF   r   )__dict__copyupdater   fromkeys)r=   stater   r   r   __getstate__U  s
    
zMarianTokenizer.__getstate__)dr   c                    sF   | _ t dsi  _ fdd jD \ _ _ j _   d S )Nr   c                 3   s   | ]}t | jV  qd S rB   )r5   r   )r   frg   r   r   	<genexpr>c  r   z/MarianTokenizer.__setstate__.<locals>.<genexpr>)r   hasattrr   r4   r6   r7   r8   r:   )r=   r   r   rg   r   __setstate__\  s    
zMarianTokenizer.__setstate__c                 O   s   dS )zJust EOS   r   )r=   argsr>   r   r   r   num_special_tokens_to_addg  s    z)MarianTokenizer.num_special_tokens_to_addc                    s(   t | j  | j  fdd|D S )Nc                    s   g | ]}| v rd ndqS )r   r   r   )r   rC   all_special_idsr   r   r$   n  r   z7MarianTokenizer._special_token_mask.<locals>.<listcomp>)setr   removeZunk_token_id)r=   seqr   r   r   _special_token_maskk  s    
z#MarianTokenizer._special_token_mask)rd   re   already_has_special_tokensr   c                 C   s>   |r|  |S |du r&|  |dg S |  || dg S dS )zCGet list where entries are [1] if a token is [eos] or [pad] else 0.Nr   )r   )r=   rd   re   r   r   r   r   get_special_tokens_maskp  s
    
z'MarianTokenizer.get_special_tokens_mask)	NNNr   r   r   r   NF)N)N)NF)(__name__
__module____qualname____doc__rv   Zvocab_files_namesZmodel_input_namesr   r   r/   r   r<   r:   rE   rN   rU   listrX   intrZ   r[   r]   rc   rf   rh   ri   propertyrk   r}   r   r   r   r   r   r   r   r   boolr   __classcell__r   r   r?   r   r   ,   sV   :         >	- 
r   )rq   r   r   c                 C   s   t jf i |}||  |S rB   )r   SentencePieceProcessorLoad)rq   r   Zspmr   r   r   r5   |  s    
r5   )rq   r   c                 C   s>   t |d }tj| |dd W d    n1 s00    Y  d S )NwrQ   )indent)r{   jsondump)datarq   r   r   r   r   rw     s    rw   c                 C   s8   t | d}t|W  d    S 1 s*0    Y  d S )Nr)r{   r   load)rq   r   r   r   r   r-     s    r-   ) r   rp   rI   pathlibr   shutilr   typingr   r   r   r   Ztokenization_utilsr   utilsr	   Zutils.import_utilsr
   Z
get_loggerr   rs   rv   ra   r   r/   r   r   r5   rw   r   r-   __all__r   r   r   r   <module>   s2   
	  Q