a
    h>                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlZddlmZ ddlmZ ddlmZ erdd	lmZ dd
lmZmZ ddlmZ eeZddiZdZeddG dd deZdgZdS )z$Tokenization class for SigLIP model.    N)copyfile)TYPE_CHECKINGAnyOptional   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)loggingrequires_backends)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc                       sx  e Zd ZdZeZddgZd8eee	e
f  dd	 fd
dZdd Zedd Zdd Zd9ee eee  eee d fddZee ee dddZd:ee eee  ee dddZd;ee eee  ee dddZdd Zd d! Ze	e	d"d#d$Zdd%d&d'Zd<d(ee	 d" fd)d*Zed+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Z d=e	ee	 e!e	 d5d6d7Z"  Z#S )>SiglipTokenizera  
    Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"</s>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`list[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        model_max_length (`int`, *optional*, defaults to 64):
            The maximum length (in number of tokens) for model inputs.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
    Z	input_idsZattention_mask</s><unk>N@   T)sp_model_kwargsreturnc	           
   
      s   t | d t|tr&t|dddddn|}t|trFt|dddddn|}t|trft|dddddn|}|d u rvi n|| _|| _|| _|  | _|| _t	 j
f ||||| j||d|	 d S )NprotobufTF)rstriplstrip
normalizedZspecial)	eos_token	unk_token	pad_tokenadditional_special_tokensr   model_max_lengthdo_lower_case)r   
isinstancestrr	   r   r    r   get_spm_processorsp_modelsuper__init__)
selfr   r   r   r   r   r   r   r    kwargs	__class__ j/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/siglip/tokenization_siglip.pyr&   Z   s:    

zSiglipTokenizer.__init__c                 C   s   t jf i | j}t| jdV}| }t }|j|}|	 }d|_
|j| | }|| W d    n1 sz0    Y  |S )NrbF)spmSentencePieceProcessorr   openr   readr   Z
ModelProtoZ
FromStringZNormalizerSpecZadd_dummy_prefixnormalizer_specZ	MergeFromZSerializeToStringZLoadFromSerializedProto)r'   	tokenizerfr$   Z	model_pb2modelr2   r+   r+   r,   r#      s    (z!SiglipTokenizer.get_spm_processorc                 C   s
   | j  S N)r$   Zget_piece_sizer'   r+   r+   r,   
vocab_size   s    zSiglipTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r+   )Zconvert_ids_to_tokens).0ir7   r+   r,   
<dictcomp>       z-SiglipTokenizer.get_vocab.<locals>.<dictcomp>)ranger8   updateZadded_tokens_encoder)r'   Zvocabr+   r7   r,   	get_vocab   s    zSiglipTokenizer.get_vocabF)token_ids_0token_ids_1already_has_special_tokensr   c                    sZ   |rt  j||ddS |du r2dgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r@   rA   rB   Nr      )r%   get_special_tokens_masklen)r'   r@   rA   rB   r)   r+   r,   rD      s    z'SiglipTokenizer.get_special_tokens_mask)	token_idsr   c                 C   sB   t |dkr2|d | jkr2td| j d |S || jg S dS )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.N)rE   eos_token_idwarningswarnr   )r'   rF   r+   r+   r,   _add_eos_if_not_present   s    z'SiglipTokenizer._add_eos_if_not_present)r@   rA   r   c                 C   s<   | j g}|du r"t|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        Nr   )rH   rE   )r'   r@   rA   Zeosr+   r+   r,   $create_token_type_ids_from_sequences   s    z4SiglipTokenizer.create_token_type_ids_from_sequencesc                 C   s,   |  |}|du r|S |  |}|| S dS )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)rK   )r'   r@   rA   r+   r+   r,    build_inputs_with_special_tokens   s
    

z0SiglipTokenizer.build_inputs_with_special_tokensc                 C   s   | j  }d |d< |S )Nr$   )__dict__copy)r'   stater+   r+   r,   __getstate__   s    
zSiglipTokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjf i | j| _| j| j d S )Nr   )rN   hasattrr   r.   r/   r$   Loadr   )r'   dr+   r+   r,   __setstate__  s
    
zSiglipTokenizer.__setstate__)textr   c                 C   s   | tddtjS )N )	translater"   	maketransstringpunctuation)r'   rV   r+   r+   r,   remove_punctuation  s    z"SiglipTokenizer.remove_punctuationkeep_punctuation_exact_stringc                   sH   |r$|  fdd||D }n
 |}tdd|}| }|S )a  Returns canonicalized `text` (puncuation removed).

        Args:
            text (`str`):
                String to be canonicalized.
            keep_punctuation_exact_string (`str`, *optional*):
                If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
                (but will still remove '{' and '}' that appear separately).
        c                 3   s   | ]}  |V  qd S r6   )r\   )r9   partr7   r+   r,   	<genexpr>  s   z4SiglipTokenizer.canonicalize_text.<locals>.<genexpr>z\s+ )joinsplitr\   resubstrip)r'   rV   r^   r+   r7   r,   canonicalize_text  s    


z!SiglipTokenizer.canonicalize_textr
   c                    sV   t  jt|td fi |}t|dkrR|d tkrR|d | jv rR|dd }|S )z8
        Converts a string to a list of tokens.
        ra   rC   r   N)r%   tokenizeSPIECE_UNDERLINEreplacerE   all_special_tokens)r'   rV   Zadd_special_tokensr(   tokensr)   r+   r,   rh   (  s     &zSiglipTokenizer.tokenizec                 C   s   t | jt| jS r6   )rE   r$   encoder"   r   r7   r+   r+   r,   unk_token_length2  s    z SiglipTokenizer.unk_token_lengthc                 K   sT   | j |dd}| jj|td}| jj| j| td}t|| jkrP|| jd S |S )u*  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE.

        For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.

        Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        Nr]   )Zout_type)rg   r$   rm   r"   r   rE   rn   )r'   rV   r(   rl   r+   r+   r,   	_tokenize7  s    zSiglipTokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r$   Zpiece_to_id)r'   tokenr+   r+   r,   _convert_token_to_idL  s    z$SiglipTokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r$   Z	IdToPiece)r'   indexrp   r+   r+   r,   _convert_id_to_tokenQ  s    z$SiglipTokenizer._convert_id_to_tokenc                 C   sp   g }d}d}|D ]F}|| j v rH|s*|d7 }|| j|| 7 }d}g }q|| d}q|| j|7 }| S )z:Converts a sequence of tokens (string) in a single string.rW   Fra   T)rk   r$   decodeappendrf   )r'   rl   Zcurrent_sub_tokensZ
out_stringZprev_is_specialrp   r+   r+   r,   convert_tokens_to_stringV  s    

z(SiglipTokenizer.convert_tokens_to_string)save_directoryfilename_prefixr   c                 C   s   t j|s"td| d d S t j||r6|d ndtd  }t j| jt j|krzt j	| jrzt
| j| nLt j	| jst|d$}| j }|| W d    n1 s0    Y  |fS )NzVocabulary path (z) should be a directory-rW   r   wb)ospathisdirloggererrorrb   VOCAB_FILES_NAMESabspathr   isfiler   r0   r$   Zserialized_model_protowrite)r'   rw   rx   Zout_vocab_filefiZcontent_spiece_modelr+   r+   r,   save_vocabularyj  s    (
(zSiglipTokenizer.save_vocabulary)r   r   r   NNr   T)NF)N)N)F)N)$__name__
__module____qualname____doc__r   Zvocab_files_namesZmodel_input_namesr   dictr"   r   r&   r#   propertyr8   r?   listintboolrD   rK   rL   rM   rQ   rU   r\   rg   rh   rn   ro   rq   rs   rv   tupler   __classcell__r+   r+   r)   r,   r   -   s\   (       1
   


r   ) r   r{   rd   rZ   rI   shutilr   typingr   r   r   r   r.   Zconvert_slow_tokenizerr   Ztokenization_utilsr   Ztokenization_utils_baser	   r
   utilsr   r   Zutils.import_utilsr   Z
get_loggerr   r~   r   ri   r   __all__r+   r+   r+   r,   <module>   s,   
  P