a
    hp                     @   sf  d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( e rddl)m*Z* ndZ*e+e,Z-ee.e/e	e. e	e. f f dde r0dndffde rDdnde rRdndffdde rhdndffdde r~dndffdde rdndffd de rd!ndffd"de rdndffd#d$e rd%nde rd&ndffd'd(de rdndffd)e rd*nddffd+d,d-e r*d.nde r8d/ndffd0d1e rNd2ndffd3d4de rfd5ndffd6d7d8de rdndffd9d:e rd;ndffd<de rd=ndffd>d?e rd@ndffdAde rdndffdBdCe rdDnde rdEndffdFdGe rdnde r dndffdHde r6dndffdId?e rLd@ndffdJde rbdndffdKde rxdndffdLdMe rdNnde rdOndffdPdQe rdRndffdSde rd!ndffdTde rd!ndffdUde rdndffdVdWe r
dXndffdYdZe r d[ndffd\e r4d]nde rBd^ndffd_d`dadbd?e r^d@ndffdcd:e rtd;ndffdddee rdfndffdge rdhnde rdindffdje rdnde rdndffdke rdnde rdndffdle rdnde rdndffdme r&dnde r4dndffdndoe rJdnde rXdndffdpdqe rndrndffdsdte rdundffdvdwe rdxndffdyd:e rd;ndffdzde rdndffd{de rdndffd|de rdndffd}e rd~nddffdde rd:nde r,d;ndffdde rBd5ndffdde rXdndffde rldnddffddde rdndffddde rdndffde rdnde rdndffde rdnde rdndffde rdnde rdndffde rdnde r&dndffde r:dnde rHdndffde r\dnde rjdndffdde rdndffdde rd5ndffdde rd5ndffdde rd5ndffdde rd5ndffdde rd5ndffde rdnddffdd:e rd;ndffdd:e r0d;ndffdd:e rFd;ndffdde r\dndffddde rtd5ndffdd:e rd;ndffddddddde rdndffdde rdndffdde rd5ndffdde rdndffddd?e 	rd@ndffdde 	rdndffdde 	r0dndffdde 	rFdndffdd:e 	r\d;ndffdd:e 	rrd;ndffddWe 	rdXndffde 	rdnde 	rdndffdde 	rdndffde 	rdnde 	rdndffdde 	rdnde 
rdndffdde 
rd5ndffdde 
r2dndffdde 
rHdndffdde 
r^dndffdde 
rtdndffdde 
rdndffdde 
rdndffde 
rdnde 
rdndffde 
rdnde 
rdndffde 
rdnde rdndffdde rdndffdde r2dndffdde rHdndffdde r^dndffdde rtdndffde rdnde rdndffddde rdndffde rdnddffdde rdndffdde rdndffde rdnddffde rdnde r(dndffde r<dnde rJdndffdd?e r`d@ndffdde rvdndffdde rdndffdde rd:nde rd;ndffde rdne rdnde re sdndffde rdne rdnde re sdndffdde r2dndffde rFdnddffdde r^dndffdde rtdndffdde rd5ndffdde rd5ndffdde rd5ndffdd e rАdndffdde rdndffdd?e r d@ndffde rdnde r(dndffdde r@dndffdde rXdndffd	d
e rtdndffddde rd5ndffdde rdndffde rdnde rАdndffde rdnde rdndffde rdnde rdndffdde r4dndffdde rLdndffdde rddndffdde r|dndffdde rdndffdde rdndffdd:e rd;ndffdde rdndffdde rdndffdde rdndffd e r&d1nde r4d2ndffd!e rJd1nde rXd2ndffd"d#e rrdnde rdndffd$dQe rdRndffd%de rdndffd&de rdndffd'd(de rdndffd)de rdne rd5ndffd*e r d+nddffd,d-de r>dndffd.dWe rVdXndffd/dWe rndXndffd0dWe rdXndffd1dWe rdXndffd2dWe rdXndffd3dWe rdXndffd4dWe rdXndffd5dWe rdXndffd6d7d8e rd9ndffd:e r4dnde rBdndffd;e rZd<nde rjd=ndffd>e rd?nde rd@ndffdAdBe rdCndffdDd?e rd@ndffdEd?e rd@ndffdFdGdHe rdIndffdJde rdndffdKe r.dLnde r>dMndffdNe rVdLnde rfdMndffdOe r|dnde rdndffdPe rdQnddffdRe rdnde rdndffdSde rd5ndffdTe rdUnddffdVdWe rdXnddffdYdZd[e r8d\ndffd]de rPdndffd^d:e rhd;ndffd_e r~dnde rdndffd`e rdnde rdndffdae rdnde rdndffdbdcdddede rdndffdfe rdgnde r dhndffdie r6dnde rDdndffdjde r\dndffdkde rtdndffdlde rdndffdmde rdndffdndoe rdne rdnde re sdndffdpdqdrdsdtdue rdvndffdwde r$dndffdxe r<dynde rLdzndffd{d|e rhd}nddffd~e rdnde rdndffde rdnde rdndffde rʐdnde rڐdndffdde rdndffde rdnde rdndffde r,dnde r:dndffde rPdnde r^dndffde rtdnde rdndffgZ0e"e$e0Z1dd e$2 D Z3e.e
e4e df dddZ5de
e.ej6e. f e	e
e.ej6e. f  e7e	e7 e	e8e.e.f  e	e
e7e.f  e	e. e7e.e8e.ef d
ddZ9G dd dZ:ddgZ;dS (  zAuto Tokenizer class.    N)OrderedDict)AnyOptionalUnion)is_mistral_common_available   )PretrainedConfig)get_class_from_dynamic_moduleresolve_trust_remote_code)load_gguf_checkpoint)PreTrainedTokenizer)TOKENIZER_CONFIG_FILE)cached_fileextract_commit_hashis_g2p_en_availableis_sentencepiece_availableis_tokenizers_availablelogging   )EncoderDecoderConfig   )_LazyAutoMapping)CONFIG_MAPPING_NAMES
AutoConfigconfig_class_to_model_typemodel_type_to_module_name!replace_list_option_in_docstrings)PreTrainedTokenizerFastZaimv2ZCLIPTokenizerZCLIPTokenizerFastZalbertZAlbertTokenizerZAlbertTokenizerFastalignZBertTokenizerZBertTokenizerFastZarceeZLlamaTokenizerZLlamaTokenizerFastZariaZ
aya_visionZCohereTokenizerFastZbark)Zbart)ZBartTokenizerZBartTokenizerFastZbarthezZBarthezTokenizerZBarthezTokenizerFast)Zbartpho)ZBartphoTokenizerNZbertzbert-generationZBertGenerationTokenizer)zbert-japanese)ZBertJapaneseTokenizerN)Zbertweet)ZBertweetTokenizerNZbig_birdZBigBirdTokenizerZBigBirdTokenizerFastZbigbird_pegasusZPegasusTokenizerZPegasusTokenizerFast)Zbiogpt)ZBioGptTokenizerNZbitnetr   )Z
blenderbot)ZBlenderbotTokenizerZBlenderbotTokenizerFast)zblenderbot-small)ZBlenderbotSmallTokenizerNZblipzblip-2GPT2TokenizerZGPT2TokenizerFastZbloomZBloomTokenizerFastZbridgetowerZRobertaTokenizerZRobertaTokenizerFastZbros)Zbyt5)ZByT5TokenizerNZ	camembertZCamembertTokenizerZCamembertTokenizerFast)Zcanine)ZCanineTokenizerNZ	chameleonZchinese_clipclapZclipZclipseg)Zclvp)ZClvpTokenizerNZ
code_llamaZCodeLlamaTokenizerZCodeLlamaTokenizerFastZcodegenZCodeGenTokenizerZCodeGenTokenizerFastZcohereZcohere2ZcolpaliZcolqwen2ZQwen2TokenizerZQwen2TokenizerFastZconvbertZConvBertTokenizerZConvBertTokenizerFastZcpmZCpmTokenizerZCpmTokenizerFast)Zcpmant)ZCpmAntTokenizerN)Zctrl)ZCTRLTokenizerN)zdata2vec-audioZWav2Vec2CTCTokenizerNzdata2vec-textZdbrxZdebertaZDebertaTokenizerZDebertaTokenizerFastz
deberta-v2ZDebertaV2TokenizerZDebertaV2TokenizerFastZdeepseek_v2Zdeepseek_v3Zdeepseek_vlZdeepseek_vl_hybrid)Zdia)ZDiaTokenizerNZ	diffllamaZ
distilbertZDistilBertTokenizerZDistilBertTokenizerFastZdprZDPRQuestionEncoderTokenizerZDPRQuestionEncoderTokenizerFastZelectraZElectraTokenizerZElectraTokenizerFastZemu3ZernieZernie4_5Zernie4_5_moeZernie_mZErnieMTokenizer)Zesm)ZEsmTokenizerNZexaone4ZfalconZfalcon_mambaZGPTNeoXTokenizerFastZfastspeech2_conformerZFastSpeech2ConformerTokenizer)Zflaubert)ZFlaubertTokenizerNZfnetZFNetTokenizerZFNetTokenizerFast)Zfsmt)ZFSMTTokenizerNZfunnelZFunnelTokenizerZFunnelTokenizerFastZgemmaZGemmaTokenizerZGemmaTokenizerFastZgemma2Zgemma3Zgemma3_textZgemma3nZgemma3n_textgitZglmZglm4Zglm4_moeZglm4vZ	glm4v_moezgpt-sw3ZGPTSw3TokenizerZgpt2Zgpt_bigcodeZgpt_neoZgpt_neox)Zgpt_neox_japanese)ZGPTNeoXJapaneseTokenizerNZgpt_ossZgptj)zgptsan-japanese)ZGPTSanJapaneseTokenizerN)Zgraniter   N)Z
granitemoer#   )Zgranitemoehybridr#   )Zgranitemoesharedr#   zgrounding-dinoZgroupvitZheliumZherbertZHerbertTokenizerZHerbertTokenizerFast)Zhubertr!   ZibertZideficsZidefics2Zidefics3ZinstructblipZinstructblipvideoZinternvlZjambaZjanusZjetmoe)Zjukebox)ZJukeboxTokenizerNzkosmos-2ZXLMRobertaTokenizerZXLMRobertaTokenizerFastz
kosmos-2.5ZlayoutlmZLayoutLMTokenizerZLayoutLMTokenizerFastZ
layoutlmv2ZLayoutLMv2TokenizerZLayoutLMv2TokenizerFastZ
layoutlmv3ZLayoutLMv3TokenizerZLayoutLMv3TokenizerFastZ	layoutxlmZLayoutXLMTokenizerZLayoutXLMTokenizerFastZledZLEDTokenizerZLEDTokenizerFastZliltllamaZllama4Zllama4_textZllavaZ
llava_nextZllava_next_videoZllava_onevisionZ
longformerZLongformerTokenizerZLongformerTokenizerFastZlongt5ZT5TokenizerZT5TokenizerFast)Zluke)ZLukeTokenizerNZlxmertZLxmertTokenizerZLxmertTokenizerFastZm2m_100ZM2M100TokenizerZmambaZmamba2ZmarianZMarianTokenizerZmbartZMBartTokenizerZMBartTokenizerFastZmbart50ZMBart50TokenizerZMBart50TokenizerFastmegazmegatron-bertZ
metaclip_2)zmgp-str)ZMgpstrTokenizerNZminimaxmistralMistralCommonTokenizermixtralZmllamaZmlukeZMLukeTokenizerzmm-grounding-dinoZ
mobilebertZMobileBertTokenizerZMobileBertTokenizerFastZ
modernbertZ	moonshineZmoshiZmpnetZMPNetTokenizerZMPNetTokenizerFastZmptZmraZmt5ZMT5TokenizerZMT5TokenizerFastZmusicgenZmusicgen_melodyZmvpZMvpTokenizerZMvpTokenizerFast)Zmyt5)ZMyT5TokenizerNZnemotronZnezhaZnllbZNllbTokenizerZNllbTokenizerFastznllb-moeZnystromformerZolmoZolmo2Zolmoezomdet-turboZ	oneformerz
openai-gptZOpenAIGPTTokenizerZOpenAIGPTTokenizerFastoptZowlv2ZowlvitZ	paligemmaZpegasusZ	pegasus_x)Z	perceiver)ZPerceiverTokenizerNZ	persimmonphiZphi3Zphimoe)Zphobert)ZPhobertTokenizerNZ
pix2structZpixtralZplbartZPLBartTokenizer)Z
prophetnet)ZProphetNetTokenizerNZqdqbertZqwen2Zqwen2_5_omniZ
qwen2_5_vlZqwen2_audioZ	qwen2_moeZqwen2_vlZqwen3Z	qwen3_moe)Zrag)ZRagTokenizerNrealmZRealmTokenizerZRealmTokenizerFastZrecurrent_gemmaZreformerZReformerTokenizerZReformerTokenizerFastZrembertZRemBertTokenizerZRemBertTokenizerFastZ	retribertZRetriBertTokenizerZRetriBertTokenizerFastZrobertazroberta-prelayernorm)Zroc_bert)ZRoCBertTokenizerNZroformerZRoFormerTokenizerZRoFormerTokenizerFastZrwkvZseamless_m4tZSeamlessM4TTokenizerZSeamlessM4TTokenizerFastZseamless_m4t_v2Zshieldgemma2ZsiglipZSiglipTokenizerZsiglip2Zsmollm3Zspeech_to_textZSpeech2TextTokenizer)Zspeech_to_text_2)ZSpeech2Text2TokenizerNZspeecht5ZSpeechT5Tokenizer)Zsplinter)ZSplinterTokenizerZSplinterTokenizerFastZsqueezebertZSqueezeBertTokenizerZSqueezeBertTokenizerFastZstablelmZ
starcoder2Zswitch_transformersZt5Zt5gemma)Ztapas)ZTapasTokenizerN)Ztapex)ZTapexTokenizerN)z
transfo-xl)ZTransfoXLTokenizerNZtvpZudopZUdopTokenizerZUdopTokenizerFastZumt5Zvideo_llavaZviltZvipllavaZvisual_bert)Zvits)ZVitsTokenizerNZvoxtral)Zwav2vec2r!   )zwav2vec2-bertr!   )zwav2vec2-conformerr!   )Zwav2vec2_phoneme)ZWav2Vec2PhonemeCTCTokenizerNZwhisperZWhisperTokenizerZWhisperTokenizerFastZxclipZxglmZXGLMTokenizerZXGLMTokenizerFast)Zxlm)ZXLMTokenizerNzxlm-prophetnetZXLMProphetNetTokenizerzxlm-robertazxlm-roberta-xlZxlnetZXLNetTokenizerZXLNetTokenizerFastZxlstmZxmodZyosoZzambaZzamba2c                 C   s   i | ]\}}||qS  r,   ).0kvr,   r,   f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/auto/tokenization_auto.py
<dictcomp>      r1   )
class_namereturnc              	   C   s   | dkrt S t D ]p\}}| |v rt|}|dv rJ| dkrJtdd}ntd| d}zt|| W   S  ty   Y qY q0 qtj	
 D ]*}|D ] }t|dd | kr|    S qqtd}t|| rt|| S d S )	Nr   )r&   r(   r'   z.tokenization_mistral_commonZtransformers.ztransformers.models__name__)r   TOKENIZER_MAPPING_NAMESitemsr   	importlibimport_modulegetattrAttributeErrorTOKENIZER_MAPPING_extra_contentvalueshasattr)r3   module_nameZ
tokenizersmodule	tokenizerZmain_moduler,   r,   r0   tokenizer_class_from_name  s(    


rD   F )
pretrained_model_name_or_path	cache_dirforce_downloadresume_downloadproxiestokenrevisionlocal_files_only	subfolderr4   c	                 K   s   |	 dd}
|
dur4tdt |dur0td|
}|	d}t| t||||||||ddd|d}|du rxt	d i S t
||}t|d	d
}t|}W d   n1 s0    Y  ||d< |S )a  
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
        pretrained_model_name_or_path (`str` or `os.PathLike`):
            This can be either:

            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
              huggingface.co.
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.

        cache_dir (`str` or `os.PathLike`, *optional*):
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
        resume_download:
            Deprecated and ignored. All downloads are now resumed by default when possible.
            Will be removed in v5 of Transformers.
        proxies (`dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        token (`str` or *bool*, *optional*):
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
            when running `hf auth login` (stored in `~/.huggingface`).
        revision (`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
            identifier allowed by git.
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
        subfolder (`str`, *optional*, defaults to `""`):
            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.

    <Tip>

    Passing `token=True` is required when you want to use a private model.

    </Tip>

    Returns:
        `dict`: The configuration of the tokenizer.

    Examples:

    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")

    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```use_auth_tokenNrThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.V`token` and `use_auth_token` are both specified. Please set only the argument `token`._commit_hashF)rG   rH   rI   rJ   rK   rL   rM   rN   Z _raise_exceptions_for_gated_repoZ%_raise_exceptions_for_missing_entriesZ'_raise_exceptions_for_connection_errorsrR   z\Could not locate the tokenizer configuration file, will try to use the model config instead.zutf-8)encoding)popwarningswarnFutureWarning
ValueErrorgetr   r   loggerinfor   openjsonload)rF   rG   rH   rI   rJ   rK   rL   rM   rN   kwargsrO   Zcommit_hashZresolved_config_filereaderresultr,   r,   r0   get_tokenizer_config&  sD    I


(rb   c                   @   s:   e Zd ZdZdd Zeeedd Ze	d
dd	Z
dS )AutoTokenizera  
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
    created with the [`AutoTokenizer.from_pretrained`] class method.

    This class cannot be instantiated directly using `__init__()` (throws an error).
    c                 C   s   t dd S )Nz}AutoTokenizer is designed to be instantiated using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.)OSError)selfr,   r,   r0   __init__  s    zAutoTokenizer.__init__c                 O   s  | dd}|dur>tdt |ddur6td||d< | dd}d|d< | d	d}| d
d}| dd}|d}	|dur4d}
t|d}|du rtd| dddd tD  d|\}}|r|durt|}
n
t	
d |
du rt|}
|
du rtd| d|
j|g|R i |S t|fi |}d|v rZ|d |d< |d}d}d|v rt|d ttfr|d }n|d dd}|du r.t|ts|	rt||	fi |}t|ddd }tjf i |}ntj|fd|i|}|j}t|dr.d|jv r.|jd }|du}t|tv pj|duojt|dupjt|d du}|r|r|d dur|d }n|d }d|v r|dd }nd}t|||||}|r|rt||fi |}
| d d}|
  |
j|g|R d|i|S |durd}
|rF|dsF| d}t|}
|
du r\|}t|}
|
du rvtd| d!|
j|g|R i |S t|trt|j t|j!urt	
d"|j!j" d#|j j" d$ |j!}t#t|j$}|durVtt| \}}|r,|s|du r,|j|g|R i |S |durN|j|g|R i |S td%td&|j" d'dd(d tD  ddS ))a]  
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.

        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:

        List options

        Params:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                Can be either:

                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
                      applicable to all derived classes)
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
                The configuration object used to determine the tokenizer class to instantiate.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
            proxies (`dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            subfolder (`str`, *optional*):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
            use_fast (`bool`, *optional*, defaults to `True`):
                Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
                a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
                is returned instead.
            tokenizer_type (`str`, *optional*):
                Tokenizer type to be loaded.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer

        >>> # Download vocabulary from huggingface.co and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")

        >>> # Download vocabulary from huggingface.co and define model-specific arguments
        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)
        ```rO   NrP   rK   rQ   configTZ
_from_autouse_fasttokenizer_typetrust_remote_code	gguf_filezPassed `tokenizer_type` z3 does not exist. `tokenizer_type` should be one of z, c                 s   s   | ]
}|V  qd S Nr,   r-   cr,   r,   r0   	<genexpr>  r2   z0AutoTokenizer.from_pretrained.<locals>.<genexpr>r5   zt`use_fast` is set to `True` but the tokenizer class does not have a fast version.  Falling back to the slow version.zTokenizer class z is not currently imported.rR   tokenizer_classauto_maprc   F)Zreturn_tensorsZFastr   r   z--Zcode_revisionz- does not exist or is not currently imported.z The encoder model config class: z3 is different from the decoder model config class: z. It is not recommended to use the `AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder specific tokenizer classes.zzThis tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer.z!Unrecognized configuration class z8 to build an AutoTokenizer.
Model type should be one of c                 s   s   | ]}|j V  qd S rl   )r6   rm   r,   r,   r0   ro     r2   )%rT   rU   rV   rW   rY   rX   r7   joinrD   rZ   warningfrom_pretrainedrb   
isinstancetuplelistr   r   r   r   Z	for_modelrp   r@   rq   typer=   splitr
   r	   Zregister_for_auto_classendswithr   decoderencoder	__class__r   r6   )clsrF   inputsr_   rO   rg   rh   ri   rj   rk   rp   Ztokenizer_class_tupleZtokenizer_class_nameZtokenizer_fast_class_nameZtokenizer_configZconfig_tokenizer_classZtokenizer_auto_mapZ	gguf_pathZconfig_dictZhas_remote_codeZhas_local_codeZ	class_refZupstream_repo_Ztokenizer_class_candidateZ
model_typeZtokenizer_class_pyZtokenizer_class_fastr,   r,   r0   rt     s    M






















zAutoTokenizer.from_pretrainedNFc                 C   s   |du r|du rt d|dur2t|tr2t d|durLt|trLt d|dur|durt|tr|j|krt d|j d| d| tjv rt|  \}}|du r|}|du r|}tj| ||f|d dS )	a  
        Register a new tokenizer in this mapping.


        Args:
            config_class ([`PretrainedConfig`]):
                The configuration corresponding to the model to register.
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
                The slow tokenizer to register.
            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
                The fast tokenizer to register.
        NzKYou need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_classz:You passed a fast tokenizer in the `slow_tokenizer_class`.z:You passed a slow tokenizer in the `fast_tokenizer_class`.zThe fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not consistent with the slow tokenizer class you passed (fast tokenizer has z and you passed z!. Fix one of those so they match!)exist_ok)rX   
issubclassr   r   slow_tokenizer_classr=   r>   register)Zconfig_classr   Zfast_tokenizer_classr   Zexisting_slowZexisting_fastr,   r,   r0   r     s8    
zAutoTokenizer.register)NNF)r6   
__module____qualname____doc__rf   classmethodr   r7   rt   staticmethodr   r,   r,   r,   r0   rc     s    crc   r=   )NFNNNNFrE   )<r   r9   r]   osrU   collectionsr   typingr   r   r   Ztransformers.utils.import_utilsr   Zconfiguration_utilsr   Zdynamic_module_utilsr	   r
   Zmodeling_gguf_pytorch_utilsr   Ztokenization_utilsr   Ztokenization_utils_baser   utilsr   r   r   r   r   r   Zencoder_decoderr   Zauto_factoryr   Zconfiguration_autor   r   r   r   r   Ztokenization_utils_fastr   Z
get_loggerr6   rZ   strrv   r7   r=   r8   ZCONFIG_TO_TYPErx   rD   PathLikebooldictrb   rc   __all__r,   r,   r,   r0   <module>   s   	

			          M
"        
o  !