a
    hi                  	   @   s6  d Z ddlZddlZddlZddlmZ ddlmZ ddlZddl	m
Z
 ddlmZ eeZdd	d
Zdd ZG dd de
ZdZdZededdddddf	Zedde ejejB ejB ZedZeeejejB ejB ZedZd-ddZd.d"d#Z G d$d% d%Z!d&d' Z"d(d) Z#d/d+d,Z$dgZ%dS )0z!Tokenization classes for BERTweet    N)copyfile)Optional   )PreTrainedTokenizer)loggingz	vocab.txtz	bpe.codes)
vocab_filemerges_filec                 C   s>   t  }| d }| dd D ]}|||f |}qt |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairsZ	prev_charchar r   n/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/bertweet/tokenization_bertweet.py	get_pairs&   s    r   c                       s  e Zd ZdZeZd* fdd		Zd+ee e	ee  ee dddZ
d,ee e	ee  eee d fddZd-ee e	ee  ee dddZedd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd.ee	e ee d%d&d'Zd(d) Z  ZS )/BertweetTokenizera	  
    Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        normalization (`bool`, *optional*, defaults to `False`):
            Whether or not to apply a normalization preprocess.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    F<s></s><unk><pad><mask>c                    sT  zddl m} || _W n" ty8   td d | _Y n0 || _|| _i | _d| jt	|< d| jt	|	< d| jt	|< d| jt	|< | 
| dd | j D | _t|d	d
&}| dd d }W d    n1 s0    Y  dd |D }tt|tt|| _i | _|| _t | _ddd| _t jf |||||||	|
d| d S )Nr   )demojizezsemoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0r	      r   c                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>       z.BertweetTokenizer.__init__.<locals>.<dictcomp>utf-8encoding
c                 S   s    g | ]}t | d d qS )Nr#   )tuplesplit)r   merger   r   r   
<listcomp>   r   z.BertweetTokenizer.__init__.<locals>.<listcomp>'z...)u   ’u   …)normalization	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_token)emojir   	demojizerImportErrorloggerwarningr   r   encoderstradd_from_fileitemsdecoderopenreadr%   dictziprangelen	bpe_rankscacher)   TweetTokenizertweetPreprocessorspecial_punctssuper__init__)selfr   r   r)   r*   r+   r,   r-   r.   r/   r0   kwargsr   Zmerges_handleZmerges	__class__r   r   rG   k   sJ    

4	zBertweetTokenizer.__init__N)token_ids_0token_ids_1returnc                 C   sD   |du r| j g| | jg S | j g}| jg}|| | | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERTweet sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`list[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)cls_token_idsep_token_id)rH   rL   rM   clssepr   r   r    build_inputs_with_special_tokens   s
    z2BertweetTokenizer.build_inputs_with_special_tokens)rL   rM   already_has_special_tokensrN   c                    sh   |rt  j||ddS |du r8dgdgt|  dg S dgdgt|  ddg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rL   rM   rT   Nr	   r   )rF   get_special_tokens_maskr@   )rH   rL   rM   rT   rJ   r   r   rU      s    z)BertweetTokenizer.get_special_tokens_maskc                 C   sP   | j g}| jg}|du r.t|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`list[int]`):
                List of IDs.
            token_ids_1 (`list[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `list[int]`: List of zeros.
        Nr   )rP   rO   r@   )rH   rL   rM   rR   rQ   r   r   r   $create_token_type_ids_from_sequences   s
    z6BertweetTokenizer.create_token_type_ids_from_sequencesc                 C   s
   t | jS N)r@   r6   rH   r   r   r   
vocab_size   s    zBertweetTokenizer.vocab_sizec                 C   s   t | jfi | jS rW   )r=   r6   Zadded_tokens_encoderrX   r   r   r   	get_vocab   s    zBertweetTokenizer.get_vocabc           
         s  | j v r j | S t|}tt|d d |d d g }t|}|sN|S t| fddd}| jvrpql|\}}g }d}|t|k rBz|||}	W n* ty   |	||d   Y qBY n0 |	|||	  |	}|| |kr*|t|d k r*||d  |kr*|
||  |d7 }q|
||  |d7 }qt|}|}t|dkrbqlqNt|}qNd	|}|d d
 }| j |< |S )Nr#   z</w>c                    s    j | tdS )Ninf)rA   getfloat)pairrX   r   r   <lambda>  r   z'BertweetTokenizer.bpe.<locals>.<lambda>)keyr   r	   r   @@ )rB   r$   listr   minrA   r@   index
ValueErrorextendappendjoin)
rH   tokenr   r   ZbigramfirstsecondZnew_wordijr   rX   r   bpe   sF    

"
2




zBertweetTokenizer.bpec                 C   sH   | j r| |}g }td|}|D ]}|t| |d q$|S )zTokenize a string.z\S+\n? )r)   normalizeTweetrefindallrg   rc   ro   r%   )rH   textZsplit_tokenswordsrj   r   r   r   	_tokenize(  s    
zBertweetTokenizer._tokenizec                    s    j D ]}|| j | }q j|}d fdd|D }|ddddddd	d
dd}|dddddddddddd}|dddddddd }d| S )!z'
        Normalize a raw Tweet
        rp   c                    s   g | ]}  |qS r   )normalizeToken)r   rj   rX   r   r   r'   ;  r   z4BertweetTokenizer.normalizeTweet.<locals>.<listcomp>zcannot zcan not zn't z n't zn 't zca n'tzcan'tzai n'tzain'tz'm z 'm z're z 're z's z 's z'll z 'll z'd z 'd z've z 've z p . m .z  p.m.z p . m z p.m z a . m .z a.m.z a . m z a.m )rE   replacerD   tokenizeri   r%   )rH   ZtweetpuncttokensZ	normTweetr   rX   r   rq   3  sF    
	z BertweetTokenizer.normalizeTweetc                 C   sp   |  }|drdS |ds*|dr.dS t|dkrh|| jv rN| j| S | jdurb| |S |S n|S dS )z-
        Normalize tokens in a Tweet
        @z@USERhttpZwwwZHTTPURLr	   N)lower
startswithr@   rE   r2   )rH   rj   Zlowercased_tokenr   r   r   rw   U  s    




z BertweetTokenizer.normalizeTokenc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r6   r\   r.   )rH   rj   r   r   r   _convert_token_to_idh  s    z&BertweetTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)r:   r\   r.   )rH   re   r   r   r   _convert_id_to_tokenl  s    z&BertweetTokenizer._convert_id_to_tokenc                 C   s   d |dd }|S )z:Converts a sequence of tokens (string) in a single string.rp   ra    )ri   rx   strip)rH   r{   Z
out_stringr   r   r   convert_tokens_to_stringp  s    z*BertweetTokenizer.convert_tokens_to_string)save_directoryfilename_prefixrN   c                 C   s  t j|s"td| d d S t j||r6|d ndtd  }t j||rX|d ndtd  }t j| jt j|krt j	| jrt
| j| nLt j	| jst|d$}| j }|| W d    n1 s0    Y  t j| jt j|krt
| j| ||fS )NzVocabulary path (z) should be a directory-r   r   r   wb)ospathisdirr4   errorri   VOCAB_FILES_NAMESabspathr   isfiler   r;   Zsp_modelZserialized_model_protowriter   )rH   r   r   Zout_vocab_fileZout_merge_filefiZcontent_spiece_modelr   r   r   save_vocabularyu  s$    (
(z!BertweetTokenizer.save_vocabularyc           	   
   C   s   t |trz<t|ddd}| | W d   n1 s:0    Y  W nH tyn } z|W Y d}~n*d}~0  ty   td| dY n0 dS | }|D ]B}| }|	d}|dkrt
d	|d| }t| j| j|< qdS )
zi
        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
        rr   r    NzIncorrect encoding detected in z, please rebuild the datasetrp   r#   z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer7   r;   r8   FileNotFoundErrorUnicodeError	Exception	readlinesr   rfindrf   r@   r6   )	rH   ffdZfnfelinesZlineTmplineidxr   r   r   r   r8     s"    
,
zBertweetTokenizer.add_from_file)Fr   r   r   r   r   r   r   )N)NF)N)N)__name__
__module____qualname____doc__r   Zvocab_files_namesrG   rc   intr   rS   boolrU   rV   propertyrY   rZ   ro   rv   rq   rw   r   r   r   r7   r$   r   r8   __classcell__r   r   rJ   r   r   6   sL   2        =   
,"r   ac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
    (?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    z(%s)|z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);strictc                 C   s&   |d u rd}t | tr"| ||S | S )Nr   )r   bytesdecode)rt   r!   errorsr   r   r   _str_to_unicode\  s
    
r   r   Tr   c                    s     fdd}t |t| |S )u  
    Remove entities from text by converting them to their corresponding unicode character.

    Args:
        text:
            A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
        keep (list):
            List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
            `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
        remove_illegal (bool):
            If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
            kept "as is".

    Returns: A unicode string with the entities removed.

    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

    Examples:

    ```python
    >>> from nltk.tokenize.casual import _replace_html_entities

    >>> _replace_html_entities(b"Price: &pound;100")
    'Price: \xa3100'

    >>> print(_replace_html_entities(b"Price: &pound;100"))
    Price: £100
    ```c              	      s   |  d}|  dr|zN|  dr,t|d}n
t|d}d|  krJdkr`n nt|fdW S W q tyx   d }Y q0 n | v r|  d	S tjj|}|d urz
t	|W S  tt
fy   Y n0 rd
S |  d	S )Nr   r	   r      
         cp1252r   r   )groupr   r   r   rf   htmlentitiesname2codepointr\   chrOverflowError)matchZentity_bodynumberkeepremove_illegalr   r   _convert_entity  s&    





z/_replace_html_entities.<locals>._convert_entity)ENT_REsubr   )rt   r   r   r!   r   r   r   r   _replace_html_entitiesd  s    r   c                   @   s"   e Zd ZdZd	ddZdd ZdS )
rC   a  
    Examples:

    ```python
    >>> # Tokenizer for tweets.
    >>> from nltk.tokenize import TweetTokenizer

    >>> tknzr = TweetTokenizer()
    >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
    >>> tknzr.tokenize(s0)
    ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

    >>> # Examples using *strip_handles* and *reduce_len parameters*:
    >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
    >>> tknzr.tokenize(s1)
    [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
    ```TFc                 C   s   || _ || _|| _d S rW   preserve_case
reduce_lenstrip_handles)rH   r   r   r   r   r   r   rG     s    zTweetTokenizer.__init__c                 C   sR   t |}| jrt|}| jr$t|}td|}t|}| j	sNdd |D }|S )z
        Args:
            text: str

        Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
        `preserve_case=False`
        \1\1\1c                 S   s"   g | ]}t |r|n| qS r   )EMOTICON_REsearchr~   )r   xr   r   r   r'     r   z+TweetTokenizer.tokenize.<locals>.<listcomp>)
r   r   remove_handlesr   reduce_lengtheningHANG_REr   WORD_RErs   r   )rH   rt   Z	safe_textru   r   r   r   ry     s    	
zTweetTokenizer.tokenizeN)TFF)r   r   r   r   rG   ry   r   r   r   r   rC     s   
rC   c                 C   s   t d}|d| S )za
    Replace repeated character sequences of length 3 or greater with sequences of length 3.
    z	(.)\1{2,}r   regexcompiler   rt   patternr   r   r   r     s    
r   c                 C   s   t d}|d| S )z4
    Remove Twitter username handles from text.
    zv(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)rp   r   r   r   r   r   r     s    r   Fc                 C   s   t |||d| S )z:
    Convenience function for wrapping the tokenizer.
    r   )rC   ry   )rt   r   r   r   r   r   r   casual_tokenize  s    r   )Nr   )r   Tr   )TFF)&r   r   r   rr   shutilr   typingr   r   Ztokenization_utilsr   utilsr   Z
get_loggerr   r4   r   r   r   Z	EMOTICONSZURLSZREGEXPSr   ri   VERBOSEIUNICODEr   r   r   r   r   r   rC   r   r   r   __all__r   r   r   r   <module>   sP   
   (.0$



A8
