a
    h!9                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlmZmZ ejeZejedZeeZe ZW d   n1 s0    Y  G dd dZe	e Ze	ee  ZG dd	 d	e
ZG d
d dZG dd dZG dd dZ efe	e! e!dddZ"dS )    N)Template)AnyCallableDictList
NamedTupleOptionalTuple)Encoding	Tokenizerzvisualizer-styles.cssc                   @   s8   e Zd ZU eed< eed< eed< eeedddZdS )
Annotationstartendlabelr   r   r   c                 C   s   || _ || _|| _d S Nr   )selfr   r   r    r   W/var/www/html/assistant/venv/lib/python3.9/site-packages/tokenizers/tools/visualizer.py__init__   s    zAnnotation.__init__N)__name__
__module____qualname__int__annotations__strr   r   r   r   r   r      s   
r   c                   @   s&   e Zd ZU ee ed< ee ed< dS )CharStateKeytoken_ixanno_ixN)r   r   r   r   r   r   r   r   r   r   r      s   
r   c                   @   sH   e Zd ZU ee ed< dd Zedd Zedd Z	e
dd	d
ZdS )	CharStatechar_ixc                 C   s   || _ d | _g | _d S r   )r    r   tokens)r   r    r   r   r   r   '   s    zCharState.__init__c                 C   s   t | jdkr| jd S d S )Nr   lenr!   r   r   r   r   r   -   s    zCharState.token_ixc                 C   s   t | jdkS )zJ
        BPE tokenizers can output more than one token for a char
           r"   r$   r   r   r   is_multitoken1   s    zCharState.is_multitoken)returnc                 C   s   t | j| jdS )N)r   r   )r   r   r   r$   r   r   r   partition_key8   s    zCharState.partition_keyN)r   r   r   r   r   r   r   propertyr   r&   r   r(   r   r   r   r   r   $   s   


r   c                   @   s   e Zd ZdS )AlignedN)r   r   r   r   r   r   r   r*   ?   s   r*   c                   @   s   e Zd ZdZejdejdZdee	e
eegef  dddZg dfeee
e	 e
e d	d
dZeeeeef dddZeee eedddZeeeeedddZeeeedddZeeeeee dddZdS )EncodingVisualizera  
    Build an EncodingVisualizer

    Args:

         tokenizer (:class:`~tokenizers.Tokenizer`):
            A tokenizer instance

         default_to_notebook (:obj:`bool`):
            Whether to render html output in a notebook by default

         annotation_converter (:obj:`Callable`, `optional`):
            An optional (lambda) function that takes an annotation in any format and returns
            an Annotation object
    z(.{1})?(unk|oov)(.{1})?)flagsTN)	tokenizerdefault_to_notebookannotation_converterc                 C   sJ   |r4zddl m}m} W n ty2   tdY n0 || _|| _|| _d S )Nr   HTMLdisplayzWe couldn't import IPython utils for html display.
                        Are you running in a notebook?
                        You can also pass `default_to_notebook=False` to get back raw HTML
                    )IPython.core.displayr1   r2   ImportError	Exceptionr-   r.   annotation_coverter)r   r-   r.   r/   r1   r2   r   r   r   r   V   s    
zEncodingVisualizer.__init__)textannotationsr.   r'   c           	      C   s   | j }|dur|}|rFzddlm}m} W n tyD   tdY n0 | jdur`tt| j|}| j	
|}t|||}|r||| n|S dS )a  
        Build a visualization of the given text

        Args:
            text (:obj:`str`):
                The text to tokenize

            annotations (:obj:`List[Annotation]`, `optional`):
                An optional list of annotations of the text. The can either be an annotation class
                or anything else if you instantiated the visualizer with a converter function

            default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
                If True, will render the html in a notebook. Otherwise returns an html string.

        Returns:
            The HTML string if default_to_notebook is False, otherwise (default) returns None and
            renders the HTML in the notebook

        Nr   r0   zeWe couldn't import IPython utils for html display.
                    Are you running in a notebook?)r.   r3   r1   r2   r4   r5   r6   listmapr-   encoder+   _EncodingVisualizer__make_html)	r   r7   r8   r.   Zfinal_default_to_notebookr1   r2   encodinghtmlr   r   r   __call__l   s"    

zEncodingVisualizer.__call__)r8   r'   c           	      C   s   t | dkri S ttdd | }t |}td| }|dk rBd}d}d}d}i }t|D ](}d	| d
| d| d||< ||7 }qZ|S )a  
        Generates a color palette for all the labels in a given set of annotations

        Args:
          annotations (:obj:`Annotation`):
            A list of annotations

        Returns:
            :obj:`dict`: A dictionary mapping labels to colors in HSL format
        r   c                 S   s   | j S r   )r   )xr   r   r   <lambda>       z;EncodingVisualizer.calculate_label_colors.<locals>.<lambda>          @   
   zhsl(,z%,z%))r#   setr:   r   sorted)	r8   labelsZ
num_labelsZh_stepslhcolorsr   r   r   r   calculate_label_colors   s    
z)EncodingVisualizer.calculate_label_colors)consecutive_chars_listr7   r=   c                 C   s.  | d }|j du r*|j|j }d| dS | d }|j }|j d }||| }g }	i }
|jdur|	d |jrz|	d |jd	 r|	d
 n
|	d tj|j|j dur|	d |j|j |
d< n
|	d dd|	 d}d}|
	 D ]\}}|d| d| d7 }qd| d| d| dS )a  
        Converts a list of "consecutive chars" into a single HTML element.
        Chars are consecutive if they fall under the same word, token and annotation.
        The CharState class is a named tuple with a "partition_key" method that makes it easy to
        compare if two chars are consecutive.

        Args:
            consecutive_chars_list (:obj:`List[CharState]`):
                A list of CharStates that have been grouped together

            text (:obj:`str`):
                The original text being processed

            encoding (:class:`~tokenizers.Encoding`):
                The encoding returned from the tokenizer

        Returns:
            :obj:`str`: The HTML span for a set of consecutive chars
        r   Nz(<span class="special-token" data-stoken=z></span>r%   tokenzmulti-token   z	odd-tokenz
even-tokenzspecial-tokenZstokz	non-tokenzclass=" " z data-z="z<span z ></span>)
r    r!   r   appendr&   r+   unk_token_regexsearchjoinitems)rQ   r7   r=   firstZstokenlastr   r   Z	span_textZcss_classesZ
data_itemscssdatakeyvalr   r   r   consecutive_chars_to_html   s4    








z,EncodingVisualizer.consecutive_chars_to_html)r7   r=   r8   r'   c                 C   sX  t | ||}|d g}|d j}g }t |}|d j}|d urp|| }	|	j}
||
 }|d| d|
 d |dd  D ]}|j}||kr|t j|| |d |g}|d ur|d |d ur|| }	|	j}
||
 }|d| d|
 d |}| |d  kr|| q||t j|| |d |g}q||t j|| |d t|}|S )Nr   z&<span class="annotation" style="color:z" data-label="z">r%   )r7   r=   rX   )	r+   %_EncodingVisualizer__make_char_statesr   rP   r   rY   rd   r(   HTMLBody)r7   r=   r8   char_statesZcurrent_consecutive_charsZprev_anno_ixspansZlabel_colors_dictZcur_anno_ixannor   colorcsresr   r   r   Z__make_html   sb    




zEncodingVisualizer.__make_html)r7   r8   r'   c                 C   s@   dgt |  }t|D ]$\}}t|j|jD ]}|||< q,q|S )a  
        Args:
            text (:obj:`str`):
                The raw text we want to align to

            annotations (:obj:`AnnotationList`):
                A (possibly empty) list of annotations

        Returns:
            A list of  length len(text) whose entry at index i is None if there is no annotation on
            character i or k, the index of the annotation that covers index i where k is with
            respect to the list of annotations
        N)r#   	enumerateranger   r   )r7   r8   annotation_mapr   air   r   r   Z__make_anno_map<  s
    z"EncodingVisualizer.__make_anno_mapc                 C   s   t | |}dd tt| D }t|jD ]B\}}||}|dur,|\}}	t||	D ]}
||
 j| qXq,t|D ]\}}||| _qx|S )a  
        For each character in the original text, we emit a tuple representing it's "state":

            * which token_ix it corresponds to
            * which word_ix it corresponds to
            * which annotation_ix it corresponds to

        Args:
            text (:obj:`str`):
                The raw text we want to align to

            annotations (:obj:`List[Annotation]`):
                A (possibly empty) list of annotations

            encoding: (:class:`~tokenizers.Encoding`):
                The encoding returned from the tokenizer

        Returns:
            :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
            it's state is
        c                 S   s   g | ]}t |qS r   )r   ).0r    r   r   r   
<listcomp>j  rB   z9EncodingVisualizer.__make_char_states.<locals>.<listcomp>N)	r+   "_EncodingVisualizer__make_anno_maprn   r#   rm   r!   Ztoken_to_charsrY   r   )r7   r=   r8   ro   rg   r   rS   offsetsr   r   rq   r    r   r   r   r   Z__make_char_statesQ  s    
z%EncodingVisualizer.__make_char_states)TN)r   r   r   __doc__recompile
IGNORECASErZ   r   boolr   r   r   r   r   r   AnnotationListr?   staticmethodr   rP   r   r   r
   rd   r<   PartialIntListrt   re   r   r   r   r   r+   C   s<     -CAr+   )childrenr'   c                 C   s   d | }d| d| dS )a[  
    Generates the full html with css from a list of html spans

    Args:
        children (:obj:`List[str]`):
            A list of strings, assumed to be html elements

        css_styles (:obj:`str`, `optional`):
            Optional alternative implementation of the css

    Returns:
        :obj:`str`: An HTML string with style markup
    rW   z?
    <html>
        <head>
            <style>
                zs
            </style>
        </head>
        <body>
            <div class="tokenized-text" dir=auto>
            z4
            </div>
        </body>
    </html>
    )r\   )r~   Z
css_stylesZchildren_textr   r   r   rf   w  s    
	rf   )#	itertoolsosrw   stringr   typingr   r   r   r   r   r   r	   Z
tokenizersr
   r   pathdirname__file__r\   Zcss_filenameopenfreadr`   r   r{   r   r}   r   r   r*   r+   r   rf   r   r   r   r   <module>   s&   $
&  6