a
    h	                     @  s\   d dl mZ d dlZd dlZd dlZd dlZd dlmZ ddlm	Z	mZ G dd deZ
dS )    )annotationsN)Iterable   )ENGLISH_STOP_WORDSWordTokenizerc                   @  sp   e Zd ZdZg edfddddddZdd	 Zdd
ddZdddddZddddZ	e
ddddZdS )WhitespaceTokenizerz
    Simple and fast white-space tokenizer. Splits sentence based on white spaces.
    Punctuation are stripped from tokens.
    FzIterable[str]boolvocab
stop_wordsdo_lower_casec                 C  s   t || _|| _| | d S N)setr   r   	set_vocab)selfr
   r   r    r   v/var/www/html/assistant/venv/lib/python3.9/site-packages/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py__init__   s    
zWhitespaceTokenizer.__init__c                 C  s   | j S r   r
   )r   r   r   r   	get_vocab   s    zWhitespaceTokenizer.get_vocabr   c                 C  s$   || _ tdd t|D | _d S )Nc                 S  s   g | ]\}}||fqS r   r   ).0idxwordr   r   r   
<listcomp>       z1WhitespaceTokenizer.set_vocab.<locals>.<listcomp>)r
   collectionsOrderedDict	enumerateword2idx)r   r
   r   r   r   r      s    zWhitespaceTokenizer.set_vocabstrz	list[int])textreturnc                 K  s   | j r| }| }g }|D ]}|| jv r0qn|| jv rL|| j|  q|tj}|| jv rfqn(t	|dkr|| jv r|| j|  q| }|| jv rqq|| jv r|| j|  qq|S )Nr   )
r   lowersplitr   r   appendstripstringpunctuationlen)r   r    kwargstokensZtokens_filteredtokenr   r   r   tokenize    s.    




zWhitespaceTokenizer.tokenize)output_pathc                 C  s^   t tj|dd6}tt| j t| j	| j
d| W d    n1 sP0    Y  d S )Nwhitespacetokenizer_config.jsonwr	   )openospathjoinjsondumplistr   keysr   r   )r   r-   ZfOutr   r   r   save>   s    zWhitespaceTokenizer.save)
input_pathc                 C  sJ   t tj| d}t|}W d    n1 s20    Y  tf i |S )Nr.   )r0   r1   r2   r3   r4   loadr   )r9   ZfInconfigr   r   r   r:   I   s    (zWhitespaceTokenizer.loadN)__name__
__module____qualname____doc__r   r   r   r   r,   r8   staticmethodr:   r   r   r   r   r      s   r   )
__future__r   r   r4   r1   r&   collections.abcr   r   r   r   r   r   r   r   <module>   s   