a
    hd                     @  s~   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZ ddlmZmZ eeZG dd deZdS )	    )annotationsN)Iterable)NLTK_IMPORT_ERRORis_nltk_available   )ENGLISH_STOP_WORDSWordTokenizerc                   @  sx   e Zd ZdZg edddfdddddd	d
dZdd ZddddZdddddZddddZ	e
ddddZdS )PhraseTokenizera~  Tokenizes the text with respect to existent phrases in the vocab.

    This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
    in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
    F_   zIterable[str]boolstrintvocab
stop_wordsdo_lower_casengram_separatormax_ngram_lengthc                 C  sB   t  stt| jjt|| _|| _|| _	|| _
| | d S N)r   ImportErrorr   format	__class____name__setr   r   r   r   	set_vocab)selfr   r   r   r   r    r   r/var/www/html/assistant/venv/lib/python3.9/site-packages/sentence_transformers/models/tokenizer/PhraseTokenizer.py__init__   s    
zPhraseTokenizer.__init__c                 C  s   | j S r   r   )r   r   r   r   	get_vocab)   s    zPhraseTokenizer.get_vocabr    c                 C  s   || _ tdd t|D | _t | _t | _|D ]Z}| jd ur4| j|v r4|	| jd }| j| j |vr4|| j
kr4| j| | j| q4t|dkrtd| j  tdt| j  d S )Nc                 S  s   g | ]\}}||fqS r   r   ).0idxwordr   r   r   
<listcomp>.       z-PhraseTokenizer.set_vocab.<locals>.<listcomp>r   r   z(PhraseTokenizer - Phrase ngram lengths: zPhraseTokenizer - Num phrases: )r   collectionsOrderedDict	enumerateword2idxr   ngram_lookupngram_lengthsr   countr   addlenloggerinfo)r   r   r$   Zngram_countr   r   r   r   ,   s    zPhraseTokenizer.set_vocabz	list[int])textreturnc           
      K  sX  ddl m} ||dd}t| jddD ]|}d}|t|| kr&| j||||  }|| jv rt|g|||| < n$| | jv r| g|||| < |d7 }q.q&g }|D ]}	|	| j	v rqn|	| j
v r|| j
|	  q|	 }	|	| j	v rqn|	| j
v r|| j
|	  q|	tj}	|	| j	v r*qqt|	dkr|	| j
v r|| j
|	  qq|S )Nr   )word_tokenizeT)Zpreserve_line)reverser   )Znltkr4   sortedr,   r/   r   joinr+   lowerr   r*   appendstripstringpunctuation)
r   r2   kwargsr4   tokensZ	ngram_lenr#   ZngramZtokens_filteredtokenr   r   r   tokenize?   s>    



zPhraseTokenizer.tokenize)output_pathc              	   C  sf   t tj|dd>}tt| j t| j	| j
| j| jd| W d    n1 sX0    Y  d S )Nphrasetokenizer_config.jsonwr   )openospathr7   jsondumplistr*   keysr   r   r   r   )r   rA   ZfOutr   r   r   saveh   s    zPhraseTokenizer.save)
input_pathc                 C  sJ   t tj| d}t|}W d    n1 s20    Y  tf i |S )NrB   )rD   rE   rF   r7   rG   loadr	   )rL   ZfInconfigr   r   r   rM   u   s    (zPhraseTokenizer.loadN)r   
__module____qualname____doc__r   r   r!   r   r@   rK   staticmethodrM   r   r   r   r   r	      s   )r	   )
__future__r   r'   rG   loggingrE   r;   collections.abcr   Ztransformers.utils.import_utilsr   r   r   r   	getLoggerr   r0   r	   r   r   r   r   <module>   s   
