a
    hX                     @  sj   d dl mZ d dlZd dlmZ d dlZd dlmZ d dlmZ ddl	m
Z
 eeZG dd	 d	eZdS )
    )annotationsN)Literal)Tensor)InputModule   )WhitespaceTokenizerc                      s   e Zd ZU dZdZded< g dZded< i dd	fdd
ddd fddZddddZdddddZ	dd Z
d%ddddddZd	dd dd!d"d#d$Z  ZS )&BoWzImplements a Bag-of-Words (BoW) model to derive sentence embeddings.

    A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
    Fboolsave_in_root)vocabword_weightsunknown_word_weightcumulative_term_frequencyz	list[str]config_keysr   Tzdict[str, float]floatc                   s   t    tt|}|| _|| _|| _|| _g | _	d}|D ]H}|}||v rX|| }n"|
 |v rr||
  }n|d7 }| j	| q>t| dt| d|  t|t dd| _t|| _d S )Nr   r   z out of z0 words without a weighting value. Set weight to F)Z
stop_wordsZdo_lower_case)super__init__listdictfromkeysr   r   r   r   weightslowerappendloggerinfolenr   set	tokenizersentence_embedding_dimension)selfr   r   r   r   Znum_unknown_wordswordweight	__class__ \/var/www/html/assistant/venv/lib/python3.9/site-packages/sentence_transformers/models/BoW.pyr      s*    

zBoW.__init__zdict[str, Tensor])featuresc                 C  s   |S Nr$   )r   r&   r$   r$   r%   forward;   s    zBoW.forwardz	list[int])textsreturnc                   s    fdd|D } |S )Nc                   s    g | ]}j j|fi  qS r$   )r   tokenize).0textkwargsr   r$   r%   
<listcomp>@       z BoW.tokenize.<locals>.<listcomp>)get_sentence_features)r   r)   r/   Z	tokenizedr$   r.   r%   r+   ?   s    zBoW.tokenizec                 C  s   | j S r'   )r   )r   r$   r$   r%    get_sentence_embedding_dimensionC   s    z$BoW.get_sentence_embedding_dimensionr   zlist[list[int]]intz1dict[Literal['sentence_embedding'], torch.Tensor])tokenized_textspad_seq_lengthr*   c                 C  sp   g }|D ]X}t j|  t jd}|D ]0}| jrF||  | j| 7  < q$| j| ||< q$|| qdt |iS )N)ZdtypeZsentence_embedding)torchZzerosr3   Zfloat32r   r   r   stack)r   r5   r6   ZvectorstokensZvectortokenr$   r$   r%   r2   F   s    zBoW.get_sentence_features)safe_serializationstrNone)output_pathr;   r*   c                O  s   |  | d S r'   )Zsave_config)r   r>   r;   argsr/   r$   r$   r%   saveV   s    zBoW.save)r   )__name__
__module____qualname____doc__r
   __annotations__r   r   r(   r+   r3   r2   r@   __classcell__r$   r$   r"   r%   r      s   
" r   )
__future__r   loggingtypingr   r7   r   Z(sentence_transformers.models.InputModuler   r   r   	getLoggerrA   r   r   r$   r$   r$   r%   <module>   s   
