a
    h                     @  s   d dl mZ d dlZd dlZd dlZd dlmZ zd dlmZ W n e	y^   d dl
mZ Y n0 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZ d	d
lmZmZmZ eeZG dd deZdS )    )annotationsN)PreTrainedTokenizerBase)Self)nn)tqdm)Module)fullnamehttp_getimport_from_string   )TransformersTokenizerWrapperWhitespaceTokenizerWordTokenizerc                
   @  s   e Zd ZU g dZded< dZded< d*d	d
ddddZdd ZddddZddddZ	d+dd
dddZ
dd Zed,dddd d d
d!d"d#d$Zedd%e dfdd
dd&d'd(d)ZdS )-WordEmbeddingstokenizer_classupdate_embeddingsmax_seq_lengthz	list[str]config_keyszwordembedding_config.jsonstrconfig_file_nameF@B z'WordTokenizer | PreTrainedTokenizerBaseboolint)	tokenizerr   r   c                 C  s   t j|  t|tr t|}nt|ts2tdt|trFt	
|}t|t	jr\t|}| \}}|| _t ||| _| jd|i || jj_|| _|| _|| _d S )Nz>tokenizer must be a WordTokenizer or a HuggingFace tokenizer. weight)r   r   __init__
isinstancer   r   r   
ValueErrorlistnpasarrayZndarraytorchZ
from_numpysizeembeddings_dimensionZ	Embedding	emb_layerZload_state_dictr   Zrequires_gradr   r   r   )selfr   embedding_weightsr   r   Znum_embeddingsr$    r(   g/var/www/html/assistant/venv/lib/python3.9/site-packages/sentence_transformers/models/WordEmbeddings.pyr      s"    






zWordEmbeddings.__init__c                 C  s,   |  |d }d }||||d d |S )N	input_idsattention_mask)token_embeddingsZcls_token_embeddingsr+   )r%   update)r&   featuresr,   Z
cls_tokensr(   r(   r)   forward:   s    zWordEmbeddings.forward)textsc                   s    fdd|D }dd |D }t |}g }g }|D ]<}dg|t|  }	|||	  |dgt| |	  q6tj|tjdtj|tjdtj|tjdd}
|
S )Nc                   s    g | ]}j j|fi  qS r(   )r   tokenize).0textkwargsr&   r(   r)   
<listcomp>G       z+WordEmbeddings.tokenize.<locals>.<listcomp>c                 S  s   g | ]}t |qS r(   )len)r2   tokensr(   r(   r)   r6   H   r7   r   r   )Zdtype)r*   r+   sentence_lengths)maxr8   appendr"   Ztensorlong)r&   r0   r5   Ztokenized_textsr:   max_lenr*   Zattention_masksr9   paddingoutputr(   r4   r)   r1   F   s    zWordEmbeddings.tokenize)returnc                 C  s   | j S )N)r$   r&   r(   r(   r)   get_word_embedding_dimensionZ   s    z+WordEmbeddings.get_word_embedding_dimensionT)output_pathsafe_serializationc                 C  s(   |  | | j||d | j| d S )N)rE   )Zsave_configZsave_torch_weightsr   save)r&   rD   rE   r(   r(   r)   rF   ]   s    
zWordEmbeddings.savec                 C  s   t | j| j| jdS )Nr   )r   r   r   r   rB   r(   r(   r)   get_config_dictb   s    zWordEmbeddings.get_config_dict Nzbool | str | Nonez
str | Noner   )model_name_or_path	subfoldertokencache_folderrevisionlocal_files_onlyrA   c                 K  s   |||||d}| j f d|i|}	t|	d}
| jf d|i|}|
|}| jf d|i|}| f ||d d|	}|S )N)rJ   rK   rL   rM   rN   rI   r   zemb_layer.weight)r   r'   )Zload_configr
   popZload_dir_pathloadZload_torch_weights)clsrI   rJ   rK   rL   rM   rN   r5   Z
hub_kwargsconfigr   Ztokenizer_local_pathr   weightsmodelr(   r(   r)   rP   i   s    
zWordEmbeddings.load z
int | None)embeddings_file_pathr   item_separatormax_vocab_sizec                 C  s  t d|  tj|s\t | d d|v s<d|v rJtd| d| }t|| d }g }g }	|drtj	|dd	d
n
t	|d	d
}
t
|
ddd}|D ]}| |}|st|dkrq|d }|d u rt|d }|d |	t| t|d |krt d qtdd |dd  D }|	| || |d ur|dkrt||kr qpqt|	}	|| | ||	|dW  d    S 1 s0    Y  d S )NzRead in embeddings file z, does not exist, try to download from server/\zEmbeddings file not found: zAhttps://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/z.gzrtutf8)encodingzLoad Word EmbeddingsZ
Embeddings)descunit   r   r   ZPADDING_TOKENz\ERROR: A line in the embeddings file had more or less  dimensions than expected. Skip token.c                 S  s   g | ]}t |qS r(   )float)r2   numr(   r(   r)   r6      r7   z1WordEmbeddings.from_text_file.<locals>.<listcomp>)r   r'   r   )loggerinfoospathexistsr   r	   endswithgzipopenr   rstripsplitr8   r<   r    Zzeroserrorarrayr!   Z	set_vocab)rQ   rV   r   rW   r   rX   urlr$   ZvocabZ
embeddingsZfIniteratorlinerl   wordZvectorr(   r(   r)   from_text_file   sP    	







zWordEmbeddings.from_text_file)Fr   )T)rH   NNNF)__name__
__module____qualname__r   __annotations__r   r   r/   r1   rC   rF   rG   classmethodrP   r   rs   r(   r(   r(   r)   r      s.   
       r   )
__future__r   ri   loggingre   Ztransformersr   typingr   ImportErrorZtyping_extensionsnumpyr    r"   r   r   Z#sentence_transformers.models.Moduler   Zsentence_transformers.utilr   r	   r
   r   r   r   r   	getLoggerrt   rc   r   r(   r(   r(   r)   <module>   s"   
