a
    hs.                     @  s   d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 zd dlm
Z
 W n eyr   d dlm
Z
 Y n0 d dlZd dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ eeZG dd deZdS )    )annotationsN)Path)Any)Self)	save_file)	Tokenizer)nn)PreTrainedTokenizerFast)InputModule)get_device_namec                      s   e Zd Zd3ddddd fddZd	d
dddZd
d
dddZeddddZddddZddddddddZ	e
d4ddd d!d!dd"d#d$d%Ze
d5dd*d!ddd+d!ddd,d d-d.d/Ze
dd d0d1d2Z  ZS )6StaticEmbeddingNz#Tokenizer | PreTrainedTokenizerFastz np.ndarray | torch.Tensor | Nonez
int | NoneNone)	tokenizerembedding_weightsembedding_dimreturnc                   s   t    t|tr|j}nt|ts.td|dur`t|tjrLt	
|}tjj|dd| _n$|dur|t| || _ntd| jj| _| jj| _|| _| j  |dd| _dS )a4	  
        Initializes the StaticEmbedding model given a tokenizer. The model is a simple embedding bag model that
        takes the mean of trained per-token embeddings to compute text embeddings.

        Args:
            tokenizer (Tokenizer | PreTrainedTokenizerFast): The tokenizer to be used. Must be a fast tokenizer
                from ``transformers`` or ``tokenizers``.
            embedding_weights (np.ndarray | torch.Tensor | None, optional): Pre-trained embedding weights.
                Defaults to None.
            embedding_dim (int | None, optional): Dimension of the embeddings. Required if embedding_weights
                is not provided. Defaults to None.

        .. tip::

            Due to the extremely efficient nature of this module architecture, the overhead for moving inputs to the
            GPU can be larger than the actual computation time. Therefore, consider using a CPU device for inference
            and training.

        Example::

            from sentence_transformers import SentenceTransformer
            from sentence_transformers.models import StaticEmbedding
            from tokenizers import Tokenizer

            # Pre-distilled embeddings:
            static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
            # or distill your own embeddings:
            static_embedding = StaticEmbedding.from_distillation("BAAI/bge-base-en-v1.5", device="cuda")
            # or start with randomized embeddings:
            tokenizer = Tokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
            static_embedding = StaticEmbedding(tokenizer, embedding_dim=512)

            model = SentenceTransformer(modules=[static_embedding])

            embeddings = model.encode(["What are Pandas?", "The giant panda, also known as the panda bear or simply the panda, is a bear native to south central China."])
            similarity = model.similarity(embeddings[0], embeddings[1])
            # tensor([[0.8093]]) (If you use potion-base-8M)
            # tensor([[0.6234]]) (If you use the distillation method)
            # tensor([[-0.0693]]) (For example, if you use randomized embeddings)

        Raises:
            ValueError: If the tokenizer is not a fast tokenizer.
            ValueError: If neither `embedding_weights` nor `embedding_dim` is provided.
        zThe tokenizer must be fast (i.e. Rust-backed) to use this class. Use Tokenizer.from_pretrained() from `tokenizers` to load a fast tokenizer.NF)freezez?Either `embedding_weights` or `embedding_dim` must be provided.
base_model)super__init__
isinstancer	   
_tokenizerr   
ValueErrornpndarraytorch
from_numpyr   ZEmbeddingBagfrom_pretrained	embeddingZget_vocab_sizeZnum_embeddingsr   r   Z
no_paddinggetr   )selfr   r   r   kwargs	__class__ h/var/www/html/assistant/venv/lib/python3.9/site-packages/sentence_transformers/models/StaticEmbedding.pyr      s&    3






zStaticEmbedding.__init__z	list[str]zdict[str, torch.Tensor])textsr   c              	   K  sj   | j j|dd}dd |D }ttdgdd |d d D  }tjdd |D tjd	}||d
S )NF)Zadd_special_tokensc                 S  s   g | ]
}|j qS r$   )Zids).0encodingr$   r$   r%   
<listcomp>o       z,StaticEmbedding.tokenize.<locals>.<listcomp>r   c                 S  s   g | ]}t |qS r$   )len)r'   	token_idsr$   r$   r%   r)   q   r*   c                 S  s   g | ]}|D ]}|qqS r$   r$   )r'   r,   Ztoken_idr$   r$   r%   r)   r   r*   )Zdtype)	input_idsoffsets)r   Zencode_batchr   r   r   ZcumsumZtensorlong)r    r&   r!   	encodingsZencodings_idsr/   r.   r$   r$   r%   tokenizem   s
    (zStaticEmbedding.tokenize)featuresr   c                 K  s   |  |d |d |d< |S )Nr.   r/   Zsentence_embedding)r   )r    r3   r!   r$   r$   r%   forwardu   s    zStaticEmbedding.forwardint)r   c                 C  s   t jS N)mathinfr    r$   r$   r%   max_seq_lengthy   s    zStaticEmbedding.max_seq_lengthc                 C  s   | j S r6   )r   r9   r$   r$   r%    get_sentence_embedding_dimension}   s    z0StaticEmbedding.get_sentence_embedding_dimensionT)safe_serializationstrbool)output_pathr<   r   c                O  sT   |rt |  tj|d nt|  tj|d | jtt	|d  d S )Nzmodel.safetensorszpytorch_model.bintokenizer.json)
save_safetensors_fileZ
state_dictospathjoinr   saver   r=   r   )r    r?   r<   argsr!   r$   r$   r%   rE      s    zStaticEmbedding.save Fzbool | str | Nonez
str | Noner   )model_name_or_path	subfoldertokencache_folderrevisionlocal_files_onlyr   c                 K  sx   |||||d}| j |fddi|}	t|	}
| jf d|i|}z|d }W n tyj   |d }Y n0 t|
|dS )N)rI   rJ   rK   rL   rM   filenamer@   rH   zembedding.weightZ
embeddings)r   )Zload_file_pathr   	from_fileZload_torch_weightsKeyErrorr   )clsrH   rI   rJ   rK   rL   rM   r!   Z
hub_kwargsZtokenizer_pathr   weightsr$   r$   r%   load   s    
zStaticEmbedding.load   -C6?\[unused\d+\]float32zlist[str] | Nonezfloat | Noner   )
model_name
vocabularydevicepca_dims
apply_zipfsif_coefficienttoken_remove_patternquantize_touse_subwordr!   r   c
              	     s   zddl m} W n ty*   tdY n0 t|}t|j dh  |||||	|||d|
}
t|
    }rt	dd
tt| d  fd	d
|
 D }
t }||fi |
}t|jtjrt|j}n|jj}|j}| |||dS )ak  
        Creates a StaticEmbedding instance from a distillation process using the `model2vec` package.

        Args:
            model_name (str): The name of the model to distill.
            vocabulary (list[str] | None, optional): A list of vocabulary words to use. Defaults to None.
            device (str): The device to run the distillation on (e.g., 'cpu', 'cuda'). If not specified,
                the strongest device is automatically detected. Defaults to None.
            pca_dims (int | None, optional): The number of dimensions for PCA reduction. Defaults to 256.
            apply_zipf (bool): Whether to apply Zipf's law during distillation. Defaults to True.
            sif_coefficient (float | None, optional): The coefficient for SIF weighting. Defaults to 1e-4.
            token_remove_pattern (str | None, optional): A regex pattern to remove tokens from the vocabulary.
                Defaults to r"\[unused\d+\]".
            quantize_to (str): The data type to quantize the weights to. Defaults to 'float32'.
            use_subword (bool): Whether to use subword tokenization. Defaults to True.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the distilled model's
                tokenizer and embedding weights.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )distillz\To use this method, please install the `model2vec` package: `pip install model2vec[distill]`rX   )rY   rZ   r[   r\   r`   r_   r]   r^   z1Your version of `model2vec` does not support the z, zh arguments for the `distill` method. Consider updating `model2vec` to take advantage of these arguments.c                   s   i | ]\}}| v r||qS r$   r$   )r'   keyvalueZdistill_kwargsr$   r%   
<dictcomp>   r*   z5StaticEmbedding.from_distillation.<locals>.<dictcomp>r   r   )Zmodel2vec.distillra   ImportErrorinspect	signatureset
parameterskeysloggerwarningrD   maprepritemsr   r   r   r   r   r   r   weightr   )rQ   rX   rY   rZ   r[   r\   r]   r^   r_   r`   r!   ra   Zdistill_signatureZ	leftoversstatic_modelr   r   r$   rd   r%   from_distillation   s>    &

	z!StaticEmbedding.from_distillation)model_id_or_pathr   c                 C  sn   zddl m} W n ty*   tdY n0 ||}t|jtjrRt	|j}n|jj
}|j}| |||dS )aH  
        Create a StaticEmbedding instance from a model2vec model. This method loads a pre-trained model2vec model
        and extracts the embedding weights and tokenizer to create a StaticEmbedding instance.

        Args:
            model_id_or_path (str): The identifier or path to the pre-trained model2vec model.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the tokenizer and embedding weights
                 the model2vec model.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )StaticModelzSTo use this method, please install the `model2vec` package: `pip install model2vec`rf   )Z	model2vecrv   rg   r   r   r   r   r   r   r   rr   r   )rQ   ru   rv   rs   r   r   r$   r$   r%   from_model2vec   s    
zStaticEmbedding.from_model2vec)NN)rG   NNNF)NNrT   TrU   rV   rW   T)__name__
__module____qualname__r   r2   r4   propertyr:   r;   rE   classmethodrS   rt   rw   __classcell__r$   r$   r"   r%   r      s8     P             &Jr   ) 
__future__r   rh   loggingr7   rB   pathlibr   typingr   r   rg   Ztyping_extensionsnumpyr   r   Zsafetensors.torchr   rA   Z
tokenizersr   r   Ztransformersr	   Z(sentence_transformers.models.InputModuler
   Zsentence_transformers.utilr   	getLoggerrx   rm   r   r$   r$   r$   r%   <module>   s(   
