a
    hC                     @  s@  d dl mZ d dlZd dlZd dlZd dlmZmZ d dlZ	d dl
Z
d dl
mZ d dlmZ ddlmZ ddlmZ eeZerd d	lmZ d
dddddedddf
ddddddddddddddddZddddefddddddddddZdd d!d"Zddd#efdddddddd$d%d&Zd/d)d*dddd+d,d-d.ZdS )0    )annotationsN)TYPE_CHECKINGCallable)Tensor)tqdm   )cos_sim)normalize_embeddings)SentenceTransformerF    i  i i  d   r
   z	list[str]boolintz"Callable[[Tensor, Tensor], Tensor]z
int | Nonez
str | Nonezlist[list[float | int]])model	sentencesshow_progress_bar
batch_sizequery_chunk_sizecorpus_chunk_size	max_pairstop_kscore_functiontruncate_dimprompt_namepromptreturnc              	   C  s,   | j |||d|	|
|d}t||||||dS )a@	  
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.

    Args:
        model (SentenceTransformer): SentenceTransformer model for embedding computation
        sentences (List[str]): A list of strings (texts or sentences)
        show_progress_bar (bool, optional): Plotting of a progress bar. Defaults to False.
        batch_size (int, optional): Number of texts that are encoded simultaneously by the model. Defaults to 32.
        query_chunk_size (int, optional): Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time). Defaults to 5000.
        corpus_chunk_size (int, optional): Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time). Defaults to 100000.
        max_pairs (int, optional): Maximal number of text pairs returned. Defaults to 500000.
        top_k (int, optional): For each sentence, we retrieve up to top_k other sentences. Defaults to 100.
        score_function (Callable[[Tensor, Tensor], Tensor], optional): Function for computing scores. By default, cosine similarity. Defaults to cos_sim.
        truncate_dim (int, optional): The dimension to truncate sentence embeddings to. If None, uses the model's ones. Defaults to None.
        prompt_name (Optional[str], optional): The name of a predefined prompt to use when encoding the sentence.
            It must match a key in the model `prompts` dictionary, which can be set during model initialization
            or loaded from the model configuration.

            Ignored if `prompt` is provided. Defaults to None.

        prompt (Optional[str], optional): A raw prompt string to prepend directly to the input sentence during encoding.

            For instance, `prompt="query: "` transforms the sentence "What is the capital of France?" into:
            "query: What is the capital of France?". Use this to override the prompt logic entirely and supply your own prefix.
            This takes precedence over `prompt_name`. Defaults to None.

    Returns:
        List[List[Union[float, int]]]: Returns a list of triplets with the format [score, id1, id2]
    T)r   r   Zconvert_to_tensorr   r   r   )r   r   r   r   r   )encodeparaphrase_mining_embeddings)r   r   r   r   r   r   r   r   r   r   r   r   
embeddings r   `/var/www/html/assistant/venv/lib/python3.9/site-packages/sentence_transformers/util/retrieval.pyparaphrase_mining   s"    .
r!   r   )r   r   r   r   r   r   r   c              	   C  s  |d7 }t  }d}d}tdt| |D ]}	tdt| |D ]}
|| |
|
|  | |	|	|  }tj|t|t|d dddd\}}|  }|  }tt|D ]z}t	|| D ]h\}}|
| }|	| }||kr|| | |kr|
|| | ||f |d7 }||kr| }|d }qqq>q(t }g }| s| \}}}t||g\}}||kr8||f|vr8|||f ||||g q8t|dd dd	}|S )
a  
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.

    Args:
        embeddings (Tensor): A tensor with the embeddings
        query_chunk_size (int): Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
        corpus_chunk_size (int): Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
        max_pairs (int): Maximal number of text pairs returned.
        top_k (int): For each sentence, we retrieve up to top_k other sentences
        score_function (Callable[[Tensor, Tensor], Tensor]): Function for computing scores. By default, cosine similarity.

    Returns:
        List[List[Union[float, int]]]: Returns a list of triplets with the format [score, id1, id2]
    r   r   TFdimlargestsortedc                 S  s   | d S )Nr   r   xr   r   r    <lambda>       z.paraphrase_mining_embeddings.<locals>.<lambda>keyreverse)queuePriorityQueuerangelentorchtopkmincputolist	enumerateputgetsetemptyr&   addappend)r   r   r   r   r   r   pairsZ	min_scoreZ	num_addedcorpus_start_idxquery_start_idxZscoresZscores_top_k_valuesZscores_top_k_idx	query_itrZ	top_k_idxZ
corpus_itrijentryZadded_pairsZ
pairs_listscoreZsorted_iZsorted_jr   r   r    r   X   sF    

r   z"list[list[dict[str, int | float]]])r   c                  O  s   t | i |S )z8This function is deprecated. Use semantic_search instead)semantic_search)argskwargsr   r   r    information_retrieval   s    rI   
   )query_embeddingscorpus_embeddingsr   r   r   r   r   c              	   C  s  t | tjtjfrt| } nt | tr2t| } t| j	dkrJ| 
d} t |tjtjfrht|}nt |tr|t|}|j| jkr| |j} dd tt| D }tdt| |D ]n}t|| t| }| jrtj||| jd}	| d|	}
n| || }
tdt||D ]}t|| t|}|jrTtj|||jd}	|d|	}n||| }||
|}tj|t|t|d dddd\}}|  }|  }tt|D ]p}t|| || D ]V\}}|| }|| }t|| |k r
t|| ||f nt|| ||f qʐqqqtt|D ]Z}tt|| D ](}|| | \}}||d	|| |< qLt|| d
d dd||< q8|S )a3  
    This function performs by default a cosine similarity search between a list of query embeddings  and a list of corpus embeddings.
    It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.

    Args:
        query_embeddings (:class:`~torch.Tensor`): A 2 dimensional tensor with the query embeddings. Can be a sparse tensor.
        corpus_embeddings (:class:`~torch.Tensor`): A 2 dimensional tensor with the corpus embeddings. Can be a sparse tensor.
        query_chunk_size (int, optional): Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory. Defaults to 100.
        corpus_chunk_size (int, optional): Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory. Defaults to 500000.
        top_k (int, optional): Retrieve top k matching entries. Defaults to 10.
        score_function (Callable[[:class:`~torch.Tensor`, :class:`~torch.Tensor`], :class:`~torch.Tensor`], optional): Function for computing scores. By default, cosine similarity.

    Returns:
        List[List[Dict[str, Union[int, float]]]]: A list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores.
    r   r   c                 S  s   g | ]}g qS r   r   ).0_r   r   r    
<listcomp>   r*   z#semantic_search.<locals>.<listcomp>deviceTFr#   )	corpus_idrE   c                 S  s   | d S )NrE   r   r'   r   r   r    r)      r*   z!semantic_search.<locals>.<lambda>r+   )
isinstancenpZndarrayZgenericr2   Z
from_numpyliststackr1   shapeZ	unsqueezerQ   tor0   r4   Z	is_sparseZarangeZindex_selectr3   r5   r6   zipheapqheappushheappushpopr&   )rK   rL   r   r   r   r   Zqueries_result_listr@   Zquery_end_idxindicesZquery_chunkr?   Zcorpus_end_idxZcorpus_chunk
cos_scoresZcos_scores_top_k_valuesZcos_scores_top_k_idxrA   Zsub_corpus_idrE   rR   Zquery_idZdoc_itrr   r   r    rF      s\    






"rF         ?   ztorch.Tensor | np.ndarrayfloatzlist[list[int]])r   	thresholdmin_community_sizer   r   r   c                 C  sR  t | tjst| } tj|| jd}t| } g }t|t| }ttd| dt| }t	t
dt| |d| dD ]N}| |||  | j }| jjdv r||k}	|	d}
|
|k}| sqt|
| }
|| }|
 }|j|d	d
\}}t|
|D ]\}}||d|   qqt|j|d	d
\}}t
t|D ]}|| d |kr6|| j|d	d
\}}|d |kr|t| k rtd| t| }|| j|d	d
\}}qb||||k   q6qtt|dd d	d}g }t }t|D ]P\}}g }|D ]}||vr|| qt||kr|| || qt|dd d	d}|S )a  
    Function for Fast Community Detection.

    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    Returns only communities that are larger than min_community_size. The communities are returned
    in decreasing order. The first element in each list is the central point in the community.

    Args:
        embeddings (torch.Tensor or numpy.ndarray): The input embeddings.
        threshold (float): The threshold for determining if two embeddings are close. Defaults to 0.75.
        min_community_size (int): The minimum size of a community to be considered. Defaults to 10.
        batch_size (int): The batch size for computing cosine similarity scores. Defaults to 1024.
        show_progress_bar (bool): Whether to show a progress bar during computation. Defaults to False.

    Returns:
        List[List[int]]: A list of communities, where each community is represented as a list of indices.
    rP      2   r   zFinding clusters)descdisable)cudaZnpur   T)kr%   Nr"   c                 S  s   t | S Nr1   r'   r   r   r    r)   R  r*   z%community_detection.<locals>.<lambda>r+   c                 S  s   t | S rj   rk   r'   r   r   r    r)   b  r*   )rS   r2   r   tensorrQ   r	   r4   r1   maxr   r0   Ttypesumanyr3   rY   r=   r6   r&   r:   r7   update)r   rb   rc   r   r   Zextracted_communitiesZsort_max_sizeZ	start_idxr^   Zthreshold_maskZrow_wise_countZlarge_enough_maskri   rN   Ztop_k_indicescountr]   Ztop_k_valuesrB   Ztop_val_largeZtop_idx_largeZunique_communitiesZextracted_idsZ
cluster_idZ	communityZnon_overlapped_communityidxr   r   r    community_detection  sX    



ru   )r_   rJ   r`   F)
__future__r   rZ   loggingr.   typingr   r   numpyrT   r2   r   Ztqdm.autonotebookr   Z
similarityr   rl   r	   	getLogger__name__loggerZ)sentence_transformers.SentenceTransformerr
   r!   r   rI   rF   ru   r   r   r   r    <module>   sR   
(DI]    