a
    hT                     @  s  d dl mZ d dlZd dlZd dlmZmZmZmZ d dl	Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ erd dlmZ eeZed ZddddddddddddddZdddddddd d!d"d#d$d%dZG d&d' d'eZdS )(    )annotationsN)TYPE_CHECKINGAnyCallableLiteral)Tensor)tqdm)SentenceTransformer)InformationRetrievalEvaluator)SentenceEvaluator)SimilarityFunction)is_datasets_available)ZclimatefeverZdbpediaZfeverZfiqa2018ZhotpotqaZmsmarcoZnfcorpusZnqZquoraretrievalZscidocsZarguanaZscifactZ
touche2020zzeta-alpha-ai/NanoClimateFEVERzzeta-alpha-ai/NanoDBPediazzeta-alpha-ai/NanoFEVERzzeta-alpha-ai/NanoFiQA2018zzeta-alpha-ai/NanoHotpotQAzzeta-alpha-ai/NanoMSMARCOzzeta-alpha-ai/NanoNFCorpuszzeta-alpha-ai/NanoNQz zeta-alpha-ai/NanoQuoraRetrievalzzeta-alpha-ai/NanoSCIDOCSzzeta-alpha-ai/NanoArguAnazzeta-alpha-ai/NanoSciFactzzeta-alpha-ai/NanoTouche2020ZClimateFEVERZDBPediaZFEVERZFiQA2018ZHotpotQAZMSMARCOZNFCorpusZNQZQuoraRetrievalZSCIDOCSZArguAnaZSciFactZ
Touche2020c                      s   e Zd ZdZeZddgdgg dg ddgddddddejd	dddfd
ddddddddddddddddd fddZdd Z	d1ddddddddZ
d dd!d"d#Zd d$d!d%d&Zd'd( Zd)d* Z fd+d,Zd-d.d/d0Z  ZS )2NanoBEIREvaluatora  
    This class evaluates the performance of a SentenceTransformer Model on the NanoBEIR collection of Information Retrieval datasets.

    The collection is a set of datasets based on the BEIR collection, but with a significantly smaller size, so it can
    be used for quickly evaluating the retrieval performance of a model before committing to a full evaluation.
    The datasets are available on Hugging Face in the `NanoBEIR collection <https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6>`_.
    This evaluator will return the same metrics as the InformationRetrievalEvaluator (i.e., MRR, nDCG, Recall@k), for each dataset and on average.

    Args:
        dataset_names (List[str]): The names of the datasets to evaluate on. Defaults to all datasets.
        mrr_at_k (List[int]): A list of integers representing the values of k for MRR calculation. Defaults to [10].
        ndcg_at_k (List[int]): A list of integers representing the values of k for NDCG calculation. Defaults to [10].
        accuracy_at_k (List[int]): A list of integers representing the values of k for accuracy calculation. Defaults to [1, 3, 5, 10].
        precision_recall_at_k (List[int]): A list of integers representing the values of k for precision and recall calculation. Defaults to [1, 3, 5, 10].
        map_at_k (List[int]): A list of integers representing the values of k for MAP calculation. Defaults to [100].
        show_progress_bar (bool): Whether to show a progress bar during evaluation. Defaults to False.
        batch_size (int): The batch size for evaluation. Defaults to 32.
        write_csv (bool): Whether to write the evaluation results to a CSV file. Defaults to True.
        truncate_dim (int, optional): The dimension to truncate the embeddings to. Defaults to None.
        score_functions (Dict[str, Callable[[Tensor, Tensor], Tensor]]): A dictionary mapping score function names to score functions. Defaults to {SimilarityFunction.COSINE.value: cos_sim, SimilarityFunction.DOT_PRODUCT.value: dot_score}.
        main_score_function (Union[str, SimilarityFunction], optional): The main score function to use for evaluation. Defaults to None.
        aggregate_fn (Callable[[list[float]], float]): The function to aggregate the scores. Defaults to np.mean.
        aggregate_key (str): The key to use for the aggregated score. Defaults to "mean".
        query_prompts (str | dict[str, str], optional): The prompts to add to the queries. If a string, will add the same prompt to all queries. If a dict, expects that all datasets in dataset_names are keys.
        corpus_prompts (str | dict[str, str], optional): The prompts to add to the corpus. If a string, will add the same prompt to all corpus. If a dict, expects that all datasets in dataset_names are keys.
        write_predictions (bool): Whether to write the predictions to a JSONL file. Defaults to False.
            This can be useful for downstream evaluation as it can be used as input to the :class:`~sentence_transformers.sparse_encoder.evaluation.ReciprocalRankFusionEvaluator` that accept precomputed predictions.

    Example:
        ::

            from sentence_transformers import SentenceTransformer
            from sentence_transformers.evaluation import NanoBEIREvaluator

            model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

            datasets = ["QuoraRetrieval", "MSMARCO"]
            query_prompts = {
                "QuoraRetrieval": "Instruct: Given a question, retrieve questions that are semantically equivalent to the given question\nQuery: ",
                "MSMARCO": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
            }

            evaluator = NanoBEIREvaluator(
                dataset_names=datasets,
                query_prompts=query_prompts,
            )

            results = evaluator(model)
            '''
            NanoBEIR Evaluation of the model on ['QuoraRetrieval', 'MSMARCO'] dataset:
            Evaluating NanoQuoraRetrieval
            Information Retrieval Evaluation of the model on the NanoQuoraRetrieval dataset:
            Queries: 50
            Corpus: 5046

            Score-Function: cosine
            Accuracy@1: 92.00%
            Accuracy@3: 98.00%
            Accuracy@5: 100.00%
            Accuracy@10: 100.00%
            Precision@1: 92.00%
            Precision@3: 40.67%
            Precision@5: 26.00%
            Precision@10: 14.00%
            Recall@1: 81.73%
            Recall@3: 94.20%
            Recall@5: 97.93%
            Recall@10: 100.00%
            MRR@10: 0.9540
            NDCG@10: 0.9597
            MAP@100: 0.9395

            Evaluating NanoMSMARCO
            Information Retrieval Evaluation of the model on the NanoMSMARCO dataset:
            Queries: 50
            Corpus: 5043

            Score-Function: cosine
            Accuracy@1: 40.00%
            Accuracy@3: 74.00%
            Accuracy@5: 78.00%
            Accuracy@10: 88.00%
            Precision@1: 40.00%
            Precision@3: 24.67%
            Precision@5: 15.60%
            Precision@10: 8.80%
            Recall@1: 40.00%
            Recall@3: 74.00%
            Recall@5: 78.00%
            Recall@10: 88.00%
            MRR@10: 0.5849
            NDCG@10: 0.6572
            MAP@100: 0.5892
            Average Queries: 50.0
            Average Corpus: 5044.5

            Aggregated for Score Function: cosine
            Accuracy@1: 66.00%
            Accuracy@3: 86.00%
            Accuracy@5: 89.00%
            Accuracy@10: 94.00%
            Precision@1: 66.00%
            Recall@1: 60.87%
            Precision@3: 32.67%
            Recall@3: 84.10%
            Precision@5: 20.80%
            Recall@5: 87.97%
            Precision@10: 11.40%
            Recall@10: 94.00%
            MRR@10: 0.7694
            NDCG@10: 0.8085
            '''
            print(evaluator.primary_metric)
            # => "NanoBEIR_mean_cosine_ndcg@10"
            print(results[evaluator.primary_metric])
            # => 0.8084508771660436
    N
   )         r   d   F    Tmeanzlist[DatasetNameType] | Nonez	list[int]boolintz
int | Nonez4dict[str, Callable[[Tensor, Tensor], Tensor]] | Nonezstr | SimilarityFunction | NonezCallable[[list[float]], float]strzstr | dict[str, str] | None)dataset_namesmrr_at_k	ndcg_at_kaccuracy_at_kprecision_recall_at_kmap_at_kshow_progress_bar
batch_size	write_csvtruncate_dimscore_functionsmain_score_functionaggregate_fnaggregate_keyquery_promptscorpus_promptswrite_predictionsc                   s:  t    |d u rtt }|_|_|_|	_|_	|_
|_|	_|_|rjttj ng _|_|
_d| _jr jdj 7  _|_|_|_|_|_    ||||||||	|
|||d  fddtjdddD _d	| d
_ddg_j d S )NZ	NanoBEIR__)r   r   r   r   r   r   r    r!   r"   r#   r$   r)   c                   s   g | ]}j |fi  qS  )_load_dataset.0nameir_evaluator_kwargsselfr+   n/var/www/html/assistant/venv/lib/python3.9/site-packages/sentence_transformers/evaluation/NanoBEIREvaluator.py
<listcomp>   s   z.NanoBEIREvaluator.__init__.<locals>.<listcomp>zLoading NanoBEIR datasetsF)descZleaveZNanoBEIR_evaluation_z_results.csvepochsteps)super__init__listdataset_name_to_idkeysr   r%   r&   r!   r'   r(   r   r#   sortedscore_function_namesr$   r"   r/   r   r   r   r   r   _validate_dataset_names_validate_promptsr   
evaluatorscsv_filecsv_headers_append_csv_headers)r2   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   	__class__r0   r3   r9      sX    

zNanoBEIREvaluator.__init__c                 C  s   |D ]}| j D ]}| j| d|  q| jD ]0}| j| d|  | j| d|  q0| jD ]}| j| d|  qh| jD ]}| j| d|  q| jD ]}| j| d|  qqd S )Nz
-Accuracy@z-Precision@z-Recall@z-MRR@z-NDCG@z-MAP@)r   rC   appendr   r   r   r   )r2   r>   Z
score_namekr+   r+   r3   rD   	  s    




z%NanoBEIREvaluator._append_csv_headersr	   z
str | Nonezdict[str, float])modeloutput_pathr6   r7   returnc                   s  i }i }|dkr8|dkr$d| }	q<d| d| d}	nd}	j d urX|	dj  d7 }	td	j d
|	 d jd u r|j|ji_|jg_j j	
d}
tjdj dD ]t}td|j	  |||||}| D ]F\}}|jd|
d}|d }||vrg ||< |||< || | qqi  |D ]}||  |< q>|d urֈjrtj|j}tj|st|ddd}|dj |d nt|ddd}||g}jD ]}jD ]}| | d|   qڈjD ]6}| | d|   | | d|   q jD ]}| | d|   q>jD ]}| | d|   qdjD ]}| | d|   qq|dt t!| |d |"  j#sBj$d u r(t% fddjD dd  d!d" }| dt%j _#nj$j& dt%j _#t'(d#d jD }t'(d$d jD }td%|  td&| d jD ](}td'|  jD ]*}td()| | d|  d)  qjD ]N}td*)| | d|  d)  td+)| | d|  d)  qjD ]&}td,)| | d|   q:jD ]&}td-)| | d|   qhjD ]&}td.)| | d|   qq* j	 +| || |,  |S )/NrI   z after epoch z
 in epoch z after z steps z (truncated to )z$NanoBEIR Evaluation of the model on z dataset:r*   zEvaluating datasets)r5   disablezEvaluating )maxsplitwzutf-8)modeencoding,
az
_accuracy@z_precision@z_recall@z_mrr@_ndcg@z_map@c                   s(   g | ] }| | d t j  fqS )rX   )maxr   r-   Zagg_resultsr2   r+   r3   r4   i      z.NanoBEIREvaluator.__call__.<locals>.<listcomp>c                 S  s   | d S Nr   r+   )xr+   r+   r3   <lambda>j  r[   z,NanoBEIREvaluator.__call__.<locals>.<lambda>)keyr   c                 S  s   g | ]}t |jqS r+   )lenqueriesr.   	evaluatorr+   r+   r3   r4   p  r[   c                 S  s   g | ]}t |jqS r+   )r`   corpusrb   r+   r+   r3   r4   q  r[   zAverage Queries: zAverage Corpus: zAggregated for Score Function: zAccuracy@{}: {:.2f}%r   zPrecision@{}: {:.2f}%zRecall@{}: {:.2f}%zMRR@{}: {:.4f}zNDCG@{}: {:.4f}zMAP@{}: {:.4f})-r"   loggerinfor   r#   Zsimilarity_fn_nameZ
similarityr>   rD   r/   countr   rA   r   itemssplitrG   r%   r!   ospathjoinrB   isfileopenwriterC   r   r   r   r   r   mapr   closeZprimary_metricr$   rY   valuenpr   formatZprefix_name_to_metrics store_metrics_in_model_card_dataupdate)r2   rJ   rK   r6   r7   argskwargsZper_metric_resultsZper_dataset_resultsZout_txtZnum_underscores_in_namerc   Z
evaluationZfull_keyZmetric_valueZsplitsZmetricZcsv_pathZfOutZoutput_datar/   rH   Zscore_functionZavg_queriesZ
avg_corpusr+   rZ   r3   __call__  s    	









 

(
$(
$
$
(
zNanoBEIREvaluator.__call__DatasetNameType)dataset_namerL   c                 C  s0   dt |   }| jd ur,|d| j 7 }|S )NZNanor*   )dataset_name_to_human_readablelowerr"   )r2   r{   human_readable_namer+   r+   r3   _get_human_readable_name  s    
z*NanoBEIREvaluator._get_human_readable_namer
   c                 K  s  t  stdddlm} t|  }||ddd}||ddd}||ddd}d	d
 |D }dd
 |D }	i }
|D ]4}|d |
vrt |
|d < |
|d  |d  qt| jd ur| j	|d |d< | j
d ur| j
	|d |d< | |}| jf |	||
|d|S )Nzedatasets is not available. Please install it to use the NanoBEIREvaluator via `pip install datasets`.r   )load_datasetrd   train)ri   ra   qrelsc                 S  s*   i | ]"}t |d  dkr|d |d  qS textr   Z_idr`   r.   sampler+   r+   r3   
<dictcomp>  r[   z3NanoBEIREvaluator._load_dataset.<locals>.<dictcomp>c                 S  s*   i | ]"}t |d  dkr|d |d  qS r   r   r   r+   r+   r3   r     r[   zquery-idz	corpus-idZquery_promptZcorpus_prompt)ra   rd   Zrelevant_docsr/   )r   
ValueErrorZdatasetsr   r;   r}   setaddr'   getr(   r   information_retrieval_class)r2   r{   r1   r   Zdataset_pathrd   ra   r   Zcorpus_dictZqueries_dictZ
qrels_dictr   r~   r+   r+   r3   r,     s:    


zNanoBEIREvaluator._load_datasetc                 C  sJ   t | jdkrtddd | jD  }rFtd| dtt  d S )Nr   zDdataset_names cannot be empty. Use None to evaluate on all datasets.c                 S  s   g | ]}|  tvr|qS r+   )r}   r;   r.   r{   r+   r+   r3   r4     s   z=NanoBEIREvaluator._validate_dataset_names.<locals>.<listcomp>zDataset(s) z@ not found in the NanoBEIR collection. Valid dataset names are: )r`   r   r   r:   r;   r<   )r2   Zmissing_datasetsr+   r+   r3   r?     s    

z)NanoBEIREvaluator._validate_dataset_namesc                   s   d} j d urZt j tr2 fdd jD  _ n( fdd jD  }rZ|d| d7 } jd urt jtr fdd jD  _n( fd	d jD  }r|d
| d7 }|rt| d S )NrM   c                   s   i | ]}| j qS r+   r'   r   r2   r+   r3   r     r[   z7NanoBEIREvaluator._validate_prompts.<locals>.<dictcomp>c                   s   g | ]}| j vr|qS r+   r   r   r   r+   r3   r4     s   z7NanoBEIREvaluator._validate_prompts.<locals>.<listcomp>z2The following datasets are missing query prompts: rV   c                   s   i | ]}| j qS r+   r(   r   r   r+   r3   r     r[   c                   s   g | ]}| j vr|qS r+   r   r   r   r+   r3   r4     s   z3The following datasets are missing corpus prompts: )r'   
isinstancer   r   r(   r   strip)r2   	error_msgZmissing_query_promptsZmissing_corpus_promptsr+   r   r3   r@     s"    





z#NanoBEIREvaluator._validate_promptsc                   s$   t | jdkr t j|i | d S r\   )r`   r   r8   ru   )r2   rw   rx   rE   r+   r3   ru     s    z2NanoBEIREvaluator.store_metrics_in_model_card_datazdict[str, Any])rL   c                 C  s<   d| j i}g d}|D ] }t| |d urt| |||< q|S )Nr   )r"   r'   r(   )r   getattr)r2   Zconfig_dictZconfig_dict_candidate_keysr_   r+   r+   r3   get_config_dict  s    
z!NanoBEIREvaluator.get_config_dict)NrI   rI   )__name__
__module____qualname____doc__r
   r   rs   r   r9   rD   ry   r   r,   r?   r@   ru   r   __classcell__r+   r+   rE   r3   r   H   s>   v4H   s r   ) 
__future__r   loggingrj   typingr   r   r   r   numpyrs   Ztorchr   r   Zsentence_transformersr	   Z>sentence_transformers.evaluation.InformationRetrievalEvaluatorr
   Z2sentence_transformers.evaluation.SentenceEvaluatorr   Z*sentence_transformers.similarity_functionsr   Zsentence_transformers.utilr   Z)sentence_transformers.SentenceTransformer	getLoggerr   re   rz   r;   r|   r   r+   r+   r+   r3   <module>   s\   
