a
    h                      @  sj   d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ eeZG dd deZdS )	a@  
This file contains deprecated code that can only be used with the old `model.fit`-style Sentence Transformers v2.X training.
It exists for backwards compatibility with the `model.old_fit` method, but will be removed in a future version.

Nowadays, with Sentence Transformers v3+, it is recommended to use the `SentenceTransformerTrainer` class to train models.
See https://www.sbert.net/docs/sentence_transformer/training_overview.html for more information.

Instead, you should create a `datasets` `Dataset` for training: https://huggingface.co/docs/datasets/create_dataset
    )annotationsN)Dataset)SentenceTransformer)InputExamplec                   @  s|   e Zd ZdZd ddddddd	Zd!ddddddddZd"dddddddZdd Zdd Zdd Z	dd Z
dd ZdS )#ParallelSentencesDatasetu  
    This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
    sentence in different languages. For example, the file can look like this (EN	DE	ES):
    hello world     hallo welt  hola mundo
    second sentence zweiter satz    segunda oración

    The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
    embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
    mapped to this English sentence embedding.

    When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.

    teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
    returns a list of sentence embeddings
       Tr   intbool)student_modelteacher_model
batch_sizeuse_embedding_cachec                 C  sL   || _ || _g | _g | _g | _g | _g | _g | _|| _|| _	i | _
d| _dS )a+  
        Parallel sentences dataset reader to train student model given a teacher model

        Args:
            student_model (SentenceTransformer): The student sentence embedding model that should be trained.
            teacher_model (SentenceTransformer): The teacher model that provides the sentence embeddings for the first column in the dataset file.
            batch_size (int, optional): The batch size for training. Defaults to 8.
            use_embedding_cache (bool, optional): Whether to use an embedding cache. Defaults to True.
        r   N)r
   r   datasetsdatasets_iteratorZdatasets_tokenizeddataset_indicesZcopy_dataset_indicescacher   r   embedding_cachenum_sentences)selfr
   r   r   r    r   s/var/www/html/assistant/venv/lib/python3.9/site-packages/sentence_transformers/datasets/ParallelSentencesDataset.py__init__*   s    z!ParallelSentencesDataset.__init__d   N   strNone)filepathweightmax_sentencesmax_sentence_lengthreturnc           
      C  s   t d|  g }|dr,tj|dddn
t|dd}d}|D ]h}| d}	|dur||dkr|td	d
 |	D |kr|qB||	 |d7 }|durB|dkrB||krB qqBW d   n1 s0    Y  | j	||||d dS )a  
        Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column

        Args:
            filepath (str): Filepath to the file.
            weight (int, optional): If more than one dataset is loaded with load_data, specifies the frequency at which data should be sampled from this dataset. Defaults to 100.
            max_sentences (int, optional): Maximum number of lines to be read from the filepath. Defaults to None.
            max_sentence_length (int, optional): Skip the example if one of the sentences has more characters than max_sentence_length. Defaults to 128.

        Returns:
            None
        zLoad z.gzrtutf8)encodingr   	Nc                 S  s   g | ]}t |qS r   len.0sentr   r   r   
<listcomp>e       z6ParallelSentencesDataset.load_data.<locals>.<listcomp>   )r   r   r   )
loggerinfoendswithgzipopenstripsplitmaxappendadd_dataset)
r   r   r   r   r   parallel_sentencesZfIncountline	sentencesr   r   r   	load_dataG   s0    

$z"ParallelSentencesDataset.load_datazlist[list[str]])r7   r   r   r   c           	        s   i  |D ]~}|d ur4|dkr4t dd |D |kr4q|d }| vrNt  |< |D ]} | | qR|d ur|dkrt |kr qqt dkrd S |  jt fdd D 7  _t| j}| jt 	  | j
d | j|g|  d S )Nr   c                 S  s   g | ]}t |qS r   r%   r'   r   r   r   r*   }   r+   z8ParallelSentencesDataset.add_dataset.<locals>.<listcomp>c                   s   g | ]}t  | qS r   r%   r'   Zsentences_mapr   r   r*      r+   )r4   setaddr&   r   sumr   r5   listitemsr   r   extend)	r   r7   r   r   r   r:   Zsource_sentencer)   Z
dataset_idr   r<   r   r6   q   s.    
 
z$ParallelSentencesDataset.add_datasetc           	      C  s   g }g }| j D ]&}| |\}}|| || q| |}t||D ](\}}|D ]}| jt|g|d qVqJt| j d S )N)Ztextslabel)	r   
next_entryr5   get_embeddingszipr   r   randomshuffle)	r   Zsource_sentences_listZtarget_sentences_listdata_idxZsrc_sentenceZtrg_sentencesZsrc_embeddingsZsrc_embeddingZtrg_sentencer   r   r   generate_data   s    


z&ParallelSentencesDataset.generate_datac                 C  sd   | j | | j|  \}}| j|  d7  < | j| t| j | kr\d| j|< t| j |  ||fS )Nr,   r   )r   r   r&   rG   rH   )r   rI   sourceZtarget_sentencesr   r   r   rD      s    
z#ParallelSentencesDataset.next_entryc                   s    j s jj| jdddS g }|D ]}| jvr$|| q$t|dkr~ jj| jddd}t||D ]\}}| j|< qj fdd|D S )NFT)r   Zshow_progress_barZconvert_to_numpyr   c                   s   g | ]} j | qS r   )r   r'   r   r   r   r*      r+   z;ParallelSentencesDataset.get_embeddings.<locals>.<listcomp>)r   r   encoder   r   r5   r&   rF   )r   r:   Znew_sentencesr)   Znew_embeddingsZ	embeddingr   rL   r   rE      s    


z'ParallelSentencesDataset.get_embeddingsc                 C  s   | j S )N)r   rL   r   r   r   __len__   s    z ParallelSentencesDataset.__len__c                 C  s    t | jdkr|   | j S )Nr   )r&   r   rJ   pop)r   idxr   r   r   __getitem__   s    z$ParallelSentencesDataset.__getitem__)r   T)r   Nr   )r   Nr   )__name__
__module____qualname____doc__r   r;   r6   rJ   rD   rE   rN   rQ   r   r   r   r   r      s      -   $
r   )rU   
__future__r   r0   loggingrG   Ztorch.utils.datar   Zsentence_transformersr   Zsentence_transformers.readersr   	getLoggerrR   r-   r   r   r   r   r   <module>   s   

