a
    h	                     @  sT   d Z ddlmZ ddlZddlmZ ddlmZm	Z	 ddl
mZ G dd deZdS )	a  
This file contains deprecated code that can only be used with the old `model.fit`-style Sentence Transformers v2.X training.
It exists for backwards compatibility with the `model.old_fit` method, but will be removed in a future version.

Nowadays, with Sentence Transformers v3+, it is recommended to use the `SentenceTransformerTrainer` class to train models.
See https://www.sbert.net/docs/sentence_transformer/training_overview.html for more information.

See this script for more details on how to use the new training API:
https://github.com/UKPLab/sentence-transformers/blob/master/examples/sentence_transformer/unsupervised_learning/TSDAE/train_stsb_tsdae.py
    )annotationsN)Dataset)NLTK_IMPORT_ERRORis_nltk_available)InputExamplec                   @  sD   e Zd ZdZdd fddddZdd	 Zd
d ZedddZdS )DenoisingAutoEncoderDataseta  
    The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
    It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
    sentence without noise.

    Args:
        sentences: A list of sentences
        noise_fn: A noise function: Given a string, it returns a string
            with noise, e.g. deleted words
    c                 C  s
   t | S N)r   delete)s r   v/var/www/html/assistant/venv/lib/python3.9/site-packages/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py<lambda>!       z$DenoisingAutoEncoderDataset.<lambda>z	list[str])	sentencesc                 C  s(   t  stt| jj|| _|| _d S r   )r   ImportErrorr   format	__class____name__r   noise_fn)selfr   r   r   r   r   __init__!   s    z$DenoisingAutoEncoderDataset.__init__c                 C  s   | j | }t| ||gdS )N)Ztexts)r   r   r   )r   itemsentr   r   r   __getitem__(   s    
z'DenoisingAutoEncoderDataset.__getitem__c                 C  s
   t | jS r   )lenr   )r   r   r   r   __len__,   s    z#DenoisingAutoEncoderDataset.__len__333333?c                 C  sz   ddl m} ddlm} || }t|}|dkr4| S tj||k}t|dkr`d|tj	|< | 
t|| }|S )Nr   )word_tokenize)TreebankWordDetokenizerT)Znltkr   Znltk.tokenize.treebankr   r   nprandomZrandsumchoiceZ
detokenizearray)textZ	del_ratior   r   wordsnZkeep_or_notZwords_processedr   r   r   r	   0   s    z"DenoisingAutoEncoderDataset.deleteN)r   )	r   
__module____qualname____doc__r   r   r   staticmethodr	   r   r   r   r   r      s   r   )r)   
__future__r   numpyr   Ztorch.utils.datar   Ztransformers.utils.import_utilsr   r   Z*sentence_transformers.readers.InputExampler   r   r   r   r   r   <module>   s   