a
    he                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	m
Z
 d dlZd dlmZ d dlmZ ddlmZ dd	lmZmZ d
dlmZmZmZ d
dlmZ eeZeG dd dZG dd deZG dd deZ dS )    N)	dataclassfield)Enum)OptionalUnion)FileLock)Dataset   )PreTrainedTokenizerBase)check_torch_load_is_safelogging   )!glue_convert_examples_to_featuresglue_output_modesglue_processors)InputFeaturesc                   @   s   e Zd ZU dZeddde  idZe	e
d< eddidZe	e
d< ed	dd
idZee
d< edddidZee
d< dd ZdS )GlueDataTrainingArgumentsz
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
    line.
    helpz"The name of the task to train on: z, )metadata	task_namezUThe input data dir. Should contain the .tsv files (or other data files) for the task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.)defaultr   max_seq_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachec                 C   s   | j  | _ d S N)r   lowerself r   [/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/data/datasets/glue.py__post_init__=   s    z'GlueDataTrainingArguments.__post_init__N)__name__
__module____qualname____doc__r   joinr   keysr   str__annotations__r   r   intr   boolr!   r   r   r   r    r   #   s   
$	r   c                   @   s   e Zd ZdZdZdZdS )SplittraindevtestN)r"   r#   r$   r-   r.   r/   r   r   r   r    r,   A   s   r,   c                   @   s|   e Zd ZU dZeed< eed< ee ed< de	j
dfeeee eee	f ee dddZd	d
 ZedddZdd ZdS )GlueDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsoutput_modefeaturesN)r1   	tokenizerlimit_lengthmode	cache_dirc                 C   s$  t dt || _t|j  | _t|j | _t	|t
r^zt| }W n ty\   tdY n0 tj|d urp|n|jd|j d|jj d|j d|j }| j }|jdv r|jjdv r|d |d  |d< |d< || _|d	 }t|  tj|rB|jsBt }	t  tj|d
d| _t d| dt |	  nt d|j  |tj!krp| j"|j}
n*|tj#kr| j$|j}
n| j%|j}
|d ur|
d | }
t&|
||j|| jd| _t }	t'| j| t d| dt |	 dd W d    n1 s0    Y  d S )Nu  This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.pyzmode is not a valid split nameZcached__)Zmnlizmnli-mm)ZRobertaTokenizerZRobertaTokenizerFastZXLMRobertaTokenizerZBartTokenizerZBartTokenizerFastr      z.lockT)Zweights_onlyz"Loading features from cached file z [took %.3f s]z'Creating features from dataset file at )
max_length
label_listr2   z!Saving features into cached file z [took z.3fz s])(warningswarnFutureWarningr1   r   r   	processorr   r2   
isinstancer(   r,   KeyErrorospathr&   r   value	__class__r"   r   
get_labelsr;   r   existsr   timer   torchloadr3   loggerinfor.   Zget_dev_examplesr/   Zget_test_examplesZget_train_examplesr   save)r   r1   r4   r5   r6   r7   Zcached_features_filer;   Z	lock_pathstartZexamplesr   r   r    __init__P   sb    
$

zGlueDataset.__init__c                 C   s
   t | jS r   )lenr3   r   r   r   r    __len__   s    zGlueDataset.__len__)returnc                 C   s
   | j | S r   )r3   )r   ir   r   r    __getitem__   s    zGlueDataset.__getitem__c                 C   s   | j S r   )r;   r   r   r   r    rF      s    zGlueDataset.get_labels)r"   r#   r$   r%   r   r)   r(   listr   r,   r-   r
   r   r*   r   rO   rQ   rT   rF   r   r   r   r    r0   G   s"   

Kr0   )!rB   rH   r<   dataclassesr   r   enumr   typingr   r   rI   filelockr   Ztorch.utils.datar   Ztokenization_utils_baser
   utilsr   r   Zprocessors.gluer   r   r   Zprocessors.utilsr   Z
get_loggerr"   rK   r   r,   r0   r   r   r   r    <module>   s"   
