a
    hO$                     @   s   d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZ eeZee Z e!dd e D Z"eG dd dZ#G dd deZ$G dd deZ%dS )    N)	dataclassfield)Enum)OptionalUnion)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)check_torch_load_is_safelogging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc                 c   s   | ]}|j V  qd S N)
model_type).0Zconf r   \/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/data/datasets/squad.py	<genexpr>"       r   c                   @   s<  e Zd ZU dZedddde idZee	d< edddidZ
ee	d	< ed
ddidZee	d< ed
ddidZee	d< edddidZee	d< edddidZee	d< edddidZee	d< edddidZee	d< edddidZee	d< edddidZee	d< eddd idZee	d!< ed"dd#idZee	d$< dS )%SquadDataTrainingArgumentszb
    Arguments pertaining to what data we are going to input our model for training and eval.
    Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads)__name__
__module____qualname____doc__r   joinMODEL_TYPESr   str__annotations__r   r    intr!   r#   r%   r&   boolr'   r(   floatr*   r+   r-   r   r   r   r   r   %   s`   
				r   c                   @   s   e Zd ZdZdZdS )SplittraindevN)r.   r/   r0   r:   r;   r   r   r   r   r9   h   s   r9   c                	   @   s   e Zd ZU dZeed< ee ed< eed< e	ed< dej
dddfeeee eeef ee	 ee ee d	d
dZdd Zeeejf dddZdS )SquadDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsfeaturesmodeis_language_sensitiveNFpt)r=   	tokenizerlimit_lengthr?   r@   	cache_dirdataset_formatc                 C   s$  || _ || _|jrt nt | _t|trRzt| }W n t	yP   t	dY n0 || _
|jrbdnd}tj|d urx|n|jd|j d|jj d|j d| }	|	d }
t|
Z tj|	r^|js^t }t  tj|	dd| _| jd	 | _| jd
d | _| jdd | _t d|	 dt |  | jd u sJ| jd u r t!d|	 d n|tj"kr|| j#|j| _n| j$|j| _t%| j||j|j&|j'|tj(k|j)|d\| _| _t }t*| j| j| jd|	 t d|	 dt | dd W d    n1 s0    Y  d S )Nzmode is not a valid split nameZv2Zv1Zcached__z.lockT)Zweights_onlyr>   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rH   rB   r    r!   r#   Zis_trainingr-   Zreturn_dataset)r>   rG   rH   z!Saving features into cached file z [took z.3fz s])+r=   r@   r'   r   r   	processor
isinstancer4   r9   KeyErrorr?   ospathr2   r   value	__class__r.   r    r   existsr&   timer   torchloadZold_featuresr>   getrG   rH   loggerinfowarningr;   Zget_dev_examplesZget_train_examplesr   r!   r#   r:   r-   save)selfr=   rB   rC   r?   r@   rD   rE   Zversion_tagZcached_features_fileZ	lock_pathstartr   r   r   __init__w   sf    

"
zSquadDataset.__init__c                 C   s
   t | jS r   )lenr>   )rY   r   r   r   __len__   s    zSquadDataset.__len__)returnc                 C   s6  | j | }tj|jtjd}tj|jtjd}tj|jtjd}tj|jtjd}tj|jtj	d}tj|j
tj	d}|||d}	| jjdv r|	d= | jjdv r|	||d | jjr|	d|i | jr|	dtj|jtjd| jj i | jtjkr2tj|jtjd}
tj|jtjd}|	|
|d	 |	S )
N)Zdtype)	input_idsattention_masktoken_type_ids)xlmZrobertaZ
distilbertZ	camembertra   )Zxlnetrb   )	cls_indexp_maskis_impossibleZlangs)start_positionsend_positions)r>   rR   Ztensorr_   longr`   ra   rc   rd   r8   re   r=   r   updater'   r@   ZonesshapeZint64r+   r?   r9   r:   Zstart_positionZend_position)rY   ifeaturer_   r`   ra   rc   rd   re   inputsrf   rg   r   r   r   __getitem__   s0    
$zSquadDataset.__getitem__)r.   r/   r0   r1   r   r5   listr   r9   r7   r:   r   r   r6   r   r4   r[   r]   dictrR   ZTensorrn   r   r   r   r   r<   m   s*   

Lr<   )&rL   rQ   dataclassesr   r   enumr   typingr   r   rR   filelockr   Ztorch.utils.datar   Zmodels.auto.modeling_autor
   Ztokenization_utilsr   utilsr   r   Zprocessors.squadr   r   r   r   Z
get_loggerr.   rU   ro   keysZMODEL_CONFIG_CLASSEStupler3   r   r9   r<   r   r   r   r   <module>   s$   
B