a
    h\                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlZd dl	m
Z
 d dlmZ ddlmZ ddlmZ eeZdZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZdS )    N)Optional)FileLock)Dataset   )PreTrainedTokenizer)loggingu   This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: {0}c                   @   sB   e Zd ZdZdeeeee dddZdd Z	e
jd	d
dZdS )TextDatasetH
    This will be superseded by a framework-agnostic approach soon.
    FN)	tokenizer	file_path
block_size	cache_dirc              
   C   s0  t tdt tj|du r2td| d||j	dd }tj
|\}}tj|d urd|n|d|jj d| d| }|d }	t|	 tj|r|st }
t|d	}t|| _W d    n1 s0    Y  td
| dt |
  ntd|  g | _t|dd}| }W d    n1 sJ0    Y  |||}tdt|| d |D ]$}| j|||||   q|t }
t|d$}tj| j|tjd W d    n1 s0    Y  td| dt |
 dd W d    n1 s"0    Y  d S )Nchttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.pyFInput file path 
 not foundpairZ
cached_lm__.lockrb"Loading features from cached file  [took %.3f s]'Creating features from dataset file at utf-8encodingr      wbprotocol!Saving features into cached file  [took .3f s]) warningswarnDEPRECATION_WARNINGformatFutureWarningospathisfile
ValueErrornum_special_tokens_to_addsplitjoin	__class____name__r   existstimeopenpickleloadexamplesloggerinforeadconvert_tokens_to_idstokenizerangelenappend build_inputs_with_special_tokensdumpHIGHEST_PROTOCOL)selfr
   r   r   overwrite_cacher   	directoryfilenamecached_features_file	lock_pathstarthandleftextZtokenized_texti rN   h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/data/datasets/language_modeling.py__init__-   sL    *(4zTextDataset.__init__c                 C   s
   t | jS Nr>   r7   rC   rN   rN   rO   __len__j   s    zTextDataset.__len__returnc                 C   s   t j| j| t jdS )NZdtype)torchtensorr7   longrC   rM   rN   rN   rO   __getitem__m   s    zTextDataset.__getitem__)FN)r1   
__module____qualname____doc__r   strintr   rP   rT   rX   ZTensorr\   rN   rN   rN   rO   r   (   s   	  =r   c                   @   sB   e Zd ZdZeeedddZdd Ze	ee
jf ddd	Zd
S )LineByLineTextDatasetr	   r
   r   r   c                 C   s   t tdt tj|du r2td| dt	
d|  t|dd&}dd	 |  D }W d    n1 sz0    Y  ||d
d
|d}|d | _dd	 | jD | _d S )Nr   Fr   r   r   r   r   c                 S   s$   g | ]}t |d kr| s|qS r   )r>   isspace.0linerN   rN   rO   
<listcomp>       z2LineByLineTextDataset.__init__.<locals>.<listcomp>TZadd_special_tokensZ
truncation
max_length	input_idsc                 S   s    g | ]}d t j|t jdiqS rm   rW   rX   rY   rZ   rg   erN   rN   rO   ri      rj   )r$   r%   r&   r'   r(   r)   r*   r+   r,   r8   r9   r4   r:   
splitlinesr7   )rC   r
   r   r   rK   linesbatch_encodingrN   rN   rO   rP   v   s    4
zLineByLineTextDataset.__init__c                 C   s
   t | jS rQ   rR   rS   rN   rN   rO   rT      s    zLineByLineTextDataset.__len__rU   c                 C   s
   | j | S rQ   r7   r[   rN   rN   rO   r\      s    z!LineByLineTextDataset.__getitem__Nr1   r]   r^   r_   r   r`   ra   rP   rT   dictrX   rY   r\   rN   rN   rN   rO   rb   q   s   rb   c                   @   sD   e Zd ZdZeeeedddZdd Ze	ee
jf ddd	Zd
S )LineByLineWithRefDatasetr	   )r
   r   r   ref_pathc              
   C   s  t tdt tj|du r2td| dtj|du rRtd| dt	
d|  t	
d|  t|dd	}| }W d    n1 s0    Y  d
d |D }t|dd	&}dd |  D }W d    n1 s0    Y  t|t|kr0td| dt| d| dt| ||dd|d}|d | _dd | jD | _t| j}	t|	D ]$}
tj||
 tjd| j|
 d< qnd S )Nzghttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.pyFr   r   zRef file path r   zUse ref segment results at r   r   c                 S   s(   g | ] }t |d kr| s| qS rd   )r>   re   striprf   rN   rN   rO   ri      rj   z5LineByLineWithRefDataset.__init__.<locals>.<listcomp>c                 S   s*   g | ]"}t |d kr| st|qS rd   )r>   re   jsonloadsrf   rN   rN   rO   ri      rj   zDLength of Input file should be equal to Ref file. But the length of z is z while length of Trk   rm   c                 S   s    g | ]}d t j|t jdiqS rn   ro   rp   rN   rN   rO   ri      rj   rW   Zchinese_ref)r$   r%   r&   r'   r(   r)   r*   r+   r,   r8   r9   r4   	readlinesr:   rr   r>   r7   r=   rX   rY   rZ   )rC   r
   r   r   ry   rK   datarefrt   nrM   rN   rN   rO   rP      s>    &4

z!LineByLineWithRefDataset.__init__c                 C   s
   t | jS rQ   rR   rS   rN   rN   rO   rT      s    z LineByLineWithRefDataset.__len__rU   c                 C   s
   | j | S rQ   ru   r[   rN   rN   rO   r\      s    z$LineByLineWithRefDataset.__getitem__Nrv   rN   rN   rN   rO   rx      s   $rx   c                   @   sL   e Zd ZdZeeedddZdddZdd	 Z	e
eejf d
ddZdS )LineByLineWithSOPTextDatasetzY
    Dataset for sentence order prediction task, prepare sentence pairs for SOP task
    )r
   file_dirr   c              	      s:  t tdt tj|du r0t| dt	
d|  g | _t|D ]}tj||}tj|du rt| dd}t|dd}| }g }	|D ]f}
d|
v rd	}qd
|
v rd} fdd|	dd  D }| || }| j| g }	q|r|	|
 qW d    qP1 s 0    Y  qPt	
d d S )Nr   Fz is not a directoryz.Creating features from dataset file folder at z is not a filer   r   z<doc id=Tz</doc>c                    s0   g | ](}t |d kr| s  |qS rd   )r>   re   r;   r<   rf   r
   rN   rO   ri      s   z9LineByLineWithSOPTextDataset.__init__.<locals>.<listcomp>r   zDataset parse finished.)r$   r%   r&   r'   r(   r)   r*   isdirr,   r8   r9   r7   listdirr/   r+   r4   r}   create_examples_from_documentextendr?   )rC   r
   r   r   	file_namer   Zarticle_openrK   Zoriginal_linesZarticle_linesrh   documentr7   rN   r   rO   rP      s@    

.z%LineByLineWithSOPTextDataset.__init__皙?c                 C   s  ||j dd }|}t |k r,td|}g }g }d}	d}
|
t|k r||
 }|s`|
d7 }
q<|| |	t|7 }	|
t|d ks|	|kr|rd}t|dkrtdt|d }g }t|D ]}|||  qg }t|t|D ]}|||  qt|dks<t|dkrq<t dk r:d}|| }}nd}dd	 }|||| t|dksttd
t| dt|dkstdt| d|||}|	||}t
j|t
jdt
j|t
jdt
j|rdndt
jdd}|| g }d}	|
d7 }
q<|S )'Creates examples for a single document.Tr      r   r         ?Fc                 S   sh   t | t | }||krqdt | t |kr.| n|}t |dksFtdt dk rZ|d= q |  q dS )z;Truncates a pair of sequences to a maximum sequence length.r   z8Sequence length to be truncated must be no less than oner   r   N)r>   r,   randompop)tokens_atokens_bmax_num_tokenstotal_lengthZtrunc_tokensrN   rN   rO   truncate_seq_pair-  s    zULineByLineWithSOPTextDataset.create_examples_from_document.<locals>.truncate_seq_pairLength of sequence a is  which must be no less than 1Length of sequence b is rW   )rm   token_type_idsZsentence_order_label)r-   r   randintr>   r?   r=   r   r,   r@   $create_token_type_ids_from_sequencesrX   rY   rZ   )rC   r   r   r
   Zshort_seq_probr   target_seq_lengthr7   current_chunkcurrent_lengthrM   segmenta_endr   jr   Zis_nextr   rm   r   examplerN   rN   rO   r      sb    	


z:LineByLineWithSOPTextDataset.create_examples_from_documentc                 C   s
   t | jS rQ   rR   rS   rN   rN   rO   rT   S  s    z$LineByLineWithSOPTextDataset.__len__rU   c                 C   s
   | j | S rQ   ru   r[   rN   rN   rO   r\   V  s    z(LineByLineWithSOPTextDataset.__getitem__N)r   )r1   r]   r^   r_   r   r`   ra   rP   r   rT   rw   rX   rY   r\   rN   rN   rN   rO   r      s
   )
cr   c                   @   sN   e Zd ZdZdeeedddZeee  eedd	d
Z	dd Z
dd ZdS )$TextDatasetForNextSentencePredictionr	   Fr   r   rc   c              	   C   sz  t tdt tj|s.td| d|| _	|| _
tj|\}}tj|d|jj d| d| }	|| _|	d }
t|
 tj|	r|st }t|	d}t|| _W d    n1 s0    Y  td|	 d	t |  n`td
|  g g| _t|dd~}| }|s.q| }|s\t| jd dkr\| jg  ||}||}|r| jd | qW d    n1 s0    Y  tdt| j d g | _t | jD ]\}}| !||| qt }t|	d$}tj"| j|tj#d W d    n1 s*0    Y  td|	 dt | dd W d    n1 sl0    Y  d S )Nr   r   r   Zcached_nsp_r   r   r   r   r   r   r   r   r   zCreating examples from z documents.r   r   r    r!   r"   r#   )$r$   r%   r&   r'   r(   r)   r*   r+   r,   short_seq_probabilitynsp_probabilityr.   r/   r0   r1   r
   r   r2   r3   r4   r5   r6   r7   r8   r9   	documentsreadlinerz   r>   r?   r<   r;   	enumerater   rA   rB   )rC   r
   r   r   rD   r   r   rE   rF   rG   rH   rI   rJ   rK   rh   tokens	doc_indexr   rN   rN   rO   rP   _  s`    	*

44z-TextDatasetForNextSentencePrediction.__init__)r   r   r   c                 C   s  || j jdd }|}t | jk r0td|}g }d}d}|t|k r~|| }	||	 |t|	7 }|t|d ks||krt|rld}
t|dkrtdt|d }
g }t|
D ]}|||  qg }t|dkst | j	k rd}|t| }tdD ],}tdt| j
d }||kr q2q| j
| }tdt|d }t|t|D ](}|||  t||kr^ qq^t||
 }||8 }n(d}t|
t|D ]}|||  qt|dkstdt| d	t|dks
td
t| d	| j ||}| j ||}tj|tjdtj|tjdtj|rPdndtjdd}| j| g }d}|d7 }q<dS )r   Tr   r   r   r   
   Fr   r   r   rW   )rm   r   Znext_sentence_labelN)r
   r-   r   r   r   r>   r?   r=   r   r   r   r,   r@   r   rX   rY   rZ   r7   )rC   r   r   r   r   r   r   r   rM   r   r   r   r   r   Zis_random_nextZtarget_b_lengthr   Zrandom_document_indexZrandom_documentZrandom_startZnum_unused_segmentsrm   r   r   rN   rN   rO   r     sh    	





zBTextDatasetForNextSentencePrediction.create_examples_from_documentc                 C   s
   t | jS rQ   rR   rS   rN   rN   rO   rT     s    z,TextDatasetForNextSentencePrediction.__len__c                 C   s
   | j | S rQ   ru   r[   rN   rN   rO   r\     s    z0TextDatasetForNextSentencePrediction.__getitem__N)Fr   r   )r1   r]   r^   r_   r   r`   ra   rP   listr   rT   r\   rN   rN   rN   rO   r   Z  s   	   UZr   )r{   r)   r5   r   r3   r$   typingr   rX   filelockr   Ztorch.utils.datar   Ztokenization_utilsr   utilsr   Z
get_loggerr1   r8   r&   r   rb   rx   r   r   rN   rN   rN   rO   <module>   s(   
I!0 