
    gh                        d dl Z d dlmZ d dlmZ d Zd Zd Zd Ze	dk    rH e
d	d
d          5 Ze                    d           ddd           n# 1 swxY w Y    ed	          \  ZZer ede            n ede            	  e            Ze                    d           e                    d            ed          \  ZZer ede            n ede            n # e$ rZ ede            Y dZ[ndZ[ww xY wdZe j                            e          r0 ee          \  ZZer ede            dS  ede            dS  ede d           dS dS )    N)	PdfReader)Documentc                     t          | dd          5 }|                                cd d d            S # 1 swxY w Y   d S )Nrutf-8encoding)openread)filepathfs     "/home/ubuntu/src/data_ingestion.pyread_text_filer      s    	hg	.	.	. !vvxx                 s   488c                     d}t          | d          5 }t          |          }|j        D ]}||                                pdz  }	 d d d            n# 1 swxY w Y   |S )N rb)r
   r   pagesextract_text)r   textr   readerpages        r   read_pdf_filer   
   s    D	h		 .1L 	. 	.DD%%''-2-DD	.. . . . . . . . . . . . . . . Ks   3AAAc                     t          |           }g }|j        D ]}|                    |j                   d                    |          S )N
)r   
paragraphsappendr   join)r   documentr   	paragraphs       r   read_docx_filer       sM    !!HD( $ $	IN####99T??    c                 B   t           j                            |           sdS t           j                            |           d                                         }|dk    rt          |           }n.|dk    rt          |           }n|dk    rt          |           }ndS |d fS )N)NzFile not found   z.txtz.pdfz.docx)NzUnsupported file type)ospathexistssplitextlowerr   r   r    )r   file_extensioncontents      r   ingest_datar+      s    7>>(## &%%W%%h//288::N - **	6	! -))	7	" - **,,D=r!   __main__zdata/test.txtwr   r   z&Este es un archivo de texto de prueba.zTXT Content:
zError reading TXT: z"Este es un archivo DOCX de prueba.zdata/test.docxzDOCX Content:
zError reading DOCX: z(Could not create/read DOCX for testing: zdata/test.pdfzPDF Content:
zError reading PDF: zSkipping PDF test: z2 not found. Please place a PDF file there to test.)r$   PyPDF2r   docxr   r   r   r    r+   __name__r
   r   writetxt_contenterrorprintdocadd_paragraphsavedocx_content	Exceptionepdf_pathr%   r&   pdf_content r!   r   <module>r>      s   				                    " z -b 
osW	5	5	5 :	8999: : : : : : : : : : : : : : : %_55K -,{,,----+E++,,,
>hjj>???!""")k*:;;e 	2E2L223333E000111 > > ><<<========> H	w~~h b([22U 	1E0;0011111E///00000`H```aaaaa[-b -bs*   AAAA!C/ /D4DD