a
    gì®h°  ã                
   @   sŠ  d dl Z d dlmZ d dlmZ dd„ Zdd„ Zdd	„ Zd
d„ Ze	dkr†e
ddddZe d¡ W d  ƒ n1 sx0    Y  edƒ\ZZer¢ede› ƒ nede› ƒ zLeƒ Ze d¡ e d¡ edƒ\ZZerìede› ƒ nede› ƒ W n4 ey0 Z zede› ƒ W Y dZ[n
dZ[0 0 dZe j e¡rveeƒ\ZZerfede› ƒ nede› ƒ nede› dƒ dS )é    N)Ú	PdfReader)ÚDocumentc                 C   s:   t | ddd}| ¡ W  d   ƒ S 1 s,0    Y  d S )NÚrúutf-8©Úencoding)ÚopenÚread)ÚfilepathÚf© r   ú3/var/www/html/moodle/api/manu/src/data_ingestion.pyÚread_text_file   s    r   c                 C   sV   d}t | dƒ4}t|ƒ}|jD ]}|| ¡ p.d7 }qW d   ƒ n1 sH0    Y  |S )NÚ Úrb)r   r   ZpagesZextract_text)r
   Útextr   ÚreaderZpager   r   r   Úread_pdf_file
   s    
0r   c                 C   s.   t | ƒ}g }|jD ]}| |j¡ qd |¡S )NÚ
)r   Z
paragraphsÚappendr   Újoin)r
   Zdocumentr   Z	paragraphr   r   r   Úread_docx_file   s
    
r   c                 C   sf   t j | ¡sdS t j | ¡d  ¡ }|dkr6t| ƒ}n(|dkrHt| ƒ}n|dkrZt| ƒ}ndS |d fS )N)NzFile not foundé   z.txtz.pdfz.docx)NzUnsupported file type)ÚosÚpathÚexistsÚsplitextÚlowerr   r   r   )r
   Zfile_extensionÚcontentr   r   r   Úingest_data   s    


r   Ú__main__zdata/test.txtÚwr   r   z&Este es un archivo de texto de prueba.zTXT Content:
zError reading TXT: z"Este es un archivo DOCX de prueba.zdata/test.docxzDOCX Content:
zError reading DOCX: z(Could not create/read DOCX for testing: zdata/test.pdfzPDF Content:
zError reading PDF: zSkipping PDF test: z2 not found. Please place a PDF file there to test.)r   ZPyPDF2r   Zdocxr   r   r   r   r   Ú__name__r   r   ÚwriteZtxt_contentÚerrorÚprintÚdocZadd_paragraphÚsaveZdocx_contentÚ	ExceptionÚeZpdf_pathr   r   Zpdf_contentr   r   r   r   Ú<module>   s<   
(

$