
import os
from PyPDF2 import PdfReader
from docx import Document

def read_text_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

def read_pdf_file(filepath):
    text = ""
    with open(filepath, 'rb') as f:
        reader = PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def read_docx_file(filepath):
    document = Document(filepath)
    text = []
    for paragraph in document.paragraphs:
        text.append(paragraph.text)
    return '\n'.join(text)

def ingest_data(filepath):
    if not os.path.exists(filepath):
        return None, "File not found"

    file_extension = os.path.splitext(filepath)[1].lower()

    if file_extension == '.txt':
        content = read_text_file(filepath)
    elif file_extension == '.pdf':
        content = read_pdf_file(filepath)
    elif file_extension == '.docx':
        content = read_docx_file(filepath)
    else:
        return None, "Unsupported file type"

    return content, None

if __name__ == '__main__':
    # Example usage (for testing purposes)
    # Create dummy files for testing
    with open('data/test.txt', 'w', encoding='utf-8') as f:
        f.write('Este es un archivo de texto de prueba.')

    # Create a dummy PDF (requires a PDF file to be present or generated)
    # For simplicity, we'll just create a dummy text file and pretend it's a PDF for now
    # In a real scenario, you'd need a proper PDF file.
    # You can create a dummy PDF using a tool like reportlab if needed.

    # Test TXT
    txt_content, error = ingest_data('data/test.txt')
    if txt_content:
        print(f"TXT Content:\n{txt_content}")
    else:
        print(f"Error reading TXT: {error}")

    # Test DOCX (requires a DOCX file to be present or generated)
    # For simplicity, we'll just create a dummy text file and pretend it's a DOCX for now
    # In a real scenario, you'd need a proper DOCX file.
    # You can create a dummy DOCX using python-docx if needed.
    try:
        doc = Document()
        doc.add_paragraph('Este es un archivo DOCX de prueba.')
        doc.save('data/test.docx')
        docx_content, error = ingest_data('data/test.docx')
        if docx_content:
            print(f"DOCX Content:\n{docx_content}")
        else:
            print(f"Error reading DOCX: {error}")
    except Exception as e:
        print(f"Could not create/read DOCX for testing: {e}")

    # Test PDF (requires a PDF file)
    # For actual PDF testing, you would need a PDF file in the 'data' directory.
    # For now, this will likely fail if no PDF is present.
    pdf_path = 'data/test.pdf'
    if os.path.exists(pdf_path):
        pdf_content, error = ingest_data(pdf_path)
        if pdf_content:
            print(f"PDF Content:\n{pdf_content}")
        else:
            print(f"Error reading PDF: {error}")
    else:
        print(f"Skipping PDF test: {pdf_path} not found. Please place a PDF file there to test.")




