# ingest.py — Baixa conteúdos do Moodle e cria índice FAISS para RAG
# Requisitos: pip install faiss-cpu sentence-transformers pymupdf requests python-dotenv

import os
import io
import json
import time
import re
from pathlib import Path
from typing import List, Dict, Any, Tuple

import requests
import fitz  # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

# ── config ────────────────────────────────────────────────────────────────────
BASE_DIR = Path(__file__).resolve().parent
load_dotenv(BASE_DIR / ".env")

MOODLE_URL = os.getenv("MOODLE_URL", "https://lms.ed-consulting.ao").rstrip("/")
MOODLE_TOKEN = os.getenv("MOODLE_TOKEN", "")

DATA_DIR = BASE_DIR / "data"
DATA_DIR.mkdir(exist_ok=True)
INDEX_DIR = BASE_DIR / "indexes"
INDEX_DIR.mkdir(exist_ok=True)

EMB_MODEL_NAME = os.getenv("EMB_MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")

# ── util Moodle ───────────────────────────────────────────────────────────────
def call_moodle(wsfunction: str, params: Dict[str, Any]) -> Any:
    """Chama Moodle REST e devolve JSON, lançando erro em caso de exceção do Moodle."""
    url = f"{MOODLE_URL}/webservice/rest/server.php"
    payload = {
        "wstoken": MOODLE_TOKEN,
        "wsfunction": wsfunction,
        "moodlewsrestformat": "json",
        **params,
    }
    r = requests.post(url, data=payload, timeout=90)
    r.raise_for_status()
    try:
        out = r.json()
    except ValueError:
        raise RuntimeError("Resposta não-JSON do Moodle (verifique permissões/funções do serviço).")

    if isinstance(out, dict) and "exception" in out:
        raise RuntimeError(out)
    return out

def collect_file_urls(courseid: int) -> List[Tuple[str, str]]:
    """
    Varre o curso e devolve [(fileurl_com_token, titulo)].
    Usa excludecontents=false para incluir ficheiros/links dentro dos módulos.
    """
    print(f"[info] a recolher URLs do curso {courseid} …")
    options = {
        "options[0][name]": "excludemodules",  "options[0][value]": "false",
        "options[1][name]": "excludecontents", "options[1][value]": "false",
    }
    contents = call_moodle("core_course_get_contents", {"courseid": courseid, **options})
    urls: List[Tuple[str, str]] = []

    for section in contents:
        for mod in section.get("modules", []) or []:
            for c in mod.get("contents", []) or []:
                url = c.get("fileurl")
                if not url:
                    continue
                # acrescentar token/forcedownload quando necessário
                if "token=" not in url:
                    sep = "&" if "?" in url else "?"
                    url = f"{url}{sep}token={MOODLE_TOKEN}&forcedownload=1"
                title = c.get("filename") or mod.get("name") or section.get("name") or "file"
                urls.append((url, title))

    print(f"[info] encontrados {len(urls)} ficheiros/links")
    return urls

# ── processamento de ficheiros ────────────────────────────────────────────────
def pdf_to_text(data: bytes) -> str:
    """Extrai texto de PDF usando PyMuPDF."""
    doc = fitz.open(stream=data, filetype="pdf")
    parts: List[str] = []
    for page in doc:
        parts.append(page.get_text("text"))
    return "\n".join(parts)

def chunk_text(text: str, max_chars: int = 1200, overlap: int = 150) -> List[str]:
    """Divide texto em pedaços com sobreposição para melhor recuperação."""
    text = re.sub(r"[ \t]+\n", "\n", text).strip()
    chunks: List[str] = []
    i = 0
    n = len(text)
    if n == 0:
        return chunks
    while i < n:
        j = min(n, i + max_chars)
        chunks.append(text[i:j])
        if j == n:
            break
        i = max(0, j - overlap)
    return [c for c in chunks if c.strip()]

# ── pipeline de indexação ─────────────────────────────────────────────────────
def build_index(courseid: int) -> None:
    """Baixa ficheiros do curso, cria chunks, gera embeddings e salva FAISS + meta."""
    if not MOODLE_TOKEN:
        raise RuntimeError("MOODLE_TOKEN não definido no .env")

    model = SentenceTransformer(EMB_MODEL_NAME)

    file_urls = collect_file_urls(courseid)
    corpus: List[Dict[str, Any]] = []

    for url, title in file_urls:
        try:
            print(f"[download] {title} ← {url}")
            r = requests.get(url, timeout=180)
            r.raise_for_status()
            ctype = r.headers.get("Content-Type", "").lower()

            text = ""
            if "pdf" in ctype or url.lower().endswith(".pdf"):
                text = pdf_to_text(r.content)
            else:
                # fallback: guarda o link como texto (para páginas web, etc.)
                text = f"{title}\n{url}"

            for ch in chunk_text(text):
                corpus.append({"text": ch, "source": title, "url": url})

        except Exception as e:
            print(f"[erro] falha ao processar {title}: {e}")

    if not corpus:
        raise RuntimeError(
            "Nada para indexar. Verifique se o curso tem PDFs/ficheiros com fileurl e se o serviço permite 'core_course_get_contents'."
        )

    print(f"[info] a gerar embeddings de {len(corpus)} chunks com {EMB_MODEL_NAME} …")
    embs = model.encode(
        [c["text"] for c in corpus],
        batch_size=32,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )

    dim = int(embs.shape[1])
    index = faiss.IndexFlatIP(dim)
    index.add(embs)

    index_path = INDEX_DIR / f"course_{courseid}.faiss"
    meta_path = INDEX_DIR / f"course_{courseid}.meta.json"
    faiss.write_index(index, str(index_path))
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump(corpus, f, ensure_ascii=False)

    print(f"[ok] índice criado: {index_path}  (chunks: {len(corpus)})")
    print(f"[ok] metadados: {meta_path}")

# ── CLI ───────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("Uso: python3 ingest.py <courseid>")
        sys.exit(1)
    cid = int(sys.argv[1])
    build_index(cid)