breakpilot-lehrer/backend-lehrer/abitur_docs_api.py

"""
Abitur Document Store API - Verwaltung von Abitur-Aufgaben und Erwartungshorizonten.

Unterstützt:
- Bundesland-spezifische Dokumente
- Fach, Jahr, Niveau (eA/gA), Aufgabennummer
- KI-basierte Dokumentenerkennung
- RAG-Integration mit Vector Store

Dateinamen-Schema (NiBiS Niedersachsen):
- 2025_Deutsch_eA_I.pdf - Aufgabe
- 2025_Deutsch_eA_I_EWH.pdf - Erwartungshorizont
"""

import logging
import uuid
import os
import zipfile
import tempfile
from datetime import datetime
from typing import List, Optional, Dict, Any
from pathlib import Path

from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
from fastapi.responses import FileResponse

from abitur_docs_models import (
    Bundesland, Fach, Niveau, DokumentTyp, VerarbeitungsStatus,
    DokumentCreate, DokumentUpdate, DokumentResponse, ImportResult,
    RecognitionResult, AbiturDokument,
    FACH_LABELS, DOKUMENT_TYP_LABELS,
    # Backwards-compatibility re-exports
    AbiturFach, Anforderungsniveau, DocumentMetadata, AbiturDokumentCompat,
)
from abitur_docs_recognition import parse_nibis_filename, to_dokument_response

logger = logging.getLogger(__name__)

router = APIRouter(
    prefix="/abitur-docs",
    tags=["abitur-docs"],
)

# Storage directory
DOCS_DIR = Path("/tmp/abitur-docs")
DOCS_DIR.mkdir(parents=True, exist_ok=True)

# In-Memory Storage
_dokumente: Dict[str, AbiturDokument] = {}

# Backwards-compatibility alias
documents_db = _dokumente


# ============================================================================
# Private helper (kept local since it references module-level _dokumente)
# ============================================================================

def _to_dokument_response(doc: AbiturDokument) -> DokumentResponse:
    return to_dokument_response(doc)


# ============================================================================
# API Endpoints - Dokumente
# ============================================================================

@router.post("/upload", response_model=DokumentResponse)
async def upload_dokument(
    file: UploadFile = File(...),
    bundesland: Optional[Bundesland] = Form(None),
    fach: Optional[Fach] = Form(None),
    jahr: Optional[int] = Form(None),
    niveau: Optional[Niveau] = Form(None),
    typ: Optional[DokumentTyp] = Form(None),
    aufgaben_nummer: Optional[str] = Form(None)
):
    """Lädt ein einzelnes Dokument hoch."""
    if not file.filename:
        raise HTTPException(status_code=400, detail="Kein Dateiname")

    recognition = parse_nibis_filename(file.filename)

    final_bundesland = bundesland or recognition.bundesland or Bundesland.NIEDERSACHSEN
    final_fach = fach or recognition.fach
    final_jahr = jahr or recognition.jahr or datetime.now().year
    final_niveau = niveau or recognition.niveau or Niveau.EA
    final_typ = typ or recognition.typ or DokumentTyp.AUFGABE
    final_aufgabe = aufgaben_nummer or recognition.aufgaben_nummer

    if not final_fach:
        raise HTTPException(status_code=400, detail="Fach konnte nicht erkannt werden")

    doc_id = str(uuid.uuid4())
    file_ext = Path(file.filename).suffix
    safe_filename = f"{doc_id}{file_ext}"
    file_path = DOCS_DIR / safe_filename

    content = await file.read()
    with open(file_path, "wb") as f:
        f.write(content)

    now = datetime.utcnow()
    dokument = AbiturDokument(
        id=doc_id, dateiname=safe_filename, original_dateiname=file.filename,
        bundesland=final_bundesland, fach=final_fach, jahr=final_jahr,
        niveau=final_niveau, typ=final_typ, aufgaben_nummer=final_aufgabe,
        status=VerarbeitungsStatus.RECOGNIZED if recognition.success else VerarbeitungsStatus.PENDING,
        confidence=recognition.confidence, file_path=str(file_path), file_size=len(content),
        indexed=False, vector_ids=[], created_at=now, updated_at=now
    )
    _dokumente[doc_id] = dokument
    logger.info(f"Uploaded document {doc_id}: {file.filename}")
    return _to_dokument_response(dokument)


@router.post("/import-zip", response_model=ImportResult)
async def import_zip(
    file: UploadFile = File(...),
    bundesland: Bundesland = Form(Bundesland.NIEDERSACHSEN),
    background_tasks: BackgroundTasks = None
):
    """Importiert alle PDFs aus einer ZIP-Datei."""
    if not file.filename or not file.filename.endswith(".zip"):
        raise HTTPException(status_code=400, detail="ZIP-Datei erforderlich")

    with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp:
        content = await file.read()
        tmp.write(content)
        tmp_path = tmp.name

    documents = []
    total = 0
    recognized = 0
    errors = 0

    try:
        with zipfile.ZipFile(tmp_path, 'r') as zip_ref:
            for zip_info in zip_ref.infolist():
                if not zip_info.filename.lower().endswith(".pdf"):
                    continue
                if "__MACOSX" in zip_info.filename or zip_info.filename.startswith("."):
                    continue
                if "thumbs.db" in zip_info.filename.lower():
                    continue

                total += 1
                try:
                    basename = Path(zip_info.filename).name
                    recognition = parse_nibis_filename(basename)
                    if not recognition.fach:
                        errors += 1
                        logger.warning(f"Konnte Fach nicht erkennen: {basename}")
                        continue

                    doc_id = str(uuid.uuid4())
                    file_ext = Path(basename).suffix
                    safe_filename = f"{doc_id}{file_ext}"
                    file_path = DOCS_DIR / safe_filename

                    with zip_ref.open(zip_info.filename) as source:
                        file_content = source.read()
                        with open(file_path, "wb") as target:
                            target.write(file_content)

                    now = datetime.utcnow()
                    dokument = AbiturDokument(
                        id=doc_id, dateiname=safe_filename, original_dateiname=basename,
                        bundesland=bundesland, fach=recognition.fach,
                        jahr=recognition.jahr or datetime.now().year,
                        niveau=recognition.niveau or Niveau.EA,
                        typ=recognition.typ or DokumentTyp.AUFGABE,
                        aufgaben_nummer=recognition.aufgaben_nummer,
                        status=VerarbeitungsStatus.RECOGNIZED, confidence=recognition.confidence,
                        file_path=str(file_path), file_size=len(file_content),
                        indexed=False, vector_ids=[], created_at=now, updated_at=now
                    )
                    _dokumente[doc_id] = dokument
                    documents.append(_to_dokument_response(dokument))
                    recognized += 1
                except Exception as e:
                    errors += 1
                    logger.error(f"Fehler bei {zip_info.filename}: {e}")
    finally:
        os.unlink(tmp_path)

    logger.info(f"ZIP-Import: {recognized}/{total} erkannt, {errors} Fehler")
    return ImportResult(total_files=total, recognized=recognized, errors=errors, documents=documents)


@router.get("/", response_model=List[DokumentResponse])
async def list_dokumente(
    bundesland: Optional[Bundesland] = None, fach: Optional[Fach] = None,
    jahr: Optional[int] = None, niveau: Optional[Niveau] = None,
    typ: Optional[DokumentTyp] = None, status: Optional[VerarbeitungsStatus] = None,
    indexed: Optional[bool] = None
):
    """Listet Dokumente mit optionalen Filtern."""
    docs = list(_dokumente.values())
    if bundesland:
        docs = [d for d in docs if d.bundesland == bundesland]
    if fach:
        docs = [d for d in docs if d.fach == fach]
    if jahr:
        docs = [d for d in docs if d.jahr == jahr]
    if niveau:
        docs = [d for d in docs if d.niveau == niveau]
    if typ:
        docs = [d for d in docs if d.typ == typ]
    if status:
        docs = [d for d in docs if d.status == status]
    if indexed is not None:
        docs = [d for d in docs if d.indexed == indexed]
    docs.sort(key=lambda x: (x.jahr, x.fach.value, x.niveau.value), reverse=True)
    return [_to_dokument_response(d) for d in docs]


@router.get("/{doc_id}", response_model=DokumentResponse)
async def get_dokument(doc_id: str):
    """Ruft ein Dokument ab."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
    return _to_dokument_response(doc)


@router.put("/{doc_id}", response_model=DokumentResponse)
async def update_dokument(doc_id: str, data: DokumentUpdate):
    """Aktualisiert Dokument-Metadaten."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
    if data.bundesland is not None:
        doc.bundesland = data.bundesland
    if data.fach is not None:
        doc.fach = data.fach
    if data.jahr is not None:
        doc.jahr = data.jahr
    if data.niveau is not None:
        doc.niveau = data.niveau
    if data.typ is not None:
        doc.typ = data.typ
    if data.aufgaben_nummer is not None:
        doc.aufgaben_nummer = data.aufgaben_nummer
    if data.status is not None:
        doc.status = data.status
    doc.updated_at = datetime.utcnow()
    return _to_dokument_response(doc)


@router.post("/{doc_id}/confirm", response_model=DokumentResponse)
async def confirm_dokument(doc_id: str):
    """Bestätigt erkannte Metadaten."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
    doc.status = VerarbeitungsStatus.CONFIRMED
    doc.updated_at = datetime.utcnow()
    return _to_dokument_response(doc)


@router.post("/{doc_id}/index", response_model=DokumentResponse)
async def index_dokument(doc_id: str):
    """Indiziert Dokument im Vector Store."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
    if doc.status not in [VerarbeitungsStatus.CONFIRMED, VerarbeitungsStatus.RECOGNIZED]:
        raise HTTPException(status_code=400, detail="Dokument muss erst bestätigt werden")
    doc.indexed = True
    doc.vector_ids = [f"vec_{doc_id}_{i}" for i in range(3)]
    doc.status = VerarbeitungsStatus.INDEXED
    doc.updated_at = datetime.utcnow()
    logger.info(f"Document {doc_id} indexed (demo)")
    return _to_dokument_response(doc)


@router.delete("/{doc_id}")
async def delete_dokument(doc_id: str):
    """Löscht ein Dokument."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
    if os.path.exists(doc.file_path):
        os.remove(doc.file_path)
    del _dokumente[doc_id]
    return {"status": "deleted", "id": doc_id}


@router.get("/{doc_id}/download")
async def download_dokument(doc_id: str):
    """Lädt Dokument herunter."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
    if not os.path.exists(doc.file_path):
        raise HTTPException(status_code=404, detail="Datei nicht gefunden")
    return FileResponse(doc.file_path, filename=doc.original_dateiname, media_type="application/pdf")


@router.post("/recognize", response_model=RecognitionResult)
async def recognize_filename(filename: str):
    """Erkennt Metadaten aus einem Dateinamen."""
    return parse_nibis_filename(filename)


@router.post("/bulk-confirm")
async def bulk_confirm(doc_ids: List[str]):
    """Bestätigt mehrere Dokumente auf einmal."""
    confirmed = 0
    for doc_id in doc_ids:
        doc = _dokumente.get(doc_id)
        if doc and doc.status == VerarbeitungsStatus.RECOGNIZED:
            doc.status = VerarbeitungsStatus.CONFIRMED
            doc.updated_at = datetime.utcnow()
            confirmed += 1
    return {"confirmed": confirmed, "total": len(doc_ids)}


@router.post("/bulk-index")
async def bulk_index(doc_ids: List[str]):
    """Indiziert mehrere Dokumente auf einmal."""
    indexed = 0
    for doc_id in doc_ids:
        doc = _dokumente.get(doc_id)
        if doc and doc.status in [VerarbeitungsStatus.CONFIRMED, VerarbeitungsStatus.RECOGNIZED]:
            doc.indexed = True
            doc.vector_ids = [f"vec_{doc_id}_{i}" for i in range(3)]
            doc.status = VerarbeitungsStatus.INDEXED
            doc.updated_at = datetime.utcnow()
            indexed += 1
    return {"indexed": indexed, "total": len(doc_ids)}


@router.get("/stats/overview")
async def get_stats_overview():
    """Gibt Übersicht über alle Dokumente."""
    docs = list(_dokumente.values())
    by_bundesland: Dict[str, int] = {}
    by_fach: Dict[str, int] = {}
    by_jahr: Dict[int, int] = {}
    by_status: Dict[str, int] = {}
    for doc in docs:
        by_bundesland[doc.bundesland.value] = by_bundesland.get(doc.bundesland.value, 0) + 1
        by_fach[doc.fach.value] = by_fach.get(doc.fach.value, 0) + 1
        by_jahr[doc.jahr] = by_jahr.get(doc.jahr, 0) + 1
        by_status[doc.status.value] = by_status.get(doc.status.value, 0) + 1
    return {
        "total": len(docs), "indexed": sum(1 for d in docs if d.indexed),
        "pending": sum(1 for d in docs if d.status == VerarbeitungsStatus.PENDING),
        "by_bundesland": by_bundesland, "by_fach": by_fach, "by_jahr": by_jahr, "by_status": by_status
    }


@router.get("/search", response_model=List[DokumentResponse])
async def search_dokumente(
    bundesland: Bundesland, fach: Fach, jahr: Optional[int] = None,
    niveau: Optional[Niveau] = None, nur_indexed: bool = True
):
    """Sucht Dokumente für Klausur-Korrektur."""
    docs = [d for d in _dokumente.values() if d.bundesland == bundesland and d.fach == fach]
    if jahr:
        docs = [d for d in docs if d.jahr == jahr]
    if niveau:
        docs = [d for d in docs if d.niveau == niveau]
    if nur_indexed:
        docs = [d for d in docs if d.indexed]

    aufgaben = [d for d in docs if d.typ == DokumentTyp.AUFGABE]
    ewh = [d for d in docs if d.typ == DokumentTyp.ERWARTUNGSHORIZONT]
    andere = [d for d in docs if d.typ not in [DokumentTyp.AUFGABE, DokumentTyp.ERWARTUNGSHORIZONT]]

    result = []
    for aufgabe in aufgaben:
        result.append(_to_dokument_response(aufgabe))
        matching_ewh = next(
            (e for e in ewh if e.jahr == aufgabe.jahr and e.niveau == aufgabe.niveau
             and e.aufgaben_nummer == aufgabe.aufgaben_nummer), None
        )
        if matching_ewh:
            result.append(_to_dokument_response(matching_ewh))
    for e in ewh:
        if _to_dokument_response(e) not in result:
            result.append(_to_dokument_response(e))
    for a in andere:
        result.append(_to_dokument_response(a))
    return result


@router.get("/enums/bundeslaender")
async def get_bundeslaender():
    """Gibt alle Bundesländer zurück."""
    return [{"value": b.value, "label": b.value.replace("_", " ").title()} for b in Bundesland]


@router.get("/enums/faecher")
async def get_faecher():
    """Gibt alle Fächer zurück."""
    return [{"value": f.value, "label": FACH_LABELS.get(f, f.value)} for f in Fach]


@router.get("/enums/niveaus")
async def get_niveaus():
    """Gibt alle Niveaus zurück."""
    return [
        {"value": "eA", "label": "eA (erhöhtes Anforderungsniveau)"},
        {"value": "gA", "label": "gA (grundlegendes Anforderungsniveau)"}
    ]


@router.get("/enums/typen")
async def get_typen():
    """Gibt alle Dokumenttypen zurück."""
    return [{"value": t.value, "label": DOKUMENT_TYP_LABELS.get(t, t.value)} for t in DokumentTyp]