breakpilot-lehrer/backend-lehrer/abitur_docs_api.py

"""
Abitur Document Store API - Verwaltung von Abitur-Aufgaben und Erwartungshorizonten.

Unterstützt:
- Bundesland-spezifische Dokumente
- Fach, Jahr, Niveau (eA/gA), Aufgabennummer
- KI-basierte Dokumentenerkennung
- RAG-Integration mit Vector Store

Dateinamen-Schema (NiBiS Niedersachsen):
- 2025_Deutsch_eA_I.pdf - Aufgabe
- 2025_Deutsch_eA_I_EWH.pdf - Erwartungshorizont
"""

import logging
import uuid
import os
import re
import zipfile
import tempfile
from datetime import datetime
from typing import List, Dict, Any, Optional
from enum import Enum
from pathlib import Path
from dataclasses import dataclass

from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
from fastapi.responses import FileResponse
from pydantic import BaseModel, Field

logger = logging.getLogger(__name__)

router = APIRouter(
    prefix="/abitur-docs",
    tags=["abitur-docs"],
)

# Storage directory
DOCS_DIR = Path("/tmp/abitur-docs")
DOCS_DIR.mkdir(parents=True, exist_ok=True)


# ============================================================================
# Enums
# ============================================================================

class Bundesland(str, Enum):
    """Bundesländer mit Zentralabitur."""
    NIEDERSACHSEN = "niedersachsen"
    BAYERN = "bayern"
    BADEN_WUERTTEMBERG = "baden_wuerttemberg"
    NORDRHEIN_WESTFALEN = "nordrhein_westfalen"
    HESSEN = "hessen"
    SACHSEN = "sachsen"
    THUERINGEN = "thueringen"
    BERLIN = "berlin"
    HAMBURG = "hamburg"
    SCHLESWIG_HOLSTEIN = "schleswig_holstein"
    BREMEN = "bremen"
    BRANDENBURG = "brandenburg"
    MECKLENBURG_VORPOMMERN = "mecklenburg_vorpommern"
    SACHSEN_ANHALT = "sachsen_anhalt"
    RHEINLAND_PFALZ = "rheinland_pfalz"
    SAARLAND = "saarland"


class Fach(str, Enum):
    """Abiturfächer."""
    DEUTSCH = "deutsch"
    ENGLISCH = "englisch"
    MATHEMATIK = "mathematik"
    BIOLOGIE = "biologie"
    CHEMIE = "chemie"
    PHYSIK = "physik"
    GESCHICHTE = "geschichte"
    ERDKUNDE = "erdkunde"
    POLITIK_WIRTSCHAFT = "politik_wirtschaft"
    FRANZOESISCH = "franzoesisch"
    SPANISCH = "spanisch"
    LATEIN = "latein"
    GRIECHISCH = "griechisch"
    KUNST = "kunst"
    MUSIK = "musik"
    SPORT = "sport"
    INFORMATIK = "informatik"
    EV_RELIGION = "ev_religion"
    KATH_RELIGION = "kath_religion"
    WERTE_NORMEN = "werte_normen"
    BRC = "brc"  # Betriebswirtschaft mit Rechnungswesen
    BVW = "bvw"  # Volkswirtschaft
    ERNAEHRUNG = "ernaehrung"
    MECHATRONIK = "mechatronik"
    GESUNDHEIT_PFLEGE = "gesundheit_pflege"
    PAEDAGOGIK_PSYCHOLOGIE = "paedagogik_psychologie"


class Niveau(str, Enum):
    """Anforderungsniveau."""
    EA = "eA"  # Erhöhtes Anforderungsniveau (Leistungskurs)
    GA = "gA"  # Grundlegendes Anforderungsniveau (Grundkurs)


class DokumentTyp(str, Enum):
    """Dokumenttyp."""
    AUFGABE = "aufgabe"
    ERWARTUNGSHORIZONT = "erwartungshorizont"
    DECKBLATT = "deckblatt"
    MATERIAL = "material"
    HOERVERSTEHEN = "hoerverstehen"  # Für Sprachen
    SPRACHMITTLUNG = "sprachmittlung"  # Für Sprachen
    BEWERTUNGSBOGEN = "bewertungsbogen"


class VerarbeitungsStatus(str, Enum):
    """Status der Dokumentenverarbeitung."""
    PENDING = "pending"
    PROCESSING = "processing"
    RECOGNIZED = "recognized"  # KI hat Metadaten erkannt
    CONFIRMED = "confirmed"  # Entwickler hat bestätigt
    INDEXED = "indexed"  # Im Vector Store
    ERROR = "error"


# ============================================================================
# Fach-Mapping für Dateinamen
# ============================================================================

FACH_NAME_MAPPING = {
    "deutsch": Fach.DEUTSCH,
    "englisch": Fach.ENGLISCH,
    "mathe": Fach.MATHEMATIK,
    "mathematik": Fach.MATHEMATIK,
    "biologie": Fach.BIOLOGIE,
    "bio": Fach.BIOLOGIE,
    "chemie": Fach.CHEMIE,
    "physik": Fach.PHYSIK,
    "geschichte": Fach.GESCHICHTE,
    "erdkunde": Fach.ERDKUNDE,
    "geographie": Fach.ERDKUNDE,
    "politikwirtschaft": Fach.POLITIK_WIRTSCHAFT,
    "politik": Fach.POLITIK_WIRTSCHAFT,
    "franzoesisch": Fach.FRANZOESISCH,
    "franz": Fach.FRANZOESISCH,
    "spanisch": Fach.SPANISCH,
    "latein": Fach.LATEIN,
    "griechisch": Fach.GRIECHISCH,
    "kunst": Fach.KUNST,
    "musik": Fach.MUSIK,
    "sport": Fach.SPORT,
    "informatik": Fach.INFORMATIK,
    "evreligion": Fach.EV_RELIGION,
    "kathreligion": Fach.KATH_RELIGION,
    "wertenormen": Fach.WERTE_NORMEN,
    "brc": Fach.BRC,
    "bvw": Fach.BVW,
    "ernaehrung": Fach.ERNAEHRUNG,
    "mecha": Fach.MECHATRONIK,
    "mechatronik": Fach.MECHATRONIK,
    "technikmecha": Fach.MECHATRONIK,
    "gespfl": Fach.GESUNDHEIT_PFLEGE,
    "paedpsych": Fach.PAEDAGOGIK_PSYCHOLOGIE,
}


# ============================================================================
# Pydantic Models
# ============================================================================

class DokumentCreate(BaseModel):
    """Manuelles Erstellen eines Dokuments."""
    bundesland: Bundesland
    fach: Fach
    jahr: int = Field(ge=2000, le=2100)
    niveau: Niveau
    typ: DokumentTyp
    aufgaben_nummer: Optional[str] = None  # I, II, III, 1, 2, etc.


class DokumentUpdate(BaseModel):
    """Update für erkannte Metadaten."""
    bundesland: Optional[Bundesland] = None
    fach: Optional[Fach] = None
    jahr: Optional[int] = None
    niveau: Optional[Niveau] = None
    typ: Optional[DokumentTyp] = None
    aufgaben_nummer: Optional[str] = None
    status: Optional[VerarbeitungsStatus] = None


class DokumentResponse(BaseModel):
    """Response für ein Dokument."""
    id: str
    dateiname: str
    original_dateiname: str
    bundesland: Bundesland
    fach: Fach
    jahr: int
    niveau: Niveau
    typ: DokumentTyp
    aufgaben_nummer: Optional[str]
    status: VerarbeitungsStatus
    confidence: float  # Erkennungs-Confidence
    file_path: str
    file_size: int
    indexed: bool
    vector_ids: List[str]
    created_at: datetime
    updated_at: datetime


class ImportResult(BaseModel):
    """Ergebnis eines ZIP-Imports."""
    total_files: int
    recognized: int
    errors: int
    documents: List[DokumentResponse]


class RecognitionResult(BaseModel):
    """Ergebnis der Dokumentenerkennung."""
    success: bool
    bundesland: Optional[Bundesland]
    fach: Optional[Fach]
    jahr: Optional[int]
    niveau: Optional[Niveau]
    typ: Optional[DokumentTyp]
    aufgaben_nummer: Optional[str]
    confidence: float
    raw_filename: str
    suggestions: List[Dict[str, Any]]

    @property
    def extracted(self) -> Dict[str, Any]:
        """Backwards-compatible property returning extracted values as dict."""
        result = {}
        if self.bundesland:
            result["bundesland"] = self.bundesland.value
        if self.fach:
            result["fach"] = self.fach.value
        if self.jahr:
            result["jahr"] = self.jahr
        if self.niveau:
            result["niveau"] = self.niveau.value
        if self.typ:
            result["typ"] = self.typ.value
        if self.aufgaben_nummer:
            result["aufgaben_nummer"] = self.aufgaben_nummer
        return result

    @property
    def method(self) -> str:
        """Backwards-compatible property for recognition method."""
        return "filename_pattern"


# ============================================================================
# Internal Data Classes
# ============================================================================

@dataclass
class AbiturDokument:
    """Internes Dokument."""
    id: str
    dateiname: str
    original_dateiname: str
    bundesland: Bundesland
    fach: Fach
    jahr: int
    niveau: Niveau
    typ: DokumentTyp
    aufgaben_nummer: Optional[str]
    status: VerarbeitungsStatus
    confidence: float
    file_path: str
    file_size: int
    indexed: bool
    vector_ids: List[str]
    created_at: datetime
    updated_at: datetime


# ============================================================================
# In-Memory Storage
# ============================================================================

_dokumente: Dict[str, AbiturDokument] = {}


# ============================================================================
# Helper Functions - Dokumentenerkennung
# ============================================================================

def parse_nibis_filename(filename: str) -> RecognitionResult:
    """
    Erkennt Metadaten aus NiBiS-Dateinamen.

    Beispiele:
    - 2025_Deutsch_eA_I.pdf
    - 2025_Deutsch_eA_I_EWH.pdf
    - 2025_Biologie_gA_1.pdf
    - 2025_Englisch_eA_HV.pdf (Hörverstehen)
    """
    result = RecognitionResult(
        success=False,
        bundesland=Bundesland.NIEDERSACHSEN,  # NiBiS = Niedersachsen
        fach=None,
        jahr=None,
        niveau=None,
        typ=None,
        aufgaben_nummer=None,
        confidence=0.0,
        raw_filename=filename,
        suggestions=[]
    )

    # Bereinige Dateiname
    name = Path(filename).stem.lower()

    # Extrahiere Jahr (4 Ziffern am Anfang)
    jahr_match = re.match(r'^(\d{4})', name)
    if jahr_match:
        result.jahr = int(jahr_match.group(1))
        result.confidence += 0.2

    # Extrahiere Fach
    for fach_key, fach_enum in FACH_NAME_MAPPING.items():
        if fach_key in name.replace("_", "").replace("-", ""):
            result.fach = fach_enum
            result.confidence += 0.3
            break

    # Extrahiere Niveau (eA/gA)
    if "_ea" in name or "_ea_" in name or "ea_" in name:
        result.niveau = Niveau.EA
        result.confidence += 0.2
    elif "_ga" in name or "_ga_" in name or "ga_" in name:
        result.niveau = Niveau.GA
        result.confidence += 0.2

    # Extrahiere Typ
    if "_ewh" in name:
        result.typ = DokumentTyp.ERWARTUNGSHORIZONT
        result.confidence += 0.2
    elif "_hv" in name or "hoerverstehen" in name:
        result.typ = DokumentTyp.HOERVERSTEHEN
        result.confidence += 0.15
    elif "_sm" in name or "_me" in name or "sprachmittlung" in name:
        result.typ = DokumentTyp.SPRACHMITTLUNG
        result.confidence += 0.15
    elif "deckblatt" in name:
        result.typ = DokumentTyp.DECKBLATT
        result.confidence += 0.15
    elif "material" in name:
        result.typ = DokumentTyp.MATERIAL
        result.confidence += 0.15
    elif "bewertung" in name:
        result.typ = DokumentTyp.BEWERTUNGSBOGEN
        result.confidence += 0.15
    else:
        result.typ = DokumentTyp.AUFGABE
        result.confidence += 0.1

    # Extrahiere Aufgabennummer (römisch oder arabisch)
    aufgabe_match = re.search(r'_([ivx]+|[1-4][abc]?)(?:_|\.pdf|$)', name, re.IGNORECASE)
    if aufgabe_match:
        result.aufgaben_nummer = aufgabe_match.group(1).upper()
        result.confidence += 0.1

    # Erfolg wenn mindestens Fach und Jahr erkannt
    if result.fach and result.jahr:
        result.success = True

    # Normalisiere Confidence auf max 1.0
    result.confidence = min(result.confidence, 1.0)

    return result


def _to_dokument_response(doc: AbiturDokument) -> DokumentResponse:
    """Konvertiert internes Dokument zu Response."""
    return DokumentResponse(
        id=doc.id,
        dateiname=doc.dateiname,
        original_dateiname=doc.original_dateiname,
        bundesland=doc.bundesland,
        fach=doc.fach,
        jahr=doc.jahr,
        niveau=doc.niveau,
        typ=doc.typ,
        aufgaben_nummer=doc.aufgaben_nummer,
        status=doc.status,
        confidence=doc.confidence,
        file_path=doc.file_path,
        file_size=doc.file_size,
        indexed=doc.indexed,
        vector_ids=doc.vector_ids,
        created_at=doc.created_at,
        updated_at=doc.updated_at
    )


# ============================================================================
# API Endpoints - Dokumente
# ============================================================================

@router.post("/upload", response_model=DokumentResponse)
async def upload_dokument(
    file: UploadFile = File(...),
    bundesland: Optional[Bundesland] = Form(None),
    fach: Optional[Fach] = Form(None),
    jahr: Optional[int] = Form(None),
    niveau: Optional[Niveau] = Form(None),
    typ: Optional[DokumentTyp] = Form(None),
    aufgaben_nummer: Optional[str] = Form(None)
):
    """
    Lädt ein einzelnes Dokument hoch.

    Metadaten können manuell angegeben oder automatisch erkannt werden.
    """
    if not file.filename:
        raise HTTPException(status_code=400, detail="Kein Dateiname")

    # Erkenne Metadaten aus Dateiname
    recognition = parse_nibis_filename(file.filename)

    # Überschreibe mit manuellen Angaben
    final_bundesland = bundesland or recognition.bundesland or Bundesland.NIEDERSACHSEN
    final_fach = fach or recognition.fach
    final_jahr = jahr or recognition.jahr or datetime.now().year
    final_niveau = niveau or recognition.niveau or Niveau.EA
    final_typ = typ or recognition.typ or DokumentTyp.AUFGABE
    final_aufgabe = aufgaben_nummer or recognition.aufgaben_nummer

    if not final_fach:
        raise HTTPException(status_code=400, detail="Fach konnte nicht erkannt werden")

    # Generiere ID und speichere Datei
    doc_id = str(uuid.uuid4())
    file_ext = Path(file.filename).suffix
    safe_filename = f"{doc_id}{file_ext}"
    file_path = DOCS_DIR / safe_filename

    content = await file.read()
    with open(file_path, "wb") as f:
        f.write(content)

    now = datetime.utcnow()

    dokument = AbiturDokument(
        id=doc_id,
        dateiname=safe_filename,
        original_dateiname=file.filename,
        bundesland=final_bundesland,
        fach=final_fach,
        jahr=final_jahr,
        niveau=final_niveau,
        typ=final_typ,
        aufgaben_nummer=final_aufgabe,
        status=VerarbeitungsStatus.RECOGNIZED if recognition.success else VerarbeitungsStatus.PENDING,
        confidence=recognition.confidence,
        file_path=str(file_path),
        file_size=len(content),
        indexed=False,
        vector_ids=[],
        created_at=now,
        updated_at=now
    )

    _dokumente[doc_id] = dokument
    logger.info(f"Uploaded document {doc_id}: {file.filename}")

    return _to_dokument_response(dokument)


@router.post("/import-zip", response_model=ImportResult)
async def import_zip(
    file: UploadFile = File(...),
    bundesland: Bundesland = Form(Bundesland.NIEDERSACHSEN),
    background_tasks: BackgroundTasks = None
):
    """
    Importiert alle PDFs aus einer ZIP-Datei.

    Erkennt automatisch Metadaten aus Dateinamen.
    """
    if not file.filename or not file.filename.endswith(".zip"):
        raise HTTPException(status_code=400, detail="ZIP-Datei erforderlich")

    # Speichere ZIP temporär
    with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp:
        content = await file.read()
        tmp.write(content)
        tmp_path = tmp.name

    documents = []
    total = 0
    recognized = 0
    errors = 0

    try:
        with zipfile.ZipFile(tmp_path, 'r') as zip_ref:
            for zip_info in zip_ref.infolist():
                # Nur PDFs
                if not zip_info.filename.lower().endswith(".pdf"):
                    continue

                # Ignoriere Mac-spezifische Dateien
                if "__MACOSX" in zip_info.filename or zip_info.filename.startswith("."):
                    continue

                # Ignoriere Thumbs.db
                if "thumbs.db" in zip_info.filename.lower():
                    continue

                total += 1

                try:
                    # Erkenne Metadaten
                    basename = Path(zip_info.filename).name
                    recognition = parse_nibis_filename(basename)

                    if not recognition.fach:
                        errors += 1
                        logger.warning(f"Konnte Fach nicht erkennen: {basename}")
                        continue

                    # Extrahiere und speichere
                    doc_id = str(uuid.uuid4())
                    file_ext = Path(basename).suffix
                    safe_filename = f"{doc_id}{file_ext}"
                    file_path = DOCS_DIR / safe_filename

                    with zip_ref.open(zip_info.filename) as source:
                        file_content = source.read()
                        with open(file_path, "wb") as target:
                            target.write(file_content)

                    now = datetime.utcnow()

                    dokument = AbiturDokument(
                        id=doc_id,
                        dateiname=safe_filename,
                        original_dateiname=basename,
                        bundesland=bundesland,
                        fach=recognition.fach,
                        jahr=recognition.jahr or datetime.now().year,
                        niveau=recognition.niveau or Niveau.EA,
                        typ=recognition.typ or DokumentTyp.AUFGABE,
                        aufgaben_nummer=recognition.aufgaben_nummer,
                        status=VerarbeitungsStatus.RECOGNIZED,
                        confidence=recognition.confidence,
                        file_path=str(file_path),
                        file_size=len(file_content),
                        indexed=False,
                        vector_ids=[],
                        created_at=now,
                        updated_at=now
                    )

                    _dokumente[doc_id] = dokument
                    documents.append(_to_dokument_response(dokument))
                    recognized += 1

                except Exception as e:
                    errors += 1
                    logger.error(f"Fehler bei {zip_info.filename}: {e}")

    finally:
        # Lösche temporäre ZIP
        os.unlink(tmp_path)

    logger.info(f"ZIP-Import: {recognized}/{total} erkannt, {errors} Fehler")

    return ImportResult(
        total_files=total,
        recognized=recognized,
        errors=errors,
        documents=documents
    )


@router.get("/", response_model=List[DokumentResponse])
async def list_dokumente(
    bundesland: Optional[Bundesland] = None,
    fach: Optional[Fach] = None,
    jahr: Optional[int] = None,
    niveau: Optional[Niveau] = None,
    typ: Optional[DokumentTyp] = None,
    status: Optional[VerarbeitungsStatus] = None,
    indexed: Optional[bool] = None
):
    """Listet Dokumente mit optionalen Filtern."""
    docs = list(_dokumente.values())

    if bundesland:
        docs = [d for d in docs if d.bundesland == bundesland]
    if fach:
        docs = [d for d in docs if d.fach == fach]
    if jahr:
        docs = [d for d in docs if d.jahr == jahr]
    if niveau:
        docs = [d for d in docs if d.niveau == niveau]
    if typ:
        docs = [d for d in docs if d.typ == typ]
    if status:
        docs = [d for d in docs if d.status == status]
    if indexed is not None:
        docs = [d for d in docs if d.indexed == indexed]

    docs.sort(key=lambda x: (x.jahr, x.fach.value, x.niveau.value), reverse=True)
    return [_to_dokument_response(d) for d in docs]


@router.get("/{doc_id}", response_model=DokumentResponse)
async def get_dokument(doc_id: str):
    """Ruft ein Dokument ab."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
    return _to_dokument_response(doc)


@router.put("/{doc_id}", response_model=DokumentResponse)
async def update_dokument(doc_id: str, data: DokumentUpdate):
    """Aktualisiert Dokument-Metadaten (nach KI-Erkennung durch Entwickler)."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")

    if data.bundesland is not None:
        doc.bundesland = data.bundesland
    if data.fach is not None:
        doc.fach = data.fach
    if data.jahr is not None:
        doc.jahr = data.jahr
    if data.niveau is not None:
        doc.niveau = data.niveau
    if data.typ is not None:
        doc.typ = data.typ
    if data.aufgaben_nummer is not None:
        doc.aufgaben_nummer = data.aufgaben_nummer
    if data.status is not None:
        doc.status = data.status

    doc.updated_at = datetime.utcnow()

    return _to_dokument_response(doc)


@router.post("/{doc_id}/confirm", response_model=DokumentResponse)
async def confirm_dokument(doc_id: str):
    """Bestätigt erkannte Metadaten."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")

    doc.status = VerarbeitungsStatus.CONFIRMED
    doc.updated_at = datetime.utcnow()

    return _to_dokument_response(doc)


@router.post("/{doc_id}/index", response_model=DokumentResponse)
async def index_dokument(doc_id: str):
    """Indiziert Dokument im Vector Store."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")

    if doc.status not in [VerarbeitungsStatus.CONFIRMED, VerarbeitungsStatus.RECOGNIZED]:
        raise HTTPException(status_code=400, detail="Dokument muss erst bestätigt werden")

    # TODO: Vector Store Integration
    # 1. PDF lesen und Text extrahieren
    # 2. In Chunks aufteilen
    # 3. Embeddings generieren
    # 4. Mit Metadaten im Vector Store speichern

    # Demo: Simuliere Indexierung
    doc.indexed = True
    doc.vector_ids = [f"vec_{doc_id}_{i}" for i in range(3)]  # Demo-IDs
    doc.status = VerarbeitungsStatus.INDEXED
    doc.updated_at = datetime.utcnow()

    logger.info(f"Document {doc_id} indexed (demo)")

    return _to_dokument_response(doc)


@router.delete("/{doc_id}")
async def delete_dokument(doc_id: str):
    """Löscht ein Dokument."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")

    # Lösche Datei
    if os.path.exists(doc.file_path):
        os.remove(doc.file_path)

    # TODO: Aus Vector Store entfernen

    del _dokumente[doc_id]

    return {"status": "deleted", "id": doc_id}


@router.get("/{doc_id}/download")
async def download_dokument(doc_id: str):
    """Lädt Dokument herunter."""
    doc = _dokumente.get(doc_id)
    if not doc:
        raise HTTPException(status_code=404, detail="Dokument nicht gefunden")

    if not os.path.exists(doc.file_path):
        raise HTTPException(status_code=404, detail="Datei nicht gefunden")

    return FileResponse(
        doc.file_path,
        filename=doc.original_dateiname,
        media_type="application/pdf"
    )


# ============================================================================
# API Endpoints - Erkennung
# ============================================================================

@router.post("/recognize", response_model=RecognitionResult)
async def recognize_filename(filename: str):
    """Erkennt Metadaten aus einem Dateinamen."""
    return parse_nibis_filename(filename)


@router.post("/bulk-confirm")
async def bulk_confirm(doc_ids: List[str]):
    """Bestätigt mehrere Dokumente auf einmal."""
    confirmed = 0
    for doc_id in doc_ids:
        doc = _dokumente.get(doc_id)
        if doc and doc.status == VerarbeitungsStatus.RECOGNIZED:
            doc.status = VerarbeitungsStatus.CONFIRMED
            doc.updated_at = datetime.utcnow()
            confirmed += 1

    return {"confirmed": confirmed, "total": len(doc_ids)}


@router.post("/bulk-index")
async def bulk_index(doc_ids: List[str]):
    """Indiziert mehrere Dokumente auf einmal."""
    indexed = 0
    for doc_id in doc_ids:
        doc = _dokumente.get(doc_id)
        if doc and doc.status in [VerarbeitungsStatus.CONFIRMED, VerarbeitungsStatus.RECOGNIZED]:
            # Demo-Indexierung
            doc.indexed = True
            doc.vector_ids = [f"vec_{doc_id}_{i}" for i in range(3)]
            doc.status = VerarbeitungsStatus.INDEXED
            doc.updated_at = datetime.utcnow()
            indexed += 1

    return {"indexed": indexed, "total": len(doc_ids)}


# ============================================================================
# API Endpoints - Statistiken
# ============================================================================

@router.get("/stats/overview")
async def get_stats_overview():
    """Gibt Übersicht über alle Dokumente."""
    docs = list(_dokumente.values())

    by_bundesland = {}
    by_fach = {}
    by_jahr = {}
    by_status = {}

    for doc in docs:
        by_bundesland[doc.bundesland.value] = by_bundesland.get(doc.bundesland.value, 0) + 1
        by_fach[doc.fach.value] = by_fach.get(doc.fach.value, 0) + 1
        by_jahr[doc.jahr] = by_jahr.get(doc.jahr, 0) + 1
        by_status[doc.status.value] = by_status.get(doc.status.value, 0) + 1

    return {
        "total": len(docs),
        "indexed": sum(1 for d in docs if d.indexed),
        "pending": sum(1 for d in docs if d.status == VerarbeitungsStatus.PENDING),
        "by_bundesland": by_bundesland,
        "by_fach": by_fach,
        "by_jahr": by_jahr,
        "by_status": by_status
    }


# ============================================================================
# API Endpoints - Suche (für Klausur-Korrektur)
# ============================================================================

@router.get("/search", response_model=List[DokumentResponse])
async def search_dokumente(
    bundesland: Bundesland,
    fach: Fach,
    jahr: Optional[int] = None,
    niveau: Optional[Niveau] = None,
    nur_indexed: bool = True
):
    """
    Sucht Dokumente für Klausur-Korrektur.

    Gibt nur indizierte Dokumente zurück (Standard).
    """
    docs = list(_dokumente.values())

    # Pflichtfilter
    docs = [d for d in docs if d.bundesland == bundesland and d.fach == fach]

    # Optionale Filter
    if jahr:
        docs = [d for d in docs if d.jahr == jahr]
    if niveau:
        docs = [d for d in docs if d.niveau == niveau]
    if nur_indexed:
        docs = [d for d in docs if d.indexed]

    # Sortiere: Aufgaben vor Erwartungshorizonten
    aufgaben = [d for d in docs if d.typ == DokumentTyp.AUFGABE]
    ewh = [d for d in docs if d.typ == DokumentTyp.ERWARTUNGSHORIZONT]
    andere = [d for d in docs if d.typ not in [DokumentTyp.AUFGABE, DokumentTyp.ERWARTUNGSHORIZONT]]

    result = []
    for aufgabe in aufgaben:
        result.append(_to_dokument_response(aufgabe))
        # Finde passenden EWH
        matching_ewh = next(
            (e for e in ewh
             if e.jahr == aufgabe.jahr
             and e.niveau == aufgabe.niveau
             and e.aufgaben_nummer == aufgabe.aufgaben_nummer),
            None
        )
        if matching_ewh:
            result.append(_to_dokument_response(matching_ewh))

    # Restliche EWH und andere
    for e in ewh:
        if _to_dokument_response(e) not in result:
            result.append(_to_dokument_response(e))
    for a in andere:
        result.append(_to_dokument_response(a))

    return result


# ============================================================================
# Enums Endpoint (für Frontend)
# ============================================================================

@router.get("/enums/bundeslaender")
async def get_bundeslaender():
    """Gibt alle Bundesländer zurück."""
    return [{"value": b.value, "label": b.value.replace("_", " ").title()} for b in Bundesland]


@router.get("/enums/faecher")
async def get_faecher():
    """Gibt alle Fächer zurück."""
    labels = {
        Fach.DEUTSCH: "Deutsch",
        Fach.ENGLISCH: "Englisch",
        Fach.MATHEMATIK: "Mathematik",
        Fach.BIOLOGIE: "Biologie",
        Fach.CHEMIE: "Chemie",
        Fach.PHYSIK: "Physik",
        Fach.GESCHICHTE: "Geschichte",
        Fach.ERDKUNDE: "Erdkunde",
        Fach.POLITIK_WIRTSCHAFT: "Politik-Wirtschaft",
        Fach.FRANZOESISCH: "Französisch",
        Fach.SPANISCH: "Spanisch",
        Fach.LATEIN: "Latein",
        Fach.GRIECHISCH: "Griechisch",
        Fach.KUNST: "Kunst",
        Fach.MUSIK: "Musik",
        Fach.SPORT: "Sport",
        Fach.INFORMATIK: "Informatik",
        Fach.EV_RELIGION: "Ev. Religion",
        Fach.KATH_RELIGION: "Kath. Religion",
        Fach.WERTE_NORMEN: "Werte und Normen",
        Fach.BRC: "BRC (Betriebswirtschaft)",
        Fach.BVW: "BVW (Volkswirtschaft)",
        Fach.ERNAEHRUNG: "Ernährung",
        Fach.MECHATRONIK: "Mechatronik",
        Fach.GESUNDHEIT_PFLEGE: "Gesundheit-Pflege",
        Fach.PAEDAGOGIK_PSYCHOLOGIE: "Pädagogik-Psychologie",
    }
    return [{"value": f.value, "label": labels.get(f, f.value)} for f in Fach]


@router.get("/enums/niveaus")
async def get_niveaus():
    """Gibt alle Niveaus zurück."""
    return [
        {"value": "eA", "label": "eA (erhöhtes Anforderungsniveau)"},
        {"value": "gA", "label": "gA (grundlegendes Anforderungsniveau)"}
    ]


@router.get("/enums/typen")
async def get_typen():
    """Gibt alle Dokumenttypen zurück."""
    labels = {
        DokumentTyp.AUFGABE: "Aufgabe",
        DokumentTyp.ERWARTUNGSHORIZONT: "Erwartungshorizont",
        DokumentTyp.DECKBLATT: "Deckblatt",
        DokumentTyp.MATERIAL: "Material",
        DokumentTyp.HOERVERSTEHEN: "Hörverstehen",
        DokumentTyp.SPRACHMITTLUNG: "Sprachmittlung",
        DokumentTyp.BEWERTUNGSBOGEN: "Bewertungsbogen",
    }
    return [{"value": t.value, "label": labels.get(t, t.value)} for t in DokumentTyp]


# ============================================================================
# Backwards-compatibility aliases (used by tests)
# ============================================================================
AbiturFach = Fach
Anforderungsniveau = Niveau
documents_db = _dokumente


class DocumentMetadata(BaseModel):
    """Backwards-compatible metadata model for tests."""
    jahr: Optional[int] = None
    bundesland: Optional[str] = None
    fach: Optional[str] = None
    niveau: Optional[str] = None
    dokument_typ: Optional[str] = None
    aufgaben_nummer: Optional[str] = None


# Backwards-compatible AbiturDokument for tests (different from internal dataclass)
class AbiturDokumentCompat(BaseModel):
    """Backwards-compatible AbiturDokument model for tests."""
    id: str
    filename: str
    file_path: str
    metadata: DocumentMetadata
    status: VerarbeitungsStatus
    recognition_result: Optional[RecognitionResult] = None
    created_at: datetime
    updated_at: datetime

    class Config:
        arbitrary_types_allowed = True