fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
956
backend/abitur_docs_api.py
Normal file
956
backend/abitur_docs_api.py
Normal file
@@ -0,0 +1,956 @@
|
||||
"""
|
||||
Abitur Document Store API - Verwaltung von Abitur-Aufgaben und Erwartungshorizonten.
|
||||
|
||||
Unterstützt:
|
||||
- Bundesland-spezifische Dokumente
|
||||
- Fach, Jahr, Niveau (eA/gA), Aufgabennummer
|
||||
- KI-basierte Dokumentenerkennung
|
||||
- RAG-Integration mit Vector Store
|
||||
|
||||
Dateinamen-Schema (NiBiS Niedersachsen):
|
||||
- 2025_Deutsch_eA_I.pdf - Aufgabe
|
||||
- 2025_Deutsch_eA_I_EWH.pdf - Erwartungshorizont
|
||||
"""
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
|
||||
from fastapi.responses import FileResponse
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/abitur-docs",
|
||||
tags=["abitur-docs"],
|
||||
)
|
||||
|
||||
# Storage directory
|
||||
DOCS_DIR = Path("/tmp/abitur-docs")
|
||||
DOCS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Enums
|
||||
# ============================================================================
|
||||
|
||||
class Bundesland(str, Enum):
|
||||
"""Bundesländer mit Zentralabitur."""
|
||||
NIEDERSACHSEN = "niedersachsen"
|
||||
BAYERN = "bayern"
|
||||
BADEN_WUERTTEMBERG = "baden_wuerttemberg"
|
||||
NORDRHEIN_WESTFALEN = "nordrhein_westfalen"
|
||||
HESSEN = "hessen"
|
||||
SACHSEN = "sachsen"
|
||||
THUERINGEN = "thueringen"
|
||||
BERLIN = "berlin"
|
||||
HAMBURG = "hamburg"
|
||||
SCHLESWIG_HOLSTEIN = "schleswig_holstein"
|
||||
BREMEN = "bremen"
|
||||
BRANDENBURG = "brandenburg"
|
||||
MECKLENBURG_VORPOMMERN = "mecklenburg_vorpommern"
|
||||
SACHSEN_ANHALT = "sachsen_anhalt"
|
||||
RHEINLAND_PFALZ = "rheinland_pfalz"
|
||||
SAARLAND = "saarland"
|
||||
|
||||
|
||||
class Fach(str, Enum):
|
||||
"""Abiturfächer."""
|
||||
DEUTSCH = "deutsch"
|
||||
ENGLISCH = "englisch"
|
||||
MATHEMATIK = "mathematik"
|
||||
BIOLOGIE = "biologie"
|
||||
CHEMIE = "chemie"
|
||||
PHYSIK = "physik"
|
||||
GESCHICHTE = "geschichte"
|
||||
ERDKUNDE = "erdkunde"
|
||||
POLITIK_WIRTSCHAFT = "politik_wirtschaft"
|
||||
FRANZOESISCH = "franzoesisch"
|
||||
SPANISCH = "spanisch"
|
||||
LATEIN = "latein"
|
||||
GRIECHISCH = "griechisch"
|
||||
KUNST = "kunst"
|
||||
MUSIK = "musik"
|
||||
SPORT = "sport"
|
||||
INFORMATIK = "informatik"
|
||||
EV_RELIGION = "ev_religion"
|
||||
KATH_RELIGION = "kath_religion"
|
||||
WERTE_NORMEN = "werte_normen"
|
||||
BRC = "brc" # Betriebswirtschaft mit Rechnungswesen
|
||||
BVW = "bvw" # Volkswirtschaft
|
||||
ERNAEHRUNG = "ernaehrung"
|
||||
MECHATRONIK = "mechatronik"
|
||||
GESUNDHEIT_PFLEGE = "gesundheit_pflege"
|
||||
PAEDAGOGIK_PSYCHOLOGIE = "paedagogik_psychologie"
|
||||
|
||||
|
||||
class Niveau(str, Enum):
|
||||
"""Anforderungsniveau."""
|
||||
EA = "eA" # Erhöhtes Anforderungsniveau (Leistungskurs)
|
||||
GA = "gA" # Grundlegendes Anforderungsniveau (Grundkurs)
|
||||
|
||||
|
||||
class DokumentTyp(str, Enum):
|
||||
"""Dokumenttyp."""
|
||||
AUFGABE = "aufgabe"
|
||||
ERWARTUNGSHORIZONT = "erwartungshorizont"
|
||||
DECKBLATT = "deckblatt"
|
||||
MATERIAL = "material"
|
||||
HOERVERSTEHEN = "hoerverstehen" # Für Sprachen
|
||||
SPRACHMITTLUNG = "sprachmittlung" # Für Sprachen
|
||||
BEWERTUNGSBOGEN = "bewertungsbogen"
|
||||
|
||||
|
||||
class VerarbeitungsStatus(str, Enum):
|
||||
"""Status der Dokumentenverarbeitung."""
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
RECOGNIZED = "recognized" # KI hat Metadaten erkannt
|
||||
CONFIRMED = "confirmed" # Entwickler hat bestätigt
|
||||
INDEXED = "indexed" # Im Vector Store
|
||||
ERROR = "error"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Fach-Mapping für Dateinamen
|
||||
# ============================================================================
|
||||
|
||||
FACH_NAME_MAPPING = {
|
||||
"deutsch": Fach.DEUTSCH,
|
||||
"englisch": Fach.ENGLISCH,
|
||||
"mathe": Fach.MATHEMATIK,
|
||||
"mathematik": Fach.MATHEMATIK,
|
||||
"biologie": Fach.BIOLOGIE,
|
||||
"bio": Fach.BIOLOGIE,
|
||||
"chemie": Fach.CHEMIE,
|
||||
"physik": Fach.PHYSIK,
|
||||
"geschichte": Fach.GESCHICHTE,
|
||||
"erdkunde": Fach.ERDKUNDE,
|
||||
"geographie": Fach.ERDKUNDE,
|
||||
"politikwirtschaft": Fach.POLITIK_WIRTSCHAFT,
|
||||
"politik": Fach.POLITIK_WIRTSCHAFT,
|
||||
"franzoesisch": Fach.FRANZOESISCH,
|
||||
"franz": Fach.FRANZOESISCH,
|
||||
"spanisch": Fach.SPANISCH,
|
||||
"latein": Fach.LATEIN,
|
||||
"griechisch": Fach.GRIECHISCH,
|
||||
"kunst": Fach.KUNST,
|
||||
"musik": Fach.MUSIK,
|
||||
"sport": Fach.SPORT,
|
||||
"informatik": Fach.INFORMATIK,
|
||||
"evreligion": Fach.EV_RELIGION,
|
||||
"kathreligion": Fach.KATH_RELIGION,
|
||||
"wertenormen": Fach.WERTE_NORMEN,
|
||||
"brc": Fach.BRC,
|
||||
"bvw": Fach.BVW,
|
||||
"ernaehrung": Fach.ERNAEHRUNG,
|
||||
"mecha": Fach.MECHATRONIK,
|
||||
"mechatronik": Fach.MECHATRONIK,
|
||||
"technikmecha": Fach.MECHATRONIK,
|
||||
"gespfl": Fach.GESUNDHEIT_PFLEGE,
|
||||
"paedpsych": Fach.PAEDAGOGIK_PSYCHOLOGIE,
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Pydantic Models
|
||||
# ============================================================================
|
||||
|
||||
class DokumentCreate(BaseModel):
|
||||
"""Manuelles Erstellen eines Dokuments."""
|
||||
bundesland: Bundesland
|
||||
fach: Fach
|
||||
jahr: int = Field(ge=2000, le=2100)
|
||||
niveau: Niveau
|
||||
typ: DokumentTyp
|
||||
aufgaben_nummer: Optional[str] = None # I, II, III, 1, 2, etc.
|
||||
|
||||
|
||||
class DokumentUpdate(BaseModel):
|
||||
"""Update für erkannte Metadaten."""
|
||||
bundesland: Optional[Bundesland] = None
|
||||
fach: Optional[Fach] = None
|
||||
jahr: Optional[int] = None
|
||||
niveau: Optional[Niveau] = None
|
||||
typ: Optional[DokumentTyp] = None
|
||||
aufgaben_nummer: Optional[str] = None
|
||||
status: Optional[VerarbeitungsStatus] = None
|
||||
|
||||
|
||||
class DokumentResponse(BaseModel):
|
||||
"""Response für ein Dokument."""
|
||||
id: str
|
||||
dateiname: str
|
||||
original_dateiname: str
|
||||
bundesland: Bundesland
|
||||
fach: Fach
|
||||
jahr: int
|
||||
niveau: Niveau
|
||||
typ: DokumentTyp
|
||||
aufgaben_nummer: Optional[str]
|
||||
status: VerarbeitungsStatus
|
||||
confidence: float # Erkennungs-Confidence
|
||||
file_path: str
|
||||
file_size: int
|
||||
indexed: bool
|
||||
vector_ids: List[str]
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
|
||||
class ImportResult(BaseModel):
|
||||
"""Ergebnis eines ZIP-Imports."""
|
||||
total_files: int
|
||||
recognized: int
|
||||
errors: int
|
||||
documents: List[DokumentResponse]
|
||||
|
||||
|
||||
class RecognitionResult(BaseModel):
|
||||
"""Ergebnis der Dokumentenerkennung."""
|
||||
success: bool
|
||||
bundesland: Optional[Bundesland]
|
||||
fach: Optional[Fach]
|
||||
jahr: Optional[int]
|
||||
niveau: Optional[Niveau]
|
||||
typ: Optional[DokumentTyp]
|
||||
aufgaben_nummer: Optional[str]
|
||||
confidence: float
|
||||
raw_filename: str
|
||||
suggestions: List[Dict[str, Any]]
|
||||
|
||||
@property
|
||||
def extracted(self) -> Dict[str, Any]:
|
||||
"""Backwards-compatible property returning extracted values as dict."""
|
||||
result = {}
|
||||
if self.bundesland:
|
||||
result["bundesland"] = self.bundesland.value
|
||||
if self.fach:
|
||||
result["fach"] = self.fach.value
|
||||
if self.jahr:
|
||||
result["jahr"] = self.jahr
|
||||
if self.niveau:
|
||||
result["niveau"] = self.niveau.value
|
||||
if self.typ:
|
||||
result["typ"] = self.typ.value
|
||||
if self.aufgaben_nummer:
|
||||
result["aufgaben_nummer"] = self.aufgaben_nummer
|
||||
return result
|
||||
|
||||
@property
|
||||
def method(self) -> str:
|
||||
"""Backwards-compatible property for recognition method."""
|
||||
return "filename_pattern"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Internal Data Classes
|
||||
# ============================================================================
|
||||
|
||||
@dataclass
|
||||
class AbiturDokument:
|
||||
"""Internes Dokument."""
|
||||
id: str
|
||||
dateiname: str
|
||||
original_dateiname: str
|
||||
bundesland: Bundesland
|
||||
fach: Fach
|
||||
jahr: int
|
||||
niveau: Niveau
|
||||
typ: DokumentTyp
|
||||
aufgaben_nummer: Optional[str]
|
||||
status: VerarbeitungsStatus
|
||||
confidence: float
|
||||
file_path: str
|
||||
file_size: int
|
||||
indexed: bool
|
||||
vector_ids: List[str]
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# In-Memory Storage
|
||||
# ============================================================================
|
||||
|
||||
_dokumente: Dict[str, AbiturDokument] = {}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Helper Functions - Dokumentenerkennung
|
||||
# ============================================================================
|
||||
|
||||
def parse_nibis_filename(filename: str) -> RecognitionResult:
|
||||
"""
|
||||
Erkennt Metadaten aus NiBiS-Dateinamen.
|
||||
|
||||
Beispiele:
|
||||
- 2025_Deutsch_eA_I.pdf
|
||||
- 2025_Deutsch_eA_I_EWH.pdf
|
||||
- 2025_Biologie_gA_1.pdf
|
||||
- 2025_Englisch_eA_HV.pdf (Hörverstehen)
|
||||
"""
|
||||
result = RecognitionResult(
|
||||
success=False,
|
||||
bundesland=Bundesland.NIEDERSACHSEN, # NiBiS = Niedersachsen
|
||||
fach=None,
|
||||
jahr=None,
|
||||
niveau=None,
|
||||
typ=None,
|
||||
aufgaben_nummer=None,
|
||||
confidence=0.0,
|
||||
raw_filename=filename,
|
||||
suggestions=[]
|
||||
)
|
||||
|
||||
# Bereinige Dateiname
|
||||
name = Path(filename).stem.lower()
|
||||
|
||||
# Extrahiere Jahr (4 Ziffern am Anfang)
|
||||
jahr_match = re.match(r'^(\d{4})', name)
|
||||
if jahr_match:
|
||||
result.jahr = int(jahr_match.group(1))
|
||||
result.confidence += 0.2
|
||||
|
||||
# Extrahiere Fach
|
||||
for fach_key, fach_enum in FACH_NAME_MAPPING.items():
|
||||
if fach_key in name.replace("_", "").replace("-", ""):
|
||||
result.fach = fach_enum
|
||||
result.confidence += 0.3
|
||||
break
|
||||
|
||||
# Extrahiere Niveau (eA/gA)
|
||||
if "_ea" in name or "_ea_" in name or "ea_" in name:
|
||||
result.niveau = Niveau.EA
|
||||
result.confidence += 0.2
|
||||
elif "_ga" in name or "_ga_" in name or "ga_" in name:
|
||||
result.niveau = Niveau.GA
|
||||
result.confidence += 0.2
|
||||
|
||||
# Extrahiere Typ
|
||||
if "_ewh" in name:
|
||||
result.typ = DokumentTyp.ERWARTUNGSHORIZONT
|
||||
result.confidence += 0.2
|
||||
elif "_hv" in name or "hoerverstehen" in name:
|
||||
result.typ = DokumentTyp.HOERVERSTEHEN
|
||||
result.confidence += 0.15
|
||||
elif "_sm" in name or "_me" in name or "sprachmittlung" in name:
|
||||
result.typ = DokumentTyp.SPRACHMITTLUNG
|
||||
result.confidence += 0.15
|
||||
elif "deckblatt" in name:
|
||||
result.typ = DokumentTyp.DECKBLATT
|
||||
result.confidence += 0.15
|
||||
elif "material" in name:
|
||||
result.typ = DokumentTyp.MATERIAL
|
||||
result.confidence += 0.15
|
||||
elif "bewertung" in name:
|
||||
result.typ = DokumentTyp.BEWERTUNGSBOGEN
|
||||
result.confidence += 0.15
|
||||
else:
|
||||
result.typ = DokumentTyp.AUFGABE
|
||||
result.confidence += 0.1
|
||||
|
||||
# Extrahiere Aufgabennummer (römisch oder arabisch)
|
||||
aufgabe_match = re.search(r'_([ivx]+|[1-4][abc]?)(?:_|\.pdf|$)', name, re.IGNORECASE)
|
||||
if aufgabe_match:
|
||||
result.aufgaben_nummer = aufgabe_match.group(1).upper()
|
||||
result.confidence += 0.1
|
||||
|
||||
# Erfolg wenn mindestens Fach und Jahr erkannt
|
||||
if result.fach and result.jahr:
|
||||
result.success = True
|
||||
|
||||
# Normalisiere Confidence auf max 1.0
|
||||
result.confidence = min(result.confidence, 1.0)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _to_dokument_response(doc: AbiturDokument) -> DokumentResponse:
|
||||
"""Konvertiert internes Dokument zu Response."""
|
||||
return DokumentResponse(
|
||||
id=doc.id,
|
||||
dateiname=doc.dateiname,
|
||||
original_dateiname=doc.original_dateiname,
|
||||
bundesland=doc.bundesland,
|
||||
fach=doc.fach,
|
||||
jahr=doc.jahr,
|
||||
niveau=doc.niveau,
|
||||
typ=doc.typ,
|
||||
aufgaben_nummer=doc.aufgaben_nummer,
|
||||
status=doc.status,
|
||||
confidence=doc.confidence,
|
||||
file_path=doc.file_path,
|
||||
file_size=doc.file_size,
|
||||
indexed=doc.indexed,
|
||||
vector_ids=doc.vector_ids,
|
||||
created_at=doc.created_at,
|
||||
updated_at=doc.updated_at
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# API Endpoints - Dokumente
|
||||
# ============================================================================
|
||||
|
||||
@router.post("/upload", response_model=DokumentResponse)
|
||||
async def upload_dokument(
|
||||
file: UploadFile = File(...),
|
||||
bundesland: Optional[Bundesland] = Form(None),
|
||||
fach: Optional[Fach] = Form(None),
|
||||
jahr: Optional[int] = Form(None),
|
||||
niveau: Optional[Niveau] = Form(None),
|
||||
typ: Optional[DokumentTyp] = Form(None),
|
||||
aufgaben_nummer: Optional[str] = Form(None)
|
||||
):
|
||||
"""
|
||||
Lädt ein einzelnes Dokument hoch.
|
||||
|
||||
Metadaten können manuell angegeben oder automatisch erkannt werden.
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="Kein Dateiname")
|
||||
|
||||
# Erkenne Metadaten aus Dateiname
|
||||
recognition = parse_nibis_filename(file.filename)
|
||||
|
||||
# Überschreibe mit manuellen Angaben
|
||||
final_bundesland = bundesland or recognition.bundesland or Bundesland.NIEDERSACHSEN
|
||||
final_fach = fach or recognition.fach
|
||||
final_jahr = jahr or recognition.jahr or datetime.now().year
|
||||
final_niveau = niveau or recognition.niveau or Niveau.EA
|
||||
final_typ = typ or recognition.typ or DokumentTyp.AUFGABE
|
||||
final_aufgabe = aufgaben_nummer or recognition.aufgaben_nummer
|
||||
|
||||
if not final_fach:
|
||||
raise HTTPException(status_code=400, detail="Fach konnte nicht erkannt werden")
|
||||
|
||||
# Generiere ID und speichere Datei
|
||||
doc_id = str(uuid.uuid4())
|
||||
file_ext = Path(file.filename).suffix
|
||||
safe_filename = f"{doc_id}{file_ext}"
|
||||
file_path = DOCS_DIR / safe_filename
|
||||
|
||||
content = await file.read()
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
now = datetime.utcnow()
|
||||
|
||||
dokument = AbiturDokument(
|
||||
id=doc_id,
|
||||
dateiname=safe_filename,
|
||||
original_dateiname=file.filename,
|
||||
bundesland=final_bundesland,
|
||||
fach=final_fach,
|
||||
jahr=final_jahr,
|
||||
niveau=final_niveau,
|
||||
typ=final_typ,
|
||||
aufgaben_nummer=final_aufgabe,
|
||||
status=VerarbeitungsStatus.RECOGNIZED if recognition.success else VerarbeitungsStatus.PENDING,
|
||||
confidence=recognition.confidence,
|
||||
file_path=str(file_path),
|
||||
file_size=len(content),
|
||||
indexed=False,
|
||||
vector_ids=[],
|
||||
created_at=now,
|
||||
updated_at=now
|
||||
)
|
||||
|
||||
_dokumente[doc_id] = dokument
|
||||
logger.info(f"Uploaded document {doc_id}: {file.filename}")
|
||||
|
||||
return _to_dokument_response(dokument)
|
||||
|
||||
|
||||
@router.post("/import-zip", response_model=ImportResult)
|
||||
async def import_zip(
|
||||
file: UploadFile = File(...),
|
||||
bundesland: Bundesland = Form(Bundesland.NIEDERSACHSEN),
|
||||
background_tasks: BackgroundTasks = None
|
||||
):
|
||||
"""
|
||||
Importiert alle PDFs aus einer ZIP-Datei.
|
||||
|
||||
Erkennt automatisch Metadaten aus Dateinamen.
|
||||
"""
|
||||
if not file.filename or not file.filename.endswith(".zip"):
|
||||
raise HTTPException(status_code=400, detail="ZIP-Datei erforderlich")
|
||||
|
||||
# Speichere ZIP temporär
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp:
|
||||
content = await file.read()
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
documents = []
|
||||
total = 0
|
||||
recognized = 0
|
||||
errors = 0
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(tmp_path, 'r') as zip_ref:
|
||||
for zip_info in zip_ref.infolist():
|
||||
# Nur PDFs
|
||||
if not zip_info.filename.lower().endswith(".pdf"):
|
||||
continue
|
||||
|
||||
# Ignoriere Mac-spezifische Dateien
|
||||
if "__MACOSX" in zip_info.filename or zip_info.filename.startswith("."):
|
||||
continue
|
||||
|
||||
# Ignoriere Thumbs.db
|
||||
if "thumbs.db" in zip_info.filename.lower():
|
||||
continue
|
||||
|
||||
total += 1
|
||||
|
||||
try:
|
||||
# Erkenne Metadaten
|
||||
basename = Path(zip_info.filename).name
|
||||
recognition = parse_nibis_filename(basename)
|
||||
|
||||
if not recognition.fach:
|
||||
errors += 1
|
||||
logger.warning(f"Konnte Fach nicht erkennen: {basename}")
|
||||
continue
|
||||
|
||||
# Extrahiere und speichere
|
||||
doc_id = str(uuid.uuid4())
|
||||
file_ext = Path(basename).suffix
|
||||
safe_filename = f"{doc_id}{file_ext}"
|
||||
file_path = DOCS_DIR / safe_filename
|
||||
|
||||
with zip_ref.open(zip_info.filename) as source:
|
||||
file_content = source.read()
|
||||
with open(file_path, "wb") as target:
|
||||
target.write(file_content)
|
||||
|
||||
now = datetime.utcnow()
|
||||
|
||||
dokument = AbiturDokument(
|
||||
id=doc_id,
|
||||
dateiname=safe_filename,
|
||||
original_dateiname=basename,
|
||||
bundesland=bundesland,
|
||||
fach=recognition.fach,
|
||||
jahr=recognition.jahr or datetime.now().year,
|
||||
niveau=recognition.niveau or Niveau.EA,
|
||||
typ=recognition.typ or DokumentTyp.AUFGABE,
|
||||
aufgaben_nummer=recognition.aufgaben_nummer,
|
||||
status=VerarbeitungsStatus.RECOGNIZED,
|
||||
confidence=recognition.confidence,
|
||||
file_path=str(file_path),
|
||||
file_size=len(file_content),
|
||||
indexed=False,
|
||||
vector_ids=[],
|
||||
created_at=now,
|
||||
updated_at=now
|
||||
)
|
||||
|
||||
_dokumente[doc_id] = dokument
|
||||
documents.append(_to_dokument_response(dokument))
|
||||
recognized += 1
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
logger.error(f"Fehler bei {zip_info.filename}: {e}")
|
||||
|
||||
finally:
|
||||
# Lösche temporäre ZIP
|
||||
os.unlink(tmp_path)
|
||||
|
||||
logger.info(f"ZIP-Import: {recognized}/{total} erkannt, {errors} Fehler")
|
||||
|
||||
return ImportResult(
|
||||
total_files=total,
|
||||
recognized=recognized,
|
||||
errors=errors,
|
||||
documents=documents
|
||||
)
|
||||
|
||||
|
||||
@router.get("/", response_model=List[DokumentResponse])
|
||||
async def list_dokumente(
|
||||
bundesland: Optional[Bundesland] = None,
|
||||
fach: Optional[Fach] = None,
|
||||
jahr: Optional[int] = None,
|
||||
niveau: Optional[Niveau] = None,
|
||||
typ: Optional[DokumentTyp] = None,
|
||||
status: Optional[VerarbeitungsStatus] = None,
|
||||
indexed: Optional[bool] = None
|
||||
):
|
||||
"""Listet Dokumente mit optionalen Filtern."""
|
||||
docs = list(_dokumente.values())
|
||||
|
||||
if bundesland:
|
||||
docs = [d for d in docs if d.bundesland == bundesland]
|
||||
if fach:
|
||||
docs = [d for d in docs if d.fach == fach]
|
||||
if jahr:
|
||||
docs = [d for d in docs if d.jahr == jahr]
|
||||
if niveau:
|
||||
docs = [d for d in docs if d.niveau == niveau]
|
||||
if typ:
|
||||
docs = [d for d in docs if d.typ == typ]
|
||||
if status:
|
||||
docs = [d for d in docs if d.status == status]
|
||||
if indexed is not None:
|
||||
docs = [d for d in docs if d.indexed == indexed]
|
||||
|
||||
docs.sort(key=lambda x: (x.jahr, x.fach.value, x.niveau.value), reverse=True)
|
||||
return [_to_dokument_response(d) for d in docs]
|
||||
|
||||
|
||||
@router.get("/{doc_id}", response_model=DokumentResponse)
|
||||
async def get_dokument(doc_id: str):
|
||||
"""Ruft ein Dokument ab."""
|
||||
doc = _dokumente.get(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
||||
return _to_dokument_response(doc)
|
||||
|
||||
|
||||
@router.put("/{doc_id}", response_model=DokumentResponse)
|
||||
async def update_dokument(doc_id: str, data: DokumentUpdate):
|
||||
"""Aktualisiert Dokument-Metadaten (nach KI-Erkennung durch Entwickler)."""
|
||||
doc = _dokumente.get(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
||||
|
||||
if data.bundesland is not None:
|
||||
doc.bundesland = data.bundesland
|
||||
if data.fach is not None:
|
||||
doc.fach = data.fach
|
||||
if data.jahr is not None:
|
||||
doc.jahr = data.jahr
|
||||
if data.niveau is not None:
|
||||
doc.niveau = data.niveau
|
||||
if data.typ is not None:
|
||||
doc.typ = data.typ
|
||||
if data.aufgaben_nummer is not None:
|
||||
doc.aufgaben_nummer = data.aufgaben_nummer
|
||||
if data.status is not None:
|
||||
doc.status = data.status
|
||||
|
||||
doc.updated_at = datetime.utcnow()
|
||||
|
||||
return _to_dokument_response(doc)
|
||||
|
||||
|
||||
@router.post("/{doc_id}/confirm", response_model=DokumentResponse)
|
||||
async def confirm_dokument(doc_id: str):
|
||||
"""Bestätigt erkannte Metadaten."""
|
||||
doc = _dokumente.get(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
||||
|
||||
doc.status = VerarbeitungsStatus.CONFIRMED
|
||||
doc.updated_at = datetime.utcnow()
|
||||
|
||||
return _to_dokument_response(doc)
|
||||
|
||||
|
||||
@router.post("/{doc_id}/index", response_model=DokumentResponse)
|
||||
async def index_dokument(doc_id: str):
|
||||
"""Indiziert Dokument im Vector Store."""
|
||||
doc = _dokumente.get(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
||||
|
||||
if doc.status not in [VerarbeitungsStatus.CONFIRMED, VerarbeitungsStatus.RECOGNIZED]:
|
||||
raise HTTPException(status_code=400, detail="Dokument muss erst bestätigt werden")
|
||||
|
||||
# TODO: Vector Store Integration
|
||||
# 1. PDF lesen und Text extrahieren
|
||||
# 2. In Chunks aufteilen
|
||||
# 3. Embeddings generieren
|
||||
# 4. Mit Metadaten im Vector Store speichern
|
||||
|
||||
# Demo: Simuliere Indexierung
|
||||
doc.indexed = True
|
||||
doc.vector_ids = [f"vec_{doc_id}_{i}" for i in range(3)] # Demo-IDs
|
||||
doc.status = VerarbeitungsStatus.INDEXED
|
||||
doc.updated_at = datetime.utcnow()
|
||||
|
||||
logger.info(f"Document {doc_id} indexed (demo)")
|
||||
|
||||
return _to_dokument_response(doc)
|
||||
|
||||
|
||||
@router.delete("/{doc_id}")
|
||||
async def delete_dokument(doc_id: str):
|
||||
"""Löscht ein Dokument."""
|
||||
doc = _dokumente.get(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
||||
|
||||
# Lösche Datei
|
||||
if os.path.exists(doc.file_path):
|
||||
os.remove(doc.file_path)
|
||||
|
||||
# TODO: Aus Vector Store entfernen
|
||||
|
||||
del _dokumente[doc_id]
|
||||
|
||||
return {"status": "deleted", "id": doc_id}
|
||||
|
||||
|
||||
@router.get("/{doc_id}/download")
|
||||
async def download_dokument(doc_id: str):
|
||||
"""Lädt Dokument herunter."""
|
||||
doc = _dokumente.get(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
||||
|
||||
if not os.path.exists(doc.file_path):
|
||||
raise HTTPException(status_code=404, detail="Datei nicht gefunden")
|
||||
|
||||
return FileResponse(
|
||||
doc.file_path,
|
||||
filename=doc.original_dateiname,
|
||||
media_type="application/pdf"
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# API Endpoints - Erkennung
|
||||
# ============================================================================
|
||||
|
||||
@router.post("/recognize", response_model=RecognitionResult)
|
||||
async def recognize_filename(filename: str):
|
||||
"""Erkennt Metadaten aus einem Dateinamen."""
|
||||
return parse_nibis_filename(filename)
|
||||
|
||||
|
||||
@router.post("/bulk-confirm")
|
||||
async def bulk_confirm(doc_ids: List[str]):
|
||||
"""Bestätigt mehrere Dokumente auf einmal."""
|
||||
confirmed = 0
|
||||
for doc_id in doc_ids:
|
||||
doc = _dokumente.get(doc_id)
|
||||
if doc and doc.status == VerarbeitungsStatus.RECOGNIZED:
|
||||
doc.status = VerarbeitungsStatus.CONFIRMED
|
||||
doc.updated_at = datetime.utcnow()
|
||||
confirmed += 1
|
||||
|
||||
return {"confirmed": confirmed, "total": len(doc_ids)}
|
||||
|
||||
|
||||
@router.post("/bulk-index")
|
||||
async def bulk_index(doc_ids: List[str]):
|
||||
"""Indiziert mehrere Dokumente auf einmal."""
|
||||
indexed = 0
|
||||
for doc_id in doc_ids:
|
||||
doc = _dokumente.get(doc_id)
|
||||
if doc and doc.status in [VerarbeitungsStatus.CONFIRMED, VerarbeitungsStatus.RECOGNIZED]:
|
||||
# Demo-Indexierung
|
||||
doc.indexed = True
|
||||
doc.vector_ids = [f"vec_{doc_id}_{i}" for i in range(3)]
|
||||
doc.status = VerarbeitungsStatus.INDEXED
|
||||
doc.updated_at = datetime.utcnow()
|
||||
indexed += 1
|
||||
|
||||
return {"indexed": indexed, "total": len(doc_ids)}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# API Endpoints - Statistiken
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/stats/overview")
|
||||
async def get_stats_overview():
|
||||
"""Gibt Übersicht über alle Dokumente."""
|
||||
docs = list(_dokumente.values())
|
||||
|
||||
by_bundesland = {}
|
||||
by_fach = {}
|
||||
by_jahr = {}
|
||||
by_status = {}
|
||||
|
||||
for doc in docs:
|
||||
by_bundesland[doc.bundesland.value] = by_bundesland.get(doc.bundesland.value, 0) + 1
|
||||
by_fach[doc.fach.value] = by_fach.get(doc.fach.value, 0) + 1
|
||||
by_jahr[doc.jahr] = by_jahr.get(doc.jahr, 0) + 1
|
||||
by_status[doc.status.value] = by_status.get(doc.status.value, 0) + 1
|
||||
|
||||
return {
|
||||
"total": len(docs),
|
||||
"indexed": sum(1 for d in docs if d.indexed),
|
||||
"pending": sum(1 for d in docs if d.status == VerarbeitungsStatus.PENDING),
|
||||
"by_bundesland": by_bundesland,
|
||||
"by_fach": by_fach,
|
||||
"by_jahr": by_jahr,
|
||||
"by_status": by_status
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# API Endpoints - Suche (für Klausur-Korrektur)
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/search", response_model=List[DokumentResponse])
|
||||
async def search_dokumente(
|
||||
bundesland: Bundesland,
|
||||
fach: Fach,
|
||||
jahr: Optional[int] = None,
|
||||
niveau: Optional[Niveau] = None,
|
||||
nur_indexed: bool = True
|
||||
):
|
||||
"""
|
||||
Sucht Dokumente für Klausur-Korrektur.
|
||||
|
||||
Gibt nur indizierte Dokumente zurück (Standard).
|
||||
"""
|
||||
docs = list(_dokumente.values())
|
||||
|
||||
# Pflichtfilter
|
||||
docs = [d for d in docs if d.bundesland == bundesland and d.fach == fach]
|
||||
|
||||
# Optionale Filter
|
||||
if jahr:
|
||||
docs = [d for d in docs if d.jahr == jahr]
|
||||
if niveau:
|
||||
docs = [d for d in docs if d.niveau == niveau]
|
||||
if nur_indexed:
|
||||
docs = [d for d in docs if d.indexed]
|
||||
|
||||
# Sortiere: Aufgaben vor Erwartungshorizonten
|
||||
aufgaben = [d for d in docs if d.typ == DokumentTyp.AUFGABE]
|
||||
ewh = [d for d in docs if d.typ == DokumentTyp.ERWARTUNGSHORIZONT]
|
||||
andere = [d for d in docs if d.typ not in [DokumentTyp.AUFGABE, DokumentTyp.ERWARTUNGSHORIZONT]]
|
||||
|
||||
result = []
|
||||
for aufgabe in aufgaben:
|
||||
result.append(_to_dokument_response(aufgabe))
|
||||
# Finde passenden EWH
|
||||
matching_ewh = next(
|
||||
(e for e in ewh
|
||||
if e.jahr == aufgabe.jahr
|
||||
and e.niveau == aufgabe.niveau
|
||||
and e.aufgaben_nummer == aufgabe.aufgaben_nummer),
|
||||
None
|
||||
)
|
||||
if matching_ewh:
|
||||
result.append(_to_dokument_response(matching_ewh))
|
||||
|
||||
# Restliche EWH und andere
|
||||
for e in ewh:
|
||||
if _to_dokument_response(e) not in result:
|
||||
result.append(_to_dokument_response(e))
|
||||
for a in andere:
|
||||
result.append(_to_dokument_response(a))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Enums Endpoint (für Frontend)
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/enums/bundeslaender")
|
||||
async def get_bundeslaender():
|
||||
"""Gibt alle Bundesländer zurück."""
|
||||
return [{"value": b.value, "label": b.value.replace("_", " ").title()} for b in Bundesland]
|
||||
|
||||
|
||||
@router.get("/enums/faecher")
|
||||
async def get_faecher():
|
||||
"""Gibt alle Fächer zurück."""
|
||||
labels = {
|
||||
Fach.DEUTSCH: "Deutsch",
|
||||
Fach.ENGLISCH: "Englisch",
|
||||
Fach.MATHEMATIK: "Mathematik",
|
||||
Fach.BIOLOGIE: "Biologie",
|
||||
Fach.CHEMIE: "Chemie",
|
||||
Fach.PHYSIK: "Physik",
|
||||
Fach.GESCHICHTE: "Geschichte",
|
||||
Fach.ERDKUNDE: "Erdkunde",
|
||||
Fach.POLITIK_WIRTSCHAFT: "Politik-Wirtschaft",
|
||||
Fach.FRANZOESISCH: "Französisch",
|
||||
Fach.SPANISCH: "Spanisch",
|
||||
Fach.LATEIN: "Latein",
|
||||
Fach.GRIECHISCH: "Griechisch",
|
||||
Fach.KUNST: "Kunst",
|
||||
Fach.MUSIK: "Musik",
|
||||
Fach.SPORT: "Sport",
|
||||
Fach.INFORMATIK: "Informatik",
|
||||
Fach.EV_RELIGION: "Ev. Religion",
|
||||
Fach.KATH_RELIGION: "Kath. Religion",
|
||||
Fach.WERTE_NORMEN: "Werte und Normen",
|
||||
Fach.BRC: "BRC (Betriebswirtschaft)",
|
||||
Fach.BVW: "BVW (Volkswirtschaft)",
|
||||
Fach.ERNAEHRUNG: "Ernährung",
|
||||
Fach.MECHATRONIK: "Mechatronik",
|
||||
Fach.GESUNDHEIT_PFLEGE: "Gesundheit-Pflege",
|
||||
Fach.PAEDAGOGIK_PSYCHOLOGIE: "Pädagogik-Psychologie",
|
||||
}
|
||||
return [{"value": f.value, "label": labels.get(f, f.value)} for f in Fach]
|
||||
|
||||
|
||||
@router.get("/enums/niveaus")
|
||||
async def get_niveaus():
|
||||
"""Gibt alle Niveaus zurück."""
|
||||
return [
|
||||
{"value": "eA", "label": "eA (erhöhtes Anforderungsniveau)"},
|
||||
{"value": "gA", "label": "gA (grundlegendes Anforderungsniveau)"}
|
||||
]
|
||||
|
||||
|
||||
@router.get("/enums/typen")
|
||||
async def get_typen():
|
||||
"""Gibt alle Dokumenttypen zurück."""
|
||||
labels = {
|
||||
DokumentTyp.AUFGABE: "Aufgabe",
|
||||
DokumentTyp.ERWARTUNGSHORIZONT: "Erwartungshorizont",
|
||||
DokumentTyp.DECKBLATT: "Deckblatt",
|
||||
DokumentTyp.MATERIAL: "Material",
|
||||
DokumentTyp.HOERVERSTEHEN: "Hörverstehen",
|
||||
DokumentTyp.SPRACHMITTLUNG: "Sprachmittlung",
|
||||
DokumentTyp.BEWERTUNGSBOGEN: "Bewertungsbogen",
|
||||
}
|
||||
return [{"value": t.value, "label": labels.get(t, t.value)} for t in DokumentTyp]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Backwards-compatibility aliases (used by tests)
|
||||
# ============================================================================
|
||||
AbiturFach = Fach
|
||||
Anforderungsniveau = Niveau
|
||||
documents_db = _dokumente
|
||||
|
||||
|
||||
class DocumentMetadata(BaseModel):
|
||||
"""Backwards-compatible metadata model for tests."""
|
||||
jahr: Optional[int] = None
|
||||
bundesland: Optional[str] = None
|
||||
fach: Optional[str] = None
|
||||
niveau: Optional[str] = None
|
||||
dokument_typ: Optional[str] = None
|
||||
aufgaben_nummer: Optional[str] = None
|
||||
|
||||
|
||||
# Backwards-compatible AbiturDokument for tests (different from internal dataclass)
|
||||
class AbiturDokumentCompat(BaseModel):
|
||||
"""Backwards-compatible AbiturDokument model for tests."""
|
||||
id: str
|
||||
filename: str
|
||||
file_path: str
|
||||
metadata: DocumentMetadata
|
||||
status: VerarbeitungsStatus
|
||||
recognition_result: Optional[RecognitionResult] = None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
Reference in New Issue
Block a user