backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
414 lines
15 KiB
Python
414 lines
15 KiB
Python
"""
|
|
Abitur Document Store API - Verwaltung von Abitur-Aufgaben und Erwartungshorizonten.
|
|
|
|
Unterstützt:
|
|
- Bundesland-spezifische Dokumente
|
|
- Fach, Jahr, Niveau (eA/gA), Aufgabennummer
|
|
- KI-basierte Dokumentenerkennung
|
|
- RAG-Integration mit Vector Store
|
|
|
|
Dateinamen-Schema (NiBiS Niedersachsen):
|
|
- 2025_Deutsch_eA_I.pdf - Aufgabe
|
|
- 2025_Deutsch_eA_I_EWH.pdf - Erwartungshorizont
|
|
"""
|
|
|
|
import logging
|
|
import uuid
|
|
import os
|
|
import zipfile
|
|
import tempfile
|
|
from datetime import datetime
|
|
from typing import List, Optional, Dict, Any
|
|
from pathlib import Path
|
|
|
|
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
|
|
from fastapi.responses import FileResponse
|
|
|
|
from abitur_docs_models import (
|
|
Bundesland, Fach, Niveau, DokumentTyp, VerarbeitungsStatus,
|
|
DokumentCreate, DokumentUpdate, DokumentResponse, ImportResult,
|
|
RecognitionResult, AbiturDokument,
|
|
FACH_LABELS, DOKUMENT_TYP_LABELS,
|
|
# Backwards-compatibility re-exports
|
|
AbiturFach, Anforderungsniveau, DocumentMetadata, AbiturDokumentCompat,
|
|
)
|
|
from abitur_docs_recognition import parse_nibis_filename, to_dokument_response
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(
|
|
prefix="/abitur-docs",
|
|
tags=["abitur-docs"],
|
|
)
|
|
|
|
# Storage directory
|
|
DOCS_DIR = Path("/tmp/abitur-docs")
|
|
DOCS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# In-Memory Storage
|
|
_dokumente: Dict[str, AbiturDokument] = {}
|
|
|
|
# Backwards-compatibility alias
|
|
documents_db = _dokumente
|
|
|
|
|
|
# ============================================================================
|
|
# Private helper (kept local since it references module-level _dokumente)
|
|
# ============================================================================
|
|
|
|
def _to_dokument_response(doc: AbiturDokument) -> DokumentResponse:
|
|
return to_dokument_response(doc)
|
|
|
|
|
|
# ============================================================================
|
|
# API Endpoints - Dokumente
|
|
# ============================================================================
|
|
|
|
@router.post("/upload", response_model=DokumentResponse)
|
|
async def upload_dokument(
|
|
file: UploadFile = File(...),
|
|
bundesland: Optional[Bundesland] = Form(None),
|
|
fach: Optional[Fach] = Form(None),
|
|
jahr: Optional[int] = Form(None),
|
|
niveau: Optional[Niveau] = Form(None),
|
|
typ: Optional[DokumentTyp] = Form(None),
|
|
aufgaben_nummer: Optional[str] = Form(None)
|
|
):
|
|
"""Lädt ein einzelnes Dokument hoch."""
|
|
if not file.filename:
|
|
raise HTTPException(status_code=400, detail="Kein Dateiname")
|
|
|
|
recognition = parse_nibis_filename(file.filename)
|
|
|
|
final_bundesland = bundesland or recognition.bundesland or Bundesland.NIEDERSACHSEN
|
|
final_fach = fach or recognition.fach
|
|
final_jahr = jahr or recognition.jahr or datetime.now().year
|
|
final_niveau = niveau or recognition.niveau or Niveau.EA
|
|
final_typ = typ or recognition.typ or DokumentTyp.AUFGABE
|
|
final_aufgabe = aufgaben_nummer or recognition.aufgaben_nummer
|
|
|
|
if not final_fach:
|
|
raise HTTPException(status_code=400, detail="Fach konnte nicht erkannt werden")
|
|
|
|
doc_id = str(uuid.uuid4())
|
|
file_ext = Path(file.filename).suffix
|
|
safe_filename = f"{doc_id}{file_ext}"
|
|
file_path = DOCS_DIR / safe_filename
|
|
|
|
content = await file.read()
|
|
with open(file_path, "wb") as f:
|
|
f.write(content)
|
|
|
|
now = datetime.utcnow()
|
|
dokument = AbiturDokument(
|
|
id=doc_id, dateiname=safe_filename, original_dateiname=file.filename,
|
|
bundesland=final_bundesland, fach=final_fach, jahr=final_jahr,
|
|
niveau=final_niveau, typ=final_typ, aufgaben_nummer=final_aufgabe,
|
|
status=VerarbeitungsStatus.RECOGNIZED if recognition.success else VerarbeitungsStatus.PENDING,
|
|
confidence=recognition.confidence, file_path=str(file_path), file_size=len(content),
|
|
indexed=False, vector_ids=[], created_at=now, updated_at=now
|
|
)
|
|
_dokumente[doc_id] = dokument
|
|
logger.info(f"Uploaded document {doc_id}: {file.filename}")
|
|
return _to_dokument_response(dokument)
|
|
|
|
|
|
@router.post("/import-zip", response_model=ImportResult)
|
|
async def import_zip(
|
|
file: UploadFile = File(...),
|
|
bundesland: Bundesland = Form(Bundesland.NIEDERSACHSEN),
|
|
background_tasks: BackgroundTasks = None
|
|
):
|
|
"""Importiert alle PDFs aus einer ZIP-Datei."""
|
|
if not file.filename or not file.filename.endswith(".zip"):
|
|
raise HTTPException(status_code=400, detail="ZIP-Datei erforderlich")
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp:
|
|
content = await file.read()
|
|
tmp.write(content)
|
|
tmp_path = tmp.name
|
|
|
|
documents = []
|
|
total = 0
|
|
recognized = 0
|
|
errors = 0
|
|
|
|
try:
|
|
with zipfile.ZipFile(tmp_path, 'r') as zip_ref:
|
|
for zip_info in zip_ref.infolist():
|
|
if not zip_info.filename.lower().endswith(".pdf"):
|
|
continue
|
|
if "__MACOSX" in zip_info.filename or zip_info.filename.startswith("."):
|
|
continue
|
|
if "thumbs.db" in zip_info.filename.lower():
|
|
continue
|
|
|
|
total += 1
|
|
try:
|
|
basename = Path(zip_info.filename).name
|
|
recognition = parse_nibis_filename(basename)
|
|
if not recognition.fach:
|
|
errors += 1
|
|
logger.warning(f"Konnte Fach nicht erkennen: {basename}")
|
|
continue
|
|
|
|
doc_id = str(uuid.uuid4())
|
|
file_ext = Path(basename).suffix
|
|
safe_filename = f"{doc_id}{file_ext}"
|
|
file_path = DOCS_DIR / safe_filename
|
|
|
|
with zip_ref.open(zip_info.filename) as source:
|
|
file_content = source.read()
|
|
with open(file_path, "wb") as target:
|
|
target.write(file_content)
|
|
|
|
now = datetime.utcnow()
|
|
dokument = AbiturDokument(
|
|
id=doc_id, dateiname=safe_filename, original_dateiname=basename,
|
|
bundesland=bundesland, fach=recognition.fach,
|
|
jahr=recognition.jahr or datetime.now().year,
|
|
niveau=recognition.niveau or Niveau.EA,
|
|
typ=recognition.typ or DokumentTyp.AUFGABE,
|
|
aufgaben_nummer=recognition.aufgaben_nummer,
|
|
status=VerarbeitungsStatus.RECOGNIZED, confidence=recognition.confidence,
|
|
file_path=str(file_path), file_size=len(file_content),
|
|
indexed=False, vector_ids=[], created_at=now, updated_at=now
|
|
)
|
|
_dokumente[doc_id] = dokument
|
|
documents.append(_to_dokument_response(dokument))
|
|
recognized += 1
|
|
except Exception as e:
|
|
errors += 1
|
|
logger.error(f"Fehler bei {zip_info.filename}: {e}")
|
|
finally:
|
|
os.unlink(tmp_path)
|
|
|
|
logger.info(f"ZIP-Import: {recognized}/{total} erkannt, {errors} Fehler")
|
|
return ImportResult(total_files=total, recognized=recognized, errors=errors, documents=documents)
|
|
|
|
|
|
@router.get("/", response_model=List[DokumentResponse])
|
|
async def list_dokumente(
|
|
bundesland: Optional[Bundesland] = None, fach: Optional[Fach] = None,
|
|
jahr: Optional[int] = None, niveau: Optional[Niveau] = None,
|
|
typ: Optional[DokumentTyp] = None, status: Optional[VerarbeitungsStatus] = None,
|
|
indexed: Optional[bool] = None
|
|
):
|
|
"""Listet Dokumente mit optionalen Filtern."""
|
|
docs = list(_dokumente.values())
|
|
if bundesland:
|
|
docs = [d for d in docs if d.bundesland == bundesland]
|
|
if fach:
|
|
docs = [d for d in docs if d.fach == fach]
|
|
if jahr:
|
|
docs = [d for d in docs if d.jahr == jahr]
|
|
if niveau:
|
|
docs = [d for d in docs if d.niveau == niveau]
|
|
if typ:
|
|
docs = [d for d in docs if d.typ == typ]
|
|
if status:
|
|
docs = [d for d in docs if d.status == status]
|
|
if indexed is not None:
|
|
docs = [d for d in docs if d.indexed == indexed]
|
|
docs.sort(key=lambda x: (x.jahr, x.fach.value, x.niveau.value), reverse=True)
|
|
return [_to_dokument_response(d) for d in docs]
|
|
|
|
|
|
@router.get("/{doc_id}", response_model=DokumentResponse)
|
|
async def get_dokument(doc_id: str):
|
|
"""Ruft ein Dokument ab."""
|
|
doc = _dokumente.get(doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
|
return _to_dokument_response(doc)
|
|
|
|
|
|
@router.put("/{doc_id}", response_model=DokumentResponse)
|
|
async def update_dokument(doc_id: str, data: DokumentUpdate):
|
|
"""Aktualisiert Dokument-Metadaten."""
|
|
doc = _dokumente.get(doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
|
if data.bundesland is not None:
|
|
doc.bundesland = data.bundesland
|
|
if data.fach is not None:
|
|
doc.fach = data.fach
|
|
if data.jahr is not None:
|
|
doc.jahr = data.jahr
|
|
if data.niveau is not None:
|
|
doc.niveau = data.niveau
|
|
if data.typ is not None:
|
|
doc.typ = data.typ
|
|
if data.aufgaben_nummer is not None:
|
|
doc.aufgaben_nummer = data.aufgaben_nummer
|
|
if data.status is not None:
|
|
doc.status = data.status
|
|
doc.updated_at = datetime.utcnow()
|
|
return _to_dokument_response(doc)
|
|
|
|
|
|
@router.post("/{doc_id}/confirm", response_model=DokumentResponse)
|
|
async def confirm_dokument(doc_id: str):
|
|
"""Bestätigt erkannte Metadaten."""
|
|
doc = _dokumente.get(doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
|
doc.status = VerarbeitungsStatus.CONFIRMED
|
|
doc.updated_at = datetime.utcnow()
|
|
return _to_dokument_response(doc)
|
|
|
|
|
|
@router.post("/{doc_id}/index", response_model=DokumentResponse)
|
|
async def index_dokument(doc_id: str):
|
|
"""Indiziert Dokument im Vector Store."""
|
|
doc = _dokumente.get(doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
|
if doc.status not in [VerarbeitungsStatus.CONFIRMED, VerarbeitungsStatus.RECOGNIZED]:
|
|
raise HTTPException(status_code=400, detail="Dokument muss erst bestätigt werden")
|
|
doc.indexed = True
|
|
doc.vector_ids = [f"vec_{doc_id}_{i}" for i in range(3)]
|
|
doc.status = VerarbeitungsStatus.INDEXED
|
|
doc.updated_at = datetime.utcnow()
|
|
logger.info(f"Document {doc_id} indexed (demo)")
|
|
return _to_dokument_response(doc)
|
|
|
|
|
|
@router.delete("/{doc_id}")
|
|
async def delete_dokument(doc_id: str):
|
|
"""Löscht ein Dokument."""
|
|
doc = _dokumente.get(doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
|
if os.path.exists(doc.file_path):
|
|
os.remove(doc.file_path)
|
|
del _dokumente[doc_id]
|
|
return {"status": "deleted", "id": doc_id}
|
|
|
|
|
|
@router.get("/{doc_id}/download")
|
|
async def download_dokument(doc_id: str):
|
|
"""Lädt Dokument herunter."""
|
|
doc = _dokumente.get(doc_id)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="Dokument nicht gefunden")
|
|
if not os.path.exists(doc.file_path):
|
|
raise HTTPException(status_code=404, detail="Datei nicht gefunden")
|
|
return FileResponse(doc.file_path, filename=doc.original_dateiname, media_type="application/pdf")
|
|
|
|
|
|
@router.post("/recognize", response_model=RecognitionResult)
|
|
async def recognize_filename(filename: str):
|
|
"""Erkennt Metadaten aus einem Dateinamen."""
|
|
return parse_nibis_filename(filename)
|
|
|
|
|
|
@router.post("/bulk-confirm")
|
|
async def bulk_confirm(doc_ids: List[str]):
|
|
"""Bestätigt mehrere Dokumente auf einmal."""
|
|
confirmed = 0
|
|
for doc_id in doc_ids:
|
|
doc = _dokumente.get(doc_id)
|
|
if doc and doc.status == VerarbeitungsStatus.RECOGNIZED:
|
|
doc.status = VerarbeitungsStatus.CONFIRMED
|
|
doc.updated_at = datetime.utcnow()
|
|
confirmed += 1
|
|
return {"confirmed": confirmed, "total": len(doc_ids)}
|
|
|
|
|
|
@router.post("/bulk-index")
|
|
async def bulk_index(doc_ids: List[str]):
|
|
"""Indiziert mehrere Dokumente auf einmal."""
|
|
indexed = 0
|
|
for doc_id in doc_ids:
|
|
doc = _dokumente.get(doc_id)
|
|
if doc and doc.status in [VerarbeitungsStatus.CONFIRMED, VerarbeitungsStatus.RECOGNIZED]:
|
|
doc.indexed = True
|
|
doc.vector_ids = [f"vec_{doc_id}_{i}" for i in range(3)]
|
|
doc.status = VerarbeitungsStatus.INDEXED
|
|
doc.updated_at = datetime.utcnow()
|
|
indexed += 1
|
|
return {"indexed": indexed, "total": len(doc_ids)}
|
|
|
|
|
|
@router.get("/stats/overview")
|
|
async def get_stats_overview():
|
|
"""Gibt Übersicht über alle Dokumente."""
|
|
docs = list(_dokumente.values())
|
|
by_bundesland: Dict[str, int] = {}
|
|
by_fach: Dict[str, int] = {}
|
|
by_jahr: Dict[int, int] = {}
|
|
by_status: Dict[str, int] = {}
|
|
for doc in docs:
|
|
by_bundesland[doc.bundesland.value] = by_bundesland.get(doc.bundesland.value, 0) + 1
|
|
by_fach[doc.fach.value] = by_fach.get(doc.fach.value, 0) + 1
|
|
by_jahr[doc.jahr] = by_jahr.get(doc.jahr, 0) + 1
|
|
by_status[doc.status.value] = by_status.get(doc.status.value, 0) + 1
|
|
return {
|
|
"total": len(docs), "indexed": sum(1 for d in docs if d.indexed),
|
|
"pending": sum(1 for d in docs if d.status == VerarbeitungsStatus.PENDING),
|
|
"by_bundesland": by_bundesland, "by_fach": by_fach, "by_jahr": by_jahr, "by_status": by_status
|
|
}
|
|
|
|
|
|
@router.get("/search", response_model=List[DokumentResponse])
|
|
async def search_dokumente(
|
|
bundesland: Bundesland, fach: Fach, jahr: Optional[int] = None,
|
|
niveau: Optional[Niveau] = None, nur_indexed: bool = True
|
|
):
|
|
"""Sucht Dokumente für Klausur-Korrektur."""
|
|
docs = [d for d in _dokumente.values() if d.bundesland == bundesland and d.fach == fach]
|
|
if jahr:
|
|
docs = [d for d in docs if d.jahr == jahr]
|
|
if niveau:
|
|
docs = [d for d in docs if d.niveau == niveau]
|
|
if nur_indexed:
|
|
docs = [d for d in docs if d.indexed]
|
|
|
|
aufgaben = [d for d in docs if d.typ == DokumentTyp.AUFGABE]
|
|
ewh = [d for d in docs if d.typ == DokumentTyp.ERWARTUNGSHORIZONT]
|
|
andere = [d for d in docs if d.typ not in [DokumentTyp.AUFGABE, DokumentTyp.ERWARTUNGSHORIZONT]]
|
|
|
|
result = []
|
|
for aufgabe in aufgaben:
|
|
result.append(_to_dokument_response(aufgabe))
|
|
matching_ewh = next(
|
|
(e for e in ewh if e.jahr == aufgabe.jahr and e.niveau == aufgabe.niveau
|
|
and e.aufgaben_nummer == aufgabe.aufgaben_nummer), None
|
|
)
|
|
if matching_ewh:
|
|
result.append(_to_dokument_response(matching_ewh))
|
|
for e in ewh:
|
|
if _to_dokument_response(e) not in result:
|
|
result.append(_to_dokument_response(e))
|
|
for a in andere:
|
|
result.append(_to_dokument_response(a))
|
|
return result
|
|
|
|
|
|
@router.get("/enums/bundeslaender")
|
|
async def get_bundeslaender():
|
|
"""Gibt alle Bundesländer zurück."""
|
|
return [{"value": b.value, "label": b.value.replace("_", " ").title()} for b in Bundesland]
|
|
|
|
|
|
@router.get("/enums/faecher")
|
|
async def get_faecher():
|
|
"""Gibt alle Fächer zurück."""
|
|
return [{"value": f.value, "label": FACH_LABELS.get(f, f.value)} for f in Fach]
|
|
|
|
|
|
@router.get("/enums/niveaus")
|
|
async def get_niveaus():
|
|
"""Gibt alle Niveaus zurück."""
|
|
return [
|
|
{"value": "eA", "label": "eA (erhöhtes Anforderungsniveau)"},
|
|
{"value": "gA", "label": "gA (grundlegendes Anforderungsniveau)"}
|
|
]
|
|
|
|
|
|
@router.get("/enums/typen")
|
|
async def get_typen():
|
|
"""Gibt alle Dokumenttypen zurück."""
|
|
return [{"value": t.value, "label": DOKUMENT_TYP_LABELS.get(t, t.value)} for t in DokumentTyp]
|