[split-required] Split 500-1000 LOC files across all services

backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 23:35:37 +02:00
parent 6811264756
commit b6983ab1dc
99 changed files with 13484 additions and 16106 deletions
--- a/backend-lehrer/abitur_docs_recognition.py
+++ b/backend-lehrer/abitur_docs_recognition.py
@@ -0,0 +1,124 @@
+"""
+Abitur Document Store - Dateinamen-Erkennung und Helfer.
+
+Erkennt Metadaten aus NiBiS-Dateinamen (Niedersachsen).
+"""
+
+import re
+from typing import Dict, Any
+from pathlib import Path
+
+from abitur_docs_models import (
+    Bundesland, Fach, Niveau, DokumentTyp, VerarbeitungsStatus,
+    RecognitionResult, AbiturDokument, DokumentResponse,
+    FACH_NAME_MAPPING,
+)
+
+
+def parse_nibis_filename(filename: str) -> RecognitionResult:
+    """
+    Erkennt Metadaten aus NiBiS-Dateinamen.
+
+    Beispiele:
+    - 2025_Deutsch_eA_I.pdf
+    - 2025_Deutsch_eA_I_EWH.pdf
+    - 2025_Biologie_gA_1.pdf
+    - 2025_Englisch_eA_HV.pdf (Hörverstehen)
+    """
+    result = RecognitionResult(
+        success=False,
+        bundesland=Bundesland.NIEDERSACHSEN,
+        fach=None,
+        jahr=None,
+        niveau=None,
+        typ=None,
+        aufgaben_nummer=None,
+        confidence=0.0,
+        raw_filename=filename,
+        suggestions=[]
+    )
+
+    # Bereinige Dateiname
+    name = Path(filename).stem.lower()
+
+    # Extrahiere Jahr (4 Ziffern am Anfang)
+    jahr_match = re.match(r'^(\d{4})', name)
+    if jahr_match:
+        result.jahr = int(jahr_match.group(1))
+        result.confidence += 0.2
+
+    # Extrahiere Fach
+    for fach_key, fach_enum in FACH_NAME_MAPPING.items():
+        if fach_key in name.replace("_", "").replace("-", ""):
+            result.fach = fach_enum
+            result.confidence += 0.3
+            break
+
+    # Extrahiere Niveau (eA/gA)
+    if "_ea" in name or "_ea_" in name or "ea_" in name:
+        result.niveau = Niveau.EA
+        result.confidence += 0.2
+    elif "_ga" in name or "_ga_" in name or "ga_" in name:
+        result.niveau = Niveau.GA
+        result.confidence += 0.2
+
+    # Extrahiere Typ
+    if "_ewh" in name:
+        result.typ = DokumentTyp.ERWARTUNGSHORIZONT
+        result.confidence += 0.2
+    elif "_hv" in name or "hoerverstehen" in name:
+        result.typ = DokumentTyp.HOERVERSTEHEN
+        result.confidence += 0.15
+    elif "_sm" in name or "_me" in name or "sprachmittlung" in name:
+        result.typ = DokumentTyp.SPRACHMITTLUNG
+        result.confidence += 0.15
+    elif "deckblatt" in name:
+        result.typ = DokumentTyp.DECKBLATT
+        result.confidence += 0.15
+    elif "material" in name:
+        result.typ = DokumentTyp.MATERIAL
+        result.confidence += 0.15
+    elif "bewertung" in name:
+        result.typ = DokumentTyp.BEWERTUNGSBOGEN
+        result.confidence += 0.15
+    else:
+        result.typ = DokumentTyp.AUFGABE
+        result.confidence += 0.1
+
+    # Extrahiere Aufgabennummer (römisch oder arabisch)
+    aufgabe_match = re.search(r'_([ivx]+|[1-4][abc]?)(?:_|\.pdf|$)', name, re.IGNORECASE)
+    if aufgabe_match:
+        result.aufgaben_nummer = aufgabe_match.group(1).upper()
+        result.confidence += 0.1
+
+    # Erfolg wenn mindestens Fach und Jahr erkannt
+    if result.fach and result.jahr:
+        result.success = True
+
+    # Normalisiere Confidence auf max 1.0
+    result.confidence = min(result.confidence, 1.0)
+
+    return result
+
+
+def to_dokument_response(doc: AbiturDokument) -> DokumentResponse:
+    """Konvertiert internes Dokument zu Response."""
+    return DokumentResponse(
+        id=doc.id,
+        dateiname=doc.dateiname,
+        original_dateiname=doc.original_dateiname,
+        bundesland=doc.bundesland,
+        fach=doc.fach,
+        jahr=doc.jahr,
+        niveau=doc.niveau,
+        typ=doc.typ,
+        aufgaben_nummer=doc.aufgaben_nummer,
+        status=doc.status,
+        confidence=doc.confidence,
+        file_path=doc.file_path,
+        file_size=doc.file_size,
+        indexed=doc.indexed,
+        vector_ids=doc.vector_ids,
+        created_at=doc.created_at,
+        updated_at=doc.updated_at
+    )