[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions
--- a/klausur-service/backend/nibis_ingestion.py
+++ b/klausur-service/backend/nibis_ingestion.py
@@ -10,12 +10,11 @@ Unterstützt:
 """

 import os
-import re
 import zipfile
 import hashlib
 import json
 from pathlib import Path
-from typing import List, Dict, Optional, Tuple
+from typing import List, Dict, Optional
 from dataclasses import dataclass, asdict
 from datetime import datetime
 import asyncio
@@ -23,6 +22,7 @@ import asyncio
 # Local imports
 from eh_pipeline import chunk_text, generate_embeddings, extract_text_from_pdf, get_vector_size, EMBEDDING_BACKEND
 from qdrant_service import QdrantService
+from nibis_parsers import parse_filename_old_format, parse_filename_new_format

 # Configuration
 DOCS_BASE_PATH = Path("/Users/benjaminadmin/projekte/breakpilot-pwa/docs")
@@ -87,15 +87,6 @@ SUBJECT_MAPPING = {
    "gespfl": "Gesundheit-Pflege",
 }

-# Niveau-Mapping
-NIVEAU_MAPPING = {
-    "ea": "eA",  # erhöhtes Anforderungsniveau
-    "ga": "gA",  # grundlegendes Anforderungsniveau
-    "neuga": "gA (neu einsetzend)",
-    "neuea": "eA (neu einsetzend)",
-}
-
-
 def compute_file_hash(file_path: Path) -> str:
    """Berechnet SHA-256 Hash einer Datei."""
    sha256 = hashlib.sha256()
@@ -135,103 +126,6 @@ def extract_zip_files(base_path: Path) -> List[Path]:
    return extracted


-def parse_filename_old_format(filename: str, file_path: Path) -> Optional[Dict]:
-    """
-    Parst alte Namenskonvention (2016, 2017):
-    - {Jahr}{Fach}{Niveau}Lehrer/{Jahr}{Fach}{Niveau}A{Nr}L.pdf
-    - Beispiel: 2016DeutschEALehrer/2016DeutschEAA1L.pdf
-    """
-    # Pattern für Lehrer-Dateien
-    pattern = r"(\d{4})([A-Za-zäöüÄÖÜ]+)(EA|GA|NeuGA|NeuEA)(?:Lehrer)?.*?(?:A(\d+)|Aufg(\d+))?L?\.pdf$"
-
-    match = re.search(pattern, filename, re.IGNORECASE)
-    if not match:
-        return None
-
-    year = int(match.group(1))
-    subject_raw = match.group(2).lower()
-    niveau = match.group(3).upper()
-    task_num = match.group(4) or match.group(5)
-
-    # Prüfe ob es ein Lehrer-Dokument ist (EWH)
-    is_ewh = "lehrer" in str(file_path).lower() or filename.endswith("L.pdf")
-
-    # Extrahiere Variante (Tech, Wirt, CAS, GTR, etc.)
-    variant = None
-    variant_patterns = ["Tech", "Wirt", "CAS", "GTR", "Pflicht", "BG", "mitExp", "ohneExp"]
-    for v in variant_patterns:
-        if v.lower() in str(file_path).lower():
-            variant = v
-            break
-
-    return {
-        "year": year,
-        "subject": subject_raw,
-        "niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
-        "task_number": int(task_num) if task_num else None,
-        "doc_type": "EWH" if is_ewh else "Aufgabe",
-        "variant": variant,
-    }
-
-
-def parse_filename_new_format(filename: str, file_path: Path) -> Optional[Dict]:
-    """
-    Parst neue Namenskonvention (2024, 2025):
-    - {Jahr}_{Fach}_{niveau}_{Nr}_EWH.pdf
-    - Beispiel: 2025_Deutsch_eA_I_EWH.pdf
-    """
-    # Pattern für neue Dateien
-    pattern = r"(\d{4})_([A-Za-zäöüÄÖÜ]+)(?:BG)?_(eA|gA)(?:_([IVX\d]+))?(?:_(.+))?\.pdf$"
-
-    match = re.search(pattern, filename, re.IGNORECASE)
-    if not match:
-        return None
-
-    year = int(match.group(1))
-    subject_raw = match.group(2).lower()
-    niveau = match.group(3)
-    task_id = match.group(4)
-    suffix = match.group(5) or ""
-
-    # Task-Nummer aus römischen Zahlen
-    task_num = None
-    if task_id:
-        roman_map = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
-        task_num = roman_map.get(task_id) or (int(task_id) if task_id.isdigit() else None)
-
-    # Dokumenttyp
-    is_ewh = "EWH" in filename or "ewh" in filename.lower()
-
-    # Spezielle Dokumenttypen
-    doc_type = "EWH" if is_ewh else "Aufgabe"
-    if "Material" in suffix:
-        doc_type = "Material"
-    elif "GBU" in suffix:
-        doc_type = "GBU"
-    elif "Ergebnis" in suffix:
-        doc_type = "Ergebnis"
-    elif "Bewertungsbogen" in suffix:
-        doc_type = "Bewertungsbogen"
-    elif "HV" in suffix:
-        doc_type = "Hörverstehen"
-    elif "ME" in suffix:
-        doc_type = "Mediation"
-
-    # BG Variante
-    variant = "BG" if "BG" in filename else None
-    if "mitExp" in str(file_path):
-        variant = "mitExp"
-
-    return {
-        "year": year,
-        "subject": subject_raw,
-        "niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
-        "task_number": task_num,
-        "doc_type": doc_type,
-        "variant": variant,
-    }
-
-
 def discover_documents(base_path: Path, ewh_only: bool = True) -> List[NiBiSDocument]:
    """
    Findet alle relevanten Dokumente in den za-download Verzeichnissen.