[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions

View File

@@ -10,12 +10,11 @@ Unterstützt:
"""
import os
import re
import zipfile
import hashlib
import json
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from typing import List, Dict, Optional
from dataclasses import dataclass, asdict
from datetime import datetime
import asyncio
@@ -23,6 +22,7 @@ import asyncio
# Local imports
from eh_pipeline import chunk_text, generate_embeddings, extract_text_from_pdf, get_vector_size, EMBEDDING_BACKEND
from qdrant_service import QdrantService
from nibis_parsers import parse_filename_old_format, parse_filename_new_format
# Configuration
DOCS_BASE_PATH = Path("/Users/benjaminadmin/projekte/breakpilot-pwa/docs")
@@ -87,15 +87,6 @@ SUBJECT_MAPPING = {
"gespfl": "Gesundheit-Pflege",
}
# Niveau-Mapping
NIVEAU_MAPPING = {
"ea": "eA", # erhöhtes Anforderungsniveau
"ga": "gA", # grundlegendes Anforderungsniveau
"neuga": "gA (neu einsetzend)",
"neuea": "eA (neu einsetzend)",
}
def compute_file_hash(file_path: Path) -> str:
"""Berechnet SHA-256 Hash einer Datei."""
sha256 = hashlib.sha256()
@@ -135,103 +126,6 @@ def extract_zip_files(base_path: Path) -> List[Path]:
return extracted
def parse_filename_old_format(filename: str, file_path: Path) -> Optional[Dict]:
"""
Parst alte Namenskonvention (2016, 2017):
- {Jahr}{Fach}{Niveau}Lehrer/{Jahr}{Fach}{Niveau}A{Nr}L.pdf
- Beispiel: 2016DeutschEALehrer/2016DeutschEAA1L.pdf
"""
# Pattern für Lehrer-Dateien
pattern = r"(\d{4})([A-Za-zäöüÄÖÜ]+)(EA|GA|NeuGA|NeuEA)(?:Lehrer)?.*?(?:A(\d+)|Aufg(\d+))?L?\.pdf$"
match = re.search(pattern, filename, re.IGNORECASE)
if not match:
return None
year = int(match.group(1))
subject_raw = match.group(2).lower()
niveau = match.group(3).upper()
task_num = match.group(4) or match.group(5)
# Prüfe ob es ein Lehrer-Dokument ist (EWH)
is_ewh = "lehrer" in str(file_path).lower() or filename.endswith("L.pdf")
# Extrahiere Variante (Tech, Wirt, CAS, GTR, etc.)
variant = None
variant_patterns = ["Tech", "Wirt", "CAS", "GTR", "Pflicht", "BG", "mitExp", "ohneExp"]
for v in variant_patterns:
if v.lower() in str(file_path).lower():
variant = v
break
return {
"year": year,
"subject": subject_raw,
"niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
"task_number": int(task_num) if task_num else None,
"doc_type": "EWH" if is_ewh else "Aufgabe",
"variant": variant,
}
def parse_filename_new_format(filename: str, file_path: Path) -> Optional[Dict]:
"""
Parst neue Namenskonvention (2024, 2025):
- {Jahr}_{Fach}_{niveau}_{Nr}_EWH.pdf
- Beispiel: 2025_Deutsch_eA_I_EWH.pdf
"""
# Pattern für neue Dateien
pattern = r"(\d{4})_([A-Za-zäöüÄÖÜ]+)(?:BG)?_(eA|gA)(?:_([IVX\d]+))?(?:_(.+))?\.pdf$"
match = re.search(pattern, filename, re.IGNORECASE)
if not match:
return None
year = int(match.group(1))
subject_raw = match.group(2).lower()
niveau = match.group(3)
task_id = match.group(4)
suffix = match.group(5) or ""
# Task-Nummer aus römischen Zahlen
task_num = None
if task_id:
roman_map = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
task_num = roman_map.get(task_id) or (int(task_id) if task_id.isdigit() else None)
# Dokumenttyp
is_ewh = "EWH" in filename or "ewh" in filename.lower()
# Spezielle Dokumenttypen
doc_type = "EWH" if is_ewh else "Aufgabe"
if "Material" in suffix:
doc_type = "Material"
elif "GBU" in suffix:
doc_type = "GBU"
elif "Ergebnis" in suffix:
doc_type = "Ergebnis"
elif "Bewertungsbogen" in suffix:
doc_type = "Bewertungsbogen"
elif "HV" in suffix:
doc_type = "Hörverstehen"
elif "ME" in suffix:
doc_type = "Mediation"
# BG Variante
variant = "BG" if "BG" in filename else None
if "mitExp" in str(file_path):
variant = "mitExp"
return {
"year": year,
"subject": subject_raw,
"niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
"task_number": task_num,
"doc_type": doc_type,
"variant": variant,
}
def discover_documents(base_path: Path, ewh_only: bool = True) -> List[NiBiSDocument]:
"""
Findet alle relevanten Dokumente in den za-download Verzeichnissen.