Files
breakpilot-lehrer/klausur-service/backend/nibis_parsers.py
Benjamin Admin bd4b956e3c [split-required] Split final 43 files (500-668 LOC) to complete refactoring
klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00

114 lines
3.4 KiB
Python

"""
NiBiS Filename Parsers
Parses old and new naming conventions for NiBiS Abitur documents.
"""
import re
from typing import Dict, Optional
# Niveau-Mapping
NIVEAU_MAPPING = {
"ea": "eA", # erhoehtes Anforderungsniveau
"ga": "gA", # grundlegendes Anforderungsniveau
"neuga": "gA (neu einsetzend)",
"neuea": "eA (neu einsetzend)",
}
def parse_filename_old_format(filename: str, file_path) -> Optional[Dict]:
"""
Parst alte Namenskonvention (2016, 2017):
- {Jahr}{Fach}{Niveau}Lehrer/{Jahr}{Fach}{Niveau}A{Nr}L.pdf
- Beispiel: 2016DeutschEALehrer/2016DeutschEAA1L.pdf
"""
# Pattern fuer Lehrer-Dateien
pattern = r"(\d{4})([A-Za-z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc]+)(EA|GA|NeuGA|NeuEA)(?:Lehrer)?.*?(?:A(\d+)|Aufg(\d+))?L?\.pdf$"
match = re.search(pattern, filename, re.IGNORECASE)
if not match:
return None
year = int(match.group(1))
subject_raw = match.group(2).lower()
niveau = match.group(3).upper()
task_num = match.group(4) or match.group(5)
# Pruefe ob es ein Lehrer-Dokument ist (EWH)
is_ewh = "lehrer" in str(file_path).lower() or filename.endswith("L.pdf")
# Extrahiere Variante (Tech, Wirt, CAS, GTR, etc.)
variant = None
variant_patterns = ["Tech", "Wirt", "CAS", "GTR", "Pflicht", "BG", "mitExp", "ohneExp"]
for v in variant_patterns:
if v.lower() in str(file_path).lower():
variant = v
break
return {
"year": year,
"subject": subject_raw,
"niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
"task_number": int(task_num) if task_num else None,
"doc_type": "EWH" if is_ewh else "Aufgabe",
"variant": variant,
}
def parse_filename_new_format(filename: str, file_path) -> Optional[Dict]:
"""
Parst neue Namenskonvention (2024, 2025):
- {Jahr}_{Fach}_{niveau}_{Nr}_EWH.pdf
- Beispiel: 2025_Deutsch_eA_I_EWH.pdf
"""
# Pattern fuer neue Dateien
pattern = r"(\d{4})_([A-Za-z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc]+)(?:BG)?_(eA|gA)(?:_([IVX\d]+))?(?:_(.+))?\.pdf$"
match = re.search(pattern, filename, re.IGNORECASE)
if not match:
return None
year = int(match.group(1))
subject_raw = match.group(2).lower()
niveau = match.group(3)
task_id = match.group(4)
suffix = match.group(5) or ""
# Task-Nummer aus roemischen Zahlen
task_num = None
if task_id:
roman_map = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
task_num = roman_map.get(task_id) or (int(task_id) if task_id.isdigit() else None)
# Dokumenttyp
is_ewh = "EWH" in filename or "ewh" in filename.lower()
# Spezielle Dokumenttypen
doc_type = "EWH" if is_ewh else "Aufgabe"
if "Material" in suffix:
doc_type = "Material"
elif "GBU" in suffix:
doc_type = "GBU"
elif "Ergebnis" in suffix:
doc_type = "Ergebnis"
elif "Bewertungsbogen" in suffix:
doc_type = "Bewertungsbogen"
elif "HV" in suffix:
doc_type = "Hoerverstehen"
elif "ME" in suffix:
doc_type = "Mediation"
# BG Variante
variant = "BG" if "BG" in filename else None
if "mitExp" in str(file_path):
variant = "mitExp"
return {
"year": year,
"subject": subject_raw,
"niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
"task_number": task_num,
"doc_type": doc_type,
"variant": variant,
}