breakpilot-lehrer/klausur-service/backend/nibis_parsers.py

"""
NiBiS Filename Parsers

Parses old and new naming conventions for NiBiS Abitur documents.
"""

import re
from typing import Dict, Optional

# Niveau-Mapping
NIVEAU_MAPPING = {
    "ea": "eA",  # erhoehtes Anforderungsniveau
    "ga": "gA",  # grundlegendes Anforderungsniveau
    "neuga": "gA (neu einsetzend)",
    "neuea": "eA (neu einsetzend)",
}


def parse_filename_old_format(filename: str, file_path) -> Optional[Dict]:
    """
    Parst alte Namenskonvention (2016, 2017):
    - {Jahr}{Fach}{Niveau}Lehrer/{Jahr}{Fach}{Niveau}A{Nr}L.pdf
    - Beispiel: 2016DeutschEALehrer/2016DeutschEAA1L.pdf
    """
    # Pattern fuer Lehrer-Dateien
    pattern = r"(\d{4})([A-Za-z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc]+)(EA|GA|NeuGA|NeuEA)(?:Lehrer)?.*?(?:A(\d+)|Aufg(\d+))?L?\.pdf$"

    match = re.search(pattern, filename, re.IGNORECASE)
    if not match:
        return None

    year = int(match.group(1))
    subject_raw = match.group(2).lower()
    niveau = match.group(3).upper()
    task_num = match.group(4) or match.group(5)

    # Pruefe ob es ein Lehrer-Dokument ist (EWH)
    is_ewh = "lehrer" in str(file_path).lower() or filename.endswith("L.pdf")

    # Extrahiere Variante (Tech, Wirt, CAS, GTR, etc.)
    variant = None
    variant_patterns = ["Tech", "Wirt", "CAS", "GTR", "Pflicht", "BG", "mitExp", "ohneExp"]
    for v in variant_patterns:
        if v.lower() in str(file_path).lower():
            variant = v
            break

    return {
        "year": year,
        "subject": subject_raw,
        "niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
        "task_number": int(task_num) if task_num else None,
        "doc_type": "EWH" if is_ewh else "Aufgabe",
        "variant": variant,
    }


def parse_filename_new_format(filename: str, file_path) -> Optional[Dict]:
    """
    Parst neue Namenskonvention (2024, 2025):
    - {Jahr}_{Fach}_{niveau}_{Nr}_EWH.pdf
    - Beispiel: 2025_Deutsch_eA_I_EWH.pdf
    """
    # Pattern fuer neue Dateien
    pattern = r"(\d{4})_([A-Za-z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc]+)(?:BG)?_(eA|gA)(?:_([IVX\d]+))?(?:_(.+))?\.pdf$"

    match = re.search(pattern, filename, re.IGNORECASE)
    if not match:
        return None

    year = int(match.group(1))
    subject_raw = match.group(2).lower()
    niveau = match.group(3)
    task_id = match.group(4)
    suffix = match.group(5) or ""

    # Task-Nummer aus roemischen Zahlen
    task_num = None
    if task_id:
        roman_map = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
        task_num = roman_map.get(task_id) or (int(task_id) if task_id.isdigit() else None)

    # Dokumenttyp
    is_ewh = "EWH" in filename or "ewh" in filename.lower()

    # Spezielle Dokumenttypen
    doc_type = "EWH" if is_ewh else "Aufgabe"
    if "Material" in suffix:
        doc_type = "Material"
    elif "GBU" in suffix:
        doc_type = "GBU"
    elif "Ergebnis" in suffix:
        doc_type = "Ergebnis"
    elif "Bewertungsbogen" in suffix:
        doc_type = "Bewertungsbogen"
    elif "HV" in suffix:
        doc_type = "Hoerverstehen"
    elif "ME" in suffix:
        doc_type = "Mediation"

    # BG Variante
    variant = "BG" if "BG" in filename else None
    if "mitExp" in str(file_path):
        variant = "mitExp"

    return {
        "year": year,
        "subject": subject_raw,
        "niveau": NIVEAU_MAPPING.get(niveau.lower(), niveau),
        "task_number": task_num,
        "doc_type": doc_type,
        "variant": variant,
    }