From 0c5f1fd7a4cc3521c7ddcfb8942d855d86e8e892 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 21 Jun 2026 18:31:45 +0200 Subject: [PATCH] feat(rag): regulation_short Casing-Normalisierung am Ingest-Rand Der Re-Ingest leitet regulation_short z.T. via title()-Casing aus Dateinamen ab ('dsgvo'->'Dsgvo', 'osha otm'->'Osha Otm') -> falsche Akronyme im Payload UND im article_label ('Art. 37 Dsgvo'). NEU: normalize_regulation_short() in legal_metadata, token-basiert mit kuratiertem Akronym-Set -> nur gelistete Akronyme werden gross, legitimes Mixed-Case (GeschGehG, MuSchG, GoBD, MiCA, eIDAS, EuGH) bleibt unberuehrt. Angewandt am Ingest-Rand in documents.py (greift fuer Payload-Feld + display_name). +13 Tests gruen. Bestandsdaten brauchen separaten einmaligen Qdrant-Patch. Co-Authored-By: Claude Opus 4.7 --- rag-service/api/documents.py | 14 +++++++++- rag-service/legal_metadata.py | 34 ++++++++++++++++++++++++ rag-service/tests/test_legal_metadata.py | 24 +++++++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/rag-service/api/documents.py b/rag-service/api/documents.py index 48f5915..dddfd62 100644 --- a/rag-service/api/documents.py +++ b/rag-service/api/documents.py @@ -10,7 +10,12 @@ from embedding_client import embedding_client from html_utils import decode_html_bytes, looks_like_html, strip_html from minio_client_wrapper import minio_wrapper from qdrant_client_wrapper import qdrant_wrapper -from legal_metadata import build_legal_fields, compute_chunk_hash, deterministic_point_id +from legal_metadata import ( + build_legal_fields, + compute_chunk_hash, + deterministic_point_id, + normalize_regulation_short, +) logger = logging.getLogger("rag-service.api.documents") @@ -155,6 +160,13 @@ async def upload_document( except json.JSONDecodeError: logger.warning("Invalid metadata_json, ignoring") + # Casing-Normalisierung am Ingest-Rand: title-caste Akronyme korrigieren + # ('Dsgvo'->'DSGVO'), damit Payload-Feld UND article_label sauber sind. + if extra_metadata.get("regulation_short"): + extra_metadata["regulation_short"] = normalize_regulation_short( + extra_metadata["regulation_short"] + ) + # --- Build payloads (rag_reingest_spec.md §2/§3: zitierfaehige Legal-Metadaten) --- reg_code = ( extra_metadata.get("regulation_code") diff --git a/rag-service/legal_metadata.py b/rag-service/legal_metadata.py index 66ec408..4165932 100644 --- a/rag-service/legal_metadata.py +++ b/rag-service/legal_metadata.py @@ -25,6 +25,40 @@ _SUB_RE = re.compile(r"(lit\.\s*[a-z]+|Satz\s*\d+|Nr\.\s*\d+)", re.IGNORECASE) # Zitiert wird das Aktenzeichen (display_name / regulation_short). _RULING_TYPES = {"urteil", "ruling", "court_decision", "beschluss"} +# Akronyme, deren KORREKTE Schreibweise GROSS ist. Der Ingest leitet regulation_short +# z.T. via title()-Casing aus Dateinamen ab ("dsgvo" -> "Dsgvo", "osha otm" -> "Osha Otm"). +# Token-basiert: NUR Tokens, deren Upper-Form hier steht, werden gross geschrieben — damit +# bleiben legitime Mixed-Case-Kuerzel (GeschGehG, MuSchG, GoBD, MiCA, eIDAS, EuGH) unberuehrt. +# Bewusst NICHT enthalten: EUGH/BVERFG/BVERWG (korrekt sind EuGH/BVerfG/BVerwG). +# Konservativ + erweiterbar. +_REG_SHORT_ACRONYMS = { + "DSGVO", "GDPR", "DSA", "DMA", "DORA", "AMLR", "GPSR", "DPF", "IFRS", "DSM", "MDR", + "CRA", "NIS2", "EU", "US", "PPE", "ICS", "SCADA", "OSHA", "OTM", "NIST", "NISTIR", + "ENISA", "EDPB", "EDPS", "DSK", "BFDI", "OWASP", "API", "GPAI", "EUCC", "ECCG", + "CISA", "CVSS", "CVD", "SRP", "PF", "SCHUFA", "SLSA", "BGH", "OGH", "BAG", +} +# Ganz-Wert-Overrides (Token-Logik kann diese nicht herleiten). +_REG_SHORT_OVERRIDES = { + "Dataact": "Data Act", + "Ecommerce": "E-Commerce", +} + + +def normalize_regulation_short(name: str) -> str: + """Korrigiert title-caste Akronyme im druckbaren reg-short ('Dsgvo'->'DSGVO', + 'Osha Otm ...'->'OSHA OTM ...', 'EU Mdr'->'EU MDR') und erhaelt legitimes Mixed-Case + (GeschGehG, MuSchG, MiCA, eIDAS, EuGH). Nur Tokens in _REG_SHORT_ACRONYMS werden + gross geschrieben; alles andere bleibt unveraendert.""" + s = (name or "").strip() + if not s: + return s + if s in _REG_SHORT_OVERRIDES: + return _REG_SHORT_OVERRIDES[s] + return " ".join( + tok.upper() if tok.upper() in _REG_SHORT_ACRONYMS else tok + for tok in s.split() + ) + def detect_citation_style(section: str) -> str: """'§ 38' -> 'paragraph' (DE-Gesetze); 'Artikel/Art. 13' -> 'article' (EU-VO).""" diff --git a/rag-service/tests/test_legal_metadata.py b/rag-service/tests/test_legal_metadata.py index d797223..01d0396 100644 --- a/rag-service/tests/test_legal_metadata.py +++ b/rag-service/tests/test_legal_metadata.py @@ -113,3 +113,27 @@ def test_build_legal_fields_ruling_uses_aktenzeichen_not_paragraph(): assert out["article"] == "" assert out["article_label"] == "BAG, 1 ABR 22/21" assert out["regulation_code"] == "BAG_1_ABR_22_21" + + +def test_normalize_regulation_short(): + # title-caste Akronyme -> korrekt GROSS (Token-basiert) + assert L.normalize_regulation_short("Dsgvo") == "DSGVO" + assert L.normalize_regulation_short("Gpsr") == "GPSR" + assert L.normalize_regulation_short("Ifrs") == "IFRS" + assert L.normalize_regulation_short("EU Mdr") == "EU MDR" + assert L.normalize_regulation_short("OWASP Api TOP10") == "OWASP API TOP10" + assert ( + L.normalize_regulation_short("Osha Otm Section-4-safety-hazards Chapter-1") + == "OSHA OTM Section-4-safety-hazards Chapter-1" + ) + # legitimes Mixed-Case bleibt unberuehrt (NICHT in der Akronym-Liste) + assert L.normalize_regulation_short("GeschGehG") == "GeschGehG" + assert L.normalize_regulation_short("MuSchG") == "MuSchG" + assert L.normalize_regulation_short("GoBD") == "GoBD" + assert L.normalize_regulation_short("MiCA") == "MiCA" + assert L.normalize_regulation_short("eIDAS") == "eIDAS" + assert L.normalize_regulation_short("EuGH C-252/21 (Meta)") == "EuGH C-252/21 (Meta)" + # Ganz-Wert-Overrides + Leerfaelle + assert L.normalize_regulation_short("Dataact") == "Data Act" + assert L.normalize_regulation_short("") == "" + assert L.normalize_regulation_short(None) == ""