feat(rag): regulation_short Casing-Normalisierung am Ingest-Rand
Der Re-Ingest leitet regulation_short z.T. via title()-Casing aus Dateinamen ab
('dsgvo'->'Dsgvo', 'osha otm'->'Osha Otm') -> falsche Akronyme im Payload UND im
article_label ('Art. 37 Dsgvo'). NEU: normalize_regulation_short() in legal_metadata,
token-basiert mit kuratiertem Akronym-Set -> nur gelistete Akronyme werden gross,
legitimes Mixed-Case (GeschGehG, MuSchG, GoBD, MiCA, eIDAS, EuGH) bleibt unberuehrt.
Angewandt am Ingest-Rand in documents.py (greift fuer Payload-Feld + display_name).
+13 Tests gruen. Bestandsdaten brauchen separaten einmaligen Qdrant-Patch.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -10,7 +10,12 @@ from embedding_client import embedding_client
|
|||||||
from html_utils import decode_html_bytes, looks_like_html, strip_html
|
from html_utils import decode_html_bytes, looks_like_html, strip_html
|
||||||
from minio_client_wrapper import minio_wrapper
|
from minio_client_wrapper import minio_wrapper
|
||||||
from qdrant_client_wrapper import qdrant_wrapper
|
from qdrant_client_wrapper import qdrant_wrapper
|
||||||
from legal_metadata import build_legal_fields, compute_chunk_hash, deterministic_point_id
|
from legal_metadata import (
|
||||||
|
build_legal_fields,
|
||||||
|
compute_chunk_hash,
|
||||||
|
deterministic_point_id,
|
||||||
|
normalize_regulation_short,
|
||||||
|
)
|
||||||
|
|
||||||
logger = logging.getLogger("rag-service.api.documents")
|
logger = logging.getLogger("rag-service.api.documents")
|
||||||
|
|
||||||
@@ -155,6 +160,13 @@ async def upload_document(
|
|||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
logger.warning("Invalid metadata_json, ignoring")
|
logger.warning("Invalid metadata_json, ignoring")
|
||||||
|
|
||||||
|
# Casing-Normalisierung am Ingest-Rand: title-caste Akronyme korrigieren
|
||||||
|
# ('Dsgvo'->'DSGVO'), damit Payload-Feld UND article_label sauber sind.
|
||||||
|
if extra_metadata.get("regulation_short"):
|
||||||
|
extra_metadata["regulation_short"] = normalize_regulation_short(
|
||||||
|
extra_metadata["regulation_short"]
|
||||||
|
)
|
||||||
|
|
||||||
# --- Build payloads (rag_reingest_spec.md §2/§3: zitierfaehige Legal-Metadaten) ---
|
# --- Build payloads (rag_reingest_spec.md §2/§3: zitierfaehige Legal-Metadaten) ---
|
||||||
reg_code = (
|
reg_code = (
|
||||||
extra_metadata.get("regulation_code")
|
extra_metadata.get("regulation_code")
|
||||||
|
|||||||
@@ -25,6 +25,40 @@ _SUB_RE = re.compile(r"(lit\.\s*[a-z]+|Satz\s*\d+|Nr\.\s*\d+)", re.IGNORECASE)
|
|||||||
# Zitiert wird das Aktenzeichen (display_name / regulation_short).
|
# Zitiert wird das Aktenzeichen (display_name / regulation_short).
|
||||||
_RULING_TYPES = {"urteil", "ruling", "court_decision", "beschluss"}
|
_RULING_TYPES = {"urteil", "ruling", "court_decision", "beschluss"}
|
||||||
|
|
||||||
|
# Akronyme, deren KORREKTE Schreibweise GROSS ist. Der Ingest leitet regulation_short
|
||||||
|
# z.T. via title()-Casing aus Dateinamen ab ("dsgvo" -> "Dsgvo", "osha otm" -> "Osha Otm").
|
||||||
|
# Token-basiert: NUR Tokens, deren Upper-Form hier steht, werden gross geschrieben — damit
|
||||||
|
# bleiben legitime Mixed-Case-Kuerzel (GeschGehG, MuSchG, GoBD, MiCA, eIDAS, EuGH) unberuehrt.
|
||||||
|
# Bewusst NICHT enthalten: EUGH/BVERFG/BVERWG (korrekt sind EuGH/BVerfG/BVerwG).
|
||||||
|
# Konservativ + erweiterbar.
|
||||||
|
_REG_SHORT_ACRONYMS = {
|
||||||
|
"DSGVO", "GDPR", "DSA", "DMA", "DORA", "AMLR", "GPSR", "DPF", "IFRS", "DSM", "MDR",
|
||||||
|
"CRA", "NIS2", "EU", "US", "PPE", "ICS", "SCADA", "OSHA", "OTM", "NIST", "NISTIR",
|
||||||
|
"ENISA", "EDPB", "EDPS", "DSK", "BFDI", "OWASP", "API", "GPAI", "EUCC", "ECCG",
|
||||||
|
"CISA", "CVSS", "CVD", "SRP", "PF", "SCHUFA", "SLSA", "BGH", "OGH", "BAG",
|
||||||
|
}
|
||||||
|
# Ganz-Wert-Overrides (Token-Logik kann diese nicht herleiten).
|
||||||
|
_REG_SHORT_OVERRIDES = {
|
||||||
|
"Dataact": "Data Act",
|
||||||
|
"Ecommerce": "E-Commerce",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_regulation_short(name: str) -> str:
|
||||||
|
"""Korrigiert title-caste Akronyme im druckbaren reg-short ('Dsgvo'->'DSGVO',
|
||||||
|
'Osha Otm ...'->'OSHA OTM ...', 'EU Mdr'->'EU MDR') und erhaelt legitimes Mixed-Case
|
||||||
|
(GeschGehG, MuSchG, MiCA, eIDAS, EuGH). Nur Tokens in _REG_SHORT_ACRONYMS werden
|
||||||
|
gross geschrieben; alles andere bleibt unveraendert."""
|
||||||
|
s = (name or "").strip()
|
||||||
|
if not s:
|
||||||
|
return s
|
||||||
|
if s in _REG_SHORT_OVERRIDES:
|
||||||
|
return _REG_SHORT_OVERRIDES[s]
|
||||||
|
return " ".join(
|
||||||
|
tok.upper() if tok.upper() in _REG_SHORT_ACRONYMS else tok
|
||||||
|
for tok in s.split()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def detect_citation_style(section: str) -> str:
|
def detect_citation_style(section: str) -> str:
|
||||||
"""'§ 38' -> 'paragraph' (DE-Gesetze); 'Artikel/Art. 13' -> 'article' (EU-VO)."""
|
"""'§ 38' -> 'paragraph' (DE-Gesetze); 'Artikel/Art. 13' -> 'article' (EU-VO)."""
|
||||||
|
|||||||
@@ -113,3 +113,27 @@ def test_build_legal_fields_ruling_uses_aktenzeichen_not_paragraph():
|
|||||||
assert out["article"] == ""
|
assert out["article"] == ""
|
||||||
assert out["article_label"] == "BAG, 1 ABR 22/21"
|
assert out["article_label"] == "BAG, 1 ABR 22/21"
|
||||||
assert out["regulation_code"] == "BAG_1_ABR_22_21"
|
assert out["regulation_code"] == "BAG_1_ABR_22_21"
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_regulation_short():
|
||||||
|
# title-caste Akronyme -> korrekt GROSS (Token-basiert)
|
||||||
|
assert L.normalize_regulation_short("Dsgvo") == "DSGVO"
|
||||||
|
assert L.normalize_regulation_short("Gpsr") == "GPSR"
|
||||||
|
assert L.normalize_regulation_short("Ifrs") == "IFRS"
|
||||||
|
assert L.normalize_regulation_short("EU Mdr") == "EU MDR"
|
||||||
|
assert L.normalize_regulation_short("OWASP Api TOP10") == "OWASP API TOP10"
|
||||||
|
assert (
|
||||||
|
L.normalize_regulation_short("Osha Otm Section-4-safety-hazards Chapter-1")
|
||||||
|
== "OSHA OTM Section-4-safety-hazards Chapter-1"
|
||||||
|
)
|
||||||
|
# legitimes Mixed-Case bleibt unberuehrt (NICHT in der Akronym-Liste)
|
||||||
|
assert L.normalize_regulation_short("GeschGehG") == "GeschGehG"
|
||||||
|
assert L.normalize_regulation_short("MuSchG") == "MuSchG"
|
||||||
|
assert L.normalize_regulation_short("GoBD") == "GoBD"
|
||||||
|
assert L.normalize_regulation_short("MiCA") == "MiCA"
|
||||||
|
assert L.normalize_regulation_short("eIDAS") == "eIDAS"
|
||||||
|
assert L.normalize_regulation_short("EuGH C-252/21 (Meta)") == "EuGH C-252/21 (Meta)"
|
||||||
|
# Ganz-Wert-Overrides + Leerfaelle
|
||||||
|
assert L.normalize_regulation_short("Dataact") == "Data Act"
|
||||||
|
assert L.normalize_regulation_short("") == ""
|
||||||
|
assert L.normalize_regulation_short(None) == ""
|
||||||
|
|||||||
Reference in New Issue
Block a user