Files
breakpilot-core/rag-service/tests/test_legal_metadata.py
T
Benjamin Admin 0c5f1fd7a4 feat(rag): regulation_short Casing-Normalisierung am Ingest-Rand
Der Re-Ingest leitet regulation_short z.T. via title()-Casing aus Dateinamen ab
('dsgvo'->'Dsgvo', 'osha otm'->'Osha Otm') -> falsche Akronyme im Payload UND im
article_label ('Art. 37 Dsgvo'). NEU: normalize_regulation_short() in legal_metadata,
token-basiert mit kuratiertem Akronym-Set -> nur gelistete Akronyme werden gross,
legitimes Mixed-Case (GeschGehG, MuSchG, GoBD, MiCA, eIDAS, EuGH) bleibt unberuehrt.
Angewandt am Ingest-Rand in documents.py (greift fuer Payload-Feld + display_name).
+13 Tests gruen. Bestandsdaten brauchen separaten einmaligen Qdrant-Patch.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-21 18:31:45 +02:00

140 lines
5.7 KiB
Python

"""Tests fuer legal_metadata (rag_reingest_spec.md §3-Normalisierung)."""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import legal_metadata as L
def test_detect_citation_style():
assert L.detect_citation_style("§ 38") == "paragraph"
assert L.detect_citation_style("Artikel 13") == "article"
assert L.detect_citation_style("Art. 13") == "article"
def test_normalize_article():
assert L.normalize_article("§ 38") == "38"
assert L.normalize_article("Artikel 13") == "13"
assert L.normalize_article("Art. 13a") == "13a"
assert L.normalize_article("[Artikel 17]") == "17"
assert L.normalize_article("") == ""
# Strikt: Datum/Seite/Titel sind KEINE Fundstelle -> leer (keine falsche Zitierung)
assert L.normalize_article("4.5.2016 DE Amtsblatt der Europaeischen Union L 119/9") == ""
assert L.normalize_article("Begriffsbestimmungen") == ""
def test_normalize_paragraph():
assert L.normalize_paragraph("(1)") == "1"
assert L.normalize_paragraph("Abs. 2") == "2"
assert L.normalize_paragraph("") == ""
def test_extract_sub():
assert L.extract_sub("... lit. c ...") == "lit. c"
assert L.extract_sub("Satz 2 des Absatzes") == "Satz 2"
assert L.extract_sub("kein Sub hier") == ""
def test_format_article_label_paragraph_style():
# Klarname (regulation_short) wird NICHT mehr uppercased -> camelCase bleibt erhalten.
assert L.format_article_label("ProdHaftG", "paragraph", "1", "", "") == "ProdHaftG § 1"
assert L.format_article_label("BDSG", "paragraph", "38", "1", "") == "BDSG § 38 Abs. 1"
assert L.format_article_label("BDSG", "paragraph", "38", "", "") == "BDSG § 38"
def test_format_article_label_article_style():
assert L.format_article_label("DSGVO", "article", "13", "1", "lit. c") == "Art. 13 Abs. 1 lit. c DSGVO"
assert L.format_article_label("DSGVO", "article", "30", "", "") == "Art. 30 DSGVO"
# CH-Recht: Klarname + Art.
assert L.format_article_label("RevDSG", "article", "12", "", "") == "Art. 12 RevDSG"
def test_compute_chunk_hash_stable_and_ws_insensitive():
a = L.compute_chunk_hash("Hallo Welt\n")
b = L.compute_chunk_hash("Hallo Welt")
assert a == b and len(a) == 64
def test_deterministic_point_id_stable():
i1 = L.deterministic_point_id("BDSG", "38", "1", 0, "2026-06-19")
i2 = L.deterministic_point_id("BDSG", "38", "1", 0, "2026-06-19")
i3 = L.deterministic_point_id("BDSG", "38", "1", 1, "2026-06-19")
assert i1 == i2 and i1 != i3 and len(i1) == 36 # UUID
def test_build_legal_fields_citation_style_override():
# DSGVO = EU-VO -> citation_style 'article' pro Regulierung, NICHT '§' aus dem Section-String
out = L.build_legal_fields(
{"section": "§ 4", "section_title": "Begriffsbestimmungen", "paragraph": ""},
"DSGVO", citation_style="article",
)
assert out["citation_style"] == "article"
assert out["article"] == "4"
assert out["article_label"] == "Art. 4 DSGVO"
def test_build_legal_fields_integration():
out = L.build_legal_fields(
{"section": "§ 38", "section_title": "Datenschutzbeauftragte", "paragraph": "(1)"},
"bdsg",
chunk_text="... in der Regel mindestens 20 Personen ...",
display_name="BDSG",
)
assert out["citation_style"] == "paragraph"
assert out["article"] == "38"
assert out["paragraph"] == "" # artikel-genau, kein geratener Absatz
assert out["regulation_code"] == "BDSG" # Feldwert bleibt GROSS
assert out["article_label"] == "BDSG § 38"
assert out["section_header"] == "Datenschutzbeauftragte"
assert out["is_recital"] is False
def test_build_legal_fields_camelcase_display_name():
# camelCase-Klarname bleibt im Label erhalten, Feld regulation_code bleibt GROSS.
out = L.build_legal_fields(
{"section": "§ 1", "section_title": "Haftung"},
"PRODHAFTG",
display_name="ProdHaftG",
)
assert out["regulation_code"] == "PRODHAFTG"
assert out["article_label"] == "ProdHaftG § 1"
def test_build_legal_fields_ruling_uses_aktenzeichen_not_paragraph():
# Urteil: Querverweis "§ 87 BetrVG" im Text darf KEIN Label "AZ § 87" erzeugen.
out = L.build_legal_fields(
{"section": "§ 87", "section_title": "Mitbestimmung"},
"BAG_1_ABR_22_21",
source_type="urteil",
display_name="BAG, 1 ABR 22/21",
)
assert out["article"] == ""
assert out["article_label"] == "BAG, 1 ABR 22/21"
assert out["regulation_code"] == "BAG_1_ABR_22_21"
def test_normalize_regulation_short():
# title-caste Akronyme -> korrekt GROSS (Token-basiert)
assert L.normalize_regulation_short("Dsgvo") == "DSGVO"
assert L.normalize_regulation_short("Gpsr") == "GPSR"
assert L.normalize_regulation_short("Ifrs") == "IFRS"
assert L.normalize_regulation_short("EU Mdr") == "EU MDR"
assert L.normalize_regulation_short("OWASP Api TOP10") == "OWASP API TOP10"
assert (
L.normalize_regulation_short("Osha Otm Section-4-safety-hazards Chapter-1")
== "OSHA OTM Section-4-safety-hazards Chapter-1"
)
# legitimes Mixed-Case bleibt unberuehrt (NICHT in der Akronym-Liste)
assert L.normalize_regulation_short("GeschGehG") == "GeschGehG"
assert L.normalize_regulation_short("MuSchG") == "MuSchG"
assert L.normalize_regulation_short("GoBD") == "GoBD"
assert L.normalize_regulation_short("MiCA") == "MiCA"
assert L.normalize_regulation_short("eIDAS") == "eIDAS"
assert L.normalize_regulation_short("EuGH C-252/21 (Meta)") == "EuGH C-252/21 (Meta)"
# Ganz-Wert-Overrides + Leerfaelle
assert L.normalize_regulation_short("Dataact") == "Data Act"
assert L.normalize_regulation_short("") == ""
assert L.normalize_regulation_short(None) == ""