feat(rag): Legal-Metadaten — article_label + deterministische IDs + chunk_hash

Neues pures Modul legal_metadata.py (nur stdlib, lokal+CI testbar): §3-Normalisierung
section->article, strikte Header-Extraktion (Datum/Seiten-Rauschen -> kein Falsch-Zitat),
citation_style pro Regulierung (EU/CH=article, DE=paragraph), Urteil=Aktenzeichen statt §,
camelCase-Klarnamen (ProdHaftG), deterministische uuid5-Point-ID + chunk_hash (sha256).
documents.py verdrahtet build_legal_fields in den Payload-Build + document_version.
10 Tests gruen. Vertrag: rag_reingest_spec.md (§2/§3).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-20 14:35:07 +02:00
parent adb7c6802c
commit dac2a9f685
3 changed files with 285 additions and 9 deletions
+115
View File
@@ -0,0 +1,115 @@
"""Tests fuer legal_metadata (rag_reingest_spec.md §3-Normalisierung)."""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import legal_metadata as L
def test_detect_citation_style():
assert L.detect_citation_style("§ 38") == "paragraph"
assert L.detect_citation_style("Artikel 13") == "article"
assert L.detect_citation_style("Art. 13") == "article"
def test_normalize_article():
assert L.normalize_article("§ 38") == "38"
assert L.normalize_article("Artikel 13") == "13"
assert L.normalize_article("Art. 13a") == "13a"
assert L.normalize_article("[Artikel 17]") == "17"
assert L.normalize_article("") == ""
# Strikt: Datum/Seite/Titel sind KEINE Fundstelle -> leer (keine falsche Zitierung)
assert L.normalize_article("4.5.2016 DE Amtsblatt der Europaeischen Union L 119/9") == ""
assert L.normalize_article("Begriffsbestimmungen") == ""
def test_normalize_paragraph():
assert L.normalize_paragraph("(1)") == "1"
assert L.normalize_paragraph("Abs. 2") == "2"
assert L.normalize_paragraph("") == ""
def test_extract_sub():
assert L.extract_sub("... lit. c ...") == "lit. c"
assert L.extract_sub("Satz 2 des Absatzes") == "Satz 2"
assert L.extract_sub("kein Sub hier") == ""
def test_format_article_label_paragraph_style():
# Klarname (regulation_short) wird NICHT mehr uppercased -> camelCase bleibt erhalten.
assert L.format_article_label("ProdHaftG", "paragraph", "1", "", "") == "ProdHaftG § 1"
assert L.format_article_label("BDSG", "paragraph", "38", "1", "") == "BDSG § 38 Abs. 1"
assert L.format_article_label("BDSG", "paragraph", "38", "", "") == "BDSG § 38"
def test_format_article_label_article_style():
assert L.format_article_label("DSGVO", "article", "13", "1", "lit. c") == "Art. 13 Abs. 1 lit. c DSGVO"
assert L.format_article_label("DSGVO", "article", "30", "", "") == "Art. 30 DSGVO"
# CH-Recht: Klarname + Art.
assert L.format_article_label("RevDSG", "article", "12", "", "") == "Art. 12 RevDSG"
def test_compute_chunk_hash_stable_and_ws_insensitive():
a = L.compute_chunk_hash("Hallo Welt\n")
b = L.compute_chunk_hash("Hallo Welt")
assert a == b and len(a) == 64
def test_deterministic_point_id_stable():
i1 = L.deterministic_point_id("BDSG", "38", "1", 0, "2026-06-19")
i2 = L.deterministic_point_id("BDSG", "38", "1", 0, "2026-06-19")
i3 = L.deterministic_point_id("BDSG", "38", "1", 1, "2026-06-19")
assert i1 == i2 and i1 != i3 and len(i1) == 36 # UUID
def test_build_legal_fields_citation_style_override():
# DSGVO = EU-VO -> citation_style 'article' pro Regulierung, NICHT '§' aus dem Section-String
out = L.build_legal_fields(
{"section": "§ 4", "section_title": "Begriffsbestimmungen", "paragraph": ""},
"DSGVO", citation_style="article",
)
assert out["citation_style"] == "article"
assert out["article"] == "4"
assert out["article_label"] == "Art. 4 DSGVO"
def test_build_legal_fields_integration():
out = L.build_legal_fields(
{"section": "§ 38", "section_title": "Datenschutzbeauftragte", "paragraph": "(1)"},
"bdsg",
chunk_text="... in der Regel mindestens 20 Personen ...",
display_name="BDSG",
)
assert out["citation_style"] == "paragraph"
assert out["article"] == "38"
assert out["paragraph"] == "" # artikel-genau, kein geratener Absatz
assert out["regulation_code"] == "BDSG" # Feldwert bleibt GROSS
assert out["article_label"] == "BDSG § 38"
assert out["section_header"] == "Datenschutzbeauftragte"
assert out["is_recital"] is False
def test_build_legal_fields_camelcase_display_name():
# camelCase-Klarname bleibt im Label erhalten, Feld regulation_code bleibt GROSS.
out = L.build_legal_fields(
{"section": "§ 1", "section_title": "Haftung"},
"PRODHAFTG",
display_name="ProdHaftG",
)
assert out["regulation_code"] == "PRODHAFTG"
assert out["article_label"] == "ProdHaftG § 1"
def test_build_legal_fields_ruling_uses_aktenzeichen_not_paragraph():
# Urteil: Querverweis "§ 87 BetrVG" im Text darf KEIN Label "AZ § 87" erzeugen.
out = L.build_legal_fields(
{"section": "§ 87", "section_title": "Mitbestimmung"},
"BAG_1_ABR_22_21",
source_type="urteil",
display_name="BAG, 1 ABR 22/21",
)
assert out["article"] == ""
assert out["article_label"] == "BAG, 1 ABR 22/21"
assert out["regulation_code"] == "BAG_1_ABR_22_21"