dac2a9f685
Neues pures Modul legal_metadata.py (nur stdlib, lokal+CI testbar): §3-Normalisierung section->article, strikte Header-Extraktion (Datum/Seiten-Rauschen -> kein Falsch-Zitat), citation_style pro Regulierung (EU/CH=article, DE=paragraph), Urteil=Aktenzeichen statt §, camelCase-Klarnamen (ProdHaftG), deterministische uuid5-Point-ID + chunk_hash (sha256). documents.py verdrahtet build_legal_fields in den Payload-Build + document_version. 10 Tests gruen. Vertrag: rag_reingest_spec.md (§2/§3). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
116 lines
4.5 KiB
Python
116 lines
4.5 KiB
Python
"""Tests fuer legal_metadata (rag_reingest_spec.md §3-Normalisierung)."""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
import legal_metadata as L
|
|
|
|
|
|
def test_detect_citation_style():
|
|
assert L.detect_citation_style("§ 38") == "paragraph"
|
|
assert L.detect_citation_style("Artikel 13") == "article"
|
|
assert L.detect_citation_style("Art. 13") == "article"
|
|
|
|
|
|
def test_normalize_article():
|
|
assert L.normalize_article("§ 38") == "38"
|
|
assert L.normalize_article("Artikel 13") == "13"
|
|
assert L.normalize_article("Art. 13a") == "13a"
|
|
assert L.normalize_article("[Artikel 17]") == "17"
|
|
assert L.normalize_article("") == ""
|
|
# Strikt: Datum/Seite/Titel sind KEINE Fundstelle -> leer (keine falsche Zitierung)
|
|
assert L.normalize_article("4.5.2016 DE Amtsblatt der Europaeischen Union L 119/9") == ""
|
|
assert L.normalize_article("Begriffsbestimmungen") == ""
|
|
|
|
|
|
def test_normalize_paragraph():
|
|
assert L.normalize_paragraph("(1)") == "1"
|
|
assert L.normalize_paragraph("Abs. 2") == "2"
|
|
assert L.normalize_paragraph("") == ""
|
|
|
|
|
|
def test_extract_sub():
|
|
assert L.extract_sub("... lit. c ...") == "lit. c"
|
|
assert L.extract_sub("Satz 2 des Absatzes") == "Satz 2"
|
|
assert L.extract_sub("kein Sub hier") == ""
|
|
|
|
|
|
def test_format_article_label_paragraph_style():
|
|
# Klarname (regulation_short) wird NICHT mehr uppercased -> camelCase bleibt erhalten.
|
|
assert L.format_article_label("ProdHaftG", "paragraph", "1", "", "") == "ProdHaftG § 1"
|
|
assert L.format_article_label("BDSG", "paragraph", "38", "1", "") == "BDSG § 38 Abs. 1"
|
|
assert L.format_article_label("BDSG", "paragraph", "38", "", "") == "BDSG § 38"
|
|
|
|
|
|
def test_format_article_label_article_style():
|
|
assert L.format_article_label("DSGVO", "article", "13", "1", "lit. c") == "Art. 13 Abs. 1 lit. c DSGVO"
|
|
assert L.format_article_label("DSGVO", "article", "30", "", "") == "Art. 30 DSGVO"
|
|
# CH-Recht: Klarname + Art.
|
|
assert L.format_article_label("RevDSG", "article", "12", "", "") == "Art. 12 RevDSG"
|
|
|
|
|
|
def test_compute_chunk_hash_stable_and_ws_insensitive():
|
|
a = L.compute_chunk_hash("Hallo Welt\n")
|
|
b = L.compute_chunk_hash("Hallo Welt")
|
|
assert a == b and len(a) == 64
|
|
|
|
|
|
def test_deterministic_point_id_stable():
|
|
i1 = L.deterministic_point_id("BDSG", "38", "1", 0, "2026-06-19")
|
|
i2 = L.deterministic_point_id("BDSG", "38", "1", 0, "2026-06-19")
|
|
i3 = L.deterministic_point_id("BDSG", "38", "1", 1, "2026-06-19")
|
|
assert i1 == i2 and i1 != i3 and len(i1) == 36 # UUID
|
|
|
|
|
|
def test_build_legal_fields_citation_style_override():
|
|
# DSGVO = EU-VO -> citation_style 'article' pro Regulierung, NICHT '§' aus dem Section-String
|
|
out = L.build_legal_fields(
|
|
{"section": "§ 4", "section_title": "Begriffsbestimmungen", "paragraph": ""},
|
|
"DSGVO", citation_style="article",
|
|
)
|
|
assert out["citation_style"] == "article"
|
|
assert out["article"] == "4"
|
|
assert out["article_label"] == "Art. 4 DSGVO"
|
|
|
|
|
|
def test_build_legal_fields_integration():
|
|
out = L.build_legal_fields(
|
|
{"section": "§ 38", "section_title": "Datenschutzbeauftragte", "paragraph": "(1)"},
|
|
"bdsg",
|
|
chunk_text="... in der Regel mindestens 20 Personen ...",
|
|
display_name="BDSG",
|
|
)
|
|
assert out["citation_style"] == "paragraph"
|
|
assert out["article"] == "38"
|
|
assert out["paragraph"] == "" # artikel-genau, kein geratener Absatz
|
|
assert out["regulation_code"] == "BDSG" # Feldwert bleibt GROSS
|
|
assert out["article_label"] == "BDSG § 38"
|
|
assert out["section_header"] == "Datenschutzbeauftragte"
|
|
assert out["is_recital"] is False
|
|
|
|
|
|
def test_build_legal_fields_camelcase_display_name():
|
|
# camelCase-Klarname bleibt im Label erhalten, Feld regulation_code bleibt GROSS.
|
|
out = L.build_legal_fields(
|
|
{"section": "§ 1", "section_title": "Haftung"},
|
|
"PRODHAFTG",
|
|
display_name="ProdHaftG",
|
|
)
|
|
assert out["regulation_code"] == "PRODHAFTG"
|
|
assert out["article_label"] == "ProdHaftG § 1"
|
|
|
|
|
|
def test_build_legal_fields_ruling_uses_aktenzeichen_not_paragraph():
|
|
# Urteil: Querverweis "§ 87 BetrVG" im Text darf KEIN Label "AZ § 87" erzeugen.
|
|
out = L.build_legal_fields(
|
|
{"section": "§ 87", "section_title": "Mitbestimmung"},
|
|
"BAG_1_ABR_22_21",
|
|
source_type="urteil",
|
|
display_name="BAG, 1 ABR 22/21",
|
|
)
|
|
assert out["article"] == ""
|
|
assert out["article_label"] == "BAG, 1 ABR 22/21"
|
|
assert out["regulation_code"] == "BAG_1_ABR_22_21"
|