feat(control-pipeline): production LegalActIngester for EU acts (Parser 1)
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 32s
CI / test-bqas (push) Successful in 30s

Add services/legal_act_ingester.py — the EU eur-lex LegalActIngester engine:
CELLAR download (with eur-lex fallback, bypassing the HTTP 202 web block on
large acts like DORA), parse into articles + annexes with full authority
metadata + forward citation edges (references_out), and a self-test gate before
upload. Refactor scripts/ingest_eu_regulations.py to use it: parse-based,
per-unit upload with a skip-by-CELEX guard (no automatic re-ingest). Recitals
are intentionally left to a separate ingester (Parser 2).

Tested: parser / metadata / self-test / refs_out over a synthetic eur-lex
fixture (7 tests), ruff + mypy clean, real CELLAR fetch of DORA verified
end-to-end (64 articles, full authority metadata).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-24 08:17:56 +02:00
parent f398088fbb
commit 569f64a400
4 changed files with 556 additions and 160 deletions
+17
View File
@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html><body>
<p class="oj-doc-ti">VERORDNUNG (EU) 2099/1 DES TESTGEBERS</p>
<p class="oj-normal">(1) Dieser Erwaegungsgrund steht vor den Artikeln und darf NICHT als Artikel geparst werden.</p>
<p class="oj-ti-grseq-1">KAPITEL I</p>
<p class="oj-ti-art">Artikel 1</p>
<p class="oj-sti-art">Gegenstand</p>
<p class="oj-normal">Diese Verordnung legt Anforderungen fest; Einzelheiten regeln Artikel 2 und Anhang I.</p>
<p class="oj-ti-art">Artikel 2</p>
<p class="oj-sti-art">Begriffsbestimmungen</p>
<p class="oj-normal">Im Sinne dieser Verordnung bezeichnet der Ausdruck Produkt eine Sache mit digitalen Elementen.</p>
<p class="oj-doc-ti">ANHANG I</p>
<p class="oj-ti-grseq-1">GRUNDLEGENDE ANFORDERUNGEN</p>
<p class="oj-normal">Die Produkte muessen die grundlegenden Anforderungen gemaess Artikel 1 dauerhaft erfuellen.</p>
<p class="oj-doc-ti">ANHANG II</p>
<p class="oj-normal">x</p>
</body></html>
@@ -0,0 +1,108 @@
"""Unit tests for the LegalActIngester engine (Parser 1).
Pure parser + metadata tests against a synthetic eur-lex fixture — no network,
no RAG service. Covers: article/annex parsing, recital exclusion, references_out,
the self-test gate, full authority metadata and empty-annex skipping.
"""
import os
from services.legal_act_ingester import (
RegSpec,
build_upload_units,
parse_html,
refs_out,
self_test,
)
FIXTURE = os.path.join(os.path.dirname(__file__), "fixtures", "sample_eurlex_act.html")
SPEC = RegSpec(reg="TEST", celex="32099R0001", name_de="Testverordnung",
version_date="2099-01-01", legal_basis_rank="eu_regulation")
def _raw() -> str:
with open(FIXTURE, encoding="utf-8") as fh:
return fh.read()
def test_parse_articles_and_annexes():
act = parse_html(_raw(), "TEST")
assert [a.num for a in act.articles] == ["1", "2"]
assert [a.num for a in act.annexes] == ["I", "II"]
art1 = act.articles[0]
assert art1.title == "Gegenstand"
assert art1.chapter == "KAPITEL I"
assert "grundlegenden Anforderungen" in act.annexes[0].body[0]
def test_recital_before_articles_is_ignored():
# The "(1) Dieser Erwaegungsgrund …" paragraph precedes Article 1 and must
# not leak in as an article (recitals are Parser 2's job).
act = parse_html(_raw(), "TEST")
bodies = " ".join(b for a in act.articles for b in a.body)
assert "Erwaegungsgrund" not in bodies
def test_refs_out_extracts_article_and_annex_edges():
act = parse_html(_raw(), "TEST")
art1_refs = refs_out("TEST", " ".join(act.articles[0].body))
assert "Art. 2 TEST" in art1_refs
assert "TEST Anhang I" in art1_refs
# The annex points back to Article 1 (bidirectional graph is built later).
annex_refs = refs_out("TEST", " ".join(act.annexes[0].body))
assert "Art. 1 TEST" in annex_refs
def test_self_test_passes_clean_act():
passed, problems = self_test(parse_html(_raw(), "TEST"))
assert passed, problems
def test_self_test_flags_empty_and_duplicate():
from services.legal_act_ingester import Article, ParsedAct
dup = ParsedAct(reg="X", articles=[Article("1", body=["enough text here ok"]),
Article("1", body=["also enough text"])], annexes=[])
passed, problems = self_test(dup)
assert not passed and any("duplicate" in p for p in problems)
empty = ParsedAct(reg="X", articles=[Article("1", body=["x"])], annexes=[])
passed2, problems2 = self_test(empty)
assert not passed2 and any("empty" in p for p in problems2)
def test_build_upload_units_skips_empty_annex_and_tags_authority():
units = build_upload_units(parse_html(_raw(), "TEST"), SPEC, "2099-test")
# 2 articles + Annex I (Annex II body "x" is skipped) = 3 units
assert len(units) == 3
by_cu = {u.meta["citation_unit"]: u for u in units}
assert set(by_cu) == {"Art. 1 TEST", "Art. 2 TEST", "TEST Anhang I"}
art = by_cu["Art. 1 TEST"]
assert art.meta["chunk_scope"] == "section"
assert art.meta["source_class"] == "binding_law"
assert art.meta["authority_weight"] == 100
assert art.meta["jurisdiction"] == "EU"
assert art.meta["use_for_primary"] is True
assert art.document_version == "2099-test-test"
annex = by_cu["TEST Anhang I"]
assert annex.meta["chunk_scope"] == "annex"
assert annex.meta["article"] == "Anhang-I"
# per-annex document_version prevents point-ID collisions across annexes
assert annex.document_version == "2099-test-test-anhangI"
def test_build_upload_units_distinct_annex_versions():
from services.legal_act_ingester import Annex, Article, ParsedAct
act = ParsedAct(
reg="TEST",
articles=[Article("1", body=["body text long enough"])],
annexes=[Annex("I", body=["annex one body long enough"]),
Annex("II", body=["annex two body long enough"])],
)
units = build_upload_units(act, SPEC, "run9")
versions = [u.document_version for u in units if u.meta["chunk_scope"] == "annex"]
assert versions == ["run9-test-anhangI", "run9-test-anhangII"]
assert len(set(versions)) == 2