feat(control-pipeline): production LegalActIngester for EU acts (Parser 1)
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 32s
CI / test-bqas (push) Successful in 30s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 32s
CI / test-bqas (push) Successful in 30s
Add services/legal_act_ingester.py — the EU eur-lex LegalActIngester engine: CELLAR download (with eur-lex fallback, bypassing the HTTP 202 web block on large acts like DORA), parse into articles + annexes with full authority metadata + forward citation edges (references_out), and a self-test gate before upload. Refactor scripts/ingest_eu_regulations.py to use it: parse-based, per-unit upload with a skip-by-CELEX guard (no automatic re-ingest). Recitals are intentionally left to a separate ingester (Parser 2). Tested: parser / metadata / self-test / refs_out over a synthetic eur-lex fixture (7 tests), ruff + mypy clean, real CELLAR fetch of DORA verified end-to-end (64 articles, full authority metadata). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,17 @@
|
||||
<!DOCTYPE html>
|
||||
<html><body>
|
||||
<p class="oj-doc-ti">VERORDNUNG (EU) 2099/1 DES TESTGEBERS</p>
|
||||
<p class="oj-normal">(1) Dieser Erwaegungsgrund steht vor den Artikeln und darf NICHT als Artikel geparst werden.</p>
|
||||
<p class="oj-ti-grseq-1">KAPITEL I</p>
|
||||
<p class="oj-ti-art">Artikel 1</p>
|
||||
<p class="oj-sti-art">Gegenstand</p>
|
||||
<p class="oj-normal">Diese Verordnung legt Anforderungen fest; Einzelheiten regeln Artikel 2 und Anhang I.</p>
|
||||
<p class="oj-ti-art">Artikel 2</p>
|
||||
<p class="oj-sti-art">Begriffsbestimmungen</p>
|
||||
<p class="oj-normal">Im Sinne dieser Verordnung bezeichnet der Ausdruck Produkt eine Sache mit digitalen Elementen.</p>
|
||||
<p class="oj-doc-ti">ANHANG I</p>
|
||||
<p class="oj-ti-grseq-1">GRUNDLEGENDE ANFORDERUNGEN</p>
|
||||
<p class="oj-normal">Die Produkte muessen die grundlegenden Anforderungen gemaess Artikel 1 dauerhaft erfuellen.</p>
|
||||
<p class="oj-doc-ti">ANHANG II</p>
|
||||
<p class="oj-normal">x</p>
|
||||
</body></html>
|
||||
@@ -0,0 +1,108 @@
|
||||
"""Unit tests for the LegalActIngester engine (Parser 1).
|
||||
|
||||
Pure parser + metadata tests against a synthetic eur-lex fixture — no network,
|
||||
no RAG service. Covers: article/annex parsing, recital exclusion, references_out,
|
||||
the self-test gate, full authority metadata and empty-annex skipping.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from services.legal_act_ingester import (
|
||||
RegSpec,
|
||||
build_upload_units,
|
||||
parse_html,
|
||||
refs_out,
|
||||
self_test,
|
||||
)
|
||||
|
||||
FIXTURE = os.path.join(os.path.dirname(__file__), "fixtures", "sample_eurlex_act.html")
|
||||
SPEC = RegSpec(reg="TEST", celex="32099R0001", name_de="Testverordnung",
|
||||
version_date="2099-01-01", legal_basis_rank="eu_regulation")
|
||||
|
||||
|
||||
def _raw() -> str:
|
||||
with open(FIXTURE, encoding="utf-8") as fh:
|
||||
return fh.read()
|
||||
|
||||
|
||||
def test_parse_articles_and_annexes():
|
||||
act = parse_html(_raw(), "TEST")
|
||||
assert [a.num for a in act.articles] == ["1", "2"]
|
||||
assert [a.num for a in act.annexes] == ["I", "II"]
|
||||
art1 = act.articles[0]
|
||||
assert art1.title == "Gegenstand"
|
||||
assert art1.chapter == "KAPITEL I"
|
||||
assert "grundlegenden Anforderungen" in act.annexes[0].body[0]
|
||||
|
||||
|
||||
def test_recital_before_articles_is_ignored():
|
||||
# The "(1) Dieser Erwaegungsgrund …" paragraph precedes Article 1 and must
|
||||
# not leak in as an article (recitals are Parser 2's job).
|
||||
act = parse_html(_raw(), "TEST")
|
||||
bodies = " ".join(b for a in act.articles for b in a.body)
|
||||
assert "Erwaegungsgrund" not in bodies
|
||||
|
||||
|
||||
def test_refs_out_extracts_article_and_annex_edges():
|
||||
act = parse_html(_raw(), "TEST")
|
||||
art1_refs = refs_out("TEST", " ".join(act.articles[0].body))
|
||||
assert "Art. 2 TEST" in art1_refs
|
||||
assert "TEST Anhang I" in art1_refs
|
||||
# The annex points back to Article 1 (bidirectional graph is built later).
|
||||
annex_refs = refs_out("TEST", " ".join(act.annexes[0].body))
|
||||
assert "Art. 1 TEST" in annex_refs
|
||||
|
||||
|
||||
def test_self_test_passes_clean_act():
|
||||
passed, problems = self_test(parse_html(_raw(), "TEST"))
|
||||
assert passed, problems
|
||||
|
||||
|
||||
def test_self_test_flags_empty_and_duplicate():
|
||||
from services.legal_act_ingester import Article, ParsedAct
|
||||
|
||||
dup = ParsedAct(reg="X", articles=[Article("1", body=["enough text here ok"]),
|
||||
Article("1", body=["also enough text"])], annexes=[])
|
||||
passed, problems = self_test(dup)
|
||||
assert not passed and any("duplicate" in p for p in problems)
|
||||
|
||||
empty = ParsedAct(reg="X", articles=[Article("1", body=["x"])], annexes=[])
|
||||
passed2, problems2 = self_test(empty)
|
||||
assert not passed2 and any("empty" in p for p in problems2)
|
||||
|
||||
|
||||
def test_build_upload_units_skips_empty_annex_and_tags_authority():
|
||||
units = build_upload_units(parse_html(_raw(), "TEST"), SPEC, "2099-test")
|
||||
# 2 articles + Annex I (Annex II body "x" is skipped) = 3 units
|
||||
assert len(units) == 3
|
||||
by_cu = {u.meta["citation_unit"]: u for u in units}
|
||||
assert set(by_cu) == {"Art. 1 TEST", "Art. 2 TEST", "TEST Anhang I"}
|
||||
|
||||
art = by_cu["Art. 1 TEST"]
|
||||
assert art.meta["chunk_scope"] == "section"
|
||||
assert art.meta["source_class"] == "binding_law"
|
||||
assert art.meta["authority_weight"] == 100
|
||||
assert art.meta["jurisdiction"] == "EU"
|
||||
assert art.meta["use_for_primary"] is True
|
||||
assert art.document_version == "2099-test-test"
|
||||
|
||||
annex = by_cu["TEST Anhang I"]
|
||||
assert annex.meta["chunk_scope"] == "annex"
|
||||
assert annex.meta["article"] == "Anhang-I"
|
||||
# per-annex document_version prevents point-ID collisions across annexes
|
||||
assert annex.document_version == "2099-test-test-anhangI"
|
||||
|
||||
|
||||
def test_build_upload_units_distinct_annex_versions():
|
||||
from services.legal_act_ingester import Annex, Article, ParsedAct
|
||||
|
||||
act = ParsedAct(
|
||||
reg="TEST",
|
||||
articles=[Article("1", body=["body text long enough"])],
|
||||
annexes=[Annex("I", body=["annex one body long enough"]),
|
||||
Annex("II", body=["annex two body long enough"])],
|
||||
)
|
||||
units = build_upload_units(act, SPEC, "run9")
|
||||
versions = [u.document_version for u in units if u.meta["chunk_scope"] == "annex"]
|
||||
assert versions == ["run9-test-anhangI", "run9-test-anhangII"]
|
||||
assert len(set(versions)) == 2
|
||||
Reference in New Issue
Block a user