Files
breakpilot-core/control-pipeline/tests/test_recital_ingester.py
T
Benjamin Admin c258fbc3de
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 30s
CI / test-python-voice (push) Successful in 38s
CI / test-bqas (push) Successful in 40s
feat(control-pipeline): RecitalIngester for EU act recitals (Parser 2)
Add services/recital_ingester.py — parses EU act recitals (Erwägungsgründe)
from the eur-lex/CELLAR preamble via the id="rct_N" markers (the table layout
that defeats a naive article parser) and tags them as a SEPARATE interpretative
source: source_class=recital, authority_weight=60, use_for_primary=false, so
they rank below binding articles and surface only as interpretation context.
Reuses the Parser-1 download + helpers. Add scripts/ingest_recitals.py
(skip-by-existing, no auto re-ingest) + tests/fixture.

Tested: 4 unit tests over a synthetic rct_N fixture, ruff + mypy clean, real
CELLAR parse of DORA verified end-to-end (106 recitals, interpretative metadata).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-24 08:49:30 +02:00

57 lines
2.2 KiB
Python

"""Unit tests for the RecitalIngester engine (Parser 2).
Pure parser + metadata tests against a synthetic eur-lex recital fixture (the
id="rct_N" preamble-table structure). Covers: recital extraction, exclusion of
article text, the self-test gate, and the interpretative (non-primary) metadata.
"""
import os
from services.legal_act_ingester import RegSpec
from services.recital_ingester import build_upload_units, parse_recitals, self_test
FIXTURE = os.path.join(os.path.dirname(__file__), "fixtures", "sample_eurlex_recitals.html")
SPEC = RegSpec(reg="TEST", celex="32099R0001", name_de="Testverordnung", version_date="2099-01-01")
def _raw() -> str:
with open(FIXTURE, encoding="utf-8") as fh:
return fh.read()
def test_parse_recitals_from_rct_markers():
recs = parse_recitals(_raw(), "TEST")
assert [r.num for r in recs] == ["1", "2"]
assert "Hintergrund" in recs[0].text
def test_article_text_is_not_captured_as_recital():
joined = " ".join(r.text for r in parse_recitals(_raw(), "TEST"))
assert "Artikeltext" not in joined # the article body must stay out of recitals
assert "(1)" not in joined and "(2)" not in joined # the "(N)" markers are stripped
def test_self_test_passes_and_flags_empty():
ok, _ = self_test(parse_recitals(_raw(), "TEST"))
assert ok
bad, problems = self_test([])
assert not bad and "0 recitals" in problems[0]
def test_recital_units_are_interpretative_not_primary():
units = build_upload_units(parse_recitals(_raw(), "TEST"), SPEC, "run")
assert len(units) == 2
meta = units[0].meta
assert meta["source_class"] == "recital"
assert meta["authority_weight"] == 60
assert meta["use_for_primary"] is False
assert meta["is_recital"] is True
assert meta["chunk_scope"] == "recital"
assert meta["citation_unit"] == "TEST Erwägungsgrund 1"
assert meta["article"] == "Erwaegungsgrund-1"
# per-recital document_version prevents point-ID collisions
assert units[0].document_version == "run-test-rec1"
assert units[1].document_version == "run-test-rec2"
# recital 1 cites Artikel 5 → forward edge for the citation graph
assert "Art. 5 TEST" in meta["references_out"]