c258fbc3de
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 30s
CI / test-python-voice (push) Successful in 38s
CI / test-bqas (push) Successful in 40s
Add services/recital_ingester.py — parses EU act recitals (Erwägungsgründe) from the eur-lex/CELLAR preamble via the id="rct_N" markers (the table layout that defeats a naive article parser) and tags them as a SEPARATE interpretative source: source_class=recital, authority_weight=60, use_for_primary=false, so they rank below binding articles and surface only as interpretation context. Reuses the Parser-1 download + helpers. Add scripts/ingest_recitals.py (skip-by-existing, no auto re-ingest) + tests/fixture. Tested: 4 unit tests over a synthetic rct_N fixture, ruff + mypy clean, real CELLAR parse of DORA verified end-to-end (106 recitals, interpretative metadata). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
57 lines
2.2 KiB
Python
57 lines
2.2 KiB
Python
"""Unit tests for the RecitalIngester engine (Parser 2).
|
|
|
|
Pure parser + metadata tests against a synthetic eur-lex recital fixture (the
|
|
id="rct_N" preamble-table structure). Covers: recital extraction, exclusion of
|
|
article text, the self-test gate, and the interpretative (non-primary) metadata.
|
|
"""
|
|
|
|
import os
|
|
|
|
from services.legal_act_ingester import RegSpec
|
|
from services.recital_ingester import build_upload_units, parse_recitals, self_test
|
|
|
|
FIXTURE = os.path.join(os.path.dirname(__file__), "fixtures", "sample_eurlex_recitals.html")
|
|
SPEC = RegSpec(reg="TEST", celex="32099R0001", name_de="Testverordnung", version_date="2099-01-01")
|
|
|
|
|
|
def _raw() -> str:
|
|
with open(FIXTURE, encoding="utf-8") as fh:
|
|
return fh.read()
|
|
|
|
|
|
def test_parse_recitals_from_rct_markers():
|
|
recs = parse_recitals(_raw(), "TEST")
|
|
assert [r.num for r in recs] == ["1", "2"]
|
|
assert "Hintergrund" in recs[0].text
|
|
|
|
|
|
def test_article_text_is_not_captured_as_recital():
|
|
joined = " ".join(r.text for r in parse_recitals(_raw(), "TEST"))
|
|
assert "Artikeltext" not in joined # the article body must stay out of recitals
|
|
assert "(1)" not in joined and "(2)" not in joined # the "(N)" markers are stripped
|
|
|
|
|
|
def test_self_test_passes_and_flags_empty():
|
|
ok, _ = self_test(parse_recitals(_raw(), "TEST"))
|
|
assert ok
|
|
bad, problems = self_test([])
|
|
assert not bad and "0 recitals" in problems[0]
|
|
|
|
|
|
def test_recital_units_are_interpretative_not_primary():
|
|
units = build_upload_units(parse_recitals(_raw(), "TEST"), SPEC, "run")
|
|
assert len(units) == 2
|
|
meta = units[0].meta
|
|
assert meta["source_class"] == "recital"
|
|
assert meta["authority_weight"] == 60
|
|
assert meta["use_for_primary"] is False
|
|
assert meta["is_recital"] is True
|
|
assert meta["chunk_scope"] == "recital"
|
|
assert meta["citation_unit"] == "TEST Erwägungsgrund 1"
|
|
assert meta["article"] == "Erwaegungsgrund-1"
|
|
# per-recital document_version prevents point-ID collisions
|
|
assert units[0].document_version == "run-test-rec1"
|
|
assert units[1].document_version == "run-test-rec2"
|
|
# recital 1 cites Artikel 5 → forward edge for the citation graph
|
|
assert "Art. 5 TEST" in meta["references_out"]
|