"""Unit tests for the RecitalIngester engine (Parser 2). Pure parser + metadata tests against a synthetic eur-lex recital fixture (the id="rct_N" preamble-table structure). Covers: recital extraction, exclusion of article text, the self-test gate, and the interpretative (non-primary) metadata. """ import os from services.legal_act_ingester import RegSpec from services.recital_ingester import build_upload_units, parse_recitals, self_test FIXTURE = os.path.join(os.path.dirname(__file__), "fixtures", "sample_eurlex_recitals.html") SPEC = RegSpec(reg="TEST", celex="32099R0001", name_de="Testverordnung", version_date="2099-01-01") def _raw() -> str: with open(FIXTURE, encoding="utf-8") as fh: return fh.read() def test_parse_recitals_from_rct_markers(): recs = parse_recitals(_raw(), "TEST") assert [r.num for r in recs] == ["1", "2"] assert "Hintergrund" in recs[0].text def test_article_text_is_not_captured_as_recital(): joined = " ".join(r.text for r in parse_recitals(_raw(), "TEST")) assert "Artikeltext" not in joined # the article body must stay out of recitals assert "(1)" not in joined and "(2)" not in joined # the "(N)" markers are stripped def test_self_test_passes_and_flags_empty(): ok, _ = self_test(parse_recitals(_raw(), "TEST")) assert ok bad, problems = self_test([]) assert not bad and "0 recitals" in problems[0] def test_recital_units_are_interpretative_not_primary(): units = build_upload_units(parse_recitals(_raw(), "TEST"), SPEC, "run") assert len(units) == 2 meta = units[0].meta assert meta["source_class"] == "recital" assert meta["authority_weight"] == 60 assert meta["use_for_primary"] is False assert meta["is_recital"] is True assert meta["chunk_scope"] == "recital" assert meta["citation_unit"] == "TEST Erwägungsgrund 1" assert meta["article"] == "Erwaegungsgrund-1" # per-recital document_version prevents point-ID collisions assert units[0].document_version == "run-test-rec1" assert units[1].document_version == "run-test-rec2" # recital 1 cites Artikel 5 → forward edge for the citation graph assert "Art. 5 TEST" in meta["references_out"]