"""Unit tests for the GuidanceIngester engine (Parser 3). Pure tests on the text + metadata path (PDF extraction is a lazy pdfplumber wrapper, exercised in the container). Covers: normalisation, HTML stripping, references_out to the interpreted norm, the interpretative (non-primary) metadata and the self-test gate. """ from services.guidance_ingester import ( GuidanceSpec, build_upload_unit, extract_html, guidance_meta, guidance_refs_out, normalize_text, self_test, ) SPEC = GuidanceSpec( source_id="edpb_dpo", short="EDPB DPO", title="EDPB Leitlinien zum Datenschutzbeauftragten", publisher="EDPB", url="https://edpb.europa.eu/guidelines/dpo", interpreted_reg="DSGVO", version_date="2017-04-05", ) def test_normalize_text_collapses_whitespace_and_blank_runs(): assert normalize_text("a b\t c\n\n\n\nd") == "a b c\n\nd" def test_extract_html_strips_tags(): assert "Hallo Welt" in extract_html("

Hallo Welt

") def test_guidance_refs_out_links_to_interpreted_reg(): text = "Gemaess Artikel 37, Art. 38 und Article 9 der Verordnung sowie § 38 BDSG." refs = guidance_refs_out("DSGVO", text) assert "Art. 37 DSGVO" in refs assert "Art. 38 DSGVO" in refs assert "Art. 9 DSGVO" in refs assert "§ 38 BDSG" in refs def test_guidance_meta_is_interpretative_not_primary(): meta = guidance_meta(SPEC, "Diese Leitlinie erlaeutert Artikel 37 DSGVO im Detail.") assert meta["source_class"] == "supervisory_guidance" assert meta["authority_weight"] == 70 assert meta["use_for_primary"] is False assert meta["bindingness"] == "interpretative" assert meta["chunk_scope"] == "guidance" assert meta["regulation_short"] == "EDPB DPO" assert meta["interprets"] == "DSGVO" assert meta["issuer"] == "EDPB" assert "Art. 37 DSGVO" in meta["references_out"] def test_self_test_passes_long_and_flags_short(): ok, _ = self_test("x" * 300) assert ok bad, problems = self_test("too short") assert not bad and "too short" in problems[0] def test_build_upload_unit_tags_collection_and_version(): unit = build_upload_unit(SPEC, "A" * 300 + " Artikel 35 DSGVO", "run9") assert unit.document_version == "run9-edpb_dpo" assert unit.collection == "bp_compliance_datenschutz" assert unit.filename == "edpb_dpo.txt" assert unit.meta["use_for_primary"] is False