Files
breakpilot-core/control-pipeline/tests/test_guidance_ingester.py
T
Benjamin Admin 24c618ca2e
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 42s
CI / test-bqas (push) Successful in 39s
feat(control-pipeline): GuidanceIngester engine for supervisory guidance (Parser 3)
Add services/guidance_ingester.py — extracts guidance documents (pdfplumber for
PDF, an HTML stripper otherwise; pdfplumber is imported lazily so the module and
its tests load without it) and tags them as a SEPARATE interpretative source:
source_class=supervisory_guidance / authority_weight=70 / bindingness=
interpretative / use_for_primary=false, with references_out to the binding norms
they interpret (Art. N DSGVO / § N BDSG). Guidance therefore ranks below binding
law for obligation questions yet stays retrievable as interpretation context.

supervisory_guidance is reused deliberately: the live re-ranker already weights
it 70 and 8k+ chunks use it (no classifier change, no schema drift). EDPB is the
first target; technical standards (weight 80) are a later separate class.

Tested: 6 unit tests on the text + metadata path (PDF extraction is exercised in
the container), ruff + mypy clean.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-24 09:41:14 +02:00

73 lines
2.4 KiB
Python

"""Unit tests for the GuidanceIngester engine (Parser 3).
Pure tests on the text + metadata path (PDF extraction is a lazy pdfplumber
wrapper, exercised in the container). Covers: normalisation, HTML stripping,
references_out to the interpreted norm, the interpretative (non-primary)
metadata and the self-test gate.
"""
from services.guidance_ingester import (
GuidanceSpec,
build_upload_unit,
extract_html,
guidance_meta,
guidance_refs_out,
normalize_text,
self_test,
)
SPEC = GuidanceSpec(
source_id="edpb_dpo",
short="EDPB DPO",
title="EDPB Leitlinien zum Datenschutzbeauftragten",
publisher="EDPB",
url="https://edpb.europa.eu/guidelines/dpo",
interpreted_reg="DSGVO",
version_date="2017-04-05",
)
def test_normalize_text_collapses_whitespace_and_blank_runs():
assert normalize_text("a b\t c\n\n\n\nd") == "a b c\n\nd"
def test_extract_html_strips_tags():
assert "Hallo Welt" in extract_html("<p>Hallo <b>Welt</b></p>")
def test_guidance_refs_out_links_to_interpreted_reg():
text = "Gemaess Artikel 37, Art. 38 und Article 9 der Verordnung sowie § 38 BDSG."
refs = guidance_refs_out("DSGVO", text)
assert "Art. 37 DSGVO" in refs
assert "Art. 38 DSGVO" in refs
assert "Art. 9 DSGVO" in refs
assert "§ 38 BDSG" in refs
def test_guidance_meta_is_interpretative_not_primary():
meta = guidance_meta(SPEC, "Diese Leitlinie erlaeutert Artikel 37 DSGVO im Detail.")
assert meta["source_class"] == "supervisory_guidance"
assert meta["authority_weight"] == 70
assert meta["use_for_primary"] is False
assert meta["bindingness"] == "interpretative"
assert meta["chunk_scope"] == "guidance"
assert meta["regulation_short"] == "EDPB DPO"
assert meta["interprets"] == "DSGVO"
assert meta["issuer"] == "EDPB"
assert "Art. 37 DSGVO" in meta["references_out"]
def test_self_test_passes_long_and_flags_short():
ok, _ = self_test("x" * 300)
assert ok
bad, problems = self_test("too short")
assert not bad and "too short" in problems[0]
def test_build_upload_unit_tags_collection_and_version():
unit = build_upload_unit(SPEC, "A" * 300 + " Artikel 35 DSGVO", "run9")
assert unit.document_version == "run9-edpb_dpo"
assert unit.collection == "bp_compliance_datenschutz"
assert unit.filename == "edpb_dpo.txt"
assert unit.meta["use_for_primary"] is False