feat(control-pipeline): GuidanceIngester engine for supervisory guidance (Parser 3)
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 42s
CI / test-bqas (push) Successful in 39s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 42s
CI / test-bqas (push) Successful in 39s
Add services/guidance_ingester.py — extracts guidance documents (pdfplumber for PDF, an HTML stripper otherwise; pdfplumber is imported lazily so the module and its tests load without it) and tags them as a SEPARATE interpretative source: source_class=supervisory_guidance / authority_weight=70 / bindingness= interpretative / use_for_primary=false, with references_out to the binding norms they interpret (Art. N DSGVO / § N BDSG). Guidance therefore ranks below binding law for obligation questions yet stays retrievable as interpretation context. supervisory_guidance is reused deliberately: the live re-ranker already weights it 70 and 8k+ chunks use it (no classifier change, no schema drift). EDPB is the first target; technical standards (weight 80) are a later separate class. Tested: 6 unit tests on the text + metadata path (PDF extraction is exercised in the container), ruff + mypy clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,72 @@
|
||||
"""Unit tests for the GuidanceIngester engine (Parser 3).
|
||||
|
||||
Pure tests on the text + metadata path (PDF extraction is a lazy pdfplumber
|
||||
wrapper, exercised in the container). Covers: normalisation, HTML stripping,
|
||||
references_out to the interpreted norm, the interpretative (non-primary)
|
||||
metadata and the self-test gate.
|
||||
"""
|
||||
|
||||
from services.guidance_ingester import (
|
||||
GuidanceSpec,
|
||||
build_upload_unit,
|
||||
extract_html,
|
||||
guidance_meta,
|
||||
guidance_refs_out,
|
||||
normalize_text,
|
||||
self_test,
|
||||
)
|
||||
|
||||
SPEC = GuidanceSpec(
|
||||
source_id="edpb_dpo",
|
||||
short="EDPB DPO",
|
||||
title="EDPB Leitlinien zum Datenschutzbeauftragten",
|
||||
publisher="EDPB",
|
||||
url="https://edpb.europa.eu/guidelines/dpo",
|
||||
interpreted_reg="DSGVO",
|
||||
version_date="2017-04-05",
|
||||
)
|
||||
|
||||
|
||||
def test_normalize_text_collapses_whitespace_and_blank_runs():
|
||||
assert normalize_text("a b\t c\n\n\n\nd") == "a b c\n\nd"
|
||||
|
||||
|
||||
def test_extract_html_strips_tags():
|
||||
assert "Hallo Welt" in extract_html("<p>Hallo <b>Welt</b></p>")
|
||||
|
||||
|
||||
def test_guidance_refs_out_links_to_interpreted_reg():
|
||||
text = "Gemaess Artikel 37, Art. 38 und Article 9 der Verordnung sowie § 38 BDSG."
|
||||
refs = guidance_refs_out("DSGVO", text)
|
||||
assert "Art. 37 DSGVO" in refs
|
||||
assert "Art. 38 DSGVO" in refs
|
||||
assert "Art. 9 DSGVO" in refs
|
||||
assert "§ 38 BDSG" in refs
|
||||
|
||||
|
||||
def test_guidance_meta_is_interpretative_not_primary():
|
||||
meta = guidance_meta(SPEC, "Diese Leitlinie erlaeutert Artikel 37 DSGVO im Detail.")
|
||||
assert meta["source_class"] == "supervisory_guidance"
|
||||
assert meta["authority_weight"] == 70
|
||||
assert meta["use_for_primary"] is False
|
||||
assert meta["bindingness"] == "interpretative"
|
||||
assert meta["chunk_scope"] == "guidance"
|
||||
assert meta["regulation_short"] == "EDPB DPO"
|
||||
assert meta["interprets"] == "DSGVO"
|
||||
assert meta["issuer"] == "EDPB"
|
||||
assert "Art. 37 DSGVO" in meta["references_out"]
|
||||
|
||||
|
||||
def test_self_test_passes_long_and_flags_short():
|
||||
ok, _ = self_test("x" * 300)
|
||||
assert ok
|
||||
bad, problems = self_test("too short")
|
||||
assert not bad and "too short" in problems[0]
|
||||
|
||||
|
||||
def test_build_upload_unit_tags_collection_and_version():
|
||||
unit = build_upload_unit(SPEC, "A" * 300 + " Artikel 35 DSGVO", "run9")
|
||||
assert unit.document_version == "run9-edpb_dpo"
|
||||
assert unit.collection == "bp_compliance_datenschutz"
|
||||
assert unit.filename == "edpb_dpo.txt"
|
||||
assert unit.meta["use_for_primary"] is False
|
||||
Reference in New Issue
Block a user