24c618ca2e
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 42s
CI / test-bqas (push) Successful in 39s
Add services/guidance_ingester.py — extracts guidance documents (pdfplumber for PDF, an HTML stripper otherwise; pdfplumber is imported lazily so the module and its tests load without it) and tags them as a SEPARATE interpretative source: source_class=supervisory_guidance / authority_weight=70 / bindingness= interpretative / use_for_primary=false, with references_out to the binding norms they interpret (Art. N DSGVO / § N BDSG). Guidance therefore ranks below binding law for obligation questions yet stays retrievable as interpretation context. supervisory_guidance is reused deliberately: the live re-ranker already weights it 70 and 8k+ chunks use it (no classifier change, no schema drift). EDPB is the first target; technical standards (weight 80) are a later separate class. Tested: 6 unit tests on the text + metadata path (PDF extraction is exercised in the container), ruff + mypy clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
73 lines
2.4 KiB
Python
73 lines
2.4 KiB
Python
"""Unit tests for the GuidanceIngester engine (Parser 3).
|
|
|
|
Pure tests on the text + metadata path (PDF extraction is a lazy pdfplumber
|
|
wrapper, exercised in the container). Covers: normalisation, HTML stripping,
|
|
references_out to the interpreted norm, the interpretative (non-primary)
|
|
metadata and the self-test gate.
|
|
"""
|
|
|
|
from services.guidance_ingester import (
|
|
GuidanceSpec,
|
|
build_upload_unit,
|
|
extract_html,
|
|
guidance_meta,
|
|
guidance_refs_out,
|
|
normalize_text,
|
|
self_test,
|
|
)
|
|
|
|
SPEC = GuidanceSpec(
|
|
source_id="edpb_dpo",
|
|
short="EDPB DPO",
|
|
title="EDPB Leitlinien zum Datenschutzbeauftragten",
|
|
publisher="EDPB",
|
|
url="https://edpb.europa.eu/guidelines/dpo",
|
|
interpreted_reg="DSGVO",
|
|
version_date="2017-04-05",
|
|
)
|
|
|
|
|
|
def test_normalize_text_collapses_whitespace_and_blank_runs():
|
|
assert normalize_text("a b\t c\n\n\n\nd") == "a b c\n\nd"
|
|
|
|
|
|
def test_extract_html_strips_tags():
|
|
assert "Hallo Welt" in extract_html("<p>Hallo <b>Welt</b></p>")
|
|
|
|
|
|
def test_guidance_refs_out_links_to_interpreted_reg():
|
|
text = "Gemaess Artikel 37, Art. 38 und Article 9 der Verordnung sowie § 38 BDSG."
|
|
refs = guidance_refs_out("DSGVO", text)
|
|
assert "Art. 37 DSGVO" in refs
|
|
assert "Art. 38 DSGVO" in refs
|
|
assert "Art. 9 DSGVO" in refs
|
|
assert "§ 38 BDSG" in refs
|
|
|
|
|
|
def test_guidance_meta_is_interpretative_not_primary():
|
|
meta = guidance_meta(SPEC, "Diese Leitlinie erlaeutert Artikel 37 DSGVO im Detail.")
|
|
assert meta["source_class"] == "supervisory_guidance"
|
|
assert meta["authority_weight"] == 70
|
|
assert meta["use_for_primary"] is False
|
|
assert meta["bindingness"] == "interpretative"
|
|
assert meta["chunk_scope"] == "guidance"
|
|
assert meta["regulation_short"] == "EDPB DPO"
|
|
assert meta["interprets"] == "DSGVO"
|
|
assert meta["issuer"] == "EDPB"
|
|
assert "Art. 37 DSGVO" in meta["references_out"]
|
|
|
|
|
|
def test_self_test_passes_long_and_flags_short():
|
|
ok, _ = self_test("x" * 300)
|
|
assert ok
|
|
bad, problems = self_test("too short")
|
|
assert not bad and "too short" in problems[0]
|
|
|
|
|
|
def test_build_upload_unit_tags_collection_and_version():
|
|
unit = build_upload_unit(SPEC, "A" * 300 + " Artikel 35 DSGVO", "run9")
|
|
assert unit.document_version == "run9-edpb_dpo"
|
|
assert unit.collection == "bp_compliance_datenschutz"
|
|
assert unit.filename == "edpb_dpo.txt"
|
|
assert unit.meta["use_for_primary"] is False
|