feat(control-pipeline): GuidanceIngester engine for supervisory guidance (Parser 3)
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 42s
CI / test-bqas (push) Successful in 39s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 42s
CI / test-bqas (push) Successful in 39s
Add services/guidance_ingester.py — extracts guidance documents (pdfplumber for PDF, an HTML stripper otherwise; pdfplumber is imported lazily so the module and its tests load without it) and tags them as a SEPARATE interpretative source: source_class=supervisory_guidance / authority_weight=70 / bindingness= interpretative / use_for_primary=false, with references_out to the binding norms they interpret (Art. N DSGVO / § N BDSG). Guidance therefore ranks below binding law for obligation questions yet stays retrievable as interpretation context. supervisory_guidance is reused deliberately: the live re-ranker already weights it 70 and 8k+ chunks use it (no classifier change, no schema drift). EDPB is the first target; technical standards (weight 80) are a later separate class. Tested: 6 unit tests on the text + metadata path (PDF extraction is exercised in the container), ruff + mypy clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,132 @@
|
|||||||
|
"""GuidanceIngester (Parser 3): ingests supervisory guidance (EDPB / DSK / ENISA
|
||||||
|
/ BSI / CNIL) as a SEPARATE interpretative source — never a primary obligation.
|
||||||
|
|
||||||
|
Guidance documents are heterogeneous PDFs / HTML, unlike the uniform eur-lex
|
||||||
|
XHTML of Parsers 1-2. This module extracts the text (pdfplumber for PDF, a small
|
||||||
|
HTML stripper otherwise), normalises it, and tags it
|
||||||
|
source_class=supervisory_guidance / authority_weight=70 /
|
||||||
|
bindingness=interpretative / use_for_primary=false, with references_out to the
|
||||||
|
binding norms it interprets (Art. N DSGVO / § N BDSG). So guidance ranks BELOW
|
||||||
|
binding law for obligation questions, yet stays fully retrievable as
|
||||||
|
interpretation context (and is the right Top-1 for "what does the EDPB say?").
|
||||||
|
|
||||||
|
Chunking is left to the RAG service (chunk_strategy='legal'); each resulting
|
||||||
|
chunk inherits the guidance metadata. pdfplumber is imported lazily so the
|
||||||
|
module (and its tests) load without it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html as html_lib
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from services.legal_act_ingester import UploadUnit
|
||||||
|
|
||||||
|
GUIDANCE_WEIGHT = 70
|
||||||
|
|
||||||
|
_TAG_RE = re.compile(r"<[^>]+>")
|
||||||
|
_WS_RE = re.compile(r"[ \t]+")
|
||||||
|
_BLANK_RE = re.compile(r"\n{3,}")
|
||||||
|
# "Artikel 37", "Art. 38", "Article 9" → the interpreted article number
|
||||||
|
_ART_REF_RE = re.compile(r"\bArt(?:ikel|icle|\.)?\s*(\d+[a-z]?)", re.IGNORECASE)
|
||||||
|
_PARA_REF_RE = re.compile(r"§\s*(\d+[a-z]?)")
|
||||||
|
_MIN_GUIDANCE_CHARS = 200
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GuidanceSpec:
|
||||||
|
"""One guidance document + the binding norm it interprets."""
|
||||||
|
|
||||||
|
source_id: str # stable handle, e.g. "edpb_dpo"
|
||||||
|
short: str # display handle used as regulation_short, e.g. "EDPB DPO"
|
||||||
|
title: str # full title
|
||||||
|
publisher: str # EDPB / DSK / ENISA / BSI / CNIL
|
||||||
|
url: str
|
||||||
|
interpreted_reg: str # binding norm it interprets, e.g. "DSGVO" (for references_out)
|
||||||
|
collection: str = "bp_compliance_datenschutz"
|
||||||
|
version_date: str = ""
|
||||||
|
jurisdiction: str = "EU"
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_text(text: str) -> str:
|
||||||
|
"""Collapse intra-line whitespace and runs of blank lines."""
|
||||||
|
text = html_lib.unescape(text)
|
||||||
|
text = _WS_RE.sub(" ", text)
|
||||||
|
text = "\n".join(line.strip() for line in text.split("\n"))
|
||||||
|
return _BLANK_RE.sub("\n\n", text).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdf(path: str) -> str:
|
||||||
|
"""Extract text from a PDF. pdfplumber is imported lazily (container only)."""
|
||||||
|
import pdfplumber # noqa: PLC0415 — heavy, optional dep; only needed at ingest time
|
||||||
|
|
||||||
|
parts: list[str] = []
|
||||||
|
with pdfplumber.open(path) as pdf:
|
||||||
|
for page in pdf.pages:
|
||||||
|
page_text = page.extract_text(x_tolerance=3, y_tolerance=4)
|
||||||
|
if page_text:
|
||||||
|
parts.append(page_text)
|
||||||
|
return normalize_text("\n".join(parts))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_html(raw: str) -> str:
|
||||||
|
"""Strip tags to plain text (for guidance served as HTML)."""
|
||||||
|
return normalize_text(_TAG_RE.sub(" ", raw))
|
||||||
|
|
||||||
|
|
||||||
|
def guidance_refs_out(interpreted_reg: str, text: str) -> list[str]:
|
||||||
|
"""Forward edges from the guidance to the binding norms it cites."""
|
||||||
|
out = {f"Art. {m} {interpreted_reg}" for m in _ART_REF_RE.findall(text)}
|
||||||
|
out |= {f"§ {m} BDSG" for m in _PARA_REF_RE.findall(text)}
|
||||||
|
return sorted(out)
|
||||||
|
|
||||||
|
|
||||||
|
def guidance_meta(spec: GuidanceSpec, text: str) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"regulation_code": spec.short,
|
||||||
|
"regulation_short": spec.short,
|
||||||
|
"regulation_name_de": spec.title,
|
||||||
|
"citation_style": "guidance",
|
||||||
|
"document_type": "guidance",
|
||||||
|
"source_class": "supervisory_guidance",
|
||||||
|
"bindingness": "interpretative",
|
||||||
|
"authority_level": GUIDANCE_WEIGHT,
|
||||||
|
"authority_weight": GUIDANCE_WEIGHT,
|
||||||
|
"source_type": "guidance",
|
||||||
|
"issuer": spec.publisher,
|
||||||
|
"jurisdiction": spec.jurisdiction,
|
||||||
|
"version_date": spec.version_date,
|
||||||
|
"source": spec.url,
|
||||||
|
"license": "public_eu",
|
||||||
|
"category": "guidance",
|
||||||
|
"use_for_primary": False, # interpretative — never a primary obligation source
|
||||||
|
"is_citable": True,
|
||||||
|
"citation_unit": spec.title,
|
||||||
|
"article_label": spec.short,
|
||||||
|
"chunk_scope": "guidance",
|
||||||
|
"interprets": spec.interpreted_reg,
|
||||||
|
"references_out": guidance_refs_out(spec.interpreted_reg, text),
|
||||||
|
"norm_id": f"GUIDANCE-{spec.source_id}",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def self_test(text: str) -> tuple[bool, list[str]]:
|
||||||
|
"""Gate before upload — guard against an empty/failed extraction."""
|
||||||
|
problems: list[str] = []
|
||||||
|
if len(text.strip()) < _MIN_GUIDANCE_CHARS:
|
||||||
|
problems.append(f"extracted text too short ({len(text.strip())} chars)")
|
||||||
|
return (not problems, problems)
|
||||||
|
|
||||||
|
|
||||||
|
def build_upload_unit(spec: GuidanceSpec, text: str, run_tag: str) -> UploadUnit:
|
||||||
|
"""One UploadUnit for the whole document; the RAG service chunks it and each
|
||||||
|
chunk inherits the guidance metadata."""
|
||||||
|
return UploadUnit(
|
||||||
|
filename=f"{spec.source_id}.txt",
|
||||||
|
text=text,
|
||||||
|
meta=guidance_meta(spec, text),
|
||||||
|
document_version=f"{run_tag}-{spec.source_id}",
|
||||||
|
collection=spec.collection,
|
||||||
|
)
|
||||||
@@ -0,0 +1,72 @@
|
|||||||
|
"""Unit tests for the GuidanceIngester engine (Parser 3).
|
||||||
|
|
||||||
|
Pure tests on the text + metadata path (PDF extraction is a lazy pdfplumber
|
||||||
|
wrapper, exercised in the container). Covers: normalisation, HTML stripping,
|
||||||
|
references_out to the interpreted norm, the interpretative (non-primary)
|
||||||
|
metadata and the self-test gate.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from services.guidance_ingester import (
|
||||||
|
GuidanceSpec,
|
||||||
|
build_upload_unit,
|
||||||
|
extract_html,
|
||||||
|
guidance_meta,
|
||||||
|
guidance_refs_out,
|
||||||
|
normalize_text,
|
||||||
|
self_test,
|
||||||
|
)
|
||||||
|
|
||||||
|
SPEC = GuidanceSpec(
|
||||||
|
source_id="edpb_dpo",
|
||||||
|
short="EDPB DPO",
|
||||||
|
title="EDPB Leitlinien zum Datenschutzbeauftragten",
|
||||||
|
publisher="EDPB",
|
||||||
|
url="https://edpb.europa.eu/guidelines/dpo",
|
||||||
|
interpreted_reg="DSGVO",
|
||||||
|
version_date="2017-04-05",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_text_collapses_whitespace_and_blank_runs():
|
||||||
|
assert normalize_text("a b\t c\n\n\n\nd") == "a b c\n\nd"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_html_strips_tags():
|
||||||
|
assert "Hallo Welt" in extract_html("<p>Hallo <b>Welt</b></p>")
|
||||||
|
|
||||||
|
|
||||||
|
def test_guidance_refs_out_links_to_interpreted_reg():
|
||||||
|
text = "Gemaess Artikel 37, Art. 38 und Article 9 der Verordnung sowie § 38 BDSG."
|
||||||
|
refs = guidance_refs_out("DSGVO", text)
|
||||||
|
assert "Art. 37 DSGVO" in refs
|
||||||
|
assert "Art. 38 DSGVO" in refs
|
||||||
|
assert "Art. 9 DSGVO" in refs
|
||||||
|
assert "§ 38 BDSG" in refs
|
||||||
|
|
||||||
|
|
||||||
|
def test_guidance_meta_is_interpretative_not_primary():
|
||||||
|
meta = guidance_meta(SPEC, "Diese Leitlinie erlaeutert Artikel 37 DSGVO im Detail.")
|
||||||
|
assert meta["source_class"] == "supervisory_guidance"
|
||||||
|
assert meta["authority_weight"] == 70
|
||||||
|
assert meta["use_for_primary"] is False
|
||||||
|
assert meta["bindingness"] == "interpretative"
|
||||||
|
assert meta["chunk_scope"] == "guidance"
|
||||||
|
assert meta["regulation_short"] == "EDPB DPO"
|
||||||
|
assert meta["interprets"] == "DSGVO"
|
||||||
|
assert meta["issuer"] == "EDPB"
|
||||||
|
assert "Art. 37 DSGVO" in meta["references_out"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_self_test_passes_long_and_flags_short():
|
||||||
|
ok, _ = self_test("x" * 300)
|
||||||
|
assert ok
|
||||||
|
bad, problems = self_test("too short")
|
||||||
|
assert not bad and "too short" in problems[0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_upload_unit_tags_collection_and_version():
|
||||||
|
unit = build_upload_unit(SPEC, "A" * 300 + " Artikel 35 DSGVO", "run9")
|
||||||
|
assert unit.document_version == "run9-edpb_dpo"
|
||||||
|
assert unit.collection == "bp_compliance_datenschutz"
|
||||||
|
assert unit.filename == "edpb_dpo.txt"
|
||||||
|
assert unit.meta["use_for_primary"] is False
|
||||||
Reference in New Issue
Block a user