diff --git a/control-pipeline/services/guidance_ingester.py b/control-pipeline/services/guidance_ingester.py new file mode 100644 index 0000000..11efb52 --- /dev/null +++ b/control-pipeline/services/guidance_ingester.py @@ -0,0 +1,132 @@ +"""GuidanceIngester (Parser 3): ingests supervisory guidance (EDPB / DSK / ENISA +/ BSI / CNIL) as a SEPARATE interpretative source — never a primary obligation. + +Guidance documents are heterogeneous PDFs / HTML, unlike the uniform eur-lex +XHTML of Parsers 1-2. This module extracts the text (pdfplumber for PDF, a small +HTML stripper otherwise), normalises it, and tags it +source_class=supervisory_guidance / authority_weight=70 / +bindingness=interpretative / use_for_primary=false, with references_out to the +binding norms it interprets (Art. N DSGVO / § N BDSG). So guidance ranks BELOW +binding law for obligation questions, yet stays fully retrievable as +interpretation context (and is the right Top-1 for "what does the EDPB say?"). + +Chunking is left to the RAG service (chunk_strategy='legal'); each resulting +chunk inherits the guidance metadata. pdfplumber is imported lazily so the +module (and its tests) load without it. +""" + +from __future__ import annotations + +import html as html_lib +import re +from dataclasses import dataclass +from typing import Any + +from services.legal_act_ingester import UploadUnit + +GUIDANCE_WEIGHT = 70 + +_TAG_RE = re.compile(r"<[^>]+>") +_WS_RE = re.compile(r"[ \t]+") +_BLANK_RE = re.compile(r"\n{3,}") +# "Artikel 37", "Art. 38", "Article 9" → the interpreted article number +_ART_REF_RE = re.compile(r"\bArt(?:ikel|icle|\.)?\s*(\d+[a-z]?)", re.IGNORECASE) +_PARA_REF_RE = re.compile(r"§\s*(\d+[a-z]?)") +_MIN_GUIDANCE_CHARS = 200 + + +@dataclass +class GuidanceSpec: + """One guidance document + the binding norm it interprets.""" + + source_id: str # stable handle, e.g. "edpb_dpo" + short: str # display handle used as regulation_short, e.g. "EDPB DPO" + title: str # full title + publisher: str # EDPB / DSK / ENISA / BSI / CNIL + url: str + interpreted_reg: str # binding norm it interprets, e.g. "DSGVO" (for references_out) + collection: str = "bp_compliance_datenschutz" + version_date: str = "" + jurisdiction: str = "EU" + + +def normalize_text(text: str) -> str: + """Collapse intra-line whitespace and runs of blank lines.""" + text = html_lib.unescape(text) + text = _WS_RE.sub(" ", text) + text = "\n".join(line.strip() for line in text.split("\n")) + return _BLANK_RE.sub("\n\n", text).strip() + + +def extract_pdf(path: str) -> str: + """Extract text from a PDF. pdfplumber is imported lazily (container only).""" + import pdfplumber # noqa: PLC0415 — heavy, optional dep; only needed at ingest time + + parts: list[str] = [] + with pdfplumber.open(path) as pdf: + for page in pdf.pages: + page_text = page.extract_text(x_tolerance=3, y_tolerance=4) + if page_text: + parts.append(page_text) + return normalize_text("\n".join(parts)) + + +def extract_html(raw: str) -> str: + """Strip tags to plain text (for guidance served as HTML).""" + return normalize_text(_TAG_RE.sub(" ", raw)) + + +def guidance_refs_out(interpreted_reg: str, text: str) -> list[str]: + """Forward edges from the guidance to the binding norms it cites.""" + out = {f"Art. {m} {interpreted_reg}" for m in _ART_REF_RE.findall(text)} + out |= {f"§ {m} BDSG" for m in _PARA_REF_RE.findall(text)} + return sorted(out) + + +def guidance_meta(spec: GuidanceSpec, text: str) -> dict[str, Any]: + return { + "regulation_code": spec.short, + "regulation_short": spec.short, + "regulation_name_de": spec.title, + "citation_style": "guidance", + "document_type": "guidance", + "source_class": "supervisory_guidance", + "bindingness": "interpretative", + "authority_level": GUIDANCE_WEIGHT, + "authority_weight": GUIDANCE_WEIGHT, + "source_type": "guidance", + "issuer": spec.publisher, + "jurisdiction": spec.jurisdiction, + "version_date": spec.version_date, + "source": spec.url, + "license": "public_eu", + "category": "guidance", + "use_for_primary": False, # interpretative — never a primary obligation source + "is_citable": True, + "citation_unit": spec.title, + "article_label": spec.short, + "chunk_scope": "guidance", + "interprets": spec.interpreted_reg, + "references_out": guidance_refs_out(spec.interpreted_reg, text), + "norm_id": f"GUIDANCE-{spec.source_id}", + } + + +def self_test(text: str) -> tuple[bool, list[str]]: + """Gate before upload — guard against an empty/failed extraction.""" + problems: list[str] = [] + if len(text.strip()) < _MIN_GUIDANCE_CHARS: + problems.append(f"extracted text too short ({len(text.strip())} chars)") + return (not problems, problems) + + +def build_upload_unit(spec: GuidanceSpec, text: str, run_tag: str) -> UploadUnit: + """One UploadUnit for the whole document; the RAG service chunks it and each + chunk inherits the guidance metadata.""" + return UploadUnit( + filename=f"{spec.source_id}.txt", + text=text, + meta=guidance_meta(spec, text), + document_version=f"{run_tag}-{spec.source_id}", + collection=spec.collection, + ) diff --git a/control-pipeline/tests/test_guidance_ingester.py b/control-pipeline/tests/test_guidance_ingester.py new file mode 100644 index 0000000..23f6657 --- /dev/null +++ b/control-pipeline/tests/test_guidance_ingester.py @@ -0,0 +1,72 @@ +"""Unit tests for the GuidanceIngester engine (Parser 3). + +Pure tests on the text + metadata path (PDF extraction is a lazy pdfplumber +wrapper, exercised in the container). Covers: normalisation, HTML stripping, +references_out to the interpreted norm, the interpretative (non-primary) +metadata and the self-test gate. +""" + +from services.guidance_ingester import ( + GuidanceSpec, + build_upload_unit, + extract_html, + guidance_meta, + guidance_refs_out, + normalize_text, + self_test, +) + +SPEC = GuidanceSpec( + source_id="edpb_dpo", + short="EDPB DPO", + title="EDPB Leitlinien zum Datenschutzbeauftragten", + publisher="EDPB", + url="https://edpb.europa.eu/guidelines/dpo", + interpreted_reg="DSGVO", + version_date="2017-04-05", +) + + +def test_normalize_text_collapses_whitespace_and_blank_runs(): + assert normalize_text("a b\t c\n\n\n\nd") == "a b c\n\nd" + + +def test_extract_html_strips_tags(): + assert "Hallo Welt" in extract_html("

Hallo Welt

") + + +def test_guidance_refs_out_links_to_interpreted_reg(): + text = "Gemaess Artikel 37, Art. 38 und Article 9 der Verordnung sowie § 38 BDSG." + refs = guidance_refs_out("DSGVO", text) + assert "Art. 37 DSGVO" in refs + assert "Art. 38 DSGVO" in refs + assert "Art. 9 DSGVO" in refs + assert "§ 38 BDSG" in refs + + +def test_guidance_meta_is_interpretative_not_primary(): + meta = guidance_meta(SPEC, "Diese Leitlinie erlaeutert Artikel 37 DSGVO im Detail.") + assert meta["source_class"] == "supervisory_guidance" + assert meta["authority_weight"] == 70 + assert meta["use_for_primary"] is False + assert meta["bindingness"] == "interpretative" + assert meta["chunk_scope"] == "guidance" + assert meta["regulation_short"] == "EDPB DPO" + assert meta["interprets"] == "DSGVO" + assert meta["issuer"] == "EDPB" + assert "Art. 37 DSGVO" in meta["references_out"] + + +def test_self_test_passes_long_and_flags_short(): + ok, _ = self_test("x" * 300) + assert ok + bad, problems = self_test("too short") + assert not bad and "too short" in problems[0] + + +def test_build_upload_unit_tags_collection_and_version(): + unit = build_upload_unit(SPEC, "A" * 300 + " Artikel 35 DSGVO", "run9") + assert unit.document_version == "run9-edpb_dpo" + assert unit.collection == "bp_compliance_datenschutz" + assert unit.filename == "edpb_dpo.txt" + assert unit.meta["use_for_primary"] is False