breakpilot-core/control-pipeline/services/guidance_ingester.py

"""GuidanceIngester (Parser 3): ingests supervisory guidance (EDPB / DSK / ENISA
/ BSI / CNIL) as a SEPARATE interpretative source — never a primary obligation.

Guidance documents are heterogeneous PDFs / HTML, unlike the uniform eur-lex
XHTML of Parsers 1-2. This module extracts the text (pdfplumber for PDF, a small
HTML stripper otherwise), normalises it, and tags it
source_class=supervisory_guidance / authority_weight=70 /
bindingness=interpretative / use_for_primary=false, with references_out to the
binding norms it interprets (Art. N DSGVO / § N BDSG). So guidance ranks BELOW
binding law for obligation questions, yet stays fully retrievable as
interpretation context (and is the right Top-1 for "what does the EDPB say?").

Chunking is left to the RAG service (chunk_strategy='legal'); each resulting
chunk inherits the guidance metadata. pdfplumber is imported lazily so the
module (and its tests) load without it.
"""

from __future__ import annotations

import html as html_lib
import re
from dataclasses import dataclass
from typing import Any

from services.legal_act_ingester import UploadUnit

GUIDANCE_WEIGHT = 70

_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"[ \t]+")
_BLANK_RE = re.compile(r"\n{3,}")
# "Artikel 37", "Art. 38", "Article 9" → the interpreted article number
_ART_REF_RE = re.compile(r"\bArt(?:ikel|icle|\.)?\s*(\d+[a-z]?)", re.IGNORECASE)
_PARA_REF_RE = re.compile(r"§\s*(\d+[a-z]?)")
_MIN_GUIDANCE_CHARS = 200


@dataclass
class GuidanceSpec:
    """One guidance document + the binding norm it interprets."""

    source_id: str  # stable handle, e.g. "edpb_dpo"
    short: str  # display handle used as regulation_short, e.g. "EDPB DPO"
    title: str  # full title
    publisher: str  # EDPB / DSK / ENISA / BSI / CNIL
    url: str
    interpreted_reg: str  # binding norm it interprets, e.g. "DSGVO" (for references_out)
    collection: str = "bp_compliance_datenschutz"
    version_date: str = ""
    jurisdiction: str = "EU"


def normalize_text(text: str) -> str:
    """Collapse intra-line whitespace and runs of blank lines."""
    text = html_lib.unescape(text)
    text = _WS_RE.sub(" ", text)
    text = "\n".join(line.strip() for line in text.split("\n"))
    return _BLANK_RE.sub("\n\n", text).strip()


def extract_pdf(path: str) -> str:
    """Extract text from a PDF. pdfplumber is imported lazily (container only)."""
    import pdfplumber  # noqa: PLC0415 — heavy, optional dep; only needed at ingest time

    parts: list[str] = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text(x_tolerance=3, y_tolerance=4)
            if page_text:
                parts.append(page_text)
    return normalize_text("\n".join(parts))


def extract_html(raw: str) -> str:
    """Strip tags to plain text (for guidance served as HTML)."""
    return normalize_text(_TAG_RE.sub(" ", raw))


def guidance_refs_out(interpreted_reg: str, text: str) -> list[str]:
    """Forward edges from the guidance to the binding norms it cites."""
    out = {f"Art. {m} {interpreted_reg}" for m in _ART_REF_RE.findall(text)}
    out |= {f"§ {m} BDSG" for m in _PARA_REF_RE.findall(text)}
    return sorted(out)


def guidance_meta(spec: GuidanceSpec, text: str) -> dict[str, Any]:
    return {
        "regulation_code": spec.short,
        "regulation_short": spec.short,
        "regulation_name_de": spec.title,
        "citation_style": "guidance",
        "document_type": "guidance",
        "source_class": "supervisory_guidance",
        "bindingness": "interpretative",
        "authority_level": GUIDANCE_WEIGHT,
        "authority_weight": GUIDANCE_WEIGHT,
        "source_type": "guidance",
        "issuer": spec.publisher,
        "jurisdiction": spec.jurisdiction,
        "version_date": spec.version_date,
        "source": spec.url,
        "license": "public_eu",
        "category": "guidance",
        "use_for_primary": False,  # interpretative — never a primary obligation source
        "is_citable": True,
        "citation_unit": spec.title,
        "article_label": spec.short,
        "chunk_scope": "guidance",
        "interprets": spec.interpreted_reg,
        "references_out": guidance_refs_out(spec.interpreted_reg, text),
        "norm_id": f"GUIDANCE-{spec.source_id}",
    }


def self_test(text: str) -> tuple[bool, list[str]]:
    """Gate before upload — guard against an empty/failed extraction."""
    problems: list[str] = []
    if len(text.strip()) < _MIN_GUIDANCE_CHARS:
        problems.append(f"extracted text too short ({len(text.strip())} chars)")
    return (not problems, problems)


def build_upload_unit(spec: GuidanceSpec, text: str, run_tag: str) -> UploadUnit:
    """One UploadUnit for the whole document; the RAG service chunks it and each
    chunk inherits the guidance metadata."""
    return UploadUnit(
        filename=f"{spec.source_id}.txt",
        text=text,
        meta=guidance_meta(spec, text),
        document_version=f"{run_tag}-{spec.source_id}",
        collection=spec.collection,
    )