"""GuidanceIngester (Parser 3): ingests supervisory guidance (EDPB / DSK / ENISA / BSI / CNIL) as a SEPARATE interpretative source — never a primary obligation. Guidance documents are heterogeneous PDFs / HTML, unlike the uniform eur-lex XHTML of Parsers 1-2. This module extracts the text (pdfplumber for PDF, a small HTML stripper otherwise), normalises it, and tags it source_class=supervisory_guidance / authority_weight=70 / bindingness=interpretative / use_for_primary=false, with references_out to the binding norms it interprets (Art. N DSGVO / § N BDSG). So guidance ranks BELOW binding law for obligation questions, yet stays fully retrievable as interpretation context (and is the right Top-1 for "what does the EDPB say?"). Chunking is left to the RAG service (chunk_strategy='legal'); each resulting chunk inherits the guidance metadata. pdfplumber is imported lazily so the module (and its tests) load without it. """ from __future__ import annotations import html as html_lib import re from dataclasses import dataclass from typing import Any from services.legal_act_ingester import UploadUnit GUIDANCE_WEIGHT = 70 _TAG_RE = re.compile(r"<[^>]+>") _WS_RE = re.compile(r"[ \t]+") _BLANK_RE = re.compile(r"\n{3,}") # "Artikel 37", "Art. 38", "Article 9" → the interpreted article number _ART_REF_RE = re.compile(r"\bArt(?:ikel|icle|\.)?\s*(\d+[a-z]?)", re.IGNORECASE) _PARA_REF_RE = re.compile(r"§\s*(\d+[a-z]?)") _MIN_GUIDANCE_CHARS = 200 @dataclass class GuidanceSpec: """One guidance document + the binding norm it interprets.""" source_id: str # stable handle, e.g. "edpb_dpo" short: str # display handle used as regulation_short, e.g. "EDPB DPO" title: str # full title publisher: str # EDPB / DSK / ENISA / BSI / CNIL url: str interpreted_reg: str # binding norm it interprets, e.g. "DSGVO" (for references_out) collection: str = "bp_compliance_datenschutz" version_date: str = "" jurisdiction: str = "EU" def normalize_text(text: str) -> str: """Collapse intra-line whitespace and runs of blank lines.""" text = html_lib.unescape(text) text = _WS_RE.sub(" ", text) text = "\n".join(line.strip() for line in text.split("\n")) return _BLANK_RE.sub("\n\n", text).strip() def extract_pdf(path: str) -> str: """Extract text from a PDF. pdfplumber is imported lazily (container only).""" import pdfplumber # noqa: PLC0415 — heavy, optional dep; only needed at ingest time parts: list[str] = [] with pdfplumber.open(path) as pdf: for page in pdf.pages: page_text = page.extract_text(x_tolerance=3, y_tolerance=4) if page_text: parts.append(page_text) return normalize_text("\n".join(parts)) def extract_html(raw: str) -> str: """Strip tags to plain text (for guidance served as HTML).""" return normalize_text(_TAG_RE.sub(" ", raw)) def guidance_refs_out(interpreted_reg: str, text: str) -> list[str]: """Forward edges from the guidance to the binding norms it cites.""" out = {f"Art. {m} {interpreted_reg}" for m in _ART_REF_RE.findall(text)} out |= {f"§ {m} BDSG" for m in _PARA_REF_RE.findall(text)} return sorted(out) def guidance_meta(spec: GuidanceSpec, text: str) -> dict[str, Any]: return { "regulation_code": spec.short, "regulation_short": spec.short, "regulation_name_de": spec.title, "citation_style": "guidance", "document_type": "guidance", "source_class": "supervisory_guidance", "bindingness": "interpretative", "authority_level": GUIDANCE_WEIGHT, "authority_weight": GUIDANCE_WEIGHT, "source_type": "guidance", "issuer": spec.publisher, "jurisdiction": spec.jurisdiction, "version_date": spec.version_date, "source": spec.url, "license": "public_eu", "category": "guidance", "use_for_primary": False, # interpretative — never a primary obligation source "is_citable": True, "citation_unit": spec.title, "article_label": spec.short, "chunk_scope": "guidance", "interprets": spec.interpreted_reg, "references_out": guidance_refs_out(spec.interpreted_reg, text), "norm_id": f"GUIDANCE-{spec.source_id}", } def self_test(text: str) -> tuple[bool, list[str]]: """Gate before upload — guard against an empty/failed extraction.""" problems: list[str] = [] if len(text.strip()) < _MIN_GUIDANCE_CHARS: problems.append(f"extracted text too short ({len(text.strip())} chars)") return (not problems, problems) def build_upload_unit(spec: GuidanceSpec, text: str, run_tag: str) -> UploadUnit: """One UploadUnit for the whole document; the RAG service chunks it and each chunk inherits the guidance metadata.""" return UploadUnit( filename=f"{spec.source_id}.txt", text=text, meta=guidance_meta(spec, text), document_version=f"{run_tag}-{spec.source_id}", collection=spec.collection, )