From 3b466be14024b2ae29fe0a9717a2a145143bbaa8 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 24 Jun 2026 11:08:20 +0200 Subject: [PATCH] feat(control-pipeline): StandardIngester engine for technical standards (Parser 4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add services/standard_ingester.py — tags technical standards / control frameworks (NIST / OWASP / BSI Grundschutz / CSA CCM) source_class=technical_standard / authority_weight=80 / bindingness=best_practice / use_for_primary=false, so a standard ranks below binding law and guidance for obligation/interpretation questions but surfaces for "which controls/measures fit?" (control-intent, a follow-up retriever step). Reuses the guidance_ingester extraction helpers. The per-source license is carried on every unit so the commercial gate can refuse a non-commercial source (e.g. CSA CCM = CC-BY-NC). Tested: 3 unit tests on the metadata path, ruff + mypy clean. Co-Authored-By: Claude Opus 4.7 --- .../services/standard_ingester.py | 81 +++++++++++++++++++ .../tests/test_standard_ingester.py | 35 ++++++++ 2 files changed, 116 insertions(+) create mode 100644 control-pipeline/services/standard_ingester.py create mode 100644 control-pipeline/tests/test_standard_ingester.py diff --git a/control-pipeline/services/standard_ingester.py b/control-pipeline/services/standard_ingester.py new file mode 100644 index 0000000..0dc3ef1 --- /dev/null +++ b/control-pipeline/services/standard_ingester.py @@ -0,0 +1,81 @@ +"""StandardIngester (Parser 4): ingests technical standards / control frameworks +(NIST / OWASP / BSI Grundschutz / CSA CCM) as best-practice CONTROLS — never a +primary obligation. + +Tagged source_class=technical_standard / authority_weight=80 / +bindingness=best_practice / use_for_primary=false, so a standard ranks below +binding law AND supervisory guidance for an obligation/interpretation question, +but surfaces for "which controls / measures fit?" questions (control-intent in +the retriever). Reuses the PDF/HTML extraction helpers from guidance_ingester. + +LICENSE per source matters: NIST = US public domain (free), OWASP = CC-BY-SA +(free, share-alike), BSI Grundschutz = check terms. CSA CCM is CC-BY-NC (NON +commercial) → NOT usable in a commercial product; carry the license on each unit +so the gate can refuse it. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from services.legal_act_ingester import UploadUnit + +# Extraction (extract_pdf / extract_html / normalize_text) + the self_test gate are +# shared with guidance_ingester — the operational runner imports them from there. + +STANDARD_WEIGHT = 80 + + +@dataclass +class StandardSpec: + """One technical standard / control framework document.""" + + source_id: str # stable handle, e.g. "nist_csf_2_0" + short: str # display handle / regulation_short, e.g. "NIST CSF 2.0" + title: str + publisher: str # NIST / OWASP / BSI / CSA + url: str + license: str # e.g. "public_domain" | "CC-BY-SA-4.0" | "CC-BY-NC" (→ commercial-blocked) + collection: str = "bp_compliance_ce" + version_date: str = "" + jurisdiction: str = "international" + + +def standard_meta(spec: StandardSpec) -> dict[str, Any]: + return { + "regulation_code": spec.short, + "regulation_short": spec.short, + "regulation_name_de": spec.title, + "citation_style": "standard", + "document_type": "standard", + "source_class": "technical_standard", + "bindingness": "best_practice", + "authority_level": STANDARD_WEIGHT, + "authority_weight": STANDARD_WEIGHT, + "source_type": "standard", + "issuer": spec.publisher, + "jurisdiction": spec.jurisdiction, + "version_date": spec.version_date, + "source": spec.url, + "license": spec.license, + "category": "standard", + "use_for_primary": False, # best-practice control — never a primary obligation + "is_citable": True, + "citation_unit": spec.title, + "article_label": spec.short, + "chunk_scope": "standard", + "norm_id": f"STANDARD-{spec.source_id}", + } + + +def build_upload_unit(spec: StandardSpec, text: str, run_tag: str) -> UploadUnit: + """One UploadUnit for the whole standard; the RAG service chunks it and each + chunk inherits the technical_standard metadata.""" + return UploadUnit( + filename=f"{spec.source_id}.txt", + text=text, + meta=standard_meta(spec), + document_version=f"{run_tag}-{spec.source_id}", + collection=spec.collection, + ) diff --git a/control-pipeline/tests/test_standard_ingester.py b/control-pipeline/tests/test_standard_ingester.py new file mode 100644 index 0000000..0e28a5b --- /dev/null +++ b/control-pipeline/tests/test_standard_ingester.py @@ -0,0 +1,35 @@ +"""Unit tests for the StandardIngester engine (Parser 4).""" + +from services.standard_ingester import StandardSpec, build_upload_unit, standard_meta + +SPEC = StandardSpec( + source_id="nist_csf_2_0", short="NIST CSF 2.0", + title="NIST Cybersecurity Framework 2.0", publisher="NIST", + url="https://nist.gov/csf", license="public_domain", version_date="2024-02-26", +) + + +def test_standard_meta_is_best_practice_not_primary(): + m = standard_meta(SPEC) + assert m["source_class"] == "technical_standard" + assert m["authority_weight"] == 80 + assert m["bindingness"] == "best_practice" + assert m["use_for_primary"] is False + assert m["chunk_scope"] == "standard" + assert m["regulation_short"] == "NIST CSF 2.0" + assert m["issuer"] == "NIST" + assert m["license"] == "public_domain" + + +def test_build_upload_unit_tags_version_and_collection(): + unit = build_upload_unit(SPEC, "A" * 300, "run9") + assert unit.document_version == "run9-nist_csf_2_0" + assert unit.collection == "bp_compliance_ce" + assert unit.filename == "nist_csf_2_0.txt" + assert unit.meta["use_for_primary"] is False + + +def test_noncommercial_license_is_carried_for_the_gate(): + ccm = StandardSpec(source_id="csa_ccm", short="CSA CCM", title="Cloud Controls Matrix", + publisher="CSA", url="https://...", license="CC-BY-NC") + assert standard_meta(ccm)["license"] == "CC-BY-NC" # commercial gate can refuse it