Files
breakpilot-core/control-pipeline/services/standard_ingester.py
T
Benjamin Admin 3b466be140
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 25s
CI / test-python-voice (push) Successful in 28s
CI / test-bqas (push) Successful in 26s
feat(control-pipeline): StandardIngester engine for technical standards (Parser 4)
Add services/standard_ingester.py — tags technical standards / control frameworks
(NIST / OWASP / BSI Grundschutz / CSA CCM) source_class=technical_standard /
authority_weight=80 / bindingness=best_practice / use_for_primary=false, so a
standard ranks below binding law and guidance for obligation/interpretation
questions but surfaces for "which controls/measures fit?" (control-intent, a
follow-up retriever step). Reuses the guidance_ingester extraction helpers. The
per-source license is carried on every unit so the commercial gate can refuse a
non-commercial source (e.g. CSA CCM = CC-BY-NC).

Tested: 3 unit tests on the metadata path, ruff + mypy clean.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-24 11:08:20 +02:00

82 lines
3.0 KiB
Python

"""StandardIngester (Parser 4): ingests technical standards / control frameworks
(NIST / OWASP / BSI Grundschutz / CSA CCM) as best-practice CONTROLS — never a
primary obligation.
Tagged source_class=technical_standard / authority_weight=80 /
bindingness=best_practice / use_for_primary=false, so a standard ranks below
binding law AND supervisory guidance for an obligation/interpretation question,
but surfaces for "which controls / measures fit?" questions (control-intent in
the retriever). Reuses the PDF/HTML extraction helpers from guidance_ingester.
LICENSE per source matters: NIST = US public domain (free), OWASP = CC-BY-SA
(free, share-alike), BSI Grundschutz = check terms. CSA CCM is CC-BY-NC (NON
commercial) → NOT usable in a commercial product; carry the license on each unit
so the gate can refuse it.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
from services.legal_act_ingester import UploadUnit
# Extraction (extract_pdf / extract_html / normalize_text) + the self_test gate are
# shared with guidance_ingester — the operational runner imports them from there.
STANDARD_WEIGHT = 80
@dataclass
class StandardSpec:
"""One technical standard / control framework document."""
source_id: str # stable handle, e.g. "nist_csf_2_0"
short: str # display handle / regulation_short, e.g. "NIST CSF 2.0"
title: str
publisher: str # NIST / OWASP / BSI / CSA
url: str
license: str # e.g. "public_domain" | "CC-BY-SA-4.0" | "CC-BY-NC" (→ commercial-blocked)
collection: str = "bp_compliance_ce"
version_date: str = ""
jurisdiction: str = "international"
def standard_meta(spec: StandardSpec) -> dict[str, Any]:
return {
"regulation_code": spec.short,
"regulation_short": spec.short,
"regulation_name_de": spec.title,
"citation_style": "standard",
"document_type": "standard",
"source_class": "technical_standard",
"bindingness": "best_practice",
"authority_level": STANDARD_WEIGHT,
"authority_weight": STANDARD_WEIGHT,
"source_type": "standard",
"issuer": spec.publisher,
"jurisdiction": spec.jurisdiction,
"version_date": spec.version_date,
"source": spec.url,
"license": spec.license,
"category": "standard",
"use_for_primary": False, # best-practice control — never a primary obligation
"is_citable": True,
"citation_unit": spec.title,
"article_label": spec.short,
"chunk_scope": "standard",
"norm_id": f"STANDARD-{spec.source_id}",
}
def build_upload_unit(spec: StandardSpec, text: str, run_tag: str) -> UploadUnit:
"""One UploadUnit for the whole standard; the RAG service chunks it and each
chunk inherits the technical_standard metadata."""
return UploadUnit(
filename=f"{spec.source_id}.txt",
text=text,
meta=standard_meta(spec),
document_version=f"{run_tag}-{spec.source_id}",
collection=spec.collection,
)