feat(control-pipeline): StandardIngester engine for technical standards (Parser 4)
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 25s
CI / test-python-voice (push) Successful in 28s
CI / test-bqas (push) Successful in 26s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 25s
CI / test-python-voice (push) Successful in 28s
CI / test-bqas (push) Successful in 26s
Add services/standard_ingester.py — tags technical standards / control frameworks (NIST / OWASP / BSI Grundschutz / CSA CCM) source_class=technical_standard / authority_weight=80 / bindingness=best_practice / use_for_primary=false, so a standard ranks below binding law and guidance for obligation/interpretation questions but surfaces for "which controls/measures fit?" (control-intent, a follow-up retriever step). Reuses the guidance_ingester extraction helpers. The per-source license is carried on every unit so the commercial gate can refuse a non-commercial source (e.g. CSA CCM = CC-BY-NC). Tested: 3 unit tests on the metadata path, ruff + mypy clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
"""StandardIngester (Parser 4): ingests technical standards / control frameworks
|
||||
(NIST / OWASP / BSI Grundschutz / CSA CCM) as best-practice CONTROLS — never a
|
||||
primary obligation.
|
||||
|
||||
Tagged source_class=technical_standard / authority_weight=80 /
|
||||
bindingness=best_practice / use_for_primary=false, so a standard ranks below
|
||||
binding law AND supervisory guidance for an obligation/interpretation question,
|
||||
but surfaces for "which controls / measures fit?" questions (control-intent in
|
||||
the retriever). Reuses the PDF/HTML extraction helpers from guidance_ingester.
|
||||
|
||||
LICENSE per source matters: NIST = US public domain (free), OWASP = CC-BY-SA
|
||||
(free, share-alike), BSI Grundschutz = check terms. CSA CCM is CC-BY-NC (NON
|
||||
commercial) → NOT usable in a commercial product; carry the license on each unit
|
||||
so the gate can refuse it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from services.legal_act_ingester import UploadUnit
|
||||
|
||||
# Extraction (extract_pdf / extract_html / normalize_text) + the self_test gate are
|
||||
# shared with guidance_ingester — the operational runner imports them from there.
|
||||
|
||||
STANDARD_WEIGHT = 80
|
||||
|
||||
|
||||
@dataclass
|
||||
class StandardSpec:
|
||||
"""One technical standard / control framework document."""
|
||||
|
||||
source_id: str # stable handle, e.g. "nist_csf_2_0"
|
||||
short: str # display handle / regulation_short, e.g. "NIST CSF 2.0"
|
||||
title: str
|
||||
publisher: str # NIST / OWASP / BSI / CSA
|
||||
url: str
|
||||
license: str # e.g. "public_domain" | "CC-BY-SA-4.0" | "CC-BY-NC" (→ commercial-blocked)
|
||||
collection: str = "bp_compliance_ce"
|
||||
version_date: str = ""
|
||||
jurisdiction: str = "international"
|
||||
|
||||
|
||||
def standard_meta(spec: StandardSpec) -> dict[str, Any]:
|
||||
return {
|
||||
"regulation_code": spec.short,
|
||||
"regulation_short": spec.short,
|
||||
"regulation_name_de": spec.title,
|
||||
"citation_style": "standard",
|
||||
"document_type": "standard",
|
||||
"source_class": "technical_standard",
|
||||
"bindingness": "best_practice",
|
||||
"authority_level": STANDARD_WEIGHT,
|
||||
"authority_weight": STANDARD_WEIGHT,
|
||||
"source_type": "standard",
|
||||
"issuer": spec.publisher,
|
||||
"jurisdiction": spec.jurisdiction,
|
||||
"version_date": spec.version_date,
|
||||
"source": spec.url,
|
||||
"license": spec.license,
|
||||
"category": "standard",
|
||||
"use_for_primary": False, # best-practice control — never a primary obligation
|
||||
"is_citable": True,
|
||||
"citation_unit": spec.title,
|
||||
"article_label": spec.short,
|
||||
"chunk_scope": "standard",
|
||||
"norm_id": f"STANDARD-{spec.source_id}",
|
||||
}
|
||||
|
||||
|
||||
def build_upload_unit(spec: StandardSpec, text: str, run_tag: str) -> UploadUnit:
|
||||
"""One UploadUnit for the whole standard; the RAG service chunks it and each
|
||||
chunk inherits the technical_standard metadata."""
|
||||
return UploadUnit(
|
||||
filename=f"{spec.source_id}.txt",
|
||||
text=text,
|
||||
meta=standard_meta(spec),
|
||||
document_version=f"{run_tag}-{spec.source_id}",
|
||||
collection=spec.collection,
|
||||
)
|
||||
@@ -0,0 +1,35 @@
|
||||
"""Unit tests for the StandardIngester engine (Parser 4)."""
|
||||
|
||||
from services.standard_ingester import StandardSpec, build_upload_unit, standard_meta
|
||||
|
||||
SPEC = StandardSpec(
|
||||
source_id="nist_csf_2_0", short="NIST CSF 2.0",
|
||||
title="NIST Cybersecurity Framework 2.0", publisher="NIST",
|
||||
url="https://nist.gov/csf", license="public_domain", version_date="2024-02-26",
|
||||
)
|
||||
|
||||
|
||||
def test_standard_meta_is_best_practice_not_primary():
|
||||
m = standard_meta(SPEC)
|
||||
assert m["source_class"] == "technical_standard"
|
||||
assert m["authority_weight"] == 80
|
||||
assert m["bindingness"] == "best_practice"
|
||||
assert m["use_for_primary"] is False
|
||||
assert m["chunk_scope"] == "standard"
|
||||
assert m["regulation_short"] == "NIST CSF 2.0"
|
||||
assert m["issuer"] == "NIST"
|
||||
assert m["license"] == "public_domain"
|
||||
|
||||
|
||||
def test_build_upload_unit_tags_version_and_collection():
|
||||
unit = build_upload_unit(SPEC, "A" * 300, "run9")
|
||||
assert unit.document_version == "run9-nist_csf_2_0"
|
||||
assert unit.collection == "bp_compliance_ce"
|
||||
assert unit.filename == "nist_csf_2_0.txt"
|
||||
assert unit.meta["use_for_primary"] is False
|
||||
|
||||
|
||||
def test_noncommercial_license_is_carried_for_the_gate():
|
||||
ccm = StandardSpec(source_id="csa_ccm", short="CSA CCM", title="Cloud Controls Matrix",
|
||||
publisher="CSA", url="https://...", license="CC-BY-NC")
|
||||
assert standard_meta(ccm)["license"] == "CC-BY-NC" # commercial gate can refuse it
|
||||
Reference in New Issue
Block a user