3b466be140
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 25s
CI / test-python-voice (push) Successful in 28s
CI / test-bqas (push) Successful in 26s
Add services/standard_ingester.py — tags technical standards / control frameworks (NIST / OWASP / BSI Grundschutz / CSA CCM) source_class=technical_standard / authority_weight=80 / bindingness=best_practice / use_for_primary=false, so a standard ranks below binding law and guidance for obligation/interpretation questions but surfaces for "which controls/measures fit?" (control-intent, a follow-up retriever step). Reuses the guidance_ingester extraction helpers. The per-source license is carried on every unit so the commercial gate can refuse a non-commercial source (e.g. CSA CCM = CC-BY-NC). Tested: 3 unit tests on the metadata path, ruff + mypy clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
82 lines
3.0 KiB
Python
82 lines
3.0 KiB
Python
"""StandardIngester (Parser 4): ingests technical standards / control frameworks
|
|
(NIST / OWASP / BSI Grundschutz / CSA CCM) as best-practice CONTROLS — never a
|
|
primary obligation.
|
|
|
|
Tagged source_class=technical_standard / authority_weight=80 /
|
|
bindingness=best_practice / use_for_primary=false, so a standard ranks below
|
|
binding law AND supervisory guidance for an obligation/interpretation question,
|
|
but surfaces for "which controls / measures fit?" questions (control-intent in
|
|
the retriever). Reuses the PDF/HTML extraction helpers from guidance_ingester.
|
|
|
|
LICENSE per source matters: NIST = US public domain (free), OWASP = CC-BY-SA
|
|
(free, share-alike), BSI Grundschutz = check terms. CSA CCM is CC-BY-NC (NON
|
|
commercial) → NOT usable in a commercial product; carry the license on each unit
|
|
so the gate can refuse it.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
from services.legal_act_ingester import UploadUnit
|
|
|
|
# Extraction (extract_pdf / extract_html / normalize_text) + the self_test gate are
|
|
# shared with guidance_ingester — the operational runner imports them from there.
|
|
|
|
STANDARD_WEIGHT = 80
|
|
|
|
|
|
@dataclass
|
|
class StandardSpec:
|
|
"""One technical standard / control framework document."""
|
|
|
|
source_id: str # stable handle, e.g. "nist_csf_2_0"
|
|
short: str # display handle / regulation_short, e.g. "NIST CSF 2.0"
|
|
title: str
|
|
publisher: str # NIST / OWASP / BSI / CSA
|
|
url: str
|
|
license: str # e.g. "public_domain" | "CC-BY-SA-4.0" | "CC-BY-NC" (→ commercial-blocked)
|
|
collection: str = "bp_compliance_ce"
|
|
version_date: str = ""
|
|
jurisdiction: str = "international"
|
|
|
|
|
|
def standard_meta(spec: StandardSpec) -> dict[str, Any]:
|
|
return {
|
|
"regulation_code": spec.short,
|
|
"regulation_short": spec.short,
|
|
"regulation_name_de": spec.title,
|
|
"citation_style": "standard",
|
|
"document_type": "standard",
|
|
"source_class": "technical_standard",
|
|
"bindingness": "best_practice",
|
|
"authority_level": STANDARD_WEIGHT,
|
|
"authority_weight": STANDARD_WEIGHT,
|
|
"source_type": "standard",
|
|
"issuer": spec.publisher,
|
|
"jurisdiction": spec.jurisdiction,
|
|
"version_date": spec.version_date,
|
|
"source": spec.url,
|
|
"license": spec.license,
|
|
"category": "standard",
|
|
"use_for_primary": False, # best-practice control — never a primary obligation
|
|
"is_citable": True,
|
|
"citation_unit": spec.title,
|
|
"article_label": spec.short,
|
|
"chunk_scope": "standard",
|
|
"norm_id": f"STANDARD-{spec.source_id}",
|
|
}
|
|
|
|
|
|
def build_upload_unit(spec: StandardSpec, text: str, run_tag: str) -> UploadUnit:
|
|
"""One UploadUnit for the whole standard; the RAG service chunks it and each
|
|
chunk inherits the technical_standard metadata."""
|
|
return UploadUnit(
|
|
filename=f"{spec.source_id}.txt",
|
|
text=text,
|
|
meta=standard_meta(spec),
|
|
document_version=f"{run_tag}-{spec.source_id}",
|
|
collection=spec.collection,
|
|
)
|