"""StandardIngester (Parser 4): ingests technical standards / control frameworks (NIST / OWASP / BSI Grundschutz / CSA CCM) as best-practice CONTROLS — never a primary obligation. Tagged source_class=technical_standard / authority_weight=80 / bindingness=best_practice / use_for_primary=false, so a standard ranks below binding law AND supervisory guidance for an obligation/interpretation question, but surfaces for "which controls / measures fit?" questions (control-intent in the retriever). Reuses the PDF/HTML extraction helpers from guidance_ingester. LICENSE per source matters: NIST = US public domain (free), OWASP = CC-BY-SA (free, share-alike), BSI Grundschutz = check terms. CSA CCM is CC-BY-NC (NON commercial) → NOT usable in a commercial product; carry the license on each unit so the gate can refuse it. """ from __future__ import annotations from dataclasses import dataclass from typing import Any from services.legal_act_ingester import UploadUnit # Extraction (extract_pdf / extract_html / normalize_text) + the self_test gate are # shared with guidance_ingester — the operational runner imports them from there. STANDARD_WEIGHT = 80 @dataclass class StandardSpec: """One technical standard / control framework document.""" source_id: str # stable handle, e.g. "nist_csf_2_0" short: str # display handle / regulation_short, e.g. "NIST CSF 2.0" title: str publisher: str # NIST / OWASP / BSI / CSA url: str license: str # e.g. "public_domain" | "CC-BY-SA-4.0" | "CC-BY-NC" (→ commercial-blocked) collection: str = "bp_compliance_ce" version_date: str = "" jurisdiction: str = "international" def standard_meta(spec: StandardSpec) -> dict[str, Any]: return { "regulation_code": spec.short, "regulation_short": spec.short, "regulation_name_de": spec.title, "citation_style": "standard", "document_type": "standard", "source_class": "technical_standard", "bindingness": "best_practice", "authority_level": STANDARD_WEIGHT, "authority_weight": STANDARD_WEIGHT, "source_type": "standard", "issuer": spec.publisher, "jurisdiction": spec.jurisdiction, "version_date": spec.version_date, "source": spec.url, "license": spec.license, "category": "standard", "use_for_primary": False, # best-practice control — never a primary obligation "is_citable": True, "citation_unit": spec.title, "article_label": spec.short, "chunk_scope": "standard", "norm_id": f"STANDARD-{spec.source_id}", } def build_upload_unit(spec: StandardSpec, text: str, run_tag: str) -> UploadUnit: """One UploadUnit for the whole standard; the RAG service chunks it and each chunk inherits the technical_standard metadata.""" return UploadUnit( filename=f"{spec.source_id}.txt", text=text, meta=standard_meta(spec), document_version=f"{run_tag}-{spec.source_id}", collection=spec.collection, )