"""Provenance / license classification for norm references. Encodes the BreakPilot mapping methodology (idea/expression): a *reference* to where a topic sits in a standard is a fact and citable; the paid normative *text* is never stored or reproduced. This classifier marks which sources are freely usable (EU law, US-gov public domain, open licenses) vs. paid standards we may only REFERENCE by clause/control ID. See docs-src/development/mapping-methodology.md. """ LAW = "eu_law" # EU legislation — public, reproducible (EUR-Lex) PUBLIC_DOMAIN = "public_domain" # e.g. NIST (US gov work) — reproducible OPEN = "open" # OWASP (CC), ETSI EN 303 645, BSI — freely available PAID_REFERENCE = "paid_reference" # ISO/IEC/EN/DIN — REFERENCE ONLY, no text stored LABEL = { LAW: "EU-Recht (frei)", PUBLIC_DOMAIN: "Public Domain (frei)", OPEN: "offen lizenziert", PAID_REFERENCE: "kostenpflichtige Norm — nur Verweis", } _LAW = ("2024/2847", "2023/1230", "verordnung (eu)", "maschinenverordnung", "(cra)", "anhang", "nis2", "nis-2", " art. ", "dsgvo", "2016/679", "2022/2555") _PUBLIC = ("nist", "ntia", "nvd", "cisa") _OPEN = ("owasp", "slsa", "etsi en 303 645", "bsi", "cyclonedx", "spdx", "nist privacy framework") _PAID = ("iso", "iec", "din", "en iso", "62443", "27002", "27035", "29147", "30111", "15408", "18045", "13849", "13850", "13857", "14119", "14120", "61496", "61800", "62061", "60204", "82079", "15066", "10218", "13855", "62061") def classify_norm_ref(ref: str) -> str: r = (ref or "").lower() # NIST Privacy Framework is open-ish; keep public-domain check after open guard. if "nist privacy" in r: return OPEN if any(k in r for k in _LAW): return LAW if any(k in r for k in _PUBLIC): return PUBLIC_DOMAIN if any(k in r for k in _OPEN): return OPEN if any(k in r for k in _PAID): return PAID_REFERENCE return PAID_REFERENCE # conservative default: treat unknown as reference-only def classify_refs(refs) -> list: """[{ref, license_class, label}] for each norm reference.""" return [ {"ref": r, "license_class": (lc := classify_norm_ref(r)), "label": LABEL[lc]} for r in (refs or []) ]