feat(cra): Maßnahmen-Provenienz + Lizenzklasse je Normquelle

Jede Normreferenz einer Maßnahme wird lizenzklassifiziert (eu_law / public_domain / open / paid_reference) — paid-reference-Normen werden nur als Verweis geführt, nie im Text gespeichert (idea/expression). Kuratierte Maßnahmen tragen Tier 'core', KI-/Fallback-Maßnahmen 'review' (indikativ). Frontend zeigt Quellen-Badges + "indikativ"-Kennzeichnung. Methodik in docs-src/development/mapping-methodology.md (Szenario C, Due-Diligence). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-16 10:10:20 +02:00
parent 6c619ecc42
commit 7a4f086151
8 changed files with 204 additions and 3 deletions
@@ -268,6 +268,8 @@ SEVERITY_WEIGHT = {
 import json as _json
 import os as _os

+from compliance.data.norm_sources import classify_refs as _classify_refs
+
 MEASURE_DETAILS: dict = {}

 try:
@@ -275,6 +277,12 @@ try:
    with open(_MEAS_PATH, encoding="utf-8") as _fh:
        _curated = _json.load(_fh)
    MEASURE_DETAILS = {m["id"]: m for m in _curated if m.get("id")}
+    # Provenance: curated measures are expert/standards-based ('core' tier); each
+    # norm_ref is license-classified (law/public/open vs paid-reference-only).
+    for _m in MEASURE_DETAILS.values():
+        _m.setdefault("tier", "core")
+        _m.setdefault("provenance", "curated_expert_standards")
+        _m["norm_sources"] = _classify_refs(_m.get("norm_refs", []))
    for _m in _curated:
        if _m.get("id") and _m.get("name"):
            MEASURES[_m["id"]] = _m["name"]
@@ -0,0 +1,55 @@
+"""Provenance / license classification for norm references.
+
+Encodes the BreakPilot mapping methodology (idea/expression): a *reference* to
+where a topic sits in a standard is a fact and citable; the paid normative *text*
+is never stored or reproduced. This classifier marks which sources are freely
+usable (EU law, US-gov public domain, open licenses) vs. paid standards we may
+only REFERENCE by clause/control ID.
+
+See docs-src/development/mapping-methodology.md.
+"""
+
+LAW = "eu_law"                  # EU legislation — public, reproducible (EUR-Lex)
+PUBLIC_DOMAIN = "public_domain"  # e.g. NIST (US gov work) — reproducible
+OPEN = "open"                   # OWASP (CC), ETSI EN 303 645, BSI — freely available
+PAID_REFERENCE = "paid_reference"  # ISO/IEC/EN/DIN — REFERENCE ONLY, no text stored
+
+LABEL = {
+    LAW: "EU-Recht (frei)",
+    PUBLIC_DOMAIN: "Public Domain (frei)",
+    OPEN: "offen lizenziert",
+    PAID_REFERENCE: "kostenpflichtige Norm — nur Verweis",
+}
+
+_LAW = ("2024/2847", "2023/1230", "verordnung (eu)", "maschinenverordnung", "(cra)",
+        "anhang", "nis2", "nis-2", " art. ", "dsgvo", "2016/679", "2022/2555")
+_PUBLIC = ("nist", "ntia", "nvd", "cisa")
+_OPEN = ("owasp", "slsa", "etsi en 303 645", "bsi", "cyclonedx", "spdx",
+         "nist privacy framework")
+_PAID = ("iso", "iec", "din", "en iso", "62443", "27002", "27035", "29147",
+         "30111", "15408", "18045", "13849", "13850", "13857", "14119", "14120",
+         "61496", "61800", "62061", "60204", "82079", "15066", "10218", "13855", "62061")
+
+
+def classify_norm_ref(ref: str) -> str:
+    r = (ref or "").lower()
+    # NIST Privacy Framework is open-ish; keep public-domain check after open guard.
+    if "nist privacy" in r:
+        return OPEN
+    if any(k in r for k in _LAW):
+        return LAW
+    if any(k in r for k in _PUBLIC):
+        return PUBLIC_DOMAIN
+    if any(k in r for k in _OPEN):
+        return OPEN
+    if any(k in r for k in _PAID):
+        return PAID_REFERENCE
+    return PAID_REFERENCE  # conservative default: treat unknown as reference-only
+
+
+def classify_refs(refs) -> list:
+    """[{ref, license_class, label}] for each norm reference."""
+    return [
+        {"ref": r, "license_class": (lc := classify_norm_ref(r)), "label": LABEL[lc]}
+        for r in (refs or [])
+    ]
@@ -24,8 +24,10 @@ def _measure_obj(mid: str) -> dict:
    d = MEASURE_DETAILS.get(mid)
    if d:
        return {"id": mid, "name": d.get("name", ""), "description": d.get("description", ""),
-                "norm_refs": d.get("norm_refs", [])}
-    return {"id": mid, "name": MEASURES.get(mid, ""), "description": MEASURES.get(mid, ""), "norm_refs": []}
+                "norm_refs": d.get("norm_refs", []), "norm_sources": d.get("norm_sources", []),
+                "tier": d.get("tier", "core"), "provenance": d.get("provenance", "")}
+    return {"id": mid, "name": MEASURES.get(mid, ""), "description": MEASURES.get(mid, ""),
+            "norm_refs": [], "norm_sources": [], "tier": "review", "provenance": ""}


 _REQ_INDEX = {r["req_id"]: r for r in ANNEX_I_REQUIREMENTS}