feat(canonical-controls): Canonical Control Library — rechtssichere Security Controls

Eigenstaendig formulierte Security Controls mit unabhaengiger Taxonomie und Open-Source-Verankerung (OWASP, NIST, ENISA). Keine BSI-Nomenklatur. - Migration 044: 5 DB-Tabellen (frameworks, controls, sources, licenses, mappings) - 10 Seed Controls mit 39 Open-Source-Referenzen - License Gate: Quellen-Berechtigungspruefung (analysis/excerpt/embeddings/product) - Too-Close-Detektor: 5 Metriken (exact-phrase, token-overlap, ngram, embedding, LCS) - REST API: 8 Endpoints unter /v1/canonical/ - Go Loader mit Multi-Index (ID, domain, severity, framework) - Frontend: Control Library Browser + Provenance Wiki - CI/CD: validate-controls.py Job (schema, no-leak, open-anchors) - 67 Tests (8 Go + 59 Python), alle PASS - MkDocs Dokumentation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 19:55:06 +01:00
parent 8442115e7c
commit 050f353192
20 changed files with 3935 additions and 0 deletions
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""
+Canonical Control Library — CI/CD Validator
+
+Checks:
+  1. Schema Validation     — JSON against defined structure
+  2. License Gate          — Every mapping reference fulfils allowed_usages
+  3. No-Leak Scanner       — Regex against forbidden locator patterns (e.g. O.Auth_*, O.Netz_*)
+  4. Provenance Integrity  — Every referenced source_id exists in seed data
+  5. Open Anchor Check     — Every control has >= 1 open anchor
+
+Usage:
+  python scripts/validate-controls.py
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+CONTROLS_FILE = REPO_ROOT / "ai-compliance-sdk" / "policies" / "canonical_controls_v1.json"
+MIGRATION_FILE = REPO_ROOT / "backend-compliance" / "migrations" / "044_canonical_control_library.sql"
+
+# ---------------------------------------------------------------------------
+# Forbidden patterns (BSI proprietary nomenclature — must NOT appear in controls)
+# ---------------------------------------------------------------------------
+
+FORBIDDEN_PATTERNS = [
+    re.compile(r"O\.[A-Za-z]+_[0-9]+"),          # BSI objective IDs: O.Auth_1, O.Netz_3
+    re.compile(r"TR-03161"),                       # Direct TR reference in control text
+    re.compile(r"BSI-TR-"),                        # Direct BSI-TR reference
+    re.compile(r"Anforderung\s+[A-Z]\.\d+"),      # BSI requirement format
+]
+
+# Fields that are product-facing and must not contain forbidden patterns
+PRODUCT_FIELDS = ["objective", "rationale", "title", "requirements", "test_procedure"]
+
+# ---------------------------------------------------------------------------
+# Known open sources (from migration seed)
+# ---------------------------------------------------------------------------
+
+KNOWN_OPEN_SOURCES = {
+    "OWASP_ASVS", "OWASP_MASVS", "OWASP_TOP10",
+    "NIST_SP800_53", "NIST_SP800_63B",
+    "ENISA_GOOD_PRACTICES", "CIS_CONTROLS",
+}
+
+KNOWN_ALL_SOURCES = KNOWN_OPEN_SOURCES | {
+    "BSI_TR03161_1", "BSI_TR03161_2", "BSI_TR03161_3",
+}
+
+# ---------------------------------------------------------------------------
+# Validators
+# ---------------------------------------------------------------------------
+
+errors: list[str] = []
+warnings: list[str] = []
+
+
+def error(msg: str) -> None:
+    errors.append(msg)
+
+
+def warn(msg: str) -> None:
+    warnings.append(msg)
+
+
+def check_schema(data: dict) -> None:
+    """Validate JSON structure."""
+    required_top = ["version", "schema", "framework", "total_controls", "domains", "controls"]
+    for key in required_top:
+        if key not in data:
+            error(f"[SCHEMA] Missing top-level key: {key}")
+
+    required_control = [
+        "control_id", "title", "domain", "severity", "objective",
+        "rationale", "scope", "requirements", "test_procedure",
+        "evidence", "open_anchors",
+    ]
+    control_id_pattern = re.compile(r"^[A-Z]{2,6}-[0-9]{3}$")
+    valid_severities = {"low", "medium", "high", "critical"}
+
+    for ctrl in data.get("controls", []):
+        cid = ctrl.get("control_id", "???")
+        for key in required_control:
+            if key not in ctrl:
+                error(f"[SCHEMA] Control {cid}: missing field '{key}'")
+
+        if not control_id_pattern.match(cid):
+            error(f"[SCHEMA] Control {cid}: ID does not match ^[A-Z]{{2,6}}-[0-9]{{3}}$")
+
+        sev = ctrl.get("severity", "")
+        if sev not in valid_severities:
+            error(f"[SCHEMA] Control {cid}: invalid severity '{sev}'")
+
+        if ctrl.get("risk_score") is not None:
+            rs = ctrl["risk_score"]
+            if not (0 <= rs <= 10):
+                error(f"[SCHEMA] Control {cid}: risk_score {rs} out of range [0, 10]")
+
+    domain_ids = {d["id"] for d in data.get("domains", [])}
+    for ctrl in data.get("controls", []):
+        cid = ctrl.get("control_id", "???")
+        if ctrl.get("domain") not in domain_ids:
+            error(f"[SCHEMA] Control {cid}: domain '{ctrl.get('domain')}' not in domains list")
+
+
+def check_no_leak(data: dict) -> None:
+    """Ensure no BSI-proprietary nomenclature leaks into product-facing fields."""
+    for ctrl in data.get("controls", []):
+        cid = ctrl.get("control_id", "???")
+        for field_name in PRODUCT_FIELDS:
+            values = ctrl.get(field_name, "")
+            if isinstance(values, list):
+                texts = values
+            else:
+                texts = [values]
+
+            for text_val in texts:
+                if not isinstance(text_val, str):
+                    continue
+                for pattern in FORBIDDEN_PATTERNS:
+                    match = pattern.search(text_val)
+                    if match:
+                        error(
+                            f"[NO-LEAK] Control {cid}.{field_name}: "
+                            f"forbidden pattern '{match.group()}' found"
+                        )
+
+
+def check_open_anchors(data: dict) -> None:
+    """Every control must have at least 1 open anchor."""
+    for ctrl in data.get("controls", []):
+        cid = ctrl.get("control_id", "???")
+        anchors = ctrl.get("open_anchors", [])
+        if len(anchors) < 1:
+            error(f"[ANCHOR] Control {cid}: no open anchors — every control needs >= 1")
+        # Check anchor structure
+        for i, anchor in enumerate(anchors):
+            for key in ["framework", "ref", "url"]:
+                if key not in anchor or not anchor[key]:
+                    error(f"[ANCHOR] Control {cid}: open_anchor[{i}] missing '{key}'")
+
+
+def check_independent_taxonomy(data: dict) -> None:
+    """Verify controls use independent taxonomy, not BSI structure."""
+    bsi_domain_patterns = [
+        re.compile(r"^O\.", re.IGNORECASE),  # BSI objective prefix
+    ]
+    for ctrl in data.get("controls", []):
+        cid = ctrl.get("control_id", "???")
+        for pattern in bsi_domain_patterns:
+            if pattern.match(cid):
+                error(f"[TAXONOMY] Control {cid}: uses BSI-style ID prefix")
+
+
+def check_evidence_fields(data: dict) -> None:
+    """Validate evidence items have required fields."""
+    for ctrl in data.get("controls", []):
+        cid = ctrl.get("control_id", "???")
+        for i, ev in enumerate(ctrl.get("evidence", [])):
+            if not isinstance(ev, dict):
+                error(f"[EVIDENCE] Control {cid}: evidence[{i}] is not an object")
+                continue
+            for key in ["type", "description"]:
+                if key not in ev or not ev[key]:
+                    error(f"[EVIDENCE] Control {cid}: evidence[{i}] missing '{key}'")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main() -> int:
+    print(f"Validating: {CONTROLS_FILE}")
+    print()
+
+    if not CONTROLS_FILE.exists():
+        print(f"ERROR: File not found: {CONTROLS_FILE}")
+        return 1
+
+    with open(CONTROLS_FILE) as f:
+        data = json.load(f)
+
+    check_schema(data)
+    check_no_leak(data)
+    check_open_anchors(data)
+    check_independent_taxonomy(data)
+    check_evidence_fields(data)
+
+    total_controls = len(data.get("controls", []))
+    total_anchors = sum(len(c.get("open_anchors", [])) for c in data.get("controls", []))
+
+    print(f"Controls:     {total_controls}")
+    print(f"Open Anchors: {total_anchors}")
+    print()
+
+    if warnings:
+        print(f"WARNINGS ({len(warnings)}):")
+        for w in warnings:
+            print(f"  ⚠ {w}")
+        print()
+
+    if errors:
+        print(f"ERRORS ({len(errors)}):")
+        for e in errors:
+            print(f"  ✗ {e}")
+        print()
+        print("VALIDATION FAILED")
+        return 1
+
+    print("ALL CHECKS PASSED")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())