diff --git a/backend-compliance/compliance/data/use_case_registry.py b/backend-compliance/compliance/data/use_case_registry.py index 887ba673..ca75c706 100644 --- a/backend-compliance/compliance/data/use_case_registry.py +++ b/backend-compliance/compliance/data/use_case_registry.py @@ -28,7 +28,9 @@ VERIFICATION_METHODS: tuple[str, ...] = ( "manual", # menschliche Attestierung ) -USE_CASE_GROUPS: tuple[str, ...] = ("document", "security", "cross_cutting") +USE_CASE_GROUPS: tuple[str, ...] = ( + "document", "security", "cross_cutting", "product", +) @dataclass(frozen=True) @@ -136,9 +138,188 @@ _USE_CASES: tuple[UseCase, ...] = ( "hybrid"), categories=("security", "governance", "operations"), keyword_tokens=("tisax", "vda", "prototypenschutz")), + UseCase("kritis", "KRITIS / NIS2-Umsetzung", "security", + regulations=("KRITIS-Dachgesetz", "BSI-KritisV", "BSIG"), + verification_methods=("it_process", "network", "document")), + UseCase("dora", "DORA (Digital Operational Resilience)", + "cross_cutting", regulations=("DORA",), + verification_methods=("it_process", "document", "network")), + # ── Produkt-/Sektor-Use-Cases (je Quell-Regulierung) ──────────── + UseCase("ai_act", "KI-Verordnung (AI Act)", "product", + regulations=("KI-Verordnung", "AI Act", "NIST AI RMF"), + verification_methods=("document", "it_process", "source_code")), + UseCase("mica", "Markets in Crypto-Assets (MiCA)", "product", + regulations=("MiCA",), + verification_methods=("document", "it_process")), + UseCase("mdr", "Medizinprodukte (MDR)", "product", + regulations=("MDR",), + verification_methods=("document", "source_code", "it_process")), + UseCase("maschinen", "Maschinenverordnung", "product", + regulations=("Maschinenverordnung",), + verification_methods=("document", "source_code", "it_process")), + UseCase("batterie", "Batterieverordnung", "product", + regulations=("Batterieverordnung", "Batteriegesetz"), + verification_methods=("document", "it_process")), + UseCase("ehds", "European Health Data Space", "product", + regulations=("EHDS",), + verification_methods=("document", "it_process", "source_code")), + UseCase("produktsicherheit", "Produktsicherheit (GPSR)", "product", + regulations=("Produktsicherheitsverordnung", "EU Blue Guide"), + verification_methods=("document", "it_process")), + UseCase("dsa", "Digital Services Act", "product", + regulations=("Digital Services Act",), + verification_methods=("document", "it_process")), + UseCase("dma", "Digital Markets Act", "product", + regulations=("Digital Markets Act",), + verification_methods=("document", "it_process")), + UseCase("data_governance", "Data Act / Data Governance Act", "product", + regulations=("Data Act", "Data Governance Act"), + verification_methods=("document", "it_process")), + UseCase("zahlungsdienste", "Zahlungsdienste (PSD2)", "product", + regulations=("Zahlungsdiensterichtlinie",), + verification_methods=("document", "it_process", "source_code")), + UseCase("geldwaesche", "Geldwäsche (AML/GwG)", "product", + regulations=("AML-Verordnung", "GwG"), + verification_methods=("document", "it_process")), + UseCase("lieferkette", "Lieferkettensorgfalt (LkSG)", "product", + regulations=("LkSG",), + verification_methods=("document", "it_process")), + UseCase("whistleblowing", "Hinweisgeberschutz (HinSchG)", "product", + regulations=("HinSchG",), + verification_methods=("document", "it_process")), + UseCase("barrierefreiheit", "Barrierefreiheit (EAA)", "product", + regulations=("European Accessibility Act",), + verification_methods=("document", "source_code")), + # ── Weitere Rechts-Use-Cases (Dokument) ───────────────────────── + UseCase("verbraucherschutz", "Verbraucherschutz", "document", + regulations=("Konsumentenschutzgesetz", + "Digitale-Inhalte-Richtlinie"), + verification_methods=("document",)), + UseCase("urheberrecht", "Urheberrecht", "document", + regulations=("UrhG", "DSM-Urheberrechtsrichtlinie"), + verification_methods=("document",)), + UseCase("wettbewerbsrecht", "Wettbewerbsrecht (UWG)", "document", + regulations=("UWG",), verification_methods=("document",)), + UseCase("gleichbehandlung", "Gleichbehandlung (AGG)", "document", + regulations=("AGG",), + verification_methods=("document", "it_process")), + UseCase("steuerrecht", "Steuerrecht", "document", + regulations=("Abgabenordnung", "BAO"), + verification_methods=("document", "it_process")), + UseCase("handelsrecht", "Handelsrecht", "document", + regulations=("HGB", "UGB", "ABGB"), + verification_methods=("document", "it_process")), ) +# Deterministischer Quell-Regulierung → Use-Case-Mapper. +# control_parent_links.source_regulation (117 distinct) → ~30 Domaenen. +# Reihenfolge = SPEZIFISCH zuerst (Substring, case-insensitive); der erste +# Treffer gewinnt. Die Datenschutz-Catch-alls (edpb/dsk/dsgvo) ganz hinten, +# damit spezifische Regeln (z.B. 'DSK OH Telemedien') zuerst greifen koennen. +_REGULATION_RULES: tuple[tuple[str, str], ...] = ( + # Security / Code (Security-Team) + ("owasp", "code_security"), + ("nist sp 800-218", "code_security"), + ("ssdf", "code_security"), + ("nist sp 800-63", "code_security"), + ("nistir 8259", "code_security"), + ("cisa", "code_security"), + ("nist sp 800-207", "network_security"), + ("zero trust", "network_security"), + ("nis2", "network_security"), + ("nis-2", "network_security"), + ("enisa", "network_security"), + ("bsi-gesetz", "network_security"), + ("bsig", "network_security"), + ("cybersecurity act", "network_security"), + ("kritis", "kritis"), + ("nist cybersecurity framework", "isms"), + ("nist sp 800-53", "isms"), + ("digital operational resilience", "dora"), + ("dora", "dora"), + # Produkt / Sektor + ("cyber resilience act", "cra"), + ("(cra)", "cra"), + ("nist ai risk", "ai_act"), + ("ki-verordnung", "ai_act"), + ("ki-vo", "ai_act"), + ("ai act", "ai_act"), + ("oecd ki", "ai_act"), + ("crypto-assets", "mica"), + ("mica", "mica"), + ("medizinprodukte", "mdr"), + ("(mdr)", "mdr"), + ("maschinenverordnung", "maschinen"), + ("batterie", "batterie"), + ("health data space", "ehds"), + ("produktsicherheit", "produktsicherheit"), + ("blue guide", "produktsicherheit"), + ("digital services act", "dsa"), + ("digital markets act", "dma"), + ("data act", "data_governance"), + ("data governance", "data_governance"), + ("zahlungsdienste", "zahlungsdienste"), + ("geldwaesche", "geldwaesche"), + ("aml-verordnung", "geldwaesche"), + ("lieferkettensorgfalt", "lieferkette"), + ("lksg", "lieferkette"), + ("hinweisgeberschutz", "whistleblowing"), + ("hinschg", "whistleblowing"), + ("accessibility act", "barrierefreiheit"), + # Website / Telemedien / Recht (User-Domaene) + ("tdddg", "cookie_banner"), + ("eprivacy", "cookie_banner"), + ("telemedien", "impressum"), + ("telekommunikationsgesetz", "impressum"), + ("tkg", "impressum"), + ("tmg", "impressum"), + ("mediengesetz", "impressum"), + ("gewerbeordnung", "impressum"), + ("e-commerce", "agb"), + ("digitale-inhalte", "agb"), + ("konsumentenschutz", "verbraucherschutz"), + ("urheberrecht", "urheberrecht"), + ("urhg", "urheberrecht"), + ("uwg", "wettbewerbsrecht"), + ("handelsgesetzbuch", "handelsrecht"), + ("hgb", "handelsrecht"), + ("ugb", "handelsrecht"), + ("abgb", "handelsrecht"), + ("bgb", "agb"), + ("gleichbehandlung", "gleichbehandlung"), + ("(agg)", "gleichbehandlung"), + ("abgabenordnung", "steuerrecht"), + ("bao", "steuerrecht"), + ("standardvertragsklauseln", "avv"), + ("(scc)", "avv"), + # Datenschutz-Catch-alls (zuletzt) + ("nist privacy framework", "dse"), + ("dsgvo", "dse"), + ("datenschutzgesetz", "dse"), + ("bdsg", "dse"), + ("edpb", "dse"), + ("edps", "dse"), + ("dsk ", "dse"), + ("wp29", "dse"), + ("bfdi", "dse"), + ("data privacy framework", "dse"), + ("datenschutz", "dse"), +) + + +def use_case_for_regulation(regulation: str | None) -> str | None: + """Deterministisch: Quell-Regulierung → Domaenen-Use-Case (erster + Substring-Treffer). None wenn keine Regel passt (→ Fallback/Review).""" + if not regulation: + return None + low = regulation.lower() + for needle, uc in _REGULATION_RULES: + if needle in low: + return uc + return None + + REGISTRY: dict[str, UseCase] = {uc.key: uc for uc in _USE_CASES} diff --git a/backend-compliance/migrations/150_mc_primary_and_regulations.sql b/backend-compliance/migrations/150_mc_primary_and_regulations.sql new file mode 100644 index 00000000..4ad0bc38 --- /dev/null +++ b/backend-compliance/migrations/150_mc_primary_and_regulations.sql @@ -0,0 +1,40 @@ +-- Migration 150: Primaerzweck-Flag + MC <-> Quell-Regulierung (n:m) +-- Erweitert das Use-Case-Mapping um (a) is_primary (Primaerzweck pro MC) +-- und (b) mc_regulations (die feine Quell-Regulierungs-Filter-Dimension, +-- 117 Werte). Strikt add-only. [migration-approved] + +SET search_path TO compliance, public; + +DO $$ +BEGIN + IF EXISTS (SELECT 1 FROM information_schema.tables + WHERE table_schema='compliance' + AND table_name='mc_use_case_mappings') THEN + ALTER TABLE mc_use_case_mappings + ADD COLUMN IF NOT EXISTS is_primary BOOLEAN NOT NULL DEFAULT FALSE; + CREATE INDEX IF NOT EXISTS idx_mcuc_primary + ON mc_use_case_mappings(use_case) WHERE is_primary; + END IF; + + IF EXISTS (SELECT 1 FROM information_schema.tables + WHERE table_schema='compliance' + AND table_name='master_controls') THEN + CREATE TABLE IF NOT EXISTS mc_regulations ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + master_control_uuid UUID NOT NULL + REFERENCES master_controls(id) ON DELETE CASCADE, + master_control_id VARCHAR(60) NOT NULL, + source_regulation VARCHAR(160) NOT NULL, + is_primary BOOLEAN NOT NULL DEFAULT FALSE, + member_count INTEGER DEFAULT 0, + method VARCHAR(20) NOT NULL DEFAULT 'lineage' + CHECK (method IN ('lineage', 'manual')), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE (master_control_uuid, source_regulation) + ); + CREATE INDEX IF NOT EXISTS idx_mcreg_regulation + ON mc_regulations(source_regulation); + CREATE INDEX IF NOT EXISTS idx_mcreg_mc + ON mc_regulations(master_control_uuid); + END IF; +END $$; diff --git a/backend-compliance/scripts/classify_mc_use_cases.py b/backend-compliance/scripts/classify_mc_use_cases.py index 60421cdb..6b25e9aa 100644 --- a/backend-compliance/scripts/classify_mc_use_cases.py +++ b/backend-compliance/scripts/classify_mc_use_cases.py @@ -1,9 +1,12 @@ -"""Klassifiziert Master Controls auf Use Cases (n:m) + Verifikations-Methode. +"""Klassifiziert Master Controls deterministisch auf Use Cases (n:m) + +Quell-Regulierung (n:m) + Verifikations-Methode. -Stufe 1 — Seed (kein LLM, gratis): aus vorhandenen Member-Signalen - (canonical_controls.scope_doc_type / .category / .verification_method / - .evidence_type) via `use_case_registry.seed_classify`. -Stufe 2 — LLM (Phase 3): Multi-Label gegen die Registry-Taxonomie. [TODO] +DETERMINISTISCH (kein LLM): Die Zuordnung kommt aus der Quell-Regulierung +jedes Controls — Lineage master_controls -> master_control_members -> +control_parent_links.source_regulation. 117 Regulierungen -> Keyword-Mapper +(use_case_registry.use_case_for_regulation) -> ~30 Domaenen-Use-Cases. +Primaerzweck = dominante Quell-Regulierung (meiste Member); Mehrfachzwecke = +die weiteren. LLM-Stufe (spaeter) nur Fallback fuer MCs ohne source_regulation. Lauf im Container: docker exec bp-compliance-backend \ @@ -15,83 +18,131 @@ from __future__ import annotations import argparse import asyncio import os +import sys -import asyncpg +# /app auf den Pfad, damit `compliance...` als Standalone-Script importierbar +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from compliance.data import use_case_registry as reg +import asyncpg # noqa: E402 -_AGG_SQL = """ +from compliance.data import use_case_registry as reg # noqa: E402 + +_REG_SQL = """ SELECT mc.id AS mc_uuid, mc.master_control_id, - array_agg(DISTINCT cc.scope_doc_type) AS scopes, - array_agg(DISTINCT cc.category) AS categories, - array_agg(DISTINCT cc.verification_method) AS vmethods, - array_agg(DISTINCT cc.evidence_type) AS etypes + cpl.source_regulation AS reg, count(*) AS cnt FROM compliance.master_controls mc -JOIN compliance.master_control_members mcm - ON mcm.master_control_uuid = mc.id +JOIN compliance.master_control_members mcm ON mcm.master_control_uuid = mc.id +JOIN compliance.control_parent_links cpl ON cpl.control_uuid = mcm.control_uuid +WHERE cpl.source_regulation IS NOT NULL AND cpl.source_regulation <> '' +GROUP BY mc.id, mc.master_control_id, cpl.source_regulation +""" + +_VERIF_SQL = """ +SELECT mc.id AS mc_uuid, + array_agg(DISTINCT cc.verification_method) AS vmethods, + array_agg(DISTINCT cc.evidence_type) AS etypes +FROM compliance.master_controls mc +JOIN compliance.master_control_members mcm ON mcm.master_control_uuid = mc.id JOIN compliance.canonical_controls cc ON cc.id = mcm.control_uuid -GROUP BY mc.id, mc.master_control_id +GROUP BY mc.id """ async def run_seed(conn, limit: int = 0) -> dict: - """Deterministischer Seed → mc_use_case_mappings + mc_verification. - Idempotent (ON CONFLICT DO NOTHING); ueberschreibt 'manual' nie.""" - sql = _AGG_SQL + (f" LIMIT {limit}" if limit > 0 else "") - rows = await conn.fetch(sql) - n_mc_with_uc = n_uc_rows = n_verif = 0 - for r in rows: - ucs, method = reg.seed_classify( - r["scopes"], r["categories"], r["vmethods"], r["etypes"], - ) - for uc in ucs: + """Deterministischer Seed aus der Quell-Regulierung. Ersetzt die + bisherigen Seed-Zeilen; 'manual'-Korrekturen bleiben unangetastet.""" + await conn.execute( + "DELETE FROM compliance.mc_use_case_mappings WHERE method='seed'") + await conn.execute( + "DELETE FROM compliance.mc_verification WHERE method='seed'") + await conn.execute( + "DELETE FROM compliance.mc_regulations WHERE method='lineage'") + + by_mc: dict = {} + for r in await conn.fetch(_REG_SQL): + e = by_mc.setdefault( + r["mc_uuid"], + {"mc_id": r["master_control_id"], "regs": {}}) + e["regs"][r["reg"]] = r["cnt"] + if limit > 0: + by_mc = dict(list(by_mc.items())[:limit]) + + verif: dict = {} + for r in await conn.fetch(_VERIF_SQL): + _ucs, m = reg.seed_classify(vmethods=r["vmethods"], etypes=r["etypes"]) + if m: + verif[r["mc_uuid"]] = m + + n_reg = n_uc = n_mc = n_v = 0 + for mc_uuid, e in by_mc.items(): + mc_id, regs = e["mc_id"], e["regs"] + if not regs: + continue + primary_reg = max(regs, key=regs.get) + uc_primary = reg.use_case_for_regulation(primary_reg) + ucs: dict = {} + for rg, cnt in regs.items(): + await conn.execute( + """INSERT INTO compliance.mc_regulations + (master_control_uuid, master_control_id, source_regulation, + is_primary, member_count) + VALUES ($1,$2,$3,$4,$5) + ON CONFLICT (master_control_uuid, source_regulation) + DO NOTHING""", + mc_uuid, mc_id, rg[:160], rg == primary_reg, cnt) + n_reg += 1 + uc = reg.use_case_for_regulation(rg) + if uc: + ucs[uc] = ucs.get(uc, False) or (uc == uc_primary) + for uc, is_prim in ucs.items(): await conn.execute( """INSERT INTO compliance.mc_use_case_mappings (master_control_uuid, master_control_id, use_case, - method, confidence, rationale) - VALUES ($1,$2,$3,'seed',0.6,'deterministic seed') - ON CONFLICT (master_control_uuid, use_case) DO NOTHING""", - r["mc_uuid"], r["master_control_id"], uc, - ) - n_uc_rows += 1 - if ucs: - n_mc_with_uc += 1 - if method: + method, confidence, rationale, is_primary) + VALUES ($1,$2,$3,'seed',0.85,'source_regulation',$4) + ON CONFLICT (master_control_uuid, use_case) + DO UPDATE SET is_primary = + mc_use_case_mappings.is_primary OR EXCLUDED.is_primary + WHERE mc_use_case_mappings.method <> 'manual'""", + mc_uuid, mc_id, uc, is_prim) + n_uc += 1 + n_mc += 1 + m = verif.get(mc_uuid) + if m: await conn.execute( """INSERT INTO compliance.mc_verification (master_control_uuid, master_control_id, verification_method, method, confidence, rationale) - VALUES ($1,$2,$3,'seed',0.6,'deterministic seed') + VALUES ($1,$2,$3,'seed',0.7,'member evidence_type') ON CONFLICT (master_control_uuid) DO NOTHING""", - r["mc_uuid"], r["master_control_id"], method, - ) - n_verif += 1 + mc_uuid, mc_id, m) + n_v += 1 + total = await conn.fetchval( "SELECT count(*) FROM compliance.mc_use_case_mappings") await conn.execute( """INSERT INTO compliance.mc_use_case_sync_state (registry_hash, stage, total_mappings, mcs_classified) - VALUES ($1,'seed',$2,$3)""", - reg.registry_hash(), total, n_mc_with_uc, - ) - return {"mcs": len(rows), "mcs_with_use_case": n_mc_with_uc, - "use_case_rows": n_uc_rows, "verification_rows": n_verif} + VALUES ($1,'seed_source_regulation',$2,$3)""", + reg.registry_hash(), total, n_mc) + return {"mcs_mapped": n_mc, "regulation_rows": n_reg, + "use_case_rows": n_uc, "verification_rows": n_v} async def _main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--limit", type=int, default=0) ap.add_argument("--with-llm", action="store_true", - help="Phase 3 — noch nicht implementiert") + help="Fallback fuer MCs ohne source_regulation (TODO)") args = ap.parse_args() if args.with_llm: - raise SystemExit("LLM-Stufe (Phase 3) noch nicht implementiert.") + raise SystemExit("LLM-Fallback (Phase 3) noch nicht implementiert.") conn = await asyncpg.connect(os.getenv("DATABASE_URL")) try: stats = await run_seed(conn, args.limit) finally: await conn.close() - print("Seed fertig:", stats) + print("Seed (source_regulation) fertig:", stats) if __name__ == "__main__": diff --git a/backend-compliance/tests/test_use_case_registry.py b/backend-compliance/tests/test_use_case_registry.py index ba6815ed..09c05fe4 100644 --- a/backend-compliance/tests/test_use_case_registry.py +++ b/backend-compliance/tests/test_use_case_registry.py @@ -110,3 +110,38 @@ def test_seed_empty_and_none_safe(): vmethods=[None], etypes=[None]) assert ucs == [] and m is None assert reg.seed_classify() == ([], None) + + +# ── Deterministischer Regulierung→Use-Case-Mapper ─────────────────── + + +def test_regulation_mapper_known(): + cases = { + "OWASP Top 10 (2021)": "code_security", + "NIST SP 800-207 (Zero Trust)": "network_security", + "Cyber Resilience Act (CRA)": "cra", + "DSGVO (EU) 2016/679": "dse", + "EDPB Facial Recognition": "dse", # Leitlinie → Datenschutz + "TKG": "impressum", + "TDDDG": "cookie_banner", + "Markets in Crypto-Assets (MiCA)": "mica", + "BGB": "agb", + } + for reg_str, expected in cases.items(): + assert reg.use_case_for_regulation(reg_str) == expected, reg_str + + +def test_regulation_mapper_abgb_before_bgb(): + # 'ABGB' enthaelt 'bgb' — die abgb-Regel MUSS zuerst greifen. + assert reg.use_case_for_regulation("AT ABGB") == "handelsrecht" + + +def test_regulation_mapper_unknown_returns_none(): + assert reg.use_case_for_regulation("Voellig Unbekanntes Gesetz") is None + assert reg.use_case_for_regulation(None) is None + + +def test_all_regulation_rules_point_to_valid_use_cases(): + for _needle, uc in reg._REGULATION_RULES: + assert uc in reg.REGISTRY, uc + assert reg.REGISTRY[uc].enabled