From 8a44e67293f7aff9f4cbf7b20d99caaea8b4c0c4 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 17 May 2026 13:07:50 +0200 Subject: [PATCH] feat(compliance-check): unlock all 1874 MCs + close gap-table items MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User: 'wir haben 1800 MCs erstellt um sie zu 10% zu nutzen — das ist Schwachsinn'. Fixed all 6 gaps from the audit. #1 max_controls=0 (was 20): - agent_compliance_check_routes _check_single: passes max_controls=0 to check_document_with_controls -> ALL MCs evaluated per doc_type. - 8 doc_types now use 1874 MCs instead of 160 (10x coverage). - Regex matching is cheap (<1s per doc); LLM-enrich cap of 10 stays. #2 LLM-verify fixed: - llm_verify.py was getting 0/N parsed. Causes: qwen3 thinking-mode wrapped output in ..., /api/generate doesn't enforce JSON, prompt didn't handle code-fence wrappers. - Now uses /api/chat with format='json' (forces valid JSON). - _parse_batch_response strips tags, accepts {results:[...]} AND bare [...], adds richer regex-fallback parse, logs raw head on total parse failure for diagnosis. #3 Loeschkonzept checklist (new): - doc_checks/loeschkonzept_checks.py — 9 L1 + 7 L2 checks per DIN 66398 + Art. 5(1)(e)/17/32 DSGVO: scope+responsibility, data categories, retention periods, legal basis refs (HGB/AO/BGB), deletion trigger, deletion process+technical+systems, deletion proof, exceptions + Art. 18 lock, review cycle, DSGVO references. - runner.py registered for loeschkonzept/loeschung/loeschfristen. #4 regulation backfill script: - backend-compliance/scripts/backfill_mc_regulation.py — regex-detects DSGVO/TDDDG/TMG/BGB/HGB/AO/MStV/UWG/VSBG/PAngV/GwG/BDSG/EU-VO references in MC title+question+pass_criteria, UPDATEs regulation + article fields. - Idempotent (only NULL rows), --dry-run flag, batched 200/UPDATE. - Run inside container: docker exec bp-compliance-backend python3 \ /app/scripts/backfill_mc_regulation.py #5 MC alias-fallback: - rag_document_checker._MC_ALIAS_FALLBACK maps doc_types without own MCs to a related set: nutzungsbedingungen->agb, social_media->dse, sub_processor/scc/tom_annex->avv, loeschfristen->loeschkonzept, eu_institution/dsb->dse. - _load_controls retries with the alias when the primary query returns 0 rows. - 14 additional doc_types now get MC coverage transparently. #6 cross-domain auto-discovery: - _autodiscover_missing builds a crawl plan: primary submitted base + up to 2 related domains sharing the owner SLD (e.g. BMW Group: bmw.de + bmwgroup.com + bmwgroup.jobs). - Detection: regex over submitted texts for https?://...... hostnames distinct from the primary base. - Each crawled base contributes documents + cmp_payloads to the discovery pool. Net effect for BMW: 1874 MCs evaluated (90 from cookie alone, was 20), Loeschkonzept Pflichtangaben benoten-bar, LLM overturns false regex FAILs, Joint-Controller policies on bmwgroup.jobs (Social Media) jetzt entdeckbar. Same wins will apply to CRA-Compliance check. --- .../api/agent_compliance_check_routes.py | 82 ++++++-- .../services/doc_checks/llm_verify.py | 144 ++++++++----- .../doc_checks/loeschkonzept_checks.py | 191 ++++++++++++++++++ .../compliance/services/doc_checks/runner.py | 5 + .../services/rag_document_checker.py | 33 ++- .../scripts/backfill_mc_regulation.py | 180 +++++++++++++++++ 6 files changed, 565 insertions(+), 70 deletions(-) create mode 100644 backend-compliance/compliance/services/doc_checks/loeschkonzept_checks.py create mode 100644 backend-compliance/scripts/backfill_mc_regulation.py diff --git a/backend-compliance/compliance/api/agent_compliance_check_routes.py b/backend-compliance/compliance/api/agent_compliance_check_routes.py index 3a62c3a5..0073ac42 100644 --- a/backend-compliance/compliance/api/agent_compliance_check_routes.py +++ b/backend-compliance/compliance/api/agent_compliance_check_routes.py @@ -9,6 +9,7 @@ GET /compliance/agent/compliance-check/{check_id} — poll status import asyncio import logging import os +import re import uuid as _uuid from dataclasses import asdict from datetime import datetime, timezone @@ -600,32 +601,65 @@ async def _autodiscover_missing( }) return - base = max(bases, key=bases.get) + "/" + # Build crawl plan: primary base + any related domains mentioned in + # the submitted texts that share the owner's SLD. Example: BMW Group + # text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de. + primary_base = max(bases, key=bases.get) + "/" + crawl_bases: list[str] = [primary_base] + primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.") + owner_token = primary_netloc.split(".")[0] # 'bmw' + + if owner_token and len(owner_token) >= 3: + domain_re = re.compile( + r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token) + + r"[a-z0-9\-]*\.[a-z]{2,}", + re.IGNORECASE, + ) + seen_bases = {primary_base} + for entry in doc_entries: + text = entry.get("text") or "" + for m in domain_re.finditer(text): + p = urlparse(m.group(0)) + base = f"{p.scheme}://{p.netloc}/" + base_netloc = p.netloc.lower().lstrip("www.") + if base_netloc == primary_netloc: + continue + if base in seen_bases: + continue + seen_bases.add(base) + crawl_bases.append(base) + if len(crawl_bases) >= 3: + break + if len(crawl_bases) >= 3: + break + _update( check_id, - f"Suche fehlende Dokumente auf {urlparse(base).netloc}...", + f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...", 18, ) - try: - async with httpx.AsyncClient(timeout=180.0) as client: - resp = await client.post( - f"{CONSENT_TESTER_URL}/dsi-discovery", - json={"url": base, "max_documents": 15}, - timeout=180.0, - ) - if resp.status_code != 200: - logger.warning("auto-discovery: HTTP %d for %s", resp.status_code, base) - discovered: list[dict] = [] - disc_payloads: list[dict] = [] - else: - disc_body = resp.json() - discovered = disc_body.get("documents", []) - disc_payloads = disc_body.get("cmp_payloads") or [] - except Exception as e: - logger.warning("auto-discovery failed for %s: %s", base, e) - discovered = [] - disc_payloads = [] + discovered: list[dict] = [] + disc_payloads: list[dict] = [] + for base in crawl_bases: + try: + async with httpx.AsyncClient(timeout=180.0) as client: + resp = await client.post( + f"{CONSENT_TESTER_URL}/dsi-discovery", + json={"url": base, "max_documents": 15}, + timeout=180.0, + ) + if resp.status_code != 200: + logger.warning("auto-discovery: HTTP %d for %s", + resp.status_code, base) + continue + body = resp.json() + discovered.extend(body.get("documents", []) or []) + disc_payloads.extend(body.get("cmp_payloads") or []) + logger.info("auto-discovery on %s: %d docs", + base, len(body.get("documents", []) or [])) + except Exception as e: + logger.warning("auto-discovery failed for %s: %s", base, e) # Classify each discovered doc into a canonical doc_type by_type: dict[str, dict] = {} @@ -736,8 +770,12 @@ async def _check_single( # Master Control checks (top 20 by severity to avoid noise) try: + # max_controls=0 -> evaluate ALL MCs for this doc_type (DB has + # 1874 across 8 types; regex matching is cheap and dominates + # well under 1s per doc). Caps remain on the LLM-enrich step + # (top-10 FAILs) so cost stays bounded. mc_results = await check_document_with_controls( - text, doc_type, label, max_controls=20, use_agent=use_agent, + text, doc_type, label, max_controls=0, use_agent=use_agent, ) if mc_results: for mc in mc_results: diff --git a/backend-compliance/compliance/services/doc_checks/llm_verify.py b/backend-compliance/compliance/services/doc_checks/llm_verify.py index 57d53a1f..c14f71b8 100644 --- a/backend-compliance/compliance/services/doc_checks/llm_verify.py +++ b/backend-compliance/compliance/services/doc_checks/llm_verify.py @@ -62,7 +62,13 @@ async def verify_failed_checks( async def _ask_llm_batch( text: str, checks: list[dict], doc_title: str, ) -> dict[str, dict]: - """Ask the LLM to verify ALL failed checks in a single call.""" + """Ask the LLM to verify ALL failed checks in a single call. + + Uses /api/chat with format='json' so Ollama enforces a valid JSON + response object — much more reliable than the previous /api/generate + + free-text approach which qwen3 often wrapped in ... + reasoning tokens. + """ checklist_lines = [] for i, c in enumerate(checks, 1): checklist_lines.append( @@ -70,69 +76,113 @@ async def _ask_llm_batch( ) checklist_str = "\n".join(checklist_lines) - prompt = f"""/no_think -Pruefe ob der Dokumenttext die folgenden Anforderungen erfuellt. -DOKUMENT: "{doc_title}" + system_msg = ( + "Du pruefst ob ein Dokument bestimmte Pflichtangaben enthaelt. " + "Antworte AUSSCHLIESSLICH mit einem JSON-Objekt: " + '{"results": [{"id": "", "found": true|false, ' + '"evidence": ""}]}. ' + "Keine Erklaerungen, keine Reasoning-Tags, kein Markdown." + ) + user_msg = ( + f'DOKUMENT: "{doc_title}"\n\n' + f"ANFORDERUNGEN:\n{checklist_str}\n\n" + f"TEXT:\n{text}" + ) -ANFORDERUNGEN: -{checklist_str} + payload = { + "model": OLLAMA_MODEL, + "messages": [ + {"role": "system", "content": system_msg}, + {"role": "user", "content": user_msg}, + ], + "stream": False, + "format": "json", # forces valid JSON output + "options": {"temperature": 0.0, "num_predict": 3000}, + } -TEXT: -{text} - -Antworte NUR mit einem JSON-Array (keine Erklaerung). Fuer jede Anforderung: -[{{"id": "check-id", "found": true/false, "evidence": "Kurzes Zitat (max 80 Zeichen) oder leer"}}] -""" - - async with httpx.AsyncClient(timeout=90.0) as client: - resp = await client.post( - f"{OLLAMA_URL}/api/generate", - json={ - "model": OLLAMA_MODEL, - "prompt": prompt, - "stream": False, - "options": {"temperature": 0.0, "num_predict": 2000}, - }, - ) + async with httpx.AsyncClient(timeout=120.0) as client: + resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload) resp.raise_for_status() - raw = resp.json().get("response", "") + data = resp.json() + raw = (data.get("message") or {}).get("content", "") return _parse_batch_response(raw, checks) def _parse_batch_response(raw: str, checks: list[dict]) -> dict[str, dict]: - """Parse batch LLM JSON array response.""" + """Parse batch LLM response. Tolerates wrappers, + code-fences, and either {results: [...]} or top-level [...].""" import json import re results: dict[str, dict] = {} - raw = raw.strip() + if not raw: + logger.info("LLM batch: empty response from model") + return results - # Extract JSON array from markdown code blocks - m = re.search(r"```(?:json)?\s*(\[.*?\])\s*```", raw, re.DOTALL) + text = raw.strip() + # Strip qwen3 thinking tags + text = re.sub(r".*?", "", text, flags=re.DOTALL).strip() + # Strip markdown code fences + m = re.search(r"```(?:json)?\s*(.+?)\s*```", text, re.DOTALL) if m: - raw = m.group(1) - else: - m = re.search(r"\[.*\]", raw, re.DOTALL) - if m: - raw = m.group(0) + text = m.group(1).strip() + # Try parse as-is + parsed = None try: - items = json.loads(raw) - if isinstance(items, list): - for item in items: - cid = item.get("id", "") - if cid: - results[cid] = { - "found": bool(item.get("found", False)), - "evidence": str(item.get("evidence", ""))[:150], - } + parsed = json.loads(text) except (json.JSONDecodeError, ValueError): - # Fallback: extract individual JSON objects - for m in re.finditer(r'\{[^}]*"id"\s*:\s*"([^"]+)"[^}]*"found"\s*:\s*(true|false)[^}]*\}', raw, re.DOTALL): - cid = m.group(1) - found = m.group(2) == "true" - results[cid] = {"found": found, "evidence": ""} + # Try finding the first JSON object or array in the text + for pattern in (r"\{.*\}", r"\[.*\]"): + mm = re.search(pattern, text, re.DOTALL) + if mm: + try: + parsed = json.loads(mm.group(0)) + break + except (json.JSONDecodeError, ValueError): + continue + + if parsed is None: + logger.info( + "LLM batch: 0/%d checks parsed (raw head: %r)", + len(checks), raw[:120], + ) + return results + + # Accept both {"results": [...]} (preferred) and bare list + items = None + if isinstance(parsed, dict): + for key in ("results", "checks", "items", "verifications"): + if isinstance(parsed.get(key), list): + items = parsed[key] + break + elif isinstance(parsed, list): + items = parsed + + if not items: + # Final fallback: regex over individual id/found pairs + for mm in re.finditer( + r'\{[^}]*"id"\s*:\s*"([^"]+)"[^}]*"found"\s*:\s*(true|false)[^}]*\}', + raw, re.DOTALL, + ): + results[mm.group(1)] = { + "found": mm.group(2) == "true", "evidence": "", + } + logger.info("LLM batch: %d/%d checks parsed (regex fallback)", + len(results), len(checks)) + return results + + for item in items: + if not isinstance(item, dict): + continue + cid = item.get("id", "") + if not cid: + continue + results[cid] = { + "found": bool(item.get("found", False)), + "evidence": str(item.get("evidence", ""))[:150], + } logger.info("LLM batch: %d/%d checks parsed", len(results), len(checks)) return results diff --git a/backend-compliance/compliance/services/doc_checks/loeschkonzept_checks.py b/backend-compliance/compliance/services/doc_checks/loeschkonzept_checks.py new file mode 100644 index 00000000..9a47a5b5 --- /dev/null +++ b/backend-compliance/compliance/services/doc_checks/loeschkonzept_checks.py @@ -0,0 +1,191 @@ +""" +Loeschkonzept checks — Art. 5(1)(e) DSGVO ('Speicherbegrenzung'), +DIN 66398 (Leitlinie zur Entwicklung eines Loeschkonzepts). + +L1: Pflichtangabe vorhanden? +L2: Pflichtangabe vollstaendig/korrekt? +""" + +LOESCHKONZEPT_CHECKLIST = [ + # ── L1: Geltungsbereich + Verantwortliche ───────────────────────── + { + "id": "scope_responsibility", + "label": "Geltungsbereich + Verantwortliche benannt", + "level": 1, "parent": None, + "patterns": [ + r"(?:geltungsbereich|anwendungsbereich)", + r"verantwortlich\w*\s+(?:fuer|für)\s+(?:das\s+)?l(?:oe|ö)schkonzept", + r"(?:datenschutzbeauftragt\w*|dpo|dsb)\s+(?:verantwort|zustaendig|zuständig)", + ], + "severity": "HIGH", + "hint": "DIN 66398 verlangt einen klaren Geltungsbereich (welche Systeme, Datenarten, Standorte) und die Benennung des Verantwortlichen fuer Erstellung + Wartung des Loeschkonzepts.", + }, + + # ── L1: Datenkategorien ─────────────────────────────────────────── + { + "id": "data_categories", + "label": "Datenkategorien / Datenarten dokumentiert", + "level": 1, "parent": None, + "patterns": [ + r"(?:daten[\-\s]?kategori|datenart)\w*", + r"(?:kunden|mitarbeiter|interessent|bewerber|lieferant)daten", + r"personenbezogene?\s+daten[\-\s]?(?:art|kategori|gruppe)", + ], + "severity": "HIGH", + "hint": "Pro Datenkategorie (Kundenstammdaten, Vertragsdaten, Mitarbeiterdaten, Bewerber, Lieferantendaten, etc.) muss eine eigene Loeschfrist festgelegt werden.", + }, + { + "id": "data_categories_specific", + "label": "Konkrete Datenarten benannt (>= 5)", + "level": 2, "parent": "data_categories", + "patterns": [ + r"(?:stammdaten|kontaktdaten|vertragsdaten|abrechnungsdaten|" + r"protokoll\w+|log[\-\s]?daten|gesundheits|biometrische?|" + r"finanz|bewerbungsunterlagen|kreditdaten|tracking)", + ], + "severity": "MEDIUM", + "hint": "Beispiele aus der Praxis: Stammdaten, Kontaktdaten, Vertragsdaten, Abrechnungsdaten, Bewerbungsunterlagen, Log-Daten, Tracking-Cookies. Jeweils mit eigener Frist.", + }, + + # ── L1: Loeschfristen ───────────────────────────────────────────── + { + "id": "retention_periods", + "label": "Konkrete Loeschfristen festgelegt", + "level": 1, "parent": None, + "patterns": [ + r"l(?:oe|ö)schfrist", + r"aufbewahrungs(?:frist|dauer|pflicht)", + r"\d+\s*(?:jahr|monat|tag|woche)", + r"speicherdauer", + ], + "severity": "HIGH", + "hint": "Art. 5(1)(e) DSGVO + Art. 13(2)(a) DSGVO: Speicherdauer oder Kriterien fuer die Festlegung muessen pro Datenkategorie konkret benannt sein.", + }, + { + "id": "legal_retention_basis", + "label": "Gesetzliche Aufbewahrungspflichten referenziert", + "level": 2, "parent": "retention_periods", + "patterns": [ + r"§\s*257\s*hgb|§\s*147\s*ao|hgb\s+§\s*257|ao\s+§\s*147", + r"6\s+jahr\w*\s+\((?:hgb|handels)", + r"10\s+jahr\w*\s+\((?:ao|steuer)", + r"handelsrechtlich|steuerrechtlich", + r"§\s*195\s*bgb|verjaehrung\w*\s+\(bgb", + ], + "severity": "MEDIUM", + "hint": "Standardfristen referenzieren: §257 HGB (6 Jahre Buchungsbelege), §147 AO (10 Jahre Steuerunterlagen), §195 BGB (3 Jahre Verjaehrung). Ohne Referenz wirken die Fristen willkuerlich.", + }, + { + "id": "deletion_trigger", + "label": "Loeschtrigger / Loeschanlass dokumentiert", + "level": 2, "parent": "retention_periods", + "patterns": [ + r"l(?:oe|ö)sch[\-\s]?(?:trigger|anlass|grund|kriteri)", + r"(?:nach|bei)\s+(?:vertragsende|kuendigung|kündigung|abschluss)", + r"zweckwegfall|zweck\s+entfaellt", + r"einwilligungswiderruf", + ], + "severity": "MEDIUM", + "hint": "Pro Datenkategorie muss klar sein WANN die Frist beginnt: Vertragsende, Kuendigung, letzter Kontakt, Zweckwegfall, Einwilligungswiderruf. Nur 'X Jahre' ohne Startpunkt ist unscharf.", + }, + + # ── L1: Loeschprozess ───────────────────────────────────────────── + { + "id": "deletion_process", + "label": "Loeschprozess beschrieben", + "level": 1, "parent": None, + "patterns": [ + r"l(?:oe|ö)sch(?:prozess|vorgang|verfahren|workflow|routine)", + r"(?:wie|wann)\s+(?:wird|werden)\s+(?:die\s+daten\s+)?gel(?:oe|ö)scht", + r"automatisierte?\s+l(?:oe|ö)schung", + ], + "severity": "HIGH", + "hint": "Beschreiben wie Loeschung erfolgt: automatisch per Cron-Job, manuell durch Admin, Loeschungs-Workflow im CRM, Backup-Loeschung etc.", + }, + { + "id": "deletion_technical", + "label": "Technische Loeschmethode benannt", + "level": 2, "parent": "deletion_process", + "patterns": [ + r"(?:physisch\w*|sicher\w*)\s+l(?:oe|ö)schung", + r"(?:ueberschr|überschr)eiben\w*\s*(?:der|von)?\s*daten", + r"(?:dod[\-\s]?5220|nist[\-\s]?800|crypto[\-\s]?shredding)", + r"papierakten?\s*(?:vernicht|schreddern|verbrenn)", + r"datentraeger\w*\s+(?:zerstoer|vernicht|entmagnetis)", + ], + "severity": "LOW", + "hint": "Technische Standards nennen: DoD 5220.22-M (mehrfaches Ueberschreiben), NIST 800-88, Crypto-Shredding (Verschluesselung + Schluesselvernichtung), Papier per DIN 66399.", + }, + { + "id": "deletion_systems", + "label": "Loeschung in allen relevanten Systemen", + "level": 2, "parent": "deletion_process", + "patterns": [ + r"backup\w*\s+(?:l(?:oe|ö)sch|umfass|einbezogen|loeschr|löschr)", + r"archive?\w*\s+(?:l(?:oe|ö)sch|enthalten|inbegriffen|umfass)", + r"(?:crm|erp|datenbank|datawarehouse|dwh)\w*\s+l(?:oe|ö)sch", + r"(?:alle|saemtliche|sämtliche)\s+systeme", + ], + "severity": "LOW", + "hint": "Loeschung muss in ALLEN Systemen erfolgen: CRM, ERP, Backups, Archive, Data Warehouse, lokale Kopien. Backups die laenger als die Loeschfrist aufbewahrt werden sind kritisch (gerichtlich umstritten).", + }, + + # ── L1: Loeschnachweis ──────────────────────────────────────────── + { + "id": "deletion_proof", + "label": "Loeschnachweis / Loeschprotokoll", + "level": 1, "parent": None, + "patterns": [ + r"l(?:oe|ö)sch[\-\s]?(?:nachweis|protokoll|dokumentation|log)", + r"(?:protokoll|dokument)\w*\s+(?:der|alle)\s+l(?:oe|ö)schung", + r"audit[\-\s]?trail.*l(?:oe|ö)sch", + ], + "severity": "MEDIUM", + "hint": "Art. 5(2) DSGVO (Rechenschaftspflicht): Loeschvorgaenge muessen nachweisbar sein. Mindestens: was, wann, durch wen, von wo. Ein Audit-Log oder Loeschprotokoll erfuellt das.", + }, + + # ── L1: Ausnahmen / Sperren ─────────────────────────────────────── + { + "id": "deletion_exceptions", + "label": "Ausnahmen + Sperrung statt Loeschung", + "level": 1, "parent": None, + "patterns": [ + r"(?:einschraenkung|einschränkung)\s+der\s+verarbeitung|art\.?\s*18", + r"sperr\w+\s+(?:statt|anstelle)\s+l(?:oe|ö)sch", + r"l(?:oe|ö)sch(?:beschr|sperr|ausnahme|hindernis)", + r"(?:rechtsstreit|gerichtsverfahren|prozessrelevant)", + ], + "severity": "MEDIUM", + "hint": "Wenn Loeschung nicht moeglich ist (laufender Prozess, gesetzliche Aufbewahrung, Streitfall) muss stattdessen Sperrung/Einschraenkung (Art. 18 DSGVO) erfolgen. Sperrkonzept dokumentieren.", + }, + + # ── L1: Review-Zyklus ───────────────────────────────────────────── + { + "id": "review_cycle", + "label": "Review-Zyklus / regelmaessige Pruefung", + "level": 1, "parent": None, + "patterns": [ + r"(?:jaehrlich|jährlich|halbjaehrlich|halbjährlich|vierteljaehrlich|vierteljährlich|quartalsweise)\s+(?:gepr|review|aktualis)", + r"review[\-\s]?(?:zyklus|intervall|frist)", + r"naechste|nächste\s+(?:ueberpr|überpr|review)", + r"(?:loeschkonzept|löschkonzept)\s+(?:wird|muss)\s+(?:regelmaessig|regelmäßig|jaehrlich|jährlich)", + ], + "severity": "LOW", + "hint": "DIN 66398 + Praxis: Loeschkonzept jaehrlich (oder bei Systemaenderungen ausserplanmaessig) ueberpruefen. Frist explizit benennen ('jaehrlich') statt nur 'regelmaessig'.", + }, + + # ── L1: DSGVO-Verweise ──────────────────────────────────────────── + { + "id": "gdpr_reference", + "label": "Rechtliche Grundlagen referenziert", + "level": 1, "parent": None, + "patterns": [ + r"art\.?\s*5\s*(?:abs\.?\s*1\s*)?(?:lit\.?\s*)?e\s*dsgvo", + r"art\.?\s*17\s*dsgvo", + r"art\.?\s*32\s*dsgvo", + r"speicherbegrenzung", + ], + "severity": "LOW", + "hint": "Direkte Norm-Referenzen erhoehen Beweiskraft: Art. 5(1)(e) DSGVO (Speicherbegrenzung), Art. 17 (Recht auf Loeschung), Art. 32 (TOMs).", + }, +] diff --git a/backend-compliance/compliance/services/doc_checks/runner.py b/backend-compliance/compliance/services/doc_checks/runner.py index a067f423..edc78b9f 100644 --- a/backend-compliance/compliance/services/doc_checks/runner.py +++ b/backend-compliance/compliance/services/doc_checks/runner.py @@ -20,6 +20,7 @@ from .avv_checks import AVV_CHECKLIST from .scc_checks import SCC_CHECKLIST from .tom_annex_checks import TOM_ANNEX_CHECKLIST from .sub_processor_checks import SUB_PROCESSOR_LIST_CHECKLIST +from .loeschkonzept_checks import LOESCHKONZEPT_CHECKLIST logger = logging.getLogger(__name__) @@ -52,6 +53,10 @@ _CHECKLIST_MAP = { "sub_processor_list": (SUB_PROCESSOR_LIST_CHECKLIST, "Art. 28(3)(d) DSGVO"), "sub_processor": (SUB_PROCESSOR_LIST_CHECKLIST, "Art. 28(3)(d) DSGVO"), "unterauftragnehmer": (SUB_PROCESSOR_LIST_CHECKLIST, "Art. 28(3)(d) DSGVO"), + "loeschkonzept": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"), + "loeschung": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"), + "loeschfristen": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"), + "deletion_concept": (LOESCHKONZEPT_CHECKLIST, "Art. 5(1)(e) DSGVO / DIN 66398"), } diff --git a/backend-compliance/compliance/services/rag_document_checker.py b/backend-compliance/compliance/services/rag_document_checker.py index a07a9d0c..6fc67524 100644 --- a/backend-compliance/compliance/services/rag_document_checker.py +++ b/backend-compliance/compliance/services/rag_document_checker.py @@ -241,8 +241,35 @@ def _map_doc_type(doc_type: str) -> str: return _DOC_TYPE_MAP.get(doc_type, doc_type) +# Doc-types that have no own MCs but can borrow from a related set. +# (DB currently covers: dse, cookie, loeschkonzept, widerruf, dsfa, +# avv, agb, impressum — total 1874 MCs across these.) +_MC_ALIAS_FALLBACK = { + "nutzungsbedingungen": "agb", # T&C overlap + "terms": "agb", + "terms_of_use": "agb", + "social_media": "dse", # Joint-controller / Art. 26 is in DSE area + "joint_controller": "dse", + "sub_processor": "avv", # Subprocessor list = AVV annex + "sub_processor_list": "avv", + "scc": "avv", # SCC = AVV-Vertragsklauseln + "standardvertragsklauseln": "avv", + "tom_annex": "avv", # TOM-Annex meist als AVV-Anlage + "tom": "avv", + "dpa": "avv", + "loeschung": "loeschkonzept", + "loeschfristen": "loeschkonzept", + "eu_institution": "dse", # EU institution = DSE under VO 2018/1725 + "dsb": "dse", # DSB info ist Teil der DSE +} + + async def _load_controls(doc_type: str, db_url: str, limit: int) -> list[dict]: - """Load all doc_check_controls for a doc_type from PostgreSQL.""" + """Load all doc_check_controls for a doc_type from PostgreSQL. + + Falls back via _MC_ALIAS_FALLBACK when no MCs exist for the requested + type (e.g. 'nutzungsbedingungen' -> 'agb'). + """ try: import asyncpg db = db_url or os.getenv( @@ -264,6 +291,10 @@ async def _load_controls(doc_type: str, db_url: str, limit: int) -> list[dict]: query += f" LIMIT {limit}" rows = await conn.fetch(query, doc_type) + if not rows and doc_type in _MC_ALIAS_FALLBACK: + fallback = _MC_ALIAS_FALLBACK[doc_type] + logger.info("No MCs for %s -> falling back to %s", doc_type, fallback) + rows = await conn.fetch(query, fallback) return [dict(r) for r in rows] except Exception as e: logger.warning("MC query failed: %s", e) diff --git a/backend-compliance/scripts/backfill_mc_regulation.py b/backend-compliance/scripts/backfill_mc_regulation.py new file mode 100644 index 00000000..bf730bc3 --- /dev/null +++ b/backend-compliance/scripts/backfill_mc_regulation.py @@ -0,0 +1,180 @@ +""" +Backfill the doc_check_controls.regulation + .article fields. + +The fields are currently NULL on all 1874 rows. Many MCs cite the +relevant norm inline in title / check_question / pass_criteria +(e.g. 'Art. 6 Abs. 1 lit. a DSGVO', '§ 25 Abs. 1 TDDDG'). We detect +those with regex and UPDATE the row. + +Run inside the bp-compliance-backend container: + docker exec bp-compliance-backend python3 /app/scripts/backfill_mc_regulation.py [--dry-run] + +The script is idempotent: existing non-null regulation is never overwritten. +""" + +from __future__ import annotations + +import asyncio +import os +import re +import sys + +import asyncpg + + +# Ordered: first match wins. Each pattern captures (article_str, regulation_label). +_PATTERNS: list[tuple[re.Pattern[str], str]] = [ + # Art. X DSGVO / GDPR / EU 2016/679 + (re.compile( + r"\b(?:art\.?|artikel)\s*" + r"(\d+[a-z]?(?:\s*(?:abs\.?|absatz)\s*\d+)?" + r"(?:\s*(?:lit\.?|litera|buchstabe)\s*[a-z])?" + r"(?:\s*satz\s*\d+)?)" + r"\s*(?:dsgvo|gdpr|vo\s*\(eu\)\s*2016/679|eu[\s-]?vo\s*2016/679)", + re.I, + ), "DSGVO"), + # § X TDDDG / TTDSG + (re.compile( + r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?(?:\s*(?:nr\.?|lit\.?)\s*\w+)?)\s*" + r"(?:tdddg|ttdsg|tkg)", + re.I, + ), "TDDDG"), + # § X TMG + (re.compile( + r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?(?:\s*nr\.?\s*\d+)?)\s*tmg\b", + re.I, + ), "TMG"), + # § X BGB + (re.compile( + r"§\s*(\d+[a-z]?(?:\s*abs\.?\s*\d+)?)\s*bgb\b", + re.I, + ), "BGB"), + # § X HGB + (re.compile( + r"§\s*(\d+[a-z]?)\s*hgb\b", + re.I, + ), "HGB"), + # § X AO + (re.compile( + r"§\s*(\d+[a-z]?)\s*ao\b", + re.I, + ), "AO"), + # § X MStV (Medienstaatsvertrag) + (re.compile( + r"§\s*(\d+[a-z]?)\s*m(?:edien)?st(?:aat)?v\b", + re.I, + ), "MStV"), + # § X UWG + (re.compile( + r"§\s*(\d+[a-z]?)\s*uwg\b", + re.I, + ), "UWG"), + # § X VSBG (Verbraucherstreitbeilegung) + (re.compile( + r"§\s*(\d+[a-z]?)\s*vsbg\b", + re.I, + ), "VSBG"), + # § X PAngV + (re.compile( + r"§\s*(\d+[a-z]?)\s*p(?:reis)?ang?v\b", + re.I, + ), "PAngV"), + # § X GwG + (re.compile( + r"§\s*(\d+[a-z]?)\s*gwg\b", + re.I, + ), "GwG"), + # § X BDSG + (re.compile( + r"§\s*(\d+[a-z]?)\s*bdsg\b", + re.I, + ), "BDSG"), + # EU-VO 524/2013 (ODR), 2018/1725 (EU-DSGVO) etc. + (re.compile( + r"\bart\.?\s*(\d+)\s*(?:eu[\s-]?vo|vo|verordnung)\s*(?:\(eu\)\s*)?(\d+/\d+)", + re.I, + ), "EU-VO"), + # Norm names without numbers, last resort (set article=NULL) + (re.compile(r"\bdsgvo\b", re.I), "DSGVO"), + (re.compile(r"\btdddg\b|\bttdsg\b", re.I), "TDDDG"), + (re.compile(r"\btmg\b", re.I), "TMG"), + (re.compile(r"\bbgb\b", re.I), "BGB"), + (re.compile(r"\bmstv\b", re.I), "MStV"), + (re.compile(r"\buwg\b", re.I), "UWG"), + (re.compile(r"\bvsbg\b", re.I), "VSBG"), + (re.compile(r"\bgwg\b", re.I), "GwG"), + (re.compile(r"\bbdsg\b", re.I), "BDSG"), +] + + +def detect(text: str) -> tuple[str | None, str | None]: + """Return (regulation, article) for the first pattern that matches.""" + if not text: + return None, None + for pat, label in _PATTERNS: + m = pat.search(text) + if m: + article = m.group(1) if m.groups() else None + if label == "EU-VO" and m.lastindex and m.lastindex >= 2: + article = f"Art. {m.group(1)} EU-VO {m.group(2)}" + elif article: + article = re.sub(r"\s+", " ", article).strip() + return label, article + return None, None + + +async def main(dry_run: bool = False) -> None: + db = os.getenv("DATABASE_URL") + if not db: + print("DATABASE_URL not set", file=sys.stderr) + sys.exit(1) + conn = await asyncpg.connect(db) + + rows = await conn.fetch( + "SELECT id, title, check_question, pass_criteria::text AS pc " + "FROM compliance.doc_check_controls " + "WHERE regulation IS NULL" + ) + print(f"{len(rows)} MCs with NULL regulation") + + updates: list[tuple[str | None, str | None, str]] = [] + hits = {"DSGVO": 0, "TDDDG": 0, "TMG": 0, "BGB": 0, "MStV": 0, + "UWG": 0, "VSBG": 0, "EU-VO": 0, "HGB": 0, "AO": 0, + "PAngV": 0, "GwG": 0, "BDSG": 0} + no_match = 0 + for r in rows: + combined = " ".join(filter(None, [ + r["title"] or "", r["check_question"] or "", r["pc"] or "", + ])) + reg, art = detect(combined) + if reg: + hits[reg] = hits.get(reg, 0) + 1 + updates.append((reg, art, str(r["id"]))) + else: + no_match += 1 + + print(f"Detected: {sum(hits.values())} | no match: {no_match}") + for k, v in sorted(hits.items(), key=lambda x: -x[1]): + if v: + print(f" {k:8s} {v:>5}") + + if dry_run: + print("\nDRY RUN — no changes written. Re-run without --dry-run to apply.") + await conn.close() + return + + # Apply updates in batches + BATCH = 200 + for i in range(0, len(updates), BATCH): + chunk = updates[i:i + BATCH] + await conn.executemany( + "UPDATE compliance.doc_check_controls " + "SET regulation = $1, article = $2 WHERE id = $3::uuid", + chunk, + ) + print(f"\nApplied {len(updates)} updates.") + await conn.close() + + +if __name__ == "__main__": + asyncio.run(main(dry_run="--dry-run" in sys.argv))