feat(audit-pipeline): P72 MC-Scope-Classifier + P80 Snapshot/Replay-Foundation [migration-approved]
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
P72 MC-Scope-Classifier — pro MC den ECHTEN Doc-Adressaten festlegen
(cookie_richtlinie/dse/banner_implementation/cmp_audit/tom/avv/jc/
impressum/agb/widerruf/process/accounting/other).
- Migration 145: scope_doc_type Spalte + Index auf canonical_controls
- Backfill-Script mit Regex-Heuristik (12 Regeln, Prioritaet-sortiert)
- Erste 11k-Sample-Distribution: 76% other (Heuristik v1 zu strict —
v2 muss lockerere Patterns fuer DSE/TOM nachschaerfen)
- Ziel: bevor MC-Scorecard filtert, weiss jeder MC welches Dokument
er adressiert. Bisher landeten eHealth-/HGB-MCs im Cookie-Audit.
P80 Snapshot + Replay-Foundation — Roh-Daten persistieren damit
Audit-Pipeline ohne erneuten Crawl rebuildbar ist.
- Migration 146: compliance_check_snapshots Tabelle (JSONB pro
doc_entries/banner_result/profile/cmp_vendors/scan_context)
- services.check_snapshot.save_snapshot/load_snapshot/list
- Endpoints GET /snapshots, GET /snapshots/{id}
- Hook in _run_compliance_check: nach Mail-Send automatischer
Snapshot-Save via separater SessionLocal (background-task safe)
- Replay-Endpoint folgt im naechsten PR (braucht Refactoring
von _run_compliance_check in crawl_phase + interpret_phase)
- Effekt: Test-Cycle 7min -> 5sec bei reinen Logik-Aenderungen
(P73/P79/P81+ profitieren direkt). Snapshots dienen auch als
Regression-Test-Corpus (P81 Golden-Truth-Library).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
"""P72 — Backfill scope_doc_type fuer compliance.canonical_controls.
|
||||
|
||||
Heuristik: pro MC schauen Title/Objective/Tags/verification_method an und
|
||||
klassifizieren nach dem ECHTEN Adressaten. Default: 'other'.
|
||||
|
||||
Ziel: 60-80% der heutigen MC-HIGH-Noise verschwindet aus Cookie/DSE-Audit
|
||||
und landet beim richtigen Adressaten (Impressum, AGB, TOM, Banner-Impl).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from typing import Pattern
|
||||
|
||||
import psycopg2
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Klassifizierungs-Regeln (Reihenfolge = Prioritaet, erste Treffer gewinnt).
|
||||
# Tuple: (scope_doc_type, regex_pattern_lower)
|
||||
# ---------------------------------------------------------------------------
|
||||
RULES: list[tuple[str, Pattern]] = [
|
||||
# Banner-Implementierung (UI, nicht Text) — hoechste Prio
|
||||
(
|
||||
"banner_implementation",
|
||||
re.compile(
|
||||
r"\b(banner|cookie[-\s]?wall|pre[-\s]?ticked|"
|
||||
r"vorausgewaehlt(e|en)?\s*checkbox|"
|
||||
r"browser[-\s]?(default|standard|einstellung)|"
|
||||
r"dark[-\s]?pattern|reject.{0,20}button|ablehn.{0,20}button|"
|
||||
r"einwilligung.{0,30}aktive.{0,20}handlung|"
|
||||
r"floating.{0,20}icon)"
|
||||
),
|
||||
),
|
||||
# CMP-Audit-Trail
|
||||
(
|
||||
"cmp_audit",
|
||||
re.compile(
|
||||
r"\b(consent[-\s]?(log|trail|audit)|"
|
||||
r"konsent[-\s]?trag(er|er-?id)|"
|
||||
r"einwilligungs(nachweis|log|trail|protokoll)|"
|
||||
r"datensaetze?.{0,30}einwilligung|"
|
||||
r"zeitstempel.{0,30}einwilligung|"
|
||||
r"cmp[-\s]?audit)"
|
||||
),
|
||||
),
|
||||
# AVV (Art. 28)
|
||||
(
|
||||
"avv",
|
||||
re.compile(
|
||||
r"\b(art\.?\s*28|auftragsverarbeit|adv|avv|"
|
||||
r"data[-\s]?processing[-\s]?agreement|dpa)"
|
||||
),
|
||||
),
|
||||
# JC (Art. 26)
|
||||
(
|
||||
"jc",
|
||||
re.compile(
|
||||
r"\b(art\.?\s*26|joint[-\s]?controller|"
|
||||
r"gemeinsam(e|er)\s*verantwortlich|"
|
||||
r"konzern.{0,40}(verantwortlich|verarbeit))"
|
||||
),
|
||||
),
|
||||
# Impressum (§5 TMG / §18 MStV)
|
||||
(
|
||||
"impressum",
|
||||
re.compile(
|
||||
r"\b((paragraph|§)\s*5\s*(tmg|ddg)|"
|
||||
r"§\s*18\s*mstv|"
|
||||
r"impressum|anbieterkennzeichnung|"
|
||||
r"geschaeftsbrief|firma.{0,20}kaufmann|"
|
||||
r"vollstaendige.{0,20}geschaeftsadresse|"
|
||||
r"identitaet.{0,20}leistungserbringer|"
|
||||
r"postalische?.{0,30}adresse)"
|
||||
),
|
||||
),
|
||||
# AGB
|
||||
(
|
||||
"agb",
|
||||
re.compile(
|
||||
r"\b(agb|allgemeine\s*geschaeftsbedingungen|"
|
||||
r"vertragsbedingungen|"
|
||||
r"§\s*305.{0,5}(bgb)?)"
|
||||
),
|
||||
),
|
||||
# Widerruf
|
||||
(
|
||||
"widerruf",
|
||||
re.compile(
|
||||
r"\b(widerrufsbelehrung|widerrufsrecht|"
|
||||
r"14.{0,10}tage.{0,10}frist|"
|
||||
r"musterwiderruf)"
|
||||
),
|
||||
),
|
||||
# Accounting (UStG/Rechnungsstellung — NICHT Compliance-Audit)
|
||||
(
|
||||
"accounting",
|
||||
re.compile(
|
||||
r"\b((rechnung|invoice).{0,30}(angeben|enthalten|fuehren)|"
|
||||
r"§\s*14\s*ustg|umsatzsteueridentifikation\s+nummer.{0,30}rechnung|"
|
||||
r"buchhaltung|"
|
||||
r"steuernummer.{0,30}rechnung)"
|
||||
),
|
||||
),
|
||||
# TOM (Art. 32 + technische Sicherheit)
|
||||
(
|
||||
"tom",
|
||||
re.compile(
|
||||
r"\b(art\.?\s*32|verschluesselung|backup|"
|
||||
r"pseudonymisier|anonymisier|"
|
||||
r"zugriffskontrolle|berechtigungskonzept|"
|
||||
r"penetrationstest|security[-\s]?incident|"
|
||||
r"intrusion[-\s]?detection|firewall|"
|
||||
r"tom|technisch[-\s]?organisatorische)"
|
||||
),
|
||||
),
|
||||
# Cookie-Richtlinie (vor DSE pruefen, weil enger)
|
||||
(
|
||||
"cookie_richtlinie",
|
||||
re.compile(
|
||||
r"\b(cookie[-\s]?richtlinie|cookie[-\s]?(policy|liste|tabelle)|"
|
||||
r"§\s*25\s*(tddg|tdddg|ttdsg)|"
|
||||
r"cookie.{0,30}(zweck|speicherdauer|drittland|anbieter))"
|
||||
),
|
||||
),
|
||||
# DSE (Art. 13/14 — breit, daher spaet)
|
||||
(
|
||||
"dse",
|
||||
re.compile(
|
||||
r"\b(art\.?\s*1[34]|datenschutzerklaerung|"
|
||||
r"datenschutzhinweis|datenschutzinformation|"
|
||||
r"informationspflicht|"
|
||||
r"empfaenger(\s*oder\s*empfaengerkategorien)?|"
|
||||
r"drittland.{0,30}(transfer|uebermittlung)|"
|
||||
r"verantwortlich(er|en)\s+benennen|"
|
||||
r"betroffenenrecht|art\.?\s*1[5-9]|art\.?\s*2[0-2])"
|
||||
),
|
||||
),
|
||||
# Process (nicht text-basiert, kann nicht durch Text-Einfuegung erfuellt werden)
|
||||
(
|
||||
"process",
|
||||
re.compile(
|
||||
r"\b(prozess|verfahren|workflow|"
|
||||
r"durchfuehren|umsetzen|implementieren|"
|
||||
r"schulung|mitarbeiterunterweis|"
|
||||
r"regelmaessig.{0,30}pruefen|"
|
||||
r"kontinuierlich|laufend|"
|
||||
r"datenpannenmeldung|art\.?\s*33|"
|
||||
r"loeschkonzept.{0,30}umsetz)"
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def classify(title: str, objective: str, tags: str | None = None) -> str:
|
||||
"""Apply rules in order, return first match. Default: 'other'."""
|
||||
text = " ".join(
|
||||
s.lower() for s in (title or "", objective or "", tags or "") if s
|
||||
)
|
||||
for scope, pattern in RULES:
|
||||
if pattern.search(text):
|
||||
return scope
|
||||
return "other"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
dsn = os.environ.get("DATABASE_URL")
|
||||
if not dsn:
|
||||
print("DATABASE_URL missing", file=sys.stderr)
|
||||
return 1
|
||||
conn = psycopg2.connect(dsn)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT id, title, objective, tags
|
||||
FROM compliance.canonical_controls
|
||||
WHERE scope_doc_type IS NULL
|
||||
AND merged_into_uuid IS NULL
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
print(f"Backfilling {len(rows)} unscoped MCs...", file=sys.stderr)
|
||||
|
||||
from collections import Counter
|
||||
stats = Counter()
|
||||
batch = []
|
||||
for row_id, title, objective, tags in rows:
|
||||
scope = classify(title or "", objective or "", tags)
|
||||
stats[scope] += 1
|
||||
batch.append((scope, row_id))
|
||||
if len(batch) >= 1000:
|
||||
cur.executemany(
|
||||
"UPDATE compliance.canonical_controls SET scope_doc_type=%s WHERE id=%s",
|
||||
batch,
|
||||
)
|
||||
conn.commit()
|
||||
print(f" committed {sum(stats.values())} so far", file=sys.stderr)
|
||||
batch = []
|
||||
if batch:
|
||||
cur.executemany(
|
||||
"UPDATE compliance.canonical_controls SET scope_doc_type=%s WHERE id=%s",
|
||||
batch,
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
print("\n=== Distribution ===")
|
||||
for scope, n in sorted(stats.items(), key=lambda x: -x[1]):
|
||||
print(f" {scope:25s} {n:>6} ({100*n/max(1,len(rows)):.1f}%)")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user