Files
breakpilot-compliance/backend-compliance/compliance/services/unified_findings_store.py
T
Benjamin Admin 6c223c7c9b
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m43s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(compliance-check): exec-summary + voll-audit + TDM-respect + cookie-KB-extended + saving-scan-funnel
P1 — Exec-Summary oben im Email-Report (4 KPIs + 2 CTAs, dunkler Gradient)
P3 — no_direct_sales-Flag fuer OEM-Konfigurator-Sites; AGB/Widerruf/AGB als
     "NICHT ANWENDBAR" (grau) statt "NICHT GEFUNDEN" (rot)
P5 — Voll-Audit Unification: alle Findings (MC + Pflichtangaben + Vendor +
     Redundanz) in /data/compliance_audits.db.unified_findings; neuer
     /api/compliance/agent/findings/<id> Endpoint + FindingsTab im Audit-UI
     mit Filter + CSV-Export
P7 — Crawl-Hardening: TDM-Reservation-Check (robots.txt / ai.txt / Header /
     Meta) vor jedem Run mit 24h-Cache; HeadlessChrome-UA (Firma noch nicht
     gegruendet — Switch via BREAKPILOT_BRANDED_UA env); per-Domain
     Rate-Limit 1 req/s + max 2 concurrent
P2 — Cookie-Knowledge-DB additiv erweitert (35 -> 74 Cookies): Adobe, Meta,
     Microsoft, LinkedIn, TikTok, HubSpot, Marketo, Salesforce, Hotjar,
     FullStory, Mouseflow, Intercom, Drift, Zendesk, Cloudflare, Stripe,
     OneTrust/Cookiebot/Usercentrics, Matomo, Pinterest, Snapchat, X/Twitter,
     YouTube, Vimeo, Klaviyo, Mailchimp, Mixpanel, Segment, Amplitude,
     Optimizely, Datadog; Wire-in in cookie_function_classifier liefert
     compliance_risk-Label (kritisch/hoch/mittel/gering) pro Vendor
A  — k-Anonymitaets-Helper (benchmark_k_anonymity) fuer P6-Vorbereitung
B  — Cross-Tenant-Domain-Assertion im /findings-Endpoint (expected_domain
     Query-Param -> 403 bei Mismatch)
C  — Saving-Scan-Funnel: /api/compliance/agent/saving-scan/start mit
     Validierung + 24h-Rate-Limit pro Domain + Lead-Persistenz in
     saving_scan_leads + Auto-Discovery via _run_compliance_check; 6 Tests
D  — Risk-Badge im Email-Vendor-Row

Rechtliche Leitplanken (Memory feedback_oem_data_legal.md): nur eigene
Knapp-Bewertungen + Source-Pointer, keine 1:1-Kopien fremder CMP-Texte.
TDM-Opt-Out-Respect nach § 44b UrhG. KEINE Schema-Aenderungen — alles in
Sidecar-SQLite.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 23:48:34 +02:00

191 lines
7.5 KiB
Python

"""
Unified-Findings sidecar store.
A compliance check produces findings from 4 sources today:
- Master-Controls (mc_results table — already persisted)
- Pflichtangaben (L1/L2 doc checks, e.g. Impressum-Vollstaendigkeit)
- Vendor scans (per cmp_vendor: missing privacy url, no opt-out, ...)
- Redundancies (multi-vendor in same category)
Previously the DSB had to look in 4 different blocks of the email to
find everything. This store flattens all of them into ONE searchable
table so the /audit/<check_id> frontend can show a unified list with
source / severity / status / doc_type filters.
Sidecar SQLite (same DB as compliance_audit_log) — no Postgres
migration needed.
"""
from __future__ import annotations
import json
import logging
import os
import sqlite3
from pathlib import Path
logger = logging.getLogger(__name__)
DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")
def _ensure_table() -> None:
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
with sqlite3.connect(DB_PATH) as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS unified_findings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
check_id TEXT NOT NULL,
source_type TEXT NOT NULL, -- mc|pflichtangabe|vendor|redundanz
doc_type TEXT, -- impressum|dse|cookie|... or '-' for vendor/redundanz
severity TEXT, -- CRITICAL|HIGH|MEDIUM|LOW|INFO
status TEXT, -- failed|passed|skipped|na|info
regulation TEXT,
label TEXT,
hint TEXT,
action_recipe TEXT, -- JSON {what,why,fix_text,where,example}
anchor_excerpt TEXT,
anchor_conf REAL,
vendor_name TEXT,
category TEXT,
payload TEXT -- JSON extras (matched_text, cookies count, ...)
);
CREATE INDEX IF NOT EXISTS idx_uf_check ON unified_findings(check_id);
CREATE INDEX IF NOT EXISTS idx_uf_source ON unified_findings(check_id, source_type);
CREATE INDEX IF NOT EXISTS idx_uf_status ON unified_findings(check_id, status);
CREATE INDEX IF NOT EXISTS idx_uf_severity ON unified_findings(check_id, severity);
""")
def record_findings(check_id: str, findings: list[dict]) -> int:
"""Bulk-insert all findings for a check. Idempotent on check_id."""
if not check_id:
return 0
try:
_ensure_table()
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"DELETE FROM unified_findings WHERE check_id=?", (check_id,),
)
if not findings:
conn.commit()
return 0
rows = [
(
check_id,
(f.get("source_type") or "mc")[:24],
(f.get("doc_type") or "")[:32],
(f.get("severity") or "MEDIUM").upper()[:16],
(f.get("status") or "failed")[:16],
(f.get("regulation") or "")[:64],
(f.get("label") or "")[:400],
(f.get("hint") or "")[:1200],
json.dumps(f.get("action_recipe") or {}, ensure_ascii=False),
(f.get("anchor_excerpt") or "")[:800],
float(f.get("anchor_conf") or 0.0),
(f.get("vendor_name") or "")[:160],
(f.get("category") or "")[:64],
json.dumps(f.get("payload") or {}, ensure_ascii=False),
)
for f in findings
]
conn.executemany(
"INSERT INTO unified_findings "
"(check_id, source_type, doc_type, severity, status, regulation, "
" label, hint, action_recipe, anchor_excerpt, anchor_conf, "
" vendor_name, category, payload) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
rows,
)
conn.commit()
logger.info(
"unified_findings: %s rows=%d sources=%s",
check_id, len(rows),
sorted(set(f.get("source_type", "mc") for f in findings)),
)
return len(rows)
except Exception as e:
logger.warning("record_findings failed for %s: %s", check_id, e)
return 0
def list_findings(
check_id: str,
source_type: str | None = None,
severity: str | None = None,
doc_type: str | None = None,
status: str | None = None,
q: str | None = None,
limit: int = 1000,
) -> list[dict]:
"""Return filtered findings. q matches label OR vendor_name (case-insensitive)."""
try:
_ensure_table()
where = ["check_id = ?"]
params: list = [check_id]
if source_type and source_type != "all":
where.append("source_type = ?")
params.append(source_type)
if severity and severity != "all":
where.append("severity = ?")
params.append(severity.upper())
if doc_type and doc_type != "all":
where.append("doc_type = ?")
params.append(doc_type)
if status and status != "all":
where.append("status = ?")
params.append(status)
if q:
where.append("(LOWER(label) LIKE ? OR LOWER(vendor_name) LIKE ?)")
needle = f"%{q.lower()}%"
params.extend([needle, needle])
sql = ("SELECT * FROM unified_findings WHERE " + " AND ".join(where) +
" ORDER BY CASE severity "
" WHEN 'CRITICAL' THEN 0 WHEN 'HIGH' THEN 1 "
" WHEN 'MEDIUM' THEN 2 WHEN 'LOW' THEN 3 "
" ELSE 4 END, source_type, label LIMIT ?")
params.append(int(limit))
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute(sql, params).fetchall()
out = []
for r in rows:
d = dict(r)
d["action_recipe"] = json.loads(d.get("action_recipe") or "{}")
d["payload"] = json.loads(d.get("payload") or "{}")
out.append(d)
return out
except Exception as e:
logger.warning("list_findings failed: %s", e)
return []
def findings_summary(check_id: str) -> dict:
"""Return aggregate counts for the filter UI (source/severity/status)."""
out = {
"total": 0,
"by_source": {},
"by_severity": {},
"by_status": {},
"by_doc_type": {},
}
try:
_ensure_table()
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
for col in ("source_type", "severity", "status", "doc_type"):
rows = conn.execute(
f"SELECT {col} AS k, COUNT(*) AS n FROM unified_findings "
f"WHERE check_id=? GROUP BY {col}",
(check_id,),
).fetchall()
bucket = f"by_{col if col != 'source_type' else 'source'}"
if col == "doc_type":
bucket = "by_doc_type"
out[bucket] = {r["k"] or "-": r["n"] for r in rows}
out["total"] = max(out["total"], sum(r["n"] for r in rows))
return out
except Exception as e:
logger.warning("findings_summary failed: %s", e)
return out