Files
breakpilot-compliance/backend-compliance/compliance/services/run_diff.py
T
Benjamin Admin df8832c521
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 15s
CI / loc-budget (push) Failing after 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 42s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(audit): P75 Banner-vs-CMP + P84 Diff-Mode + P74/P96/P97 Doc-Types
P75 — check_banner_vs_cmp_partner_count: wenn Banner-Text 'N Partner'
nennt und N < cmp_vendors * 0.6, HIGH-Finding (Art. 13(1)(e) DSGVO).
Erkennt Verharmlosung der tatsaechlichen Vendor-Anzahl.

P84 — run_diff.py: vergleicht aktuellen Lauf mit letztem Snapshot
derselben Site (set-Diff auf normalisierten Finding-Labels). Block
ueber dem GF-1-Pager: 'Seit letztem Lauf: X Findings weg, Y neue'.
USP — keiner der grossen Anbieter hat das.

P74/P96/P97 — Labels fuer legal_notice (Rechtliche Hinweise / IP /
Forward-Looking), dsa (Art. 12+17 Digital Services Act), lizenzhinweise
(OSS-Compliance) in _DOC_TYPE_LABELS registriert. Echte Pflichtangaben-
Checks kommen separat.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 16:38:25 +02:00

183 lines
6.2 KiB
Python

"""
P84 — Diff-Mode pro Mail.
Vergleicht den aktuellen Lauf mit dem letzten Snapshot derselben Site:
"Seit letztem Lauf 3 Findings weg, 1 neues." USP — keiner der grossen
Anbieter (Borlabs, OneTrust, Cookiebot, Usercentrics) hat das.
Wird in der Mail-Composition nach dem GF-1-Pager gerendert (klein,
neutral). Wenn kein vorheriger Lauf existiert: skip silently.
Heuristik: Extrahiert Finding-Labels aus banner_result.phases[].findings
und (wenn vorhanden) scorecard.failed. Vergleicht set-basiert auf
normalisiertem Label.
"""
from __future__ import annotations
import logging
import re
from datetime import datetime, timezone
from typing import Any
from sqlalchemy import text
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
def _norm_label(s: str) -> str:
s = (s or "").lower().strip()
s = re.sub(r"\s+", " ", s)
s = re.sub(r"[^\w\s äöüß]", "", s)
return s[:200]
def _extract_finding_labels(
banner_result: dict | None,
scorecard: dict | None = None,
) -> set[str]:
out: set[str] = set()
if isinstance(banner_result, dict):
for ph in (banner_result.get("phases") or {}).values():
if not isinstance(ph, dict):
continue
for f in (ph.get("findings") or []):
if isinstance(f, dict):
lbl = f.get("label") or f.get("title") or f.get("check") or ""
if lbl:
out.add(_norm_label(lbl))
if isinstance(scorecard, dict):
for ent in (scorecard.get("failed") or scorecard.get("items") or []):
if isinstance(ent, dict):
lbl = ent.get("label") or ent.get("title") or ""
if lbl:
out.add(_norm_label(lbl))
return out
def _previous_snapshot(db: Session, site_domain: str,
exclude_check_id: str) -> dict | None:
"""Returns the most recent snapshot for the same site (excluding the
current one)."""
row = db.execute(text(
"""
SELECT check_id, banner_result, created_at
FROM compliance.compliance_check_snapshots
WHERE site_domain = :dom AND check_id != :ex
ORDER BY created_at DESC LIMIT 1
"""
), {"dom": site_domain, "ex": exclude_check_id}).fetchone()
if not row:
return None
return {
"check_id": row[0],
"banner_result": row[1] or {},
"created_at": row[2],
}
def compute_diff(
db: Session,
current_check_id: str,
site_domain: str,
banner_result: dict | None,
scorecard: dict | None = None,
) -> dict | None:
"""Returns {prev_check_id, prev_at, added, removed, unchanged_count}
or None if there is no previous snapshot."""
prev = _previous_snapshot(db, site_domain, current_check_id)
if not prev:
return None
curr_set = _extract_finding_labels(banner_result, scorecard)
prev_set = _extract_finding_labels(prev["banner_result"], None)
if not curr_set and not prev_set:
return None
return {
"prev_check_id": prev["check_id"],
"prev_at": prev["created_at"],
"added": sorted(curr_set - prev_set)[:20],
"removed": sorted(prev_set - curr_set)[:20],
"unchanged_count": len(curr_set & prev_set),
}
def _fmt_age(when: Any) -> str:
if not isinstance(when, datetime):
return "frueher"
if when.tzinfo is None:
when = when.replace(tzinfo=timezone.utc)
delta = datetime.now(timezone.utc) - when
days = delta.days
if days <= 0:
hours = delta.seconds // 3600
return f"vor {hours}h" if hours else "soeben"
if days == 1:
return "vor 1 Tag"
if days < 14:
return f"vor {days} Tagen"
weeks = days // 7
return f"vor {weeks} Wochen"
def build_diff_block_html(diff: dict) -> str:
if not diff:
return ""
added = diff.get("added") or []
removed = diff.get("removed") or []
if not added and not removed:
return (
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:760px;margin:0 auto 12px;padding:10px 14px;'
'background:#f1f5f9;border:1px solid #cbd5e1;border-radius:6px;'
'font-size:11px;color:#475569">'
f'<strong>Vergleich zum letzten Lauf '
f'({_fmt_age(diff.get("prev_at"))}):</strong> keine Veraenderungen '
f'in den erkannten Findings ({diff.get("unchanged_count",0)} '
'identisch geblieben).'
'</div>'
)
items: list[str] = []
if removed:
items.append(
'<div style="font-size:11px;color:#166534;margin-bottom:4px">'
f'<strong>{len(removed)} Finding{"s" if len(removed) != 1 else ""} '
'nicht mehr vorhanden:</strong></div>'
'<ul style="margin:0 0 8px 18px;padding:0">'
+ "".join(
f'<li style="font-size:11px;color:#166534;margin-bottom:2px">'
f'{x}</li>'
for x in removed[:6]
) + '</ul>'
)
if added:
items.append(
'<div style="font-size:11px;color:#991b1b;margin-bottom:4px">'
f'<strong>{len(added)} neue{"s" if len(added) == 1 else ""} '
f'Finding{"s" if len(added) != 1 else ""}:</strong></div>'
'<ul style="margin:0 0 8px 18px;padding:0">'
+ "".join(
f'<li style="font-size:11px;color:#991b1b;margin-bottom:2px">'
f'! {x}</li>'
for x in added[:6]
) + '</ul>'
)
return (
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:760px;margin:0 auto 12px;padding:12px 16px;'
'background:#fffbeb;border:1px solid #fde68a;border-radius:6px">'
'<div style="font-size:11px;color:#92400e;text-transform:uppercase;'
'letter-spacing:1.2px;margin-bottom:6px;font-weight:600">'
f'Was hat sich seit dem letzten Lauf veraendert '
f'({_fmt_age(diff.get("prev_at"))})'
'</div>'
+ "".join(items) +
f'<div style="font-size:10px;color:#94a3b8;margin-top:4px">'
f'{diff.get("unchanged_count",0)} weitere Findings unveraendert '
'— vollstaendige Liste weiter unten.</div>'
'</div>'
)