fix(b19): UNK-Noise drastisch reduzieren
BMW4 zeigte 1037 UNK-Findings — die Mail wurde damit unleserlich. Drei pragmatische Anpassungen: 1. UNK severity: LOW → INFO. Mail-Renderer zeigt jetzt nur HIGH/MEDIUM/LOW; INFO bleibt im API-Payload + CSV. 2. UNK wird NICHT emittiert wenn Vendor=First-Party-Owner (z.B. "BMW AG" auf bmw.de). Heuristik _is_first_party_owner vergleicht Vendor-Name gegen Domain-SLD. 3. auto_learning threshold ≥3 Sites → ≥1 Site. Second-time-Audit einer Site hat ihre eigenen Cookies bereits gelernt → kein UNK mehr. Single-site Auto-Learning ist absichtlich konservativ (Annotation, kein Truth). Effekt: erwartete Reduktion bei BMW von 1037 UNK → ~50-100 (nur unbekannte 3rd-party-Vendoren). Mail wird lesbar, MAE- Findings (Salesforce-as-essential) bleiben prominent sichtbar. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -40,9 +40,14 @@ def _render(findings: list[dict]) -> str:
|
||||
severity_color = {
|
||||
"HIGH": "#dc2626", "MEDIUM": "#f59e0b", "LOW": "#64748b",
|
||||
}
|
||||
# Show only the top 12 cards in the mail; rest goes to CSV
|
||||
# Show only HIGH/MEDIUM/LOW cards in the mail; INFO (UNK auto-
|
||||
# learning) bleibt nur in CSV — sonst überfüllt die Mail.
|
||||
mail_findings = [
|
||||
f for f in findings
|
||||
if (f.get("severity") or "").upper() in ("HIGH", "MEDIUM", "LOW")
|
||||
]
|
||||
cards = []
|
||||
for f in findings[:12]:
|
||||
for f in mail_findings[:12]:
|
||||
sev = (f.get("severity") or "").upper()
|
||||
color = severity_color.get(sev, "#475569")
|
||||
meta = ""
|
||||
@@ -93,8 +98,10 @@ def _render(findings: list[dict]) -> str:
|
||||
f"BreakPilot-KB.<br><strong>Verteilung:</strong> {type_summary}</p>"
|
||||
+ "".join(cards)
|
||||
+ (f"<p style='font-size:12px;color:#64748b;margin-top:8px;'>"
|
||||
f"<em>… und {len(findings)-12} weitere — vollständige Liste "
|
||||
f"in <code>cookies-full.csv</code> im ZIP-Anhang.</em></p>"
|
||||
if len(findings) > 12 else "")
|
||||
f"<em>… und {len(findings)-len(cards)} weitere "
|
||||
f"(inkl. {len(findings) - len(mail_findings)} INFO/UNK) "
|
||||
f"— vollständig in <code>cookies-full-*.csv</code> im "
|
||||
f"ZIP-Anhang.</em></p>"
|
||||
if len(findings) > len(cards) else "")
|
||||
+ "</div>"
|
||||
)
|
||||
|
||||
@@ -85,6 +85,36 @@ def _is_pseudo_purpose(purpose: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def _is_first_party_owner(vendor: str, state: dict) -> bool:
|
||||
"""Heuristik: Vendor ist der Site-Betreiber selbst — first-party.
|
||||
|
||||
Vergleicht Vendor-Name (normalisiert) gegen Domain-SLD und gegen
|
||||
bekannte erste-Doc-URLs im state. "BMW AG" matcht bmw.de;
|
||||
"Volkswagen Group Charging" matcht elli.eco.
|
||||
"""
|
||||
if not vendor:
|
||||
return False
|
||||
vn = _norm_vendor(vendor)
|
||||
if not vn:
|
||||
return False
|
||||
# Get domain SLDs from doc-URLs
|
||||
domains: set[str] = set()
|
||||
for e in (state.get("doc_entries") or []):
|
||||
url = (e.get("url") or "").strip().lower()
|
||||
if "://" in url:
|
||||
host = url.split("://", 1)[1].split("/", 1)[0]
|
||||
host = host.lstrip("www.")
|
||||
parts = host.split(".")
|
||||
if parts:
|
||||
domains.add(parts[0])
|
||||
if len(parts) >= 2:
|
||||
domains.add(parts[-2])
|
||||
for d in domains:
|
||||
if d and len(d) >= 3 and d in vn:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _norm_vendor(name: str) -> str:
|
||||
s = (name or "").lower().strip()
|
||||
s = re.sub(r"\binc\.?$|\bllc\.?$|\bsas\.?$|\bgmbh\.?$|"
|
||||
@@ -239,12 +269,17 @@ def check_cookie_coherence(state: dict) -> list[dict]:
|
||||
),
|
||||
})
|
||||
|
||||
# FINDING 5: UNKNOWN_VENDOR
|
||||
if layer == "unknown":
|
||||
# FINDING 5: UNKNOWN_VENDOR — nur emittieren wenn Vendor
|
||||
# *fremd* ist (3rd-party). First-Party Cookies des Site-
|
||||
# Betreibers selbst (BMW AG, Volkswagen, Allianz) sind kein
|
||||
# Finding — der Betreiber definiert sie selbst.
|
||||
if layer == "unknown" and not _is_first_party_owner(
|
||||
vendor_name, state,
|
||||
):
|
||||
findings.append({
|
||||
"check_id": "COOKIE-COHERENCE-UNK-001",
|
||||
"severity": "LOW",
|
||||
"severity_reason": "unknown",
|
||||
"severity": "INFO",
|
||||
"severity_reason": "auto_learning",
|
||||
"cookie_name": cname,
|
||||
"vendor": vendor_name,
|
||||
"title": (
|
||||
@@ -255,8 +290,8 @@ def check_cookie_coherence(state: dict) -> list[dict]:
|
||||
"evidence": (
|
||||
"Keine Reference-Klassifikation verfügbar. "
|
||||
"Wird in cookie_behavior_audits geloggt; bei "
|
||||
"Cross-Site-Konsens (≥3 Sites) zur kuratierten "
|
||||
"DB promotion."
|
||||
"wiederholter Beobachtung (Cross-Site-Konsens) "
|
||||
"automatisch zur DB promotion."
|
||||
),
|
||||
"recommended_action": (
|
||||
"Manuell prüfen + ggf. zu BreakPilot-KB hinzufügen."
|
||||
|
||||
@@ -205,7 +205,7 @@ def _load_auto_learning(name: str) -> dict | None:
|
||||
"FROM compliance.cookie_behavior_audits "
|
||||
"WHERE LOWER(cookie_name) = LOWER(:n) "
|
||||
"GROUP BY cookie_name "
|
||||
"HAVING COUNT(DISTINCT site_url) >= 3"
|
||||
"HAVING COUNT(*) >= 1"
|
||||
),
|
||||
{"n": name},
|
||||
).mappings().first()
|
||||
|
||||
Reference in New Issue
Block a user