57c0f940a2
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / nodejs-build (push) Successful in 2m19s
CI / test-go (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
P56 Anti-Auditing-Detection als constructive Compliance-Finding (Audit-API-
Empfehlung statt Anklage, weil Mercedes berechtigt Bots blockiert)
P57 Phase G vendor_details Union mit cmp_vendors -> 42 Anbieter sichtbar
P58 Anti-Audit-Detection robuster (Script-Domain-Check + Settings-spezifisch)
P59 Cookie-Behavior-Validator (4 Layer, 3-Tier-Severity: MEDIUM=Kategorie-
Mismatch / HIGH=Zweck-Mismatch / CRITICAL=beide=Vorsatz-Indiz)
+ Open Cookie Database (CC0) als Library-Seed (2264 Cookies)
P59b Cookie-Behavior in Banner-Check verdrahtet + Mail-Block (BUGFIX:
SessionLocal selbst oeffnen, db war im Background-Task nicht im Scope)
Mail-Polish nach Mercedes-Review:
P63 Banner-Footer-Links auch im wb7-link/role=link erkennen (Shadow-DOM-
Walker label-based statt nur <a href>)
P64 Re-Access-Severity: MEDIUM statt HIGH, wenn Footer "Einstellungen" oder
Mercedes-typisch existiert; OEM-Footer-Detection (wb7-footer)
P65 Text-Truncation: Word-Boundary statt Zeichen-Cut (kein "einfa"-Bruch
mehr in Sofortmassnahmen)
P66 GF-Aktionen: Service-Zweck vs Cookie-Zweck explizit erklaert
(haeufige Verwechslung Marketing/GF: "Akamai-Beschreibung" != Cookie-
Zweck pro DSK-OH 2024)
P67 Stirring-Finding mit "Verlust-Framing"-Erklaerung + Alt-vs-Neutral-
Beispiel, statt nur EDPB-Fachbegriff
Compliance-Advisor FAQ (admin agent-core/soul):
+ CNIL/EDPB Top-Bussgelder (Google 100M, Meta 60M, Amazon 35M)
+ Deutsche Praezedenz (LG Muenchen Google Fonts, EuGH Planet49, BGH I ZR 7/16)
+ 4 Risiko-Pfade (Bussgeld/Abmahnung/Sammelklage/NOYB) + Berechnungs-Methodik
Document-Generator Templates: AGB-DE (142), Impressum (140), Widerrufs-
formular-Anlage (143), DSR-Process-Dedup (139), Cookie-Library (144).
Architektur: doc_action_mappings.py + banner_dom_walkers.py +
cookie_behavior_validator.py + vendor_detail_extractor.py rausgezogen,
um die 500-LOC-Caps in agent_doc_check_report.py und
banner_text_checker.py einzuhalten.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
116 lines
3.8 KiB
Python
116 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
"""P59 Phase 2 — Seed compliance.cookie_library from Open Cookie Database (CC0).
|
|
|
|
Open Cookie Database: jkwakman/Open-Cookie-Database (CC0-1.0 Public Domain).
|
|
~700 categorised cookies maintained by Cybot/Cookiebot community."""
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import io
|
|
import os
|
|
import sys
|
|
import urllib.request
|
|
|
|
import psycopg2
|
|
|
|
OCD_URL = (
|
|
"https://raw.githubusercontent.com/jkwakman/Open-Cookie-Database/master/"
|
|
"open-cookie-database.csv"
|
|
)
|
|
|
|
CATEGORY_MAP = {
|
|
"strictly necessary": "essential",
|
|
"functional": "functional",
|
|
"performance": "statistics",
|
|
"analytics": "statistics",
|
|
"targeting": "marketing",
|
|
"marketing": "marketing",
|
|
"advertisement": "marketing",
|
|
"social media": "social_media",
|
|
"unclassified": "unknown",
|
|
}
|
|
|
|
|
|
def parse_max_age(retention: str) -> int | None:
|
|
"""Approximate seconds from retention strings like '2 years' / '30 days'."""
|
|
if not retention:
|
|
return None
|
|
r = retention.lower().strip()
|
|
if "session" in r:
|
|
return 0
|
|
import re
|
|
m = re.search(r"(\d+)\s*(jahr|year|day|tag|month|monat|hour|stund|minute)", r)
|
|
if not m:
|
|
return None
|
|
n = int(m.group(1))
|
|
unit = m.group(2)
|
|
multipliers = {
|
|
"jahr": 31536000, "year": 31536000,
|
|
"month": 2592000, "monat": 2592000,
|
|
"day": 86400, "tag": 86400,
|
|
"hour": 3600, "stund": 3600,
|
|
"minute": 60,
|
|
}
|
|
return n * multipliers.get(unit, 1)
|
|
|
|
|
|
def main() -> int:
|
|
dsn = os.environ.get("DATABASE_URL")
|
|
if not dsn:
|
|
print("DATABASE_URL missing", file=sys.stderr); return 1
|
|
print(f"Fetching {OCD_URL} ...", file=sys.stderr)
|
|
try:
|
|
with urllib.request.urlopen(OCD_URL, timeout=30) as r:
|
|
body = r.read().decode("utf-8", errors="replace")
|
|
except Exception as e:
|
|
print(f"Fetch failed: {e}", file=sys.stderr); return 2
|
|
reader = csv.DictReader(io.StringIO(body))
|
|
rows = list(reader)
|
|
print(f"Parsed {len(rows)} rows", file=sys.stderr)
|
|
|
|
conn = psycopg2.connect(dsn)
|
|
cur = conn.cursor()
|
|
inserted = 0
|
|
skipped = 0
|
|
for r in rows:
|
|
name = (r.get("Cookie / Data Key name") or "").strip()
|
|
domain = (r.get("Domain") or "").strip()
|
|
if not name:
|
|
skipped += 1
|
|
continue
|
|
category_raw = (r.get("Category") or "").strip().lower()
|
|
actual_category = CATEGORY_MAP.get(category_raw, "unknown")
|
|
vendor = (r.get("Platform") or r.get("Data Controller") or "Unknown").strip()
|
|
purpose = (r.get("Description") or "").strip()[:1000]
|
|
privacy_url = (r.get("User Privacy & GDPR Rights Portals") or "").strip()
|
|
max_age = parse_max_age(r.get("Retention period") or "")
|
|
# Wildcard match flag → domain_pattern
|
|
domain_pattern = domain or "*"
|
|
cur.execute(
|
|
"""
|
|
INSERT INTO compliance.cookie_library
|
|
(cookie_name, domain_pattern, vendor_name,
|
|
vendor_privacy_url, actual_category, purpose_en,
|
|
typical_max_age_seconds, source_name, source_url,
|
|
source_license, confidence)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT DO NOTHING
|
|
""",
|
|
(name, domain_pattern, vendor[:200], privacy_url or None,
|
|
actual_category, purpose or None, max_age,
|
|
"Open Cookie Database", OCD_URL, "CC0-1.0", 0.75),
|
|
)
|
|
inserted += cur.rowcount
|
|
conn.commit()
|
|
print(f"\nInserted {inserted}, skipped {skipped}")
|
|
cur.execute("SELECT actual_category, COUNT(*) "
|
|
"FROM compliance.cookie_library GROUP BY actual_category "
|
|
"ORDER BY 2 DESC")
|
|
for row in cur.fetchall():
|
|
print(f" {row[0]:15s}: {row[1]}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|