#!/usr/bin/env python3 """P59 Phase 2 — Seed compliance.cookie_library from Open Cookie Database (CC0). Open Cookie Database: jkwakman/Open-Cookie-Database (CC0-1.0 Public Domain). ~700 categorised cookies maintained by Cybot/Cookiebot community.""" from __future__ import annotations import csv import io import os import sys import urllib.request import psycopg2 OCD_URL = ( "https://raw.githubusercontent.com/jkwakman/Open-Cookie-Database/master/" "open-cookie-database.csv" ) CATEGORY_MAP = { "strictly necessary": "essential", "functional": "functional", "performance": "statistics", "analytics": "statistics", "targeting": "marketing", "marketing": "marketing", "advertisement": "marketing", "social media": "social_media", "unclassified": "unknown", } def parse_max_age(retention: str) -> int | None: """Approximate seconds from retention strings like '2 years' / '30 days'.""" if not retention: return None r = retention.lower().strip() if "session" in r: return 0 import re m = re.search(r"(\d+)\s*(jahr|year|day|tag|month|monat|hour|stund|minute)", r) if not m: return None n = int(m.group(1)) unit = m.group(2) multipliers = { "jahr": 31536000, "year": 31536000, "month": 2592000, "monat": 2592000, "day": 86400, "tag": 86400, "hour": 3600, "stund": 3600, "minute": 60, } return n * multipliers.get(unit, 1) def main() -> int: dsn = os.environ.get("DATABASE_URL") if not dsn: print("DATABASE_URL missing", file=sys.stderr); return 1 print(f"Fetching {OCD_URL} ...", file=sys.stderr) try: with urllib.request.urlopen(OCD_URL, timeout=30) as r: body = r.read().decode("utf-8", errors="replace") except Exception as e: print(f"Fetch failed: {e}", file=sys.stderr); return 2 reader = csv.DictReader(io.StringIO(body)) rows = list(reader) print(f"Parsed {len(rows)} rows", file=sys.stderr) conn = psycopg2.connect(dsn) cur = conn.cursor() inserted = 0 skipped = 0 for r in rows: name = (r.get("Cookie / Data Key name") or "").strip() domain = (r.get("Domain") or "").strip() if not name: skipped += 1 continue category_raw = (r.get("Category") or "").strip().lower() actual_category = CATEGORY_MAP.get(category_raw, "unknown") vendor = (r.get("Platform") or r.get("Data Controller") or "Unknown").strip() purpose = (r.get("Description") or "").strip()[:1000] privacy_url = (r.get("User Privacy & GDPR Rights Portals") or "").strip() max_age = parse_max_age(r.get("Retention period") or "") # Wildcard match flag → domain_pattern domain_pattern = domain or "*" cur.execute( """ INSERT INTO compliance.cookie_library (cookie_name, domain_pattern, vendor_name, vendor_privacy_url, actual_category, purpose_en, typical_max_age_seconds, source_name, source_url, source_license, confidence) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING """, (name, domain_pattern, vendor[:200], privacy_url or None, actual_category, purpose or None, max_age, "Open Cookie Database", OCD_URL, "CC0-1.0", 0.75), ) inserted += cur.rowcount conn.commit() print(f"\nInserted {inserted}, skipped {skipped}") cur.execute("SELECT actual_category, COUNT(*) " "FROM compliance.cookie_library GROUP BY actual_category " "ORDER BY 2 DESC") for row in cur.fetchall(): print(f" {row[0]:15s}: {row[1]}") return 0 if __name__ == "__main__": sys.exit(main())