Files
breakpilot-compliance/backend-compliance/compliance/services/compliance_user_agent.py
T
Benjamin Admin 6c223c7c9b
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m43s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(compliance-check): exec-summary + voll-audit + TDM-respect + cookie-KB-extended + saving-scan-funnel
P1 — Exec-Summary oben im Email-Report (4 KPIs + 2 CTAs, dunkler Gradient)
P3 — no_direct_sales-Flag fuer OEM-Konfigurator-Sites; AGB/Widerruf/AGB als
     "NICHT ANWENDBAR" (grau) statt "NICHT GEFUNDEN" (rot)
P5 — Voll-Audit Unification: alle Findings (MC + Pflichtangaben + Vendor +
     Redundanz) in /data/compliance_audits.db.unified_findings; neuer
     /api/compliance/agent/findings/<id> Endpoint + FindingsTab im Audit-UI
     mit Filter + CSV-Export
P7 — Crawl-Hardening: TDM-Reservation-Check (robots.txt / ai.txt / Header /
     Meta) vor jedem Run mit 24h-Cache; HeadlessChrome-UA (Firma noch nicht
     gegruendet — Switch via BREAKPILOT_BRANDED_UA env); per-Domain
     Rate-Limit 1 req/s + max 2 concurrent
P2 — Cookie-Knowledge-DB additiv erweitert (35 -> 74 Cookies): Adobe, Meta,
     Microsoft, LinkedIn, TikTok, HubSpot, Marketo, Salesforce, Hotjar,
     FullStory, Mouseflow, Intercom, Drift, Zendesk, Cloudflare, Stripe,
     OneTrust/Cookiebot/Usercentrics, Matomo, Pinterest, Snapchat, X/Twitter,
     YouTube, Vimeo, Klaviyo, Mailchimp, Mixpanel, Segment, Amplitude,
     Optimizely, Datadog; Wire-in in cookie_function_classifier liefert
     compliance_risk-Label (kritisch/hoch/mittel/gering) pro Vendor
A  — k-Anonymitaets-Helper (benchmark_k_anonymity) fuer P6-Vorbereitung
B  — Cross-Tenant-Domain-Assertion im /findings-Endpoint (expected_domain
     Query-Param -> 403 bei Mismatch)
C  — Saving-Scan-Funnel: /api/compliance/agent/saving-scan/start mit
     Validierung + 24h-Rate-Limit pro Domain + Lead-Persistenz in
     saving_scan_leads + Auto-Discovery via _run_compliance_check; 6 Tests
D  — Risk-Badge im Email-Vendor-Row

Rechtliche Leitplanken (Memory feedback_oem_data_legal.md): nur eigene
Knapp-Bewertungen + Source-Pointer, keine 1:1-Kopien fremder CMP-Texte.
TDM-Opt-Out-Respect nach § 44b UrhG. KEINE Schema-Aenderungen — alles in
Sidecar-SQLite.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 23:48:34 +02:00

142 lines
4.3 KiB
Python

"""
Zentraler User-Agent-Provider + Domain-Rate-Limiter fuer alle Crawls.
UA-Switch ist Trigger-gebunden an Firmengruendung:
- aktuell (Vor-Gruendung): generischer Headless-Chrome-UA
- nach Gruendung: env BREAKPILOT_BRANDED_UA=1 setzen
-> "BreakPilot-Compliance-Scanner/1.0 (+https://...)"
Memory: project_legal_contracts_2026_07.md (Punkt 0).
Rate-Limit:
- Default 1 req/sec/Domain, max 2 concurrent pro Domain.
- Saving-Scan-Funnel separat: max 1 vollstaendiger Run / Domain / 24h.
"""
from __future__ import annotations
import asyncio
import os
import time
from collections import defaultdict
from urllib.parse import urlparse
_BRANDED_UA = (
"BreakPilot-Compliance-Scanner/1.0 "
"(+https://breakpilot.ai/scanner)"
)
_NEUTRAL_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36"
)
def crawler_user_agent() -> str:
"""Aktueller UA-String fuer alle ausgehenden Crawls.
Switcht auf den Markennamen sobald BREAKPILOT_BRANDED_UA=1 gesetzt
wird (nach Firmengruendung — siehe Memory).
"""
branded = (os.getenv("BREAKPILOT_BRANDED_UA") or "").strip().lower()
if branded in ("1", "true", "yes"):
return _BRANDED_UA
return _NEUTRAL_UA
def default_request_headers() -> dict:
"""Vollstaendiger Header-Satz fuer httpx-Calls."""
return {
"User-Agent": crawler_user_agent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
}
def base_domain_of(url_or_host: str) -> str:
if not url_or_host:
return ""
if "://" not in url_or_host:
url_or_host = "https://" + url_or_host
netloc = urlparse(url_or_host).netloc.lower()
return netloc.replace("www.", "") or url_or_host
# --- per-Domain Rate-Limit ----------------------------------------------
_MIN_INTERVAL_S = 1.0 # 1 req/sec/Domain
_MAX_CONCURRENT_PER_DOMAIN = 2
_last_request_at: dict[str, float] = defaultdict(float)
_semaphores: dict[str, asyncio.Semaphore] = {}
_locks_lock = asyncio.Lock()
async def _get_semaphore(domain: str) -> asyncio.Semaphore:
async with _locks_lock:
sem = _semaphores.get(domain)
if sem is None:
sem = asyncio.Semaphore(_MAX_CONCURRENT_PER_DOMAIN)
_semaphores[domain] = sem
return sem
class DomainRateLimiter:
"""Async-Context: warte vor Request + halte concurrent-Slot.
async with DomainRateLimiter(url):
resp = await client.get(url)
"""
def __init__(self, url_or_domain: str):
self.domain = base_domain_of(url_or_domain)
async def __aenter__(self):
sem = await _get_semaphore(self.domain)
await sem.acquire()
last = _last_request_at[self.domain]
wait = (last + _MIN_INTERVAL_S) - time.monotonic()
if wait > 0:
await asyncio.sleep(wait)
_last_request_at[self.domain] = time.monotonic()
self._sem = sem
return self
async def __aexit__(self, exc_type, exc, tb):
self._sem.release()
return False
# --- per-Domain "1 full run / 24h" (Saving-Scan) -----------------------
_DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")
_SAVING_SCAN_INTERVAL_S = 24 * 3600
def saving_scan_allowed(domain_or_url: str) -> tuple[bool, int]:
"""True wenn fuer diese Domain in den letzten 24h kein Saving-Scan lief.
Liest aus compliance_audit_log.check_runs (existierende Tabelle).
Liefert (allowed, seconds_until_allowed).
"""
import sqlite3
domain = base_domain_of(domain_or_url)
if not domain:
return True, 0
try:
with sqlite3.connect(_DB_PATH) as conn:
row = conn.execute(
"SELECT MAX(ts) FROM check_runs WHERE base_domain=?",
(domain,),
).fetchone()
last = row[0] if row else None
if not last:
return True, 0
from datetime import datetime
elapsed = time.time() - datetime.fromisoformat(last).timestamp()
if elapsed >= _SAVING_SCAN_INTERVAL_S:
return True, 0
return False, int(_SAVING_SCAN_INTERVAL_S - elapsed)
except Exception:
return True, 0