""" Zentraler User-Agent-Provider + Domain-Rate-Limiter fuer alle Crawls. UA-Switch ist Trigger-gebunden an Firmengruendung: - aktuell (Vor-Gruendung): generischer Headless-Chrome-UA - nach Gruendung: env BREAKPILOT_BRANDED_UA=1 setzen -> "BreakPilot-Compliance-Scanner/1.0 (+https://...)" Memory: project_legal_contracts_2026_07.md (Punkt 0). Rate-Limit: - Default 1 req/sec/Domain, max 2 concurrent pro Domain. - Saving-Scan-Funnel separat: max 1 vollstaendiger Run / Domain / 24h. """ from __future__ import annotations import asyncio import os import time from collections import defaultdict from urllib.parse import urlparse _BRANDED_UA = ( "BreakPilot-Compliance-Scanner/1.0 " "(+https://breakpilot.ai/scanner)" ) _NEUTRAL_UA = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36" ) def crawler_user_agent() -> str: """Aktueller UA-String fuer alle ausgehenden Crawls. Switcht auf den Markennamen sobald BREAKPILOT_BRANDED_UA=1 gesetzt wird (nach Firmengruendung — siehe Memory). """ branded = (os.getenv("BREAKPILOT_BRANDED_UA") or "").strip().lower() if branded in ("1", "true", "yes"): return _BRANDED_UA return _NEUTRAL_UA def default_request_headers() -> dict: """Vollstaendiger Header-Satz fuer httpx-Calls.""" return { "User-Agent": crawler_user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "de-DE,de;q=0.9,en;q=0.8", } def base_domain_of(url_or_host: str) -> str: if not url_or_host: return "" if "://" not in url_or_host: url_or_host = "https://" + url_or_host netloc = urlparse(url_or_host).netloc.lower() return netloc.replace("www.", "") or url_or_host # --- per-Domain Rate-Limit ---------------------------------------------- _MIN_INTERVAL_S = 1.0 # 1 req/sec/Domain _MAX_CONCURRENT_PER_DOMAIN = 2 _last_request_at: dict[str, float] = defaultdict(float) _semaphores: dict[str, asyncio.Semaphore] = {} _locks_lock = asyncio.Lock() async def _get_semaphore(domain: str) -> asyncio.Semaphore: async with _locks_lock: sem = _semaphores.get(domain) if sem is None: sem = asyncio.Semaphore(_MAX_CONCURRENT_PER_DOMAIN) _semaphores[domain] = sem return sem class DomainRateLimiter: """Async-Context: warte vor Request + halte concurrent-Slot. async with DomainRateLimiter(url): resp = await client.get(url) """ def __init__(self, url_or_domain: str): self.domain = base_domain_of(url_or_domain) async def __aenter__(self): sem = await _get_semaphore(self.domain) await sem.acquire() last = _last_request_at[self.domain] wait = (last + _MIN_INTERVAL_S) - time.monotonic() if wait > 0: await asyncio.sleep(wait) _last_request_at[self.domain] = time.monotonic() self._sem = sem return self async def __aexit__(self, exc_type, exc, tb): self._sem.release() return False # --- per-Domain "1 full run / 24h" (Saving-Scan) ----------------------- _DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db") _SAVING_SCAN_INTERVAL_S = 24 * 3600 def saving_scan_allowed(domain_or_url: str) -> tuple[bool, int]: """True wenn fuer diese Domain in den letzten 24h kein Saving-Scan lief. Liest aus compliance_audit_log.check_runs (existierende Tabelle). Liefert (allowed, seconds_until_allowed). """ import sqlite3 domain = base_domain_of(domain_or_url) if not domain: return True, 0 try: with sqlite3.connect(_DB_PATH) as conn: row = conn.execute( "SELECT MAX(ts) FROM check_runs WHERE base_domain=?", (domain,), ).fetchone() last = row[0] if row else None if not last: return True, 0 from datetime import datetime elapsed = time.time() - datetime.fromisoformat(last).timestamp() if elapsed >= _SAVING_SCAN_INTERVAL_S: return True, 0 return False, int(_SAVING_SCAN_INTERVAL_S - elapsed) except Exception: return True, 0