""" TDM-Reservation-Check (ยง 44b UrhG / EU CDSM Art. 4). Prueft pro Domain ob ein maschinenlesbarer Nutzungsvorbehalt fuer Text-and-Data-Mining gesetzt ist. Quellen: 1. robots.txt โ€” User-agent: * Disallow: / (oder spezifisch fuer uns) 2. /ai.txt โ€” neuer OpenAI-Standard 3. HTTP-Header `tdm-reservation: 1` auf Homepage 4. HTML auf Homepage 5. HTML Tags Status-Interpretation: status=allowed -> kein Vorbehalt, crawlbar status=reserved -> expliziter Vorbehalt, NICHT crawlen status=denied -> robots.txt-Zugriff aktiv blockiert (403/401) => konservativ: NICHT crawlen status=unknown -> Server-Error (500/timeout/DNS) auf robots.txt => crawlbar, aber 24h-Recheck markiert Cache via sidecar SQLite (gleiche DB wie compliance_audit_log), 24h TTL. """ from __future__ import annotations import json import logging import os import sqlite3 import time from datetime import datetime, timezone from pathlib import Path from typing import Literal from urllib.parse import urlparse import httpx logger = logging.getLogger(__name__) DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db") CACHE_TTL_SECONDS = 24 * 3600 Status = Literal["allowed", "reserved", "denied", "unknown"] _DEFAULT_UA = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36" ) def _ensure_cache_table() -> None: Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True) with sqlite3.connect(DB_PATH) as conn: conn.executescript(""" CREATE TABLE IF NOT EXISTS tdm_reservation_cache ( domain TEXT PRIMARY KEY, ts TEXT NOT NULL, status TEXT NOT NULL, signals TEXT NOT NULL -- JSON list[dict] ); CREATE INDEX IF NOT EXISTS idx_tdm_ts ON tdm_reservation_cache(ts); """) def _cache_get(domain: str) -> dict | None: try: _ensure_cache_table() with sqlite3.connect(DB_PATH) as conn: conn.row_factory = sqlite3.Row row = conn.execute( "SELECT * FROM tdm_reservation_cache WHERE domain=?", (domain,), ).fetchone() if not row: return None ts = datetime.fromisoformat(row["ts"]).timestamp() if time.time() - ts > CACHE_TTL_SECONDS: return None return { "domain": domain, "status": row["status"], "signals": json.loads(row["signals"]), "cached": True, "ts": row["ts"], } except Exception as e: logger.debug("tdm cache_get failed for %s: %s", domain, e) return None def _cache_put(domain: str, status: Status, signals: list[dict]) -> None: try: _ensure_cache_table() with sqlite3.connect(DB_PATH) as conn: conn.execute( "INSERT OR REPLACE INTO tdm_reservation_cache " "(domain, ts, status, signals) VALUES (?, ?, ?, ?)", ( domain, datetime.now(timezone.utc).isoformat(), status, json.dumps(signals, ensure_ascii=False), ), ) conn.commit() except Exception as e: logger.warning("tdm cache_put failed for %s: %s", domain, e) def _base_domain(url_or_domain: str) -> str: if not url_or_domain: return "" if "://" not in url_or_domain: url_or_domain = "https://" + url_or_domain netloc = urlparse(url_or_domain).netloc.lower() return netloc.replace("www.", "") async def _fetch_status(client: httpx.AsyncClient, url: str) -> tuple[int, str, dict]: """Return (status_code, body, headers). Body capped at 16 KiB.""" try: resp = await client.get(url) body = resp.text[:16384] if resp.content else "" return resp.status_code, body, dict(resp.headers) except Exception as e: logger.debug("tdm fetch %s failed: %s", url, e) return 0, "", {} def _robots_disallows_us(body: str) -> bool: """Parse robots.txt โ€” true if our group has Disallow: /.""" if not body: return False relevant_groups = ["*", "claudebot", "anthropic-ai", "gptbot", "google-extended", "ccbot", "breakpilot"] current_uas: list[str] = [] in_our_group = False for raw in body.splitlines(): line = raw.split("#", 1)[0].strip() if not line: in_our_group = False current_uas = [] continue if ":" not in line: continue key, val = (s.strip().lower() for s in line.split(":", 1)) if key == "user-agent": current_uas.append(val) in_our_group = any(ua in relevant_groups for ua in current_uas) elif key == "disallow" and in_our_group: if val == "/" or val == "": if val == "/": return True return False def _meta_has_reservation(body: str) -> bool: """Detect with noai/noimageai/1.""" low = body.lower() needles = [ 'name="tdm-reservation" content="1"', "name='tdm-reservation' content='1'", '"noai"', '"noimageai"', "content=\"noai", "content='noai", ] return any(n in low for n in needles) async def check_tdm_reservation(domain_or_url: str) -> dict: """Probe a domain for machine-readable TDM reservations. Returns: { domain, status, signals: [{src, detail}], cached, ts } """ domain = _base_domain(domain_or_url) if not domain: return {"domain": "", "status": "unknown", "signals": [], "cached": False} cached = _cache_get(domain) if cached: return cached signals: list[dict] = [] status: Status = "allowed" headers = {"User-Agent": _DEFAULT_UA, "Accept": "*/*"} async with httpx.AsyncClient( timeout=12.0, follow_redirects=True, headers=headers, ) as client: for scheme in ("https", "http"): r_code, r_body, _ = await _fetch_status( client, f"{scheme}://www.{domain}/robots.txt", ) if r_code == 0 and scheme == "https": continue signals.append({"src": "robots.txt", "status_code": r_code, "scheme": scheme}) if r_code in (401, 403): status = "denied" elif r_code == 200 and _robots_disallows_us(r_body): status = "reserved" signals[-1]["detail"] = "Disallow: / for relevant UA group" elif r_code not in (200, 404): status = "unknown" break if status == "allowed": ai_code, _, _ = await _fetch_status( client, f"https://www.{domain}/ai.txt", ) if ai_code == 200: status = "reserved" signals.append({"src": "ai.txt", "status_code": 200, "detail": "ai.txt present"}) if status == "allowed": h_code, h_body, h_hdrs = await _fetch_status( client, f"https://www.{domain}/", ) if h_code == 200: if h_hdrs.get("tdm-reservation") == "1": status = "reserved" signals.append({"src": "http-header", "detail": "tdm-reservation: 1"}) elif _meta_has_reservation(h_body): status = "reserved" signals.append({"src": "html-meta", "detail": "noai/tdm-reservation meta"}) _cache_put(domain, status, signals) return { "domain": domain, "status": status, "signals": signals, "cached": False, "ts": datetime.now(timezone.utc).isoformat(), } def is_crawl_allowed(result: dict) -> bool: """Strict: only 'allowed' and 'unknown' are crawlable.""" return (result.get("status") or "unknown") in ("allowed", "unknown")