breakpilot-compliance/backend-compliance/compliance/services/tdm_reservation_check.py

"""
TDM-Reservation-Check (§ 44b UrhG / EU CDSM Art. 4).

Prueft pro Domain ob ein maschinenlesbarer Nutzungsvorbehalt fuer
Text-and-Data-Mining gesetzt ist. Quellen:
  1. robots.txt — User-agent: * Disallow: /  (oder spezifisch fuer uns)
  2. /ai.txt — neuer OpenAI-Standard
  3. HTTP-Header `tdm-reservation: 1` auf Homepage
  4. HTML <meta name="tdm-reservation" content="1"> auf Homepage
  5. HTML <meta name="robots" content="noai|noimageai"> Tags

Status-Interpretation:
  status=allowed   -> kein Vorbehalt, crawlbar
  status=reserved  -> expliziter Vorbehalt, NICHT crawlen
  status=denied    -> robots.txt-Zugriff aktiv blockiert (403/401)
                      => konservativ: NICHT crawlen
  status=unknown   -> Server-Error (500/timeout/DNS) auf robots.txt
                      => crawlbar, aber 24h-Recheck markiert

Cache via sidecar SQLite (gleiche DB wie compliance_audit_log), 24h TTL.
"""

from __future__ import annotations

import json
import logging
import os
import sqlite3
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Literal
from urllib.parse import urlparse

import httpx

logger = logging.getLogger(__name__)

DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")
CACHE_TTL_SECONDS = 24 * 3600

Status = Literal["allowed", "reserved", "denied", "unknown"]

_DEFAULT_UA = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36"
)


def _ensure_cache_table() -> None:
    Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
    with sqlite3.connect(DB_PATH) as conn:
        conn.executescript("""
            CREATE TABLE IF NOT EXISTS tdm_reservation_cache (
                domain     TEXT PRIMARY KEY,
                ts         TEXT NOT NULL,
                status     TEXT NOT NULL,
                signals    TEXT NOT NULL    -- JSON list[dict]
            );
            CREATE INDEX IF NOT EXISTS idx_tdm_ts ON tdm_reservation_cache(ts);
        """)


def _cache_get(domain: str) -> dict | None:
    try:
        _ensure_cache_table()
        with sqlite3.connect(DB_PATH) as conn:
            conn.row_factory = sqlite3.Row
            row = conn.execute(
                "SELECT * FROM tdm_reservation_cache WHERE domain=?", (domain,),
            ).fetchone()
            if not row:
                return None
            ts = datetime.fromisoformat(row["ts"]).timestamp()
            if time.time() - ts > CACHE_TTL_SECONDS:
                return None
            return {
                "domain": domain,
                "status": row["status"],
                "signals": json.loads(row["signals"]),
                "cached": True,
                "ts": row["ts"],
            }
    except Exception as e:
        logger.debug("tdm cache_get failed for %s: %s", domain, e)
        return None


def _cache_put(domain: str, status: Status, signals: list[dict]) -> None:
    try:
        _ensure_cache_table()
        with sqlite3.connect(DB_PATH) as conn:
            conn.execute(
                "INSERT OR REPLACE INTO tdm_reservation_cache "
                "(domain, ts, status, signals) VALUES (?, ?, ?, ?)",
                (
                    domain,
                    datetime.now(timezone.utc).isoformat(),
                    status,
                    json.dumps(signals, ensure_ascii=False),
                ),
            )
            conn.commit()
    except Exception as e:
        logger.warning("tdm cache_put failed for %s: %s", domain, e)


def _base_domain(url_or_domain: str) -> str:
    if not url_or_domain:
        return ""
    if "://" not in url_or_domain:
        url_or_domain = "https://" + url_or_domain
    netloc = urlparse(url_or_domain).netloc.lower()
    return netloc.replace("www.", "")


async def _fetch_status(client: httpx.AsyncClient, url: str) -> tuple[int, str, dict]:
    """Return (status_code, body, headers). Body capped at 16 KiB."""
    try:
        resp = await client.get(url)
        body = resp.text[:16384] if resp.content else ""
        return resp.status_code, body, dict(resp.headers)
    except Exception as e:
        logger.debug("tdm fetch %s failed: %s", url, e)
        return 0, "", {}


def _robots_disallows_us(body: str) -> bool:
    """Parse robots.txt — true if our group has Disallow: /."""
    if not body:
        return False
    relevant_groups = ["*", "claudebot", "anthropic-ai", "gptbot",
                       "google-extended", "ccbot", "breakpilot"]
    current_uas: list[str] = []
    in_our_group = False
    for raw in body.splitlines():
        line = raw.split("#", 1)[0].strip()
        if not line:
            in_our_group = False
            current_uas = []
            continue
        if ":" not in line:
            continue
        key, val = (s.strip().lower() for s in line.split(":", 1))
        if key == "user-agent":
            current_uas.append(val)
            in_our_group = any(ua in relevant_groups for ua in current_uas)
        elif key == "disallow" and in_our_group:
            if val == "/" or val == "":
                if val == "/":
                    return True
    return False


def _meta_has_reservation(body: str) -> bool:
    """Detect <meta name="tdm-reservation|robots|googlebot"> with noai/noimageai/1."""
    low = body.lower()
    needles = [
        'name="tdm-reservation" content="1"',
        "name='tdm-reservation' content='1'",
        '"noai"', '"noimageai"',
        "content=\"noai", "content='noai",
    ]
    return any(n in low for n in needles)


async def check_tdm_reservation(domain_or_url: str) -> dict:
    """Probe a domain for machine-readable TDM reservations.

    Returns:
      {
        domain, status, signals: [{src, detail}], cached, ts
      }
    """
    domain = _base_domain(domain_or_url)
    if not domain:
        return {"domain": "", "status": "unknown", "signals": [], "cached": False}

    cached = _cache_get(domain)
    if cached:
        return cached

    signals: list[dict] = []
    status: Status = "allowed"

    headers = {"User-Agent": _DEFAULT_UA, "Accept": "*/*"}
    async with httpx.AsyncClient(
        timeout=12.0, follow_redirects=True, headers=headers,
    ) as client:
        for scheme in ("https", "http"):
            r_code, r_body, _ = await _fetch_status(
                client, f"{scheme}://www.{domain}/robots.txt",
            )
            if r_code == 0 and scheme == "https":
                continue
            signals.append({"src": "robots.txt", "status_code": r_code,
                            "scheme": scheme})
            if r_code in (401, 403):
                status = "denied"
            elif r_code == 200 and _robots_disallows_us(r_body):
                status = "reserved"
                signals[-1]["detail"] = "Disallow: / for relevant UA group"
            elif r_code not in (200, 404):
                status = "unknown"
            break

        if status == "allowed":
            ai_code, _, _ = await _fetch_status(
                client, f"https://www.{domain}/ai.txt",
            )
            if ai_code == 200:
                status = "reserved"
                signals.append({"src": "ai.txt", "status_code": 200,
                                "detail": "ai.txt present"})

        if status == "allowed":
            h_code, h_body, h_hdrs = await _fetch_status(
                client, f"https://www.{domain}/",
            )
            if h_code == 200:
                if h_hdrs.get("tdm-reservation") == "1":
                    status = "reserved"
                    signals.append({"src": "http-header",
                                    "detail": "tdm-reservation: 1"})
                elif _meta_has_reservation(h_body):
                    status = "reserved"
                    signals.append({"src": "html-meta",
                                    "detail": "noai/tdm-reservation meta"})

    _cache_put(domain, status, signals)
    return {
        "domain": domain,
        "status": status,
        "signals": signals,
        "cached": False,
        "ts": datetime.now(timezone.utc).isoformat(),
    }


def is_crawl_allowed(result: dict) -> bool:
    """Strict: only 'allowed' and 'unknown' are crawlable."""
    return (result.get("status") or "unknown") in ("allowed", "unknown")