"""
TDM-Reservation-Check (ยง 44b UrhG / EU CDSM Art. 4).
Prueft pro Domain ob ein maschinenlesbarer Nutzungsvorbehalt fuer
Text-and-Data-Mining gesetzt ist. Quellen:
1. robots.txt โ User-agent: * Disallow: / (oder spezifisch fuer uns)
2. /ai.txt โ neuer OpenAI-Standard
3. HTTP-Header `tdm-reservation: 1` auf Homepage
4. HTML auf Homepage
5. HTML Tags
Status-Interpretation:
status=allowed -> kein Vorbehalt, crawlbar
status=reserved -> expliziter Vorbehalt, NICHT crawlen
status=denied -> robots.txt-Zugriff aktiv blockiert (403/401)
=> konservativ: NICHT crawlen
status=unknown -> Server-Error (500/timeout/DNS) auf robots.txt
=> crawlbar, aber 24h-Recheck markiert
Cache via sidecar SQLite (gleiche DB wie compliance_audit_log), 24h TTL.
"""
from __future__ import annotations
import json
import logging
import os
import sqlite3
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Literal
from urllib.parse import urlparse
import httpx
logger = logging.getLogger(__name__)
DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")
CACHE_TTL_SECONDS = 24 * 3600
Status = Literal["allowed", "reserved", "denied", "unknown"]
_DEFAULT_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36"
)
def _ensure_cache_table() -> None:
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
with sqlite3.connect(DB_PATH) as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS tdm_reservation_cache (
domain TEXT PRIMARY KEY,
ts TEXT NOT NULL,
status TEXT NOT NULL,
signals TEXT NOT NULL -- JSON list[dict]
);
CREATE INDEX IF NOT EXISTS idx_tdm_ts ON tdm_reservation_cache(ts);
""")
def _cache_get(domain: str) -> dict | None:
try:
_ensure_cache_table()
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
row = conn.execute(
"SELECT * FROM tdm_reservation_cache WHERE domain=?", (domain,),
).fetchone()
if not row:
return None
ts = datetime.fromisoformat(row["ts"]).timestamp()
if time.time() - ts > CACHE_TTL_SECONDS:
return None
return {
"domain": domain,
"status": row["status"],
"signals": json.loads(row["signals"]),
"cached": True,
"ts": row["ts"],
}
except Exception as e:
logger.debug("tdm cache_get failed for %s: %s", domain, e)
return None
def _cache_put(domain: str, status: Status, signals: list[dict]) -> None:
try:
_ensure_cache_table()
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"INSERT OR REPLACE INTO tdm_reservation_cache "
"(domain, ts, status, signals) VALUES (?, ?, ?, ?)",
(
domain,
datetime.now(timezone.utc).isoformat(),
status,
json.dumps(signals, ensure_ascii=False),
),
)
conn.commit()
except Exception as e:
logger.warning("tdm cache_put failed for %s: %s", domain, e)
def _base_domain(url_or_domain: str) -> str:
if not url_or_domain:
return ""
if "://" not in url_or_domain:
url_or_domain = "https://" + url_or_domain
netloc = urlparse(url_or_domain).netloc.lower()
return netloc.replace("www.", "")
async def _fetch_status(client: httpx.AsyncClient, url: str) -> tuple[int, str, dict]:
"""Return (status_code, body, headers). Body capped at 16 KiB."""
try:
resp = await client.get(url)
body = resp.text[:16384] if resp.content else ""
return resp.status_code, body, dict(resp.headers)
except Exception as e:
logger.debug("tdm fetch %s failed: %s", url, e)
return 0, "", {}
def _robots_disallows_us(body: str) -> bool:
"""Parse robots.txt โ true if our group has Disallow: /."""
if not body:
return False
relevant_groups = ["*", "claudebot", "anthropic-ai", "gptbot",
"google-extended", "ccbot", "breakpilot"]
current_uas: list[str] = []
in_our_group = False
for raw in body.splitlines():
line = raw.split("#", 1)[0].strip()
if not line:
in_our_group = False
current_uas = []
continue
if ":" not in line:
continue
key, val = (s.strip().lower() for s in line.split(":", 1))
if key == "user-agent":
current_uas.append(val)
in_our_group = any(ua in relevant_groups for ua in current_uas)
elif key == "disallow" and in_our_group:
if val == "/" or val == "":
if val == "/":
return True
return False
def _meta_has_reservation(body: str) -> bool:
"""Detect with noai/noimageai/1."""
low = body.lower()
needles = [
'name="tdm-reservation" content="1"',
"name='tdm-reservation' content='1'",
'"noai"', '"noimageai"',
"content=\"noai", "content='noai",
]
return any(n in low for n in needles)
async def check_tdm_reservation(domain_or_url: str) -> dict:
"""Probe a domain for machine-readable TDM reservations.
Returns:
{
domain, status, signals: [{src, detail}], cached, ts
}
"""
domain = _base_domain(domain_or_url)
if not domain:
return {"domain": "", "status": "unknown", "signals": [], "cached": False}
cached = _cache_get(domain)
if cached:
return cached
signals: list[dict] = []
status: Status = "allowed"
headers = {"User-Agent": _DEFAULT_UA, "Accept": "*/*"}
async with httpx.AsyncClient(
timeout=12.0, follow_redirects=True, headers=headers,
) as client:
for scheme in ("https", "http"):
r_code, r_body, _ = await _fetch_status(
client, f"{scheme}://www.{domain}/robots.txt",
)
if r_code == 0 and scheme == "https":
continue
signals.append({"src": "robots.txt", "status_code": r_code,
"scheme": scheme})
if r_code in (401, 403):
status = "denied"
elif r_code == 200 and _robots_disallows_us(r_body):
status = "reserved"
signals[-1]["detail"] = "Disallow: / for relevant UA group"
elif r_code not in (200, 404):
status = "unknown"
break
if status == "allowed":
ai_code, _, _ = await _fetch_status(
client, f"https://www.{domain}/ai.txt",
)
if ai_code == 200:
status = "reserved"
signals.append({"src": "ai.txt", "status_code": 200,
"detail": "ai.txt present"})
if status == "allowed":
h_code, h_body, h_hdrs = await _fetch_status(
client, f"https://www.{domain}/",
)
if h_code == 200:
if h_hdrs.get("tdm-reservation") == "1":
status = "reserved"
signals.append({"src": "http-header",
"detail": "tdm-reservation: 1"})
elif _meta_has_reservation(h_body):
status = "reserved"
signals.append({"src": "html-meta",
"detail": "noai/tdm-reservation meta"})
_cache_put(domain, status, signals)
return {
"domain": domain,
"status": status,
"signals": signals,
"cached": False,
"ts": datetime.now(timezone.utc).isoformat(),
}
def is_crawl_allowed(result: dict) -> bool:
"""Strict: only 'allowed' and 'unknown' are crawlable."""
return (result.get("status") or "unknown") in ("allowed", "unknown")