Files
breakpilot-compliance/backend-compliance/compliance/services/tdm_reservation_check.py
T
Benjamin Admin 6c223c7c9b
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 14s
CI / loc-budget (push) Failing after 15s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m43s
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
feat(compliance-check): exec-summary + voll-audit + TDM-respect + cookie-KB-extended + saving-scan-funnel
P1 — Exec-Summary oben im Email-Report (4 KPIs + 2 CTAs, dunkler Gradient)
P3 — no_direct_sales-Flag fuer OEM-Konfigurator-Sites; AGB/Widerruf/AGB als
     "NICHT ANWENDBAR" (grau) statt "NICHT GEFUNDEN" (rot)
P5 — Voll-Audit Unification: alle Findings (MC + Pflichtangaben + Vendor +
     Redundanz) in /data/compliance_audits.db.unified_findings; neuer
     /api/compliance/agent/findings/<id> Endpoint + FindingsTab im Audit-UI
     mit Filter + CSV-Export
P7 — Crawl-Hardening: TDM-Reservation-Check (robots.txt / ai.txt / Header /
     Meta) vor jedem Run mit 24h-Cache; HeadlessChrome-UA (Firma noch nicht
     gegruendet — Switch via BREAKPILOT_BRANDED_UA env); per-Domain
     Rate-Limit 1 req/s + max 2 concurrent
P2 — Cookie-Knowledge-DB additiv erweitert (35 -> 74 Cookies): Adobe, Meta,
     Microsoft, LinkedIn, TikTok, HubSpot, Marketo, Salesforce, Hotjar,
     FullStory, Mouseflow, Intercom, Drift, Zendesk, Cloudflare, Stripe,
     OneTrust/Cookiebot/Usercentrics, Matomo, Pinterest, Snapchat, X/Twitter,
     YouTube, Vimeo, Klaviyo, Mailchimp, Mixpanel, Segment, Amplitude,
     Optimizely, Datadog; Wire-in in cookie_function_classifier liefert
     compliance_risk-Label (kritisch/hoch/mittel/gering) pro Vendor
A  — k-Anonymitaets-Helper (benchmark_k_anonymity) fuer P6-Vorbereitung
B  — Cross-Tenant-Domain-Assertion im /findings-Endpoint (expected_domain
     Query-Param -> 403 bei Mismatch)
C  — Saving-Scan-Funnel: /api/compliance/agent/saving-scan/start mit
     Validierung + 24h-Rate-Limit pro Domain + Lead-Persistenz in
     saving_scan_leads + Auto-Discovery via _run_compliance_check; 6 Tests
D  — Risk-Badge im Email-Vendor-Row

Rechtliche Leitplanken (Memory feedback_oem_data_legal.md): nur eigene
Knapp-Bewertungen + Source-Pointer, keine 1:1-Kopien fremder CMP-Texte.
TDM-Opt-Out-Respect nach § 44b UrhG. KEINE Schema-Aenderungen — alles in
Sidecar-SQLite.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 23:48:34 +02:00

243 lines
8.2 KiB
Python

"""
TDM-Reservation-Check (§ 44b UrhG / EU CDSM Art. 4).
Prueft pro Domain ob ein maschinenlesbarer Nutzungsvorbehalt fuer
Text-and-Data-Mining gesetzt ist. Quellen:
1. robots.txt — User-agent: * Disallow: / (oder spezifisch fuer uns)
2. /ai.txt — neuer OpenAI-Standard
3. HTTP-Header `tdm-reservation: 1` auf Homepage
4. HTML <meta name="tdm-reservation" content="1"> auf Homepage
5. HTML <meta name="robots" content="noai|noimageai"> Tags
Status-Interpretation:
status=allowed -> kein Vorbehalt, crawlbar
status=reserved -> expliziter Vorbehalt, NICHT crawlen
status=denied -> robots.txt-Zugriff aktiv blockiert (403/401)
=> konservativ: NICHT crawlen
status=unknown -> Server-Error (500/timeout/DNS) auf robots.txt
=> crawlbar, aber 24h-Recheck markiert
Cache via sidecar SQLite (gleiche DB wie compliance_audit_log), 24h TTL.
"""
from __future__ import annotations
import json
import logging
import os
import sqlite3
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Literal
from urllib.parse import urlparse
import httpx
logger = logging.getLogger(__name__)
DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")
CACHE_TTL_SECONDS = 24 * 3600
Status = Literal["allowed", "reserved", "denied", "unknown"]
_DEFAULT_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) HeadlessChrome/120.0.0.0 Safari/537.36"
)
def _ensure_cache_table() -> None:
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
with sqlite3.connect(DB_PATH) as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS tdm_reservation_cache (
domain TEXT PRIMARY KEY,
ts TEXT NOT NULL,
status TEXT NOT NULL,
signals TEXT NOT NULL -- JSON list[dict]
);
CREATE INDEX IF NOT EXISTS idx_tdm_ts ON tdm_reservation_cache(ts);
""")
def _cache_get(domain: str) -> dict | None:
try:
_ensure_cache_table()
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
row = conn.execute(
"SELECT * FROM tdm_reservation_cache WHERE domain=?", (domain,),
).fetchone()
if not row:
return None
ts = datetime.fromisoformat(row["ts"]).timestamp()
if time.time() - ts > CACHE_TTL_SECONDS:
return None
return {
"domain": domain,
"status": row["status"],
"signals": json.loads(row["signals"]),
"cached": True,
"ts": row["ts"],
}
except Exception as e:
logger.debug("tdm cache_get failed for %s: %s", domain, e)
return None
def _cache_put(domain: str, status: Status, signals: list[dict]) -> None:
try:
_ensure_cache_table()
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"INSERT OR REPLACE INTO tdm_reservation_cache "
"(domain, ts, status, signals) VALUES (?, ?, ?, ?)",
(
domain,
datetime.now(timezone.utc).isoformat(),
status,
json.dumps(signals, ensure_ascii=False),
),
)
conn.commit()
except Exception as e:
logger.warning("tdm cache_put failed for %s: %s", domain, e)
def _base_domain(url_or_domain: str) -> str:
if not url_or_domain:
return ""
if "://" not in url_or_domain:
url_or_domain = "https://" + url_or_domain
netloc = urlparse(url_or_domain).netloc.lower()
return netloc.replace("www.", "")
async def _fetch_status(client: httpx.AsyncClient, url: str) -> tuple[int, str, dict]:
"""Return (status_code, body, headers). Body capped at 16 KiB."""
try:
resp = await client.get(url)
body = resp.text[:16384] if resp.content else ""
return resp.status_code, body, dict(resp.headers)
except Exception as e:
logger.debug("tdm fetch %s failed: %s", url, e)
return 0, "", {}
def _robots_disallows_us(body: str) -> bool:
"""Parse robots.txt — true if our group has Disallow: /."""
if not body:
return False
relevant_groups = ["*", "claudebot", "anthropic-ai", "gptbot",
"google-extended", "ccbot", "breakpilot"]
current_uas: list[str] = []
in_our_group = False
for raw in body.splitlines():
line = raw.split("#", 1)[0].strip()
if not line:
in_our_group = False
current_uas = []
continue
if ":" not in line:
continue
key, val = (s.strip().lower() for s in line.split(":", 1))
if key == "user-agent":
current_uas.append(val)
in_our_group = any(ua in relevant_groups for ua in current_uas)
elif key == "disallow" and in_our_group:
if val == "/" or val == "":
if val == "/":
return True
return False
def _meta_has_reservation(body: str) -> bool:
"""Detect <meta name="tdm-reservation|robots|googlebot"> with noai/noimageai/1."""
low = body.lower()
needles = [
'name="tdm-reservation" content="1"',
"name='tdm-reservation' content='1'",
'"noai"', '"noimageai"',
"content=\"noai", "content='noai",
]
return any(n in low for n in needles)
async def check_tdm_reservation(domain_or_url: str) -> dict:
"""Probe a domain for machine-readable TDM reservations.
Returns:
{
domain, status, signals: [{src, detail}], cached, ts
}
"""
domain = _base_domain(domain_or_url)
if not domain:
return {"domain": "", "status": "unknown", "signals": [], "cached": False}
cached = _cache_get(domain)
if cached:
return cached
signals: list[dict] = []
status: Status = "allowed"
headers = {"User-Agent": _DEFAULT_UA, "Accept": "*/*"}
async with httpx.AsyncClient(
timeout=12.0, follow_redirects=True, headers=headers,
) as client:
for scheme in ("https", "http"):
r_code, r_body, _ = await _fetch_status(
client, f"{scheme}://www.{domain}/robots.txt",
)
if r_code == 0 and scheme == "https":
continue
signals.append({"src": "robots.txt", "status_code": r_code,
"scheme": scheme})
if r_code in (401, 403):
status = "denied"
elif r_code == 200 and _robots_disallows_us(r_body):
status = "reserved"
signals[-1]["detail"] = "Disallow: / for relevant UA group"
elif r_code not in (200, 404):
status = "unknown"
break
if status == "allowed":
ai_code, _, _ = await _fetch_status(
client, f"https://www.{domain}/ai.txt",
)
if ai_code == 200:
status = "reserved"
signals.append({"src": "ai.txt", "status_code": 200,
"detail": "ai.txt present"})
if status == "allowed":
h_code, h_body, h_hdrs = await _fetch_status(
client, f"https://www.{domain}/",
)
if h_code == 200:
if h_hdrs.get("tdm-reservation") == "1":
status = "reserved"
signals.append({"src": "http-header",
"detail": "tdm-reservation: 1"})
elif _meta_has_reservation(h_body):
status = "reserved"
signals.append({"src": "html-meta",
"detail": "noai/tdm-reservation meta"})
_cache_put(domain, status, signals)
return {
"domain": domain,
"status": status,
"signals": signals,
"cached": False,
"ts": datetime.now(timezone.utc).isoformat(),
}
def is_crawl_allowed(result: dict) -> bool:
"""Strict: only 'allowed' and 'unknown' are crawlable."""
return (result.get("status") or "unknown") in ("allowed", "unknown")