feat(agents): Test-Harness nutzt volle Compliance-Pipeline für Fetch

Statt der simplen dsi-discovery-Wrapper-Funktion ruft der Test-Harness jetzt _fetch_text() aus agent_check/_fetch.py — die VOLLE Pipeline die auch der produktive Compliance-Check verwendet: - consent-tester dsi-discovery mit 240s Timeout (statt 120s) - doc_type-aware max_documents (1 für cookie/dse, 3 für impressum) - CMP-Payload-Capture (ePaaS, OneTrust …) - HTTP-Fallback mit Browser-User-Agent + DomainRateLimiter - HTML-Tag-Strip wenn Playwright fail Damit funktionieren Cloudflare-/Anti-Bot-geschützte Sites wie BMW und Elli auch im Test-Harness — vorher Timeout nach 90s. Plus: bei leerem Fetch klare Fehlermeldung im Slot ('Cloudflare-/Anti-Bot-geschützt — Tipp: Text manuell einfügen') statt silent-fail. cmp_payloads landen jetzt auch im Vault. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-08 18:38:59 +02:00
parent 702e7a6333
commit 361a5e7605
1 changed files with 34 additions and 36 deletions
@@ -16,16 +16,15 @@ from __future__ import annotations
 import asyncio
 import json
 import logging
-import os
 import uuid
 from collections.abc import AsyncGenerator
 from typing import Any

-import httpx
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import FileResponse, StreamingResponse
 from pydantic import BaseModel, Field

+from compliance.api.agent_check._fetch import _fetch_text as full_fetch_text
 from compliance.services.specialist_agents import REGISTRY, AgentInput
 from compliance.services.specialist_agents._evidence_vault import (
    EvidenceVault,
@@ -35,11 +34,6 @@ from compliance.services.specialist_agents._evidence_vault import (

 logger = logging.getLogger(__name__)

-CONSENT_TESTER_URL = os.environ.get(
-    "CONSENT_TESTER_URL",
-    "http://bp-compliance-consent-tester:8094",
-)
-
 router = APIRouter(prefix="/specialist-agent", tags=["specialist-agent"])


@@ -214,16 +208,35 @@ async def _process_slot(
    req: TestStartRequest,
    vault: EvidenceVault,
 ) -> None:
-    """Holt den Text (URL oder raw), ruft Agent, vault-speichert Output."""
+    """Holt den Text (URL oder raw), ruft Agent, vault-speichert Output.
+
+    Nutzt für den URL-Fetch die VOLLE Compliance-Check-Pipeline
+    (_fetch_text aus _fetch.py): 240s Playwright-Discovery + HTTP-
+    Fallback mit Browser-UA + Multi-Page-Merge + CMP-Capture.
+    """
    label = url or f"text-slot-{slot}"
    await _emit(run_id, {"type": "slot_started", "slot": slot,
                         "label": label})
    text = raw_text
    fetch_err = ""
+    cmp_payloads: list[dict] = []
    if url and not raw_text:
        await _emit(run_id, {"type": "slot_fetching",
-                             "slot": slot, "url": url})
-        text, fetch_err = await _fetch_text(url)
+                             "slot": slot, "url": url,
+                             "doc_type": agent.doc_type})
+        try:
+            text, cmp_payloads = await full_fetch_text(
+                url, doc_type=agent.doc_type,
+            )
+        except Exception as e:
+            fetch_err = f"{type(e).__name__}: {str(e)[:160]}"
+            text = ""
+        if not text and not fetch_err:
+            fetch_err = (
+                "Fetch lieferte 0 Zeichen — Site möglicherweise "
+                "Cloudflare-/Anti-Bot-geschützt oder JS-only-Rendering. "
+                "Tipp: Text manuell ins raw_text-Feld einfügen."
+            )
        if fetch_err:
            await _emit(run_id, {
                "type": "slot_fetch_error",
@@ -234,10 +247,14 @@ async def _process_slot(
        vault.put_bytes("raw", slot, "source.txt",
                         text.encode("utf-8"),
                         mime="text/plain")
+    if cmp_payloads:
+        vault.put_json("raw", slot, "cmp_payloads.json", cmp_payloads)
    await _emit(run_id, {
        "type": "slot_text_ready",
        "slot": slot,
        "char_count": len(text),
+        "word_count": len(text.split()) if text else 0,
+        "cmp_payloads": len(cmp_payloads),
    })
    agent_input = AgentInput(
        doc_type=agent.doc_type,
@@ -246,6 +263,7 @@ async def _process_slot(
        business_scope=req.business_scope,
        company_name=req.company_name,
        origin_domain=req.origin_domain,
+        context={"cmp_payloads": cmp_payloads} if cmp_payloads else {},
    )
    await _emit(run_id, {"type": "slot_agent_running", "slot": slot})
    try:
@@ -257,6 +275,12 @@ async def _process_slot(
            "error": f"{type(e).__name__}: {str(e)[:160]}",
        })
        return
+    # Wenn Fetch fail war: füge die Fehlermeldung an die notes des Output
+    if fetch_err and not text:
+        output.notes = (
+            (output.notes + " · " if output.notes else "")
+            + f"fetch_error: {fetch_err}"
+        )
    # Persist findings as JSON in vault
    vault.put_json("finding", slot, "output.json",
                    json.loads(output.model_dump_json()))
@@ -299,32 +323,6 @@ async def _process_slot(
    })


-async def _fetch_text(url: str) -> tuple[str, str]:
-    """Nutzt den consent-tester DSI-Discovery für Volltext."""
-    try:
-        async with httpx.AsyncClient(timeout=120.0) as client:
-            resp = await client.post(
-                f"{CONSENT_TESTER_URL}/dsi-discovery",
-                json={"url": url, "max_documents": 5},
-                timeout=120.0,
-            )
-            if resp.status_code != 200:
-                return "", f"HTTP {resp.status_code}"
-            data = resp.json()
-            docs = data.get("documents", []) or []
-            if not docs:
-                return "", "no documents discovered"
-            texts: list[str] = []
-            for doc in docs:
-                t = (doc.get("full_text", "") or
-                     doc.get("text_preview", "") or "")
-                if t and len(t) > 50:
-                    texts.append(t)
-            return "\n\n".join(texts), ""
-    except Exception as e:
-        return "", f"{type(e).__name__}: {str(e)[:160]}"
-
-
 # ── Run / Vault Queries ──────────────────────────────────────────────