feat(agents): Test-Harness nutzt volle Compliance-Pipeline für Fetch
CI / detect-changes (push) Successful in 7s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 10s
CI / loc-budget (push) Successful in 12s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / test-python-backend (push) Successful in 28s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / detect-changes (push) Successful in 7s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 10s
CI / loc-budget (push) Successful in 12s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / test-python-backend (push) Successful in 28s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
Statt der simplen dsi-discovery-Wrapper-Funktion ruft der Test-Harness
jetzt _fetch_text() aus agent_check/_fetch.py — die VOLLE Pipeline
die auch der produktive Compliance-Check verwendet:
- consent-tester dsi-discovery mit 240s Timeout (statt 120s)
- doc_type-aware max_documents (1 für cookie/dse, 3 für impressum)
- CMP-Payload-Capture (ePaaS, OneTrust …)
- HTTP-Fallback mit Browser-User-Agent + DomainRateLimiter
- HTML-Tag-Strip wenn Playwright fail
Damit funktionieren Cloudflare-/Anti-Bot-geschützte Sites wie BMW
und Elli auch im Test-Harness — vorher Timeout nach 90s.
Plus: bei leerem Fetch klare Fehlermeldung im Slot
('Cloudflare-/Anti-Bot-geschützt — Tipp: Text manuell einfügen')
statt silent-fail. cmp_payloads landen jetzt auch im Vault.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -16,16 +16,15 @@ from __future__ import annotations
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import uuid
|
import uuid
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import httpx
|
|
||||||
from fastapi import APIRouter, HTTPException
|
from fastapi import APIRouter, HTTPException
|
||||||
from fastapi.responses import FileResponse, StreamingResponse
|
from fastapi.responses import FileResponse, StreamingResponse
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from compliance.api.agent_check._fetch import _fetch_text as full_fetch_text
|
||||||
from compliance.services.specialist_agents import REGISTRY, AgentInput
|
from compliance.services.specialist_agents import REGISTRY, AgentInput
|
||||||
from compliance.services.specialist_agents._evidence_vault import (
|
from compliance.services.specialist_agents._evidence_vault import (
|
||||||
EvidenceVault,
|
EvidenceVault,
|
||||||
@@ -35,11 +34,6 @@ from compliance.services.specialist_agents._evidence_vault import (
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
CONSENT_TESTER_URL = os.environ.get(
|
|
||||||
"CONSENT_TESTER_URL",
|
|
||||||
"http://bp-compliance-consent-tester:8094",
|
|
||||||
)
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/specialist-agent", tags=["specialist-agent"])
|
router = APIRouter(prefix="/specialist-agent", tags=["specialist-agent"])
|
||||||
|
|
||||||
|
|
||||||
@@ -214,16 +208,35 @@ async def _process_slot(
|
|||||||
req: TestStartRequest,
|
req: TestStartRequest,
|
||||||
vault: EvidenceVault,
|
vault: EvidenceVault,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Holt den Text (URL oder raw), ruft Agent, vault-speichert Output."""
|
"""Holt den Text (URL oder raw), ruft Agent, vault-speichert Output.
|
||||||
|
|
||||||
|
Nutzt für den URL-Fetch die VOLLE Compliance-Check-Pipeline
|
||||||
|
(_fetch_text aus _fetch.py): 240s Playwright-Discovery + HTTP-
|
||||||
|
Fallback mit Browser-UA + Multi-Page-Merge + CMP-Capture.
|
||||||
|
"""
|
||||||
label = url or f"text-slot-{slot}"
|
label = url or f"text-slot-{slot}"
|
||||||
await _emit(run_id, {"type": "slot_started", "slot": slot,
|
await _emit(run_id, {"type": "slot_started", "slot": slot,
|
||||||
"label": label})
|
"label": label})
|
||||||
text = raw_text
|
text = raw_text
|
||||||
fetch_err = ""
|
fetch_err = ""
|
||||||
|
cmp_payloads: list[dict] = []
|
||||||
if url and not raw_text:
|
if url and not raw_text:
|
||||||
await _emit(run_id, {"type": "slot_fetching",
|
await _emit(run_id, {"type": "slot_fetching",
|
||||||
"slot": slot, "url": url})
|
"slot": slot, "url": url,
|
||||||
text, fetch_err = await _fetch_text(url)
|
"doc_type": agent.doc_type})
|
||||||
|
try:
|
||||||
|
text, cmp_payloads = await full_fetch_text(
|
||||||
|
url, doc_type=agent.doc_type,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
fetch_err = f"{type(e).__name__}: {str(e)[:160]}"
|
||||||
|
text = ""
|
||||||
|
if not text and not fetch_err:
|
||||||
|
fetch_err = (
|
||||||
|
"Fetch lieferte 0 Zeichen — Site möglicherweise "
|
||||||
|
"Cloudflare-/Anti-Bot-geschützt oder JS-only-Rendering. "
|
||||||
|
"Tipp: Text manuell ins raw_text-Feld einfügen."
|
||||||
|
)
|
||||||
if fetch_err:
|
if fetch_err:
|
||||||
await _emit(run_id, {
|
await _emit(run_id, {
|
||||||
"type": "slot_fetch_error",
|
"type": "slot_fetch_error",
|
||||||
@@ -234,10 +247,14 @@ async def _process_slot(
|
|||||||
vault.put_bytes("raw", slot, "source.txt",
|
vault.put_bytes("raw", slot, "source.txt",
|
||||||
text.encode("utf-8"),
|
text.encode("utf-8"),
|
||||||
mime="text/plain")
|
mime="text/plain")
|
||||||
|
if cmp_payloads:
|
||||||
|
vault.put_json("raw", slot, "cmp_payloads.json", cmp_payloads)
|
||||||
await _emit(run_id, {
|
await _emit(run_id, {
|
||||||
"type": "slot_text_ready",
|
"type": "slot_text_ready",
|
||||||
"slot": slot,
|
"slot": slot,
|
||||||
"char_count": len(text),
|
"char_count": len(text),
|
||||||
|
"word_count": len(text.split()) if text else 0,
|
||||||
|
"cmp_payloads": len(cmp_payloads),
|
||||||
})
|
})
|
||||||
agent_input = AgentInput(
|
agent_input = AgentInput(
|
||||||
doc_type=agent.doc_type,
|
doc_type=agent.doc_type,
|
||||||
@@ -246,6 +263,7 @@ async def _process_slot(
|
|||||||
business_scope=req.business_scope,
|
business_scope=req.business_scope,
|
||||||
company_name=req.company_name,
|
company_name=req.company_name,
|
||||||
origin_domain=req.origin_domain,
|
origin_domain=req.origin_domain,
|
||||||
|
context={"cmp_payloads": cmp_payloads} if cmp_payloads else {},
|
||||||
)
|
)
|
||||||
await _emit(run_id, {"type": "slot_agent_running", "slot": slot})
|
await _emit(run_id, {"type": "slot_agent_running", "slot": slot})
|
||||||
try:
|
try:
|
||||||
@@ -257,6 +275,12 @@ async def _process_slot(
|
|||||||
"error": f"{type(e).__name__}: {str(e)[:160]}",
|
"error": f"{type(e).__name__}: {str(e)[:160]}",
|
||||||
})
|
})
|
||||||
return
|
return
|
||||||
|
# Wenn Fetch fail war: füge die Fehlermeldung an die notes des Output
|
||||||
|
if fetch_err and not text:
|
||||||
|
output.notes = (
|
||||||
|
(output.notes + " · " if output.notes else "")
|
||||||
|
+ f"fetch_error: {fetch_err}"
|
||||||
|
)
|
||||||
# Persist findings as JSON in vault
|
# Persist findings as JSON in vault
|
||||||
vault.put_json("finding", slot, "output.json",
|
vault.put_json("finding", slot, "output.json",
|
||||||
json.loads(output.model_dump_json()))
|
json.loads(output.model_dump_json()))
|
||||||
@@ -299,32 +323,6 @@ async def _process_slot(
|
|||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_text(url: str) -> tuple[str, str]:
|
|
||||||
"""Nutzt den consent-tester DSI-Discovery für Volltext."""
|
|
||||||
try:
|
|
||||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
||||||
resp = await client.post(
|
|
||||||
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
|
||||||
json={"url": url, "max_documents": 5},
|
|
||||||
timeout=120.0,
|
|
||||||
)
|
|
||||||
if resp.status_code != 200:
|
|
||||||
return "", f"HTTP {resp.status_code}"
|
|
||||||
data = resp.json()
|
|
||||||
docs = data.get("documents", []) or []
|
|
||||||
if not docs:
|
|
||||||
return "", "no documents discovered"
|
|
||||||
texts: list[str] = []
|
|
||||||
for doc in docs:
|
|
||||||
t = (doc.get("full_text", "") or
|
|
||||||
doc.get("text_preview", "") or "")
|
|
||||||
if t and len(t) > 50:
|
|
||||||
texts.append(t)
|
|
||||||
return "\n\n".join(texts), ""
|
|
||||||
except Exception as e:
|
|
||||||
return "", f"{type(e).__name__}: {str(e)[:160]}"
|
|
||||||
|
|
||||||
|
|
||||||
# ── Run / Vault Queries ──────────────────────────────────────────────
|
# ── Run / Vault Queries ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user