diff --git a/backend-compliance/compliance/api/agent_check/_b17_wiring.py b/backend-compliance/compliance/api/agent_check/_b17_wiring.py
index d2de8ce1..4ff8a2fe 100644
--- a/backend-compliance/compliance/api/agent_check/_b17_wiring.py
+++ b/backend-compliance/compliance/api/agent_check/_b17_wiring.py
@@ -57,20 +57,46 @@ async def run_b17(state: dict) -> None:
return
walk: dict = {}
+ walk_error: str | None = None
try:
- async with httpx.AsyncClient(timeout=180.0) as c:
+ async with httpx.AsyncClient(timeout=300.0) as c:
r = await c.post(
f"{CONSENT_TESTER_URL}/scan-audit-walk",
json={"url": homepage, "dwell_s": 4.0, "max_links": 8},
- timeout=180.0,
+ timeout=300.0,
)
if r.status_code == 200:
walk = r.json()
+ else:
+ walk_error = f"consent-tester HTTP {r.status_code}"
except Exception as e:
- logger.warning("B17 audit-walk request failed: %s", e)
- return
+ walk_error = f"{type(e).__name__}: {str(e)[:120]}"
+ logger.warning("B17 audit-walk request failed: %s", walk_error)
if not walk or not walk.get("walk_id"):
+ # Fallback-Stub damit Audit-Report einen Hinweis bekommt
+ # statt "audit_walk: None". Reviewer sieht den Fail.
+ state["audit_walk"] = {
+ "walk_id": "",
+ "url": homepage,
+ "video": {},
+ "actions": [],
+ "annotations": [],
+ "error": walk_error or "unknown (no walk_id returned)",
+ }
+ state["audit_walk_html"] = (
+ "
"
+ "
"
+ "⚠️ Audit-Walk konnte nicht aufgezeichnet werden"
+ "
"
+ f"
"
+ f"Site: {homepage} · Ursache: "
+ f"{walk_error or 'unknown'}. Mögliche "
+ "Gründe: komplexes CMP-Banner (lange Tour-Zeit), Anti-Bot-"
+ "Protection, oder consent-tester überlastet.
"
+ "
"
+ )
return
# Stufe-5: annotierte Screenshots pro Finding. Schickt die
diff --git a/backend-compliance/compliance/api/agent_check/_phase_e_email.py b/backend-compliance/compliance/api/agent_check/_phase_e_email.py
index 409465a8..65c9e160 100644
--- a/backend-compliance/compliance/api/agent_check/_phase_e_email.py
+++ b/backend-compliance/compliance/api/agent_check/_phase_e_email.py
@@ -36,7 +36,17 @@ def run_phase_e(state: dict) -> None:
doc_count = len([r for r in results if not r.error])
url_company = _company_name_from_url(doc_entries)
domain = _extract_domain(doc_entries)
- site_name = url_company or domain or "Unbekannt"
+ # Priorität: User-Input (req.company_name) > URL-Heuristik > "Unbekannt"
+ req_company = (getattr(req, "company_name", None) or "").strip()
+ req_domain = (getattr(req, "origin_domain", None) or "").strip()
+ site_name = req_company or url_company or domain or "Unbekannt"
+ if req_domain and not domain:
+ # Falls keine domain aus URLs ableitbar war: User-Input verwenden
+ from urllib.parse import urlparse
+ try:
+ domain = urlparse(req_domain).netloc.lstrip("www.") or req_domain
+ except Exception:
+ domain = req_domain
_update(check_id, "E-Mail wird versendet...", 98)
# A1: bundle cookie-evidence slices into a ZIP attachment so the
diff --git a/backend-compliance/compliance/api/agent_check/_schemas.py b/backend-compliance/compliance/api/agent_check/_schemas.py
index d4625533..94aa7591 100644
--- a/backend-compliance/compliance/api/agent_check/_schemas.py
+++ b/backend-compliance/compliance/api/agent_check/_schemas.py
@@ -28,6 +28,11 @@ class ComplianceCheckRequest(BaseModel):
# Rechtsform, Konzern, MA, Besondere Daten, Drittland). Wird im
# Snapshot persistiert und filtert die MC-Auswertung (P72).
scan_context: dict | None = None
+ # Frontend-eingegebene Firma + Origin-Domain. Priorisiert vor
+ # LLM-extracted_profile-Inferenz. Wenn leer: Fallback auf Heuristik
+ # aus URL-Domains und DSE-Text.
+ company_name: str | None = None
+ origin_domain: str | None = None
class ComplianceCheckStartResponse(BaseModel):
diff --git a/backend-compliance/compliance/services/cross_domain_doc_check.py b/backend-compliance/compliance/services/cross_domain_doc_check.py
index 773812a6..ddb042d8 100644
--- a/backend-compliance/compliance/services/cross_domain_doc_check.py
+++ b/backend-compliance/compliance/services/cross_domain_doc_check.py
@@ -87,17 +87,52 @@ def _site_origin_sld(state: dict) -> str:
return max(counter, key=counter.get)
-def check_cross_domain_docs(state: dict) -> list[dict]:
- """Emit findings for doc_entries whose URL has a different SLD
- than the site origin."""
- primary = _site_origin_sld(state)
- if not primary:
- return []
- findings: list[dict] = []
+def _collect_audit_urls(state: dict) -> list[tuple[str, str]]:
+ """Sammle (doc_type, url) aus BEIDEN Quellen — state.doc_entries
+ (nach Discovery) UND req.documents (USER-Original-Input). Discovery
+ kann Original-URLs verlieren (PDF-Fetch-Fail, Auto-Reclassify), aber
+ Cross-Domain-Hosting ist juristisch unabhängig vom Text-Inhalt
+ der Datei.
+ """
+ seen: set[tuple[str, str]] = set()
+ out: list[tuple[str, str]] = []
for e in (state.get("doc_entries") or []):
url = (e.get("url") or "").strip()
doc_type = (e.get("doc_type") or "").lower()
- if not url or "://" not in url:
+ if url and doc_type and (doc_type, url) not in seen:
+ seen.add((doc_type, url))
+ out.append((doc_type, url))
+ # rejected_url ist die Original-URL die Discovery rejected hat
+ rej = (e.get("rejected_url") or "").strip()
+ if rej and doc_type and (doc_type, rej) not in seen:
+ seen.add((doc_type, rej))
+ out.append((doc_type, rej))
+ # Fallback: req.documents — USER hat sie explizit eingegeben
+ req = state.get("req")
+ if req is not None:
+ for d in getattr(req, "documents", []) or []:
+ url = (getattr(d, "url", "") or "").strip()
+ doc_type = (getattr(d, "doc_type", "") or "").lower()
+ if url and doc_type and (doc_type, url) not in seen:
+ seen.add((doc_type, url))
+ out.append((doc_type, url))
+ return out
+
+
+def check_cross_domain_docs(state: dict) -> list[dict]:
+ """Emit findings for doc-URLs whose host has a different SLD
+ than the site origin."""
+ primary = _site_origin_sld(state)
+ if not primary:
+ logger.info("B22 cross-domain: kein primary SLD ermittelbar")
+ return []
+ findings: list[dict] = []
+ audit_urls = _collect_audit_urls(state)
+ logger.info("B22 cross-domain: primary=%s, prüfe %d URL(s)",
+ primary, len(audit_urls))
+ emitted_keys: set[tuple[str, str]] = set()
+ for doc_type, url in audit_urls:
+ if "://" not in url:
continue
try:
host = urlparse(url).netloc
@@ -106,6 +141,12 @@ def check_cross_domain_docs(state: dict) -> list[dict]:
continue
if not url_sld or url_sld == primary:
continue
+ # Dedup pro (doc_type, host_sld) damit rejected_url + url nicht
+ # doppelt gemeldet werden
+ e_key = (doc_type, url_sld)
+ if e_key in emitted_keys:
+ continue
+ emitted_keys.add(e_key)
# Cross-Domain detected
severity = _SEVERITY_BY_DOC.get(doc_type, "MEDIUM")
doc_label = {
diff --git a/backend-compliance/compliance/services/finding_plausibility_check.py b/backend-compliance/compliance/services/finding_plausibility_check.py
index 6789edd2..294bd374 100644
--- a/backend-compliance/compliance/services/finding_plausibility_check.py
+++ b/backend-compliance/compliance/services/finding_plausibility_check.py
@@ -50,11 +50,19 @@ import httpx
logger = logging.getLogger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
-MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen3:30b-a3b")
-# Reduced from 8 → 4 to fight qwen3 empty-response-on-large-prompts bug.
-# 4 items × ~500 token/item + 2000 system + 1500 excerpt = ~5500 token total,
-# well within qwen3's safe range for format='json'.
-BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "4"))
+# Default-Modell als ENV-Switch konfigurierbar. qwen3:30b-a3b ist
+# bestes Reasoning, aber gibt bei großen DSEs gerne leere Responses
+# unter format='json'. qwen2.5:7b ist 4× kleiner, deutlich
+# zuverlässiger, leicht schwächeres Reasoning aber für die einfache
+# Plausibility-Klassifikation (PASS/MODIFY/DROP) ausreichend.
+MODEL = os.getenv("PLAUSIBILITY_LLM_MODEL", "qwen2.5:7b")
+# Fallback-Modell wenn das primary trotz Retries nichts liefert
+# (Strategy A → B → C → D-Schritte erschöpft). Default ist ein
+# kleines, robustes Modell.
+FALLBACK_MODEL = os.getenv("PLAUSIBILITY_FALLBACK_MODEL", "llama3.2:3b")
+# Mit kleinerem Modell können größere Batches funktionieren — aber
+# konservativ bleiben damit Single-Modell-Fail nicht ganz Phase killt.
+BATCH_SIZE = int(os.getenv("PLAUSIBILITY_BATCH_SIZE", "3"))
TIMEOUT = float(os.getenv("PLAUSIBILITY_TIMEOUT_S", "45.0"))
# Reduced excerpt 4000 → 1500 chars (same reason).
DOC_EXCERPT_CHARS = int(os.getenv("PLAUSIBILITY_DOC_EXCERPT", "1500"))
@@ -173,33 +181,46 @@ async def _ask_llm_batch(items: list[dict], doc_title: str,
"""Send a batch of up to BATCH_SIZE findings to the LLM.
Resilience strategy (P125 fix for empty-response bug):
- A. format='json' (strict) — current default
- B. If A returns empty: format='' (loose), extract JSON manually
- C. If B also empty AND batch >2: split batch + recurse
- D. Else: give up, return {} (callers stamp llm_skipped=true)
+ A. primary MODEL + format='json' (strict)
+ B. primary MODEL + format='' (loose), parse JSON manuell
+ C. FALLBACK_MODEL + format='json' (kleineres robusteres Modell)
+ D. If batch >2: split + recurse
+ E. Else: give up, return {} (callers stamp llm_skipped=true)
"""
user_prompt = _build_user_prompt(items, doc_title, doc_excerpt)
- base_body = {
- "model": MODEL,
- "messages": [
- {"role": "system", "content": _SYSTEM_PROMPT},
- {"role": "user", "content": user_prompt},
- ],
- "stream": False,
- "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500},
- }
+
+ def _body(model: str) -> dict:
+ return {
+ "model": model,
+ "messages": [
+ {"role": "system", "content": _SYSTEM_PROMPT},
+ {"role": "user", "content": user_prompt},
+ ],
+ "stream": False,
+ "options": {"temperature": 0.0, "seed": 42, "num_predict": 1500},
+ }
+
out: dict[str, dict] = {}
input_ids = [it["id"] for it in items]
try:
- # Strategy A: format='json'
- content = await _post_llm({**base_body, "format": "json"})
+ # Strategy A: primary + format='json'
+ content = await _post_llm({**_body(MODEL), "format": "json"})
if not content:
- # Strategy B: format-free, parse-on-our-side
+ # Strategy B: primary + format-free
logger.info(
"plausibility A→empty, trying B (format-free) batch=%d",
len(items),
)
- content = await _post_llm(base_body)
+ content = await _post_llm(_body(MODEL))
+ if not content and FALLBACK_MODEL and FALLBACK_MODEL != MODEL:
+ # Strategy C: fallback-model + format='json'
+ logger.info(
+ "plausibility A+B empty, trying C (fallback=%s) batch=%d",
+ FALLBACK_MODEL, len(items),
+ )
+ content = await _post_llm(
+ {**_body(FALLBACK_MODEL), "format": "json"},
+ )
if not content:
# Strategy C: split + recurse