""" Unified Compliance Check Routes — check all documents in one request. POST /compliance/agent/extract-text — extract text from a URL POST /compliance/agent/compliance-check — unified check for all documents GET /compliance/agent/compliance-check/{check_id} — poll status Phase 5 split (2026-06-06): the original 2700-line monolith is now decomposed into the `agent_check/` subpackage: - _orchestrator.py — thin run_compliance_check pipeline - _phase_a_resolve.py — TDM + Step 1 (resolve / discover / split) - _phase_b_profile_check.py — Step 2 + Step 3 (profile + doc checks) - _phase_c_banner.py — Step 3b-d (banner + cross-check + TCF) + Step 4 - _phase_d1_vendors_raw.py / _phase_d2_vendors_finalize.py — Step 5 vendor extraction + finalize - _phase_d3_blocks_top.py / mid / bot — Step 5 HTML blocks - _phase_e_email.py — Step 6 (with A1 ZIP-Anhang) - _phase_f_persist.py — Step 7 (snapshot + audit log + unified findings) - _helpers.py / _constants.py / _state.py / _schemas.py — shared External callers (saving_scan_routes, agent_migration_routes, tests) keep importing helpers from THIS module — everything is re-exported. """ from __future__ import annotations import asyncio import logging import uuid as _uuid import httpx from fastapi import APIRouter # ── Re-exports: external callers import these from THIS module ────── from .agent_check._constants import ( # noqa: F401 CONSENT_TESTER_URL, _ALL_DOC_TYPES, _COMPOUND_TLDS, _DISCOVERY_RULES, _DOC_TYPE_LABELS, _compliance_check_jobs, ) from .agent_check._discovery import _autodiscover_missing # noqa: F401 from .agent_check._fetch import _fetch_text # noqa: F401 from .agent_check._helpers import ( # noqa: F401 _apply_profile_filter, _build_profile_html, _classify_discovered_doc, _company_name_from_url, _doc_type_label, _extract_domain, _get_skip_types, _pad_results_with_missing, _result_to_dict, _update, ) from .agent_check._orchestrator import run_compliance_check as _run_compliance_check # noqa: F401 from .agent_check._schemas import ( ComplianceCheckRequest, ComplianceCheckStartResponse, ComplianceCheckStatusResponse, DocumentInput, ExtractTextRequest, ) from .agent_check._single_check import _check_single # noqa: F401 logger = logging.getLogger(__name__) router = APIRouter(prefix="/compliance/agent", tags=["agent"]) # ── Extract text endpoint ──────────────────────────────────────────── @router.post("/extract-text") async def extract_text(req: ExtractTextRequest): """Extract text from a URL via consent-tester DSI discovery. Merges all documents found on the page (sub-pages, accordions, etc.) """ try: async with httpx.AsyncClient(timeout=300.0) as client: resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", json={"url": req.url, "max_documents": 5}, timeout=300.0, ) if resp.status_code != 200: return { "text": "", "word_count": 0, "title": "", "error": f"HTTP {resp.status_code} von Consent-Tester", } data = resp.json() docs = data.get("documents", []) if not docs: return { "text": "", "word_count": 0, "title": "", "error": "Kein Text extrahierbar", } # Merge all documents (handles multi-page DSIs like BMW) texts = [] for doc in docs: t = doc.get("full_text", "") or doc.get("text_preview", "") or "" if t and len(t) > 50: texts.append(t) text = "\n\n".join(texts) if texts else "" title = docs[0].get("title", "") or docs[0].get("doc_type", "") word_count = len(text.split()) return { "text": text, "word_count": word_count, "title": title, "error": "", } except Exception as e: logger.warning("extract-text failed for %s: %s", req.url, e) return { "text": "", "word_count": 0, "title": "", "error": str(e)[:200], } # ── Unified compliance check ──────────────────────────────────────── @router.post("/compliance-check") async def start_compliance_check(req: ComplianceCheckRequest): """Start async compliance check for all documents.""" check_id = str(_uuid.uuid4())[:8] _compliance_check_jobs[check_id] = { "status": "running", "progress": "Pruefung gestartet...", "progress_pct": 0, "result": None, "error": "", } asyncio.create_task(_run_compliance_check(check_id, req)) return ComplianceCheckStartResponse(check_id=check_id, status="running") @router.get("/compliance-check/{check_id}") async def get_compliance_check_status(check_id: str): """Poll compliance check status.""" job = _compliance_check_jobs.get(check_id) if not job: return {"check_id": check_id, "status": "not_found"} return ComplianceCheckStatusResponse( check_id=check_id, status=job["status"], progress=job.get("progress", ""), progress_pct=job.get("progress_pct", 0), result=job.get("result"), error=job.get("error", ""), ) # ── P80: Snapshot + Replay ─────────────────────────────────────────── @router.get("/snapshots") async def list_snapshots(domain: str = "", limit: int = 20): """P80: list recent snapshots, optionally filtered by site_domain.""" from database import SessionLocal from compliance.services.check_snapshot import list_snapshots_for_domain db = SessionLocal() try: if domain: return {"snapshots": list_snapshots_for_domain(db, domain, limit)} from sqlalchemy import text rows = db.execute( text(""" SELECT id, check_id, site_domain, site_label, created_at, replay_count, notes FROM compliance.compliance_check_snapshots ORDER BY created_at DESC LIMIT :lim """), {"lim": limit}, ).fetchall() return {"snapshots": [ {"id": str(r[0]), "check_id": r[1], "site_domain": r[2], "site_label": r[3], "created_at": str(r[4]), "replay_count": r[5], "notes": r[6]} for r in rows ]} finally: db.close() @router.get("/snapshots/{snapshot_id}") async def get_snapshot(snapshot_id: str): """P80: load full snapshot raw data.""" from fastapi import HTTPException from database import SessionLocal from compliance.services.check_snapshot import load_snapshot db = SessionLocal() try: snap = load_snapshot(db, snapshot_id) if not snap: raise HTTPException(status_code=404, detail="snapshot not found") return snap finally: db.close() @router.get("/admin/benchmark") async def benchmark( industry: str = "", sites: str = "", anonymized: bool = False, limit: int = 50, ): """P107 — Branchen-Benchmark-Cockpit Endpoint.""" from database import SessionLocal from compliance.services.benchmark_extractor import ( anonymize_kpis, build_benchmark_summary, load_snapshots_for_benchmark, ) site_list = [s.strip() for s in sites.split(",") if s.strip()] if sites else None db = SessionLocal() try: kpis = load_snapshots_for_benchmark( db, industry=industry or None, sites=site_list, limit=limit, ) finally: db.close() if anonymized: kpis = anonymize_kpis(kpis, industry=industry) return { "industry": industry or "all", "anonymized": anonymized, "sites": [k.get("site_label") for k in kpis], "kpis": kpis, "summary": build_benchmark_summary(kpis), } @router.post("/admin/tcf-ingest") async def tcf_ingest(): """P105 — IAB TCF Vendor-Liste ingestieren / refreshen.""" from database import SessionLocal from compliance.services.tcf_vendor_authority import ( fetch_and_ingest_tcf_vendors, ) db = SessionLocal() try: return await fetch_and_ingest_tcf_vendors(db) finally: db.close() @router.get("/snapshots/{snapshot_id}/pdf") async def export_snapshot_pdf(snapshot_id: str): """P88 — PDF-Export der Audit-Mail. Liefert application/pdf.""" from fastapi import HTTPException from fastapi.responses import Response from database import SessionLocal from compliance.services.mail_pdf_export import render_snapshot_as_pdf db = SessionLocal() try: pdf = render_snapshot_as_pdf(db, snapshot_id) finally: db.close() if not pdf: raise HTTPException(404, f"Snapshot {snapshot_id} nicht gefunden " "oder PDF-Render fehlgeschlagen.") fname = f"breakpilot-audit-{snapshot_id[:8]}.pdf" return Response( content=pdf, media_type="application/pdf", headers={"Content-Disposition": f'attachment; filename="{fname}"'}, ) @router.post("/snapshots/{snapshot_id}/replay") async def replay_snapshot( snapshot_id: str, recipient: str = "", dry_run: bool = True, ): """P80: replay audit mail render from snapshot. 7min->2sec test cycle. Default dry_run=true just returns rendered HTML size + section breakdown. Pass recipient + dry_run=false to actually send a [REPLAY] mail. """ from database import SessionLocal from compliance.services.check_replay import replay_from_snapshot db = SessionLocal() try: return replay_from_snapshot( db, snapshot_id=snapshot_id, recipient=(recipient if recipient else None), dry_run=dry_run, ) finally: db.close() # ── Admin: audit drill-down (A5) + trend view (A6) ────────────────── @router.get("/audit/{check_id}") async def audit_drill_down( check_id: str, doc_type: str = "", regulation: str = "", only_failed: bool = False, ): """Return scorecard + filterable MC results for a single check run. Frontend uses this to render the /sdk/agent/audit/ view. """ from compliance.services.compliance_audit_log import ( get_check_run, list_mc_results, ) run = get_check_run(check_id) if not run: return {"check_id": check_id, "found": False} rows = list_mc_results( check_id, doc_type=doc_type or None, regulation=regulation or None, only_failed=only_failed, ) return { "check_id": check_id, "found": True, "run": run, "mc_count": len(rows), "results": rows, } @router.get("/audit/tenant/{tenant_id}") async def audit_tenant_history( tenant_id: str, base_domain: str = "", limit: int = 30, ): """Tenant-level history for the trend view (A6).""" from compliance.services.compliance_audit_log import list_runs_for_tenant runs = list_runs_for_tenant( tenant_id, base_domain=base_domain or None, limit=limit, ) return {"tenant_id": tenant_id, "count": len(runs), "runs": runs}