breakpilot-compliance/backend-compliance/compliance/api/agent_compliance_check_routes.py

"""
Unified Compliance Check Routes — check all documents in one request.

POST /compliance/agent/extract-text — extract text from a URL
POST /compliance/agent/compliance-check — unified check for all documents
GET  /compliance/agent/compliance-check/{check_id} — poll status

Phase 5 split (2026-06-06): the original 2700-line monolith is now
decomposed into the `agent_check/` subpackage:
  - _orchestrator.py — thin run_compliance_check pipeline
  - _phase_a_resolve.py — TDM + Step 1 (resolve / discover / split)
  - _phase_b_profile_check.py — Step 2 + Step 3 (profile + doc checks)
  - _phase_c_banner.py — Step 3b-d (banner + cross-check + TCF) + Step 4
  - _phase_d1_vendors_raw.py / _phase_d2_vendors_finalize.py — Step 5
    vendor extraction + finalize
  - _phase_d3_blocks_top.py / mid / bot — Step 5 HTML blocks
  - _phase_e_email.py — Step 6 (with A1 ZIP-Anhang)
  - _phase_f_persist.py — Step 7 (snapshot + audit log + unified findings)
  - _helpers.py / _constants.py / _state.py / _schemas.py — shared

External callers (saving_scan_routes, agent_migration_routes, tests)
keep importing helpers from THIS module — everything is re-exported.
"""

from __future__ import annotations

import asyncio
import logging
import uuid as _uuid

import httpx
from fastapi import APIRouter

# ── Re-exports: external callers import these from THIS module ──────
from .agent_check._constants import (  # noqa: F401
    CONSENT_TESTER_URL,
    _ALL_DOC_TYPES,
    _COMPOUND_TLDS,
    _DISCOVERY_RULES,
    _DOC_TYPE_LABELS,
    _compliance_check_jobs,
)
from .agent_check._discovery import _autodiscover_missing  # noqa: F401
from .agent_check._fetch import _fetch_text  # noqa: F401
from .agent_check._helpers import (  # noqa: F401
    _apply_profile_filter,
    _build_profile_html,
    _classify_discovered_doc,
    _company_name_from_url,
    _doc_type_label,
    _extract_domain,
    _get_skip_types,
    _pad_results_with_missing,
    _result_to_dict,
    _update,
)
from .agent_check._orchestrator import run_compliance_check as _run_compliance_check  # noqa: F401
from .agent_check._schemas import (
    ComplianceCheckRequest,
    ComplianceCheckStartResponse,
    ComplianceCheckStatusResponse,
    DocumentInput,
    ExtractTextRequest,
)
from .agent_check._single_check import _check_single  # noqa: F401

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/compliance/agent", tags=["agent"])


# ── Extract text endpoint ────────────────────────────────────────────

@router.post("/extract-text")
async def extract_text(req: ExtractTextRequest):
    """Extract text from a URL via consent-tester DSI discovery.

    Merges all documents found on the page (sub-pages, accordions, etc.)
    """
    try:
        async with httpx.AsyncClient(timeout=300.0) as client:
            resp = await client.post(
                f"{CONSENT_TESTER_URL}/dsi-discovery",
                json={"url": req.url, "max_documents": 5},
                timeout=300.0,
            )
            if resp.status_code != 200:
                return {
                    "text": "", "word_count": 0, "title": "",
                    "error": f"HTTP {resp.status_code} von Consent-Tester",
                }

            data = resp.json()
            docs = data.get("documents", [])

            if not docs:
                return {
                    "text": "", "word_count": 0, "title": "",
                    "error": "Kein Text extrahierbar",
                }

            # Merge all documents (handles multi-page DSIs like BMW)
            texts = []
            for doc in docs:
                t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
                if t and len(t) > 50:
                    texts.append(t)
            text = "\n\n".join(texts) if texts else ""
            title = docs[0].get("title", "") or docs[0].get("doc_type", "")
            word_count = len(text.split())

            return {
                "text": text,
                "word_count": word_count,
                "title": title,
                "error": "",
            }

    except Exception as e:
        logger.warning("extract-text failed for %s: %s", req.url, e)
        return {
            "text": "", "word_count": 0, "title": "",
            "error": str(e)[:200],
        }


# ── Unified compliance check ────────────────────────────────────────

@router.post("/compliance-check")
async def start_compliance_check(req: ComplianceCheckRequest):
    """Start async compliance check for all documents."""
    check_id = str(_uuid.uuid4())[:8]
    _compliance_check_jobs[check_id] = {
        "status": "running",
        "progress": "Pruefung gestartet...",
        "progress_pct": 0,
        "result": None,
        "error": "",
    }
    asyncio.create_task(_run_compliance_check(check_id, req))
    return ComplianceCheckStartResponse(check_id=check_id, status="running")


@router.get("/compliance-check/{check_id}")
async def get_compliance_check_status(check_id: str):
    """Poll compliance check status."""
    job = _compliance_check_jobs.get(check_id)
    if not job:
        return {"check_id": check_id, "status": "not_found"}
    return ComplianceCheckStatusResponse(
        check_id=check_id,
        status=job["status"],
        progress=job.get("progress", ""),
        progress_pct=job.get("progress_pct", 0),
        result=job.get("result"),
        error=job.get("error", ""),
    )


# ── P80: Snapshot + Replay ───────────────────────────────────────────

@router.get("/snapshots")
async def list_snapshots(domain: str = "", limit: int = 20):
    """P80: list recent snapshots, optionally filtered by site_domain."""
    from database import SessionLocal
    from compliance.services.check_snapshot import list_snapshots_for_domain
    db = SessionLocal()
    try:
        if domain:
            return {"snapshots": list_snapshots_for_domain(db, domain, limit)}
        from sqlalchemy import text
        rows = db.execute(
            text("""
                SELECT id, check_id, site_domain, site_label, created_at,
                       replay_count, notes
                FROM compliance.compliance_check_snapshots
                ORDER BY created_at DESC
                LIMIT :lim
            """),
            {"lim": limit},
        ).fetchall()
        return {"snapshots": [
            {"id": str(r[0]), "check_id": r[1], "site_domain": r[2],
             "site_label": r[3], "created_at": str(r[4]),
             "replay_count": r[5], "notes": r[6]}
            for r in rows
        ]}
    finally:
        db.close()


@router.get("/snapshots/{snapshot_id}")
async def get_snapshot(snapshot_id: str):
    """P80: load full snapshot raw data."""
    from fastapi import HTTPException
    from database import SessionLocal
    from compliance.services.check_snapshot import load_snapshot
    db = SessionLocal()
    try:
        snap = load_snapshot(db, snapshot_id)
        if not snap:
            raise HTTPException(status_code=404, detail="snapshot not found")
        return snap
    finally:
        db.close()


@router.get("/admin/benchmark")
async def benchmark(
    industry: str = "",
    sites: str = "",
    anonymized: bool = False,
    limit: int = 50,
):
    """P107 — Branchen-Benchmark-Cockpit Endpoint."""
    from database import SessionLocal
    from compliance.services.benchmark_extractor import (
        anonymize_kpis,
        build_benchmark_summary,
        load_snapshots_for_benchmark,
    )
    site_list = [s.strip() for s in sites.split(",") if s.strip()] if sites else None
    db = SessionLocal()
    try:
        kpis = load_snapshots_for_benchmark(
            db, industry=industry or None, sites=site_list, limit=limit,
        )
    finally:
        db.close()
    if anonymized:
        kpis = anonymize_kpis(kpis, industry=industry)
    return {
        "industry":  industry or "all",
        "anonymized": anonymized,
        "sites":     [k.get("site_label") for k in kpis],
        "kpis":      kpis,
        "summary":   build_benchmark_summary(kpis),
    }


@router.post("/admin/tcf-ingest")
async def tcf_ingest():
    """P105 — IAB TCF Vendor-Liste ingestieren / refreshen."""
    from database import SessionLocal
    from compliance.services.tcf_vendor_authority import (
        fetch_and_ingest_tcf_vendors,
    )
    db = SessionLocal()
    try:
        return await fetch_and_ingest_tcf_vendors(db)
    finally:
        db.close()


@router.get("/snapshots/{snapshot_id}/pdf")
async def export_snapshot_pdf(snapshot_id: str):
    """P88 — PDF-Export der Audit-Mail. Liefert application/pdf."""
    from fastapi import HTTPException
    from fastapi.responses import Response
    from database import SessionLocal
    from compliance.services.mail_pdf_export import render_snapshot_as_pdf
    db = SessionLocal()
    try:
        pdf = render_snapshot_as_pdf(db, snapshot_id)
    finally:
        db.close()
    if not pdf:
        raise HTTPException(404, f"Snapshot {snapshot_id} nicht gefunden "
                                  "oder PDF-Render fehlgeschlagen.")
    fname = f"breakpilot-audit-{snapshot_id[:8]}.pdf"
    return Response(
        content=pdf, media_type="application/pdf",
        headers={"Content-Disposition": f'attachment; filename="{fname}"'},
    )


@router.post("/snapshots/{snapshot_id}/replay")
async def replay_snapshot(
    snapshot_id: str,
    recipient: str = "",
    dry_run: bool = True,
):
    """P80: replay audit mail render from snapshot. 7min->2sec test cycle.

    Default dry_run=true just returns rendered HTML size + section breakdown.
    Pass recipient + dry_run=false to actually send a [REPLAY] mail.
    """
    from database import SessionLocal
    from compliance.services.check_replay import replay_from_snapshot
    db = SessionLocal()
    try:
        return replay_from_snapshot(
            db,
            snapshot_id=snapshot_id,
            recipient=(recipient if recipient else None),
            dry_run=dry_run,
        )
    finally:
        db.close()


# ── Admin: audit drill-down (A5) + trend view (A6) ──────────────────

@router.get("/audit/{check_id}")
async def audit_drill_down(
    check_id: str,
    doc_type: str = "",
    regulation: str = "",
    only_failed: bool = False,
):
    """Return scorecard + filterable MC results for a single check run.

    Frontend uses this to render the /sdk/agent/audit/<check_id> view.
    """
    from compliance.services.compliance_audit_log import (
        get_check_run, list_mc_results,
    )
    run = get_check_run(check_id)
    if not run:
        return {"check_id": check_id, "found": False}
    rows = list_mc_results(
        check_id,
        doc_type=doc_type or None,
        regulation=regulation or None,
        only_failed=only_failed,
    )
    return {
        "check_id": check_id,
        "found": True,
        "run": run,
        "mc_count": len(rows),
        "results": rows,
    }


@router.get("/audit/tenant/{tenant_id}")
async def audit_tenant_history(
    tenant_id: str,
    base_domain: str = "",
    limit: int = 30,
):
    """Tenant-level history for the trend view (A6)."""
    from compliance.services.compliance_audit_log import list_runs_for_tenant
    runs = list_runs_for_tenant(
        tenant_id, base_domain=base_domain or None, limit=limit,
    )
    return {"tenant_id": tenant_id, "count": len(runs), "runs": runs}