Files
breakpilot-compliance/backend-compliance/compliance/api/agent_compliance_check_routes.py
T
Benjamin Admin c2c8783fee refactor(agent-check): split routes file (2692→347 LOC) + wire B1/B3/A1 [guardrail-change]
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:

  - Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
    HTML blocks top+mid+bot / email / persist
  - Helpers: _constants, _helpers, _fetch, _discovery, _single_check
  - Schemas + State + thin _orchestrator

A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.

B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.

B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.

Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
  audit_quality_findings before declaration, caught by surrounding
  except → block never rendered). New _phase_d3_blocks_bot.py runs
  audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
  ("Phase 5 split target" — done).

Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-06 14:47:25 +02:00

348 lines
12 KiB
Python

"""
Unified Compliance Check Routes — check all documents in one request.
POST /compliance/agent/extract-text — extract text from a URL
POST /compliance/agent/compliance-check — unified check for all documents
GET /compliance/agent/compliance-check/{check_id} — poll status
Phase 5 split (2026-06-06): the original 2700-line monolith is now
decomposed into the `agent_check/` subpackage:
- _orchestrator.py — thin run_compliance_check pipeline
- _phase_a_resolve.py — TDM + Step 1 (resolve / discover / split)
- _phase_b_profile_check.py — Step 2 + Step 3 (profile + doc checks)
- _phase_c_banner.py — Step 3b-d (banner + cross-check + TCF) + Step 4
- _phase_d1_vendors_raw.py / _phase_d2_vendors_finalize.py — Step 5
vendor extraction + finalize
- _phase_d3_blocks_top.py / mid / bot — Step 5 HTML blocks
- _phase_e_email.py — Step 6 (with A1 ZIP-Anhang)
- _phase_f_persist.py — Step 7 (snapshot + audit log + unified findings)
- _helpers.py / _constants.py / _state.py / _schemas.py — shared
External callers (saving_scan_routes, agent_migration_routes, tests)
keep importing helpers from THIS module — everything is re-exported.
"""
from __future__ import annotations
import asyncio
import logging
import uuid as _uuid
import httpx
from fastapi import APIRouter
# ── Re-exports: external callers import these from THIS module ──────
from .agent_check._constants import ( # noqa: F401
CONSENT_TESTER_URL,
_ALL_DOC_TYPES,
_COMPOUND_TLDS,
_DISCOVERY_RULES,
_DOC_TYPE_LABELS,
_compliance_check_jobs,
)
from .agent_check._discovery import _autodiscover_missing # noqa: F401
from .agent_check._fetch import _fetch_text # noqa: F401
from .agent_check._helpers import ( # noqa: F401
_apply_profile_filter,
_build_profile_html,
_classify_discovered_doc,
_company_name_from_url,
_doc_type_label,
_extract_domain,
_get_skip_types,
_pad_results_with_missing,
_result_to_dict,
_update,
)
from .agent_check._orchestrator import run_compliance_check as _run_compliance_check # noqa: F401
from .agent_check._schemas import (
ComplianceCheckRequest,
ComplianceCheckStartResponse,
ComplianceCheckStatusResponse,
DocumentInput,
ExtractTextRequest,
)
from .agent_check._single_check import _check_single # noqa: F401
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
# ── Extract text endpoint ────────────────────────────────────────────
@router.post("/extract-text")
async def extract_text(req: ExtractTextRequest):
"""Extract text from a URL via consent-tester DSI discovery.
Merges all documents found on the page (sub-pages, accordions, etc.)
"""
try:
async with httpx.AsyncClient(timeout=300.0) as client:
resp = await client.post(
f"{CONSENT_TESTER_URL}/dsi-discovery",
json={"url": req.url, "max_documents": 5},
timeout=300.0,
)
if resp.status_code != 200:
return {
"text": "", "word_count": 0, "title": "",
"error": f"HTTP {resp.status_code} von Consent-Tester",
}
data = resp.json()
docs = data.get("documents", [])
if not docs:
return {
"text": "", "word_count": 0, "title": "",
"error": "Kein Text extrahierbar",
}
# Merge all documents (handles multi-page DSIs like BMW)
texts = []
for doc in docs:
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
if t and len(t) > 50:
texts.append(t)
text = "\n\n".join(texts) if texts else ""
title = docs[0].get("title", "") or docs[0].get("doc_type", "")
word_count = len(text.split())
return {
"text": text,
"word_count": word_count,
"title": title,
"error": "",
}
except Exception as e:
logger.warning("extract-text failed for %s: %s", req.url, e)
return {
"text": "", "word_count": 0, "title": "",
"error": str(e)[:200],
}
# ── Unified compliance check ────────────────────────────────────────
@router.post("/compliance-check")
async def start_compliance_check(req: ComplianceCheckRequest):
"""Start async compliance check for all documents."""
check_id = str(_uuid.uuid4())[:8]
_compliance_check_jobs[check_id] = {
"status": "running",
"progress": "Pruefung gestartet...",
"progress_pct": 0,
"result": None,
"error": "",
}
asyncio.create_task(_run_compliance_check(check_id, req))
return ComplianceCheckStartResponse(check_id=check_id, status="running")
@router.get("/compliance-check/{check_id}")
async def get_compliance_check_status(check_id: str):
"""Poll compliance check status."""
job = _compliance_check_jobs.get(check_id)
if not job:
return {"check_id": check_id, "status": "not_found"}
return ComplianceCheckStatusResponse(
check_id=check_id,
status=job["status"],
progress=job.get("progress", ""),
progress_pct=job.get("progress_pct", 0),
result=job.get("result"),
error=job.get("error", ""),
)
# ── P80: Snapshot + Replay ───────────────────────────────────────────
@router.get("/snapshots")
async def list_snapshots(domain: str = "", limit: int = 20):
"""P80: list recent snapshots, optionally filtered by site_domain."""
from database import SessionLocal
from compliance.services.check_snapshot import list_snapshots_for_domain
db = SessionLocal()
try:
if domain:
return {"snapshots": list_snapshots_for_domain(db, domain, limit)}
from sqlalchemy import text
rows = db.execute(
text("""
SELECT id, check_id, site_domain, site_label, created_at,
replay_count, notes
FROM compliance.compliance_check_snapshots
ORDER BY created_at DESC
LIMIT :lim
"""),
{"lim": limit},
).fetchall()
return {"snapshots": [
{"id": str(r[0]), "check_id": r[1], "site_domain": r[2],
"site_label": r[3], "created_at": str(r[4]),
"replay_count": r[5], "notes": r[6]}
for r in rows
]}
finally:
db.close()
@router.get("/snapshots/{snapshot_id}")
async def get_snapshot(snapshot_id: str):
"""P80: load full snapshot raw data."""
from fastapi import HTTPException
from database import SessionLocal
from compliance.services.check_snapshot import load_snapshot
db = SessionLocal()
try:
snap = load_snapshot(db, snapshot_id)
if not snap:
raise HTTPException(status_code=404, detail="snapshot not found")
return snap
finally:
db.close()
@router.get("/admin/benchmark")
async def benchmark(
industry: str = "",
sites: str = "",
anonymized: bool = False,
limit: int = 50,
):
"""P107 — Branchen-Benchmark-Cockpit Endpoint."""
from database import SessionLocal
from compliance.services.benchmark_extractor import (
anonymize_kpis,
build_benchmark_summary,
load_snapshots_for_benchmark,
)
site_list = [s.strip() for s in sites.split(",") if s.strip()] if sites else None
db = SessionLocal()
try:
kpis = load_snapshots_for_benchmark(
db, industry=industry or None, sites=site_list, limit=limit,
)
finally:
db.close()
if anonymized:
kpis = anonymize_kpis(kpis, industry=industry)
return {
"industry": industry or "all",
"anonymized": anonymized,
"sites": [k.get("site_label") for k in kpis],
"kpis": kpis,
"summary": build_benchmark_summary(kpis),
}
@router.post("/admin/tcf-ingest")
async def tcf_ingest():
"""P105 — IAB TCF Vendor-Liste ingestieren / refreshen."""
from database import SessionLocal
from compliance.services.tcf_vendor_authority import (
fetch_and_ingest_tcf_vendors,
)
db = SessionLocal()
try:
return await fetch_and_ingest_tcf_vendors(db)
finally:
db.close()
@router.get("/snapshots/{snapshot_id}/pdf")
async def export_snapshot_pdf(snapshot_id: str):
"""P88 — PDF-Export der Audit-Mail. Liefert application/pdf."""
from fastapi import HTTPException
from fastapi.responses import Response
from database import SessionLocal
from compliance.services.mail_pdf_export import render_snapshot_as_pdf
db = SessionLocal()
try:
pdf = render_snapshot_as_pdf(db, snapshot_id)
finally:
db.close()
if not pdf:
raise HTTPException(404, f"Snapshot {snapshot_id} nicht gefunden "
"oder PDF-Render fehlgeschlagen.")
fname = f"breakpilot-audit-{snapshot_id[:8]}.pdf"
return Response(
content=pdf, media_type="application/pdf",
headers={"Content-Disposition": f'attachment; filename="{fname}"'},
)
@router.post("/snapshots/{snapshot_id}/replay")
async def replay_snapshot(
snapshot_id: str,
recipient: str = "",
dry_run: bool = True,
):
"""P80: replay audit mail render from snapshot. 7min->2sec test cycle.
Default dry_run=true just returns rendered HTML size + section breakdown.
Pass recipient + dry_run=false to actually send a [REPLAY] mail.
"""
from database import SessionLocal
from compliance.services.check_replay import replay_from_snapshot
db = SessionLocal()
try:
return replay_from_snapshot(
db,
snapshot_id=snapshot_id,
recipient=(recipient if recipient else None),
dry_run=dry_run,
)
finally:
db.close()
# ── Admin: audit drill-down (A5) + trend view (A6) ──────────────────
@router.get("/audit/{check_id}")
async def audit_drill_down(
check_id: str,
doc_type: str = "",
regulation: str = "",
only_failed: bool = False,
):
"""Return scorecard + filterable MC results for a single check run.
Frontend uses this to render the /sdk/agent/audit/<check_id> view.
"""
from compliance.services.compliance_audit_log import (
get_check_run, list_mc_results,
)
run = get_check_run(check_id)
if not run:
return {"check_id": check_id, "found": False}
rows = list_mc_results(
check_id,
doc_type=doc_type or None,
regulation=regulation or None,
only_failed=only_failed,
)
return {
"check_id": check_id,
"found": True,
"run": run,
"mc_count": len(rows),
"results": rows,
}
@router.get("/audit/tenant/{tenant_id}")
async def audit_tenant_history(
tenant_id: str,
base_domain: str = "",
limit: int = 30,
):
"""Tenant-level history for the trend view (A6)."""
from compliance.services.compliance_audit_log import list_runs_for_tenant
runs = list_runs_for_tenant(
tenant_id, base_domain=base_domain or None, limit=limit,
)
return {"tenant_id": tenant_id, "count": len(runs), "runs": runs}