c2c8783fee
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
348 lines
12 KiB
Python
348 lines
12 KiB
Python
"""
|
|
Unified Compliance Check Routes — check all documents in one request.
|
|
|
|
POST /compliance/agent/extract-text — extract text from a URL
|
|
POST /compliance/agent/compliance-check — unified check for all documents
|
|
GET /compliance/agent/compliance-check/{check_id} — poll status
|
|
|
|
Phase 5 split (2026-06-06): the original 2700-line monolith is now
|
|
decomposed into the `agent_check/` subpackage:
|
|
- _orchestrator.py — thin run_compliance_check pipeline
|
|
- _phase_a_resolve.py — TDM + Step 1 (resolve / discover / split)
|
|
- _phase_b_profile_check.py — Step 2 + Step 3 (profile + doc checks)
|
|
- _phase_c_banner.py — Step 3b-d (banner + cross-check + TCF) + Step 4
|
|
- _phase_d1_vendors_raw.py / _phase_d2_vendors_finalize.py — Step 5
|
|
vendor extraction + finalize
|
|
- _phase_d3_blocks_top.py / mid / bot — Step 5 HTML blocks
|
|
- _phase_e_email.py — Step 6 (with A1 ZIP-Anhang)
|
|
- _phase_f_persist.py — Step 7 (snapshot + audit log + unified findings)
|
|
- _helpers.py / _constants.py / _state.py / _schemas.py — shared
|
|
|
|
External callers (saving_scan_routes, agent_migration_routes, tests)
|
|
keep importing helpers from THIS module — everything is re-exported.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import uuid as _uuid
|
|
|
|
import httpx
|
|
from fastapi import APIRouter
|
|
|
|
# ── Re-exports: external callers import these from THIS module ──────
|
|
from .agent_check._constants import ( # noqa: F401
|
|
CONSENT_TESTER_URL,
|
|
_ALL_DOC_TYPES,
|
|
_COMPOUND_TLDS,
|
|
_DISCOVERY_RULES,
|
|
_DOC_TYPE_LABELS,
|
|
_compliance_check_jobs,
|
|
)
|
|
from .agent_check._discovery import _autodiscover_missing # noqa: F401
|
|
from .agent_check._fetch import _fetch_text # noqa: F401
|
|
from .agent_check._helpers import ( # noqa: F401
|
|
_apply_profile_filter,
|
|
_build_profile_html,
|
|
_classify_discovered_doc,
|
|
_company_name_from_url,
|
|
_doc_type_label,
|
|
_extract_domain,
|
|
_get_skip_types,
|
|
_pad_results_with_missing,
|
|
_result_to_dict,
|
|
_update,
|
|
)
|
|
from .agent_check._orchestrator import run_compliance_check as _run_compliance_check # noqa: F401
|
|
from .agent_check._schemas import (
|
|
ComplianceCheckRequest,
|
|
ComplianceCheckStartResponse,
|
|
ComplianceCheckStatusResponse,
|
|
DocumentInput,
|
|
ExtractTextRequest,
|
|
)
|
|
from .agent_check._single_check import _check_single # noqa: F401
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/compliance/agent", tags=["agent"])
|
|
|
|
|
|
# ── Extract text endpoint ────────────────────────────────────────────
|
|
|
|
@router.post("/extract-text")
|
|
async def extract_text(req: ExtractTextRequest):
|
|
"""Extract text from a URL via consent-tester DSI discovery.
|
|
|
|
Merges all documents found on the page (sub-pages, accordions, etc.)
|
|
"""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
resp = await client.post(
|
|
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
|
json={"url": req.url, "max_documents": 5},
|
|
timeout=300.0,
|
|
)
|
|
if resp.status_code != 200:
|
|
return {
|
|
"text": "", "word_count": 0, "title": "",
|
|
"error": f"HTTP {resp.status_code} von Consent-Tester",
|
|
}
|
|
|
|
data = resp.json()
|
|
docs = data.get("documents", [])
|
|
|
|
if not docs:
|
|
return {
|
|
"text": "", "word_count": 0, "title": "",
|
|
"error": "Kein Text extrahierbar",
|
|
}
|
|
|
|
# Merge all documents (handles multi-page DSIs like BMW)
|
|
texts = []
|
|
for doc in docs:
|
|
t = doc.get("full_text", "") or doc.get("text_preview", "") or ""
|
|
if t and len(t) > 50:
|
|
texts.append(t)
|
|
text = "\n\n".join(texts) if texts else ""
|
|
title = docs[0].get("title", "") or docs[0].get("doc_type", "")
|
|
word_count = len(text.split())
|
|
|
|
return {
|
|
"text": text,
|
|
"word_count": word_count,
|
|
"title": title,
|
|
"error": "",
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.warning("extract-text failed for %s: %s", req.url, e)
|
|
return {
|
|
"text": "", "word_count": 0, "title": "",
|
|
"error": str(e)[:200],
|
|
}
|
|
|
|
|
|
# ── Unified compliance check ────────────────────────────────────────
|
|
|
|
@router.post("/compliance-check")
|
|
async def start_compliance_check(req: ComplianceCheckRequest):
|
|
"""Start async compliance check for all documents."""
|
|
check_id = str(_uuid.uuid4())[:8]
|
|
_compliance_check_jobs[check_id] = {
|
|
"status": "running",
|
|
"progress": "Pruefung gestartet...",
|
|
"progress_pct": 0,
|
|
"result": None,
|
|
"error": "",
|
|
}
|
|
asyncio.create_task(_run_compliance_check(check_id, req))
|
|
return ComplianceCheckStartResponse(check_id=check_id, status="running")
|
|
|
|
|
|
@router.get("/compliance-check/{check_id}")
|
|
async def get_compliance_check_status(check_id: str):
|
|
"""Poll compliance check status."""
|
|
job = _compliance_check_jobs.get(check_id)
|
|
if not job:
|
|
return {"check_id": check_id, "status": "not_found"}
|
|
return ComplianceCheckStatusResponse(
|
|
check_id=check_id,
|
|
status=job["status"],
|
|
progress=job.get("progress", ""),
|
|
progress_pct=job.get("progress_pct", 0),
|
|
result=job.get("result"),
|
|
error=job.get("error", ""),
|
|
)
|
|
|
|
|
|
# ── P80: Snapshot + Replay ───────────────────────────────────────────
|
|
|
|
@router.get("/snapshots")
|
|
async def list_snapshots(domain: str = "", limit: int = 20):
|
|
"""P80: list recent snapshots, optionally filtered by site_domain."""
|
|
from database import SessionLocal
|
|
from compliance.services.check_snapshot import list_snapshots_for_domain
|
|
db = SessionLocal()
|
|
try:
|
|
if domain:
|
|
return {"snapshots": list_snapshots_for_domain(db, domain, limit)}
|
|
from sqlalchemy import text
|
|
rows = db.execute(
|
|
text("""
|
|
SELECT id, check_id, site_domain, site_label, created_at,
|
|
replay_count, notes
|
|
FROM compliance.compliance_check_snapshots
|
|
ORDER BY created_at DESC
|
|
LIMIT :lim
|
|
"""),
|
|
{"lim": limit},
|
|
).fetchall()
|
|
return {"snapshots": [
|
|
{"id": str(r[0]), "check_id": r[1], "site_domain": r[2],
|
|
"site_label": r[3], "created_at": str(r[4]),
|
|
"replay_count": r[5], "notes": r[6]}
|
|
for r in rows
|
|
]}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.get("/snapshots/{snapshot_id}")
|
|
async def get_snapshot(snapshot_id: str):
|
|
"""P80: load full snapshot raw data."""
|
|
from fastapi import HTTPException
|
|
from database import SessionLocal
|
|
from compliance.services.check_snapshot import load_snapshot
|
|
db = SessionLocal()
|
|
try:
|
|
snap = load_snapshot(db, snapshot_id)
|
|
if not snap:
|
|
raise HTTPException(status_code=404, detail="snapshot not found")
|
|
return snap
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.get("/admin/benchmark")
|
|
async def benchmark(
|
|
industry: str = "",
|
|
sites: str = "",
|
|
anonymized: bool = False,
|
|
limit: int = 50,
|
|
):
|
|
"""P107 — Branchen-Benchmark-Cockpit Endpoint."""
|
|
from database import SessionLocal
|
|
from compliance.services.benchmark_extractor import (
|
|
anonymize_kpis,
|
|
build_benchmark_summary,
|
|
load_snapshots_for_benchmark,
|
|
)
|
|
site_list = [s.strip() for s in sites.split(",") if s.strip()] if sites else None
|
|
db = SessionLocal()
|
|
try:
|
|
kpis = load_snapshots_for_benchmark(
|
|
db, industry=industry or None, sites=site_list, limit=limit,
|
|
)
|
|
finally:
|
|
db.close()
|
|
if anonymized:
|
|
kpis = anonymize_kpis(kpis, industry=industry)
|
|
return {
|
|
"industry": industry or "all",
|
|
"anonymized": anonymized,
|
|
"sites": [k.get("site_label") for k in kpis],
|
|
"kpis": kpis,
|
|
"summary": build_benchmark_summary(kpis),
|
|
}
|
|
|
|
|
|
@router.post("/admin/tcf-ingest")
|
|
async def tcf_ingest():
|
|
"""P105 — IAB TCF Vendor-Liste ingestieren / refreshen."""
|
|
from database import SessionLocal
|
|
from compliance.services.tcf_vendor_authority import (
|
|
fetch_and_ingest_tcf_vendors,
|
|
)
|
|
db = SessionLocal()
|
|
try:
|
|
return await fetch_and_ingest_tcf_vendors(db)
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.get("/snapshots/{snapshot_id}/pdf")
|
|
async def export_snapshot_pdf(snapshot_id: str):
|
|
"""P88 — PDF-Export der Audit-Mail. Liefert application/pdf."""
|
|
from fastapi import HTTPException
|
|
from fastapi.responses import Response
|
|
from database import SessionLocal
|
|
from compliance.services.mail_pdf_export import render_snapshot_as_pdf
|
|
db = SessionLocal()
|
|
try:
|
|
pdf = render_snapshot_as_pdf(db, snapshot_id)
|
|
finally:
|
|
db.close()
|
|
if not pdf:
|
|
raise HTTPException(404, f"Snapshot {snapshot_id} nicht gefunden "
|
|
"oder PDF-Render fehlgeschlagen.")
|
|
fname = f"breakpilot-audit-{snapshot_id[:8]}.pdf"
|
|
return Response(
|
|
content=pdf, media_type="application/pdf",
|
|
headers={"Content-Disposition": f'attachment; filename="{fname}"'},
|
|
)
|
|
|
|
|
|
@router.post("/snapshots/{snapshot_id}/replay")
|
|
async def replay_snapshot(
|
|
snapshot_id: str,
|
|
recipient: str = "",
|
|
dry_run: bool = True,
|
|
):
|
|
"""P80: replay audit mail render from snapshot. 7min->2sec test cycle.
|
|
|
|
Default dry_run=true just returns rendered HTML size + section breakdown.
|
|
Pass recipient + dry_run=false to actually send a [REPLAY] mail.
|
|
"""
|
|
from database import SessionLocal
|
|
from compliance.services.check_replay import replay_from_snapshot
|
|
db = SessionLocal()
|
|
try:
|
|
return replay_from_snapshot(
|
|
db,
|
|
snapshot_id=snapshot_id,
|
|
recipient=(recipient if recipient else None),
|
|
dry_run=dry_run,
|
|
)
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
# ── Admin: audit drill-down (A5) + trend view (A6) ──────────────────
|
|
|
|
@router.get("/audit/{check_id}")
|
|
async def audit_drill_down(
|
|
check_id: str,
|
|
doc_type: str = "",
|
|
regulation: str = "",
|
|
only_failed: bool = False,
|
|
):
|
|
"""Return scorecard + filterable MC results for a single check run.
|
|
|
|
Frontend uses this to render the /sdk/agent/audit/<check_id> view.
|
|
"""
|
|
from compliance.services.compliance_audit_log import (
|
|
get_check_run, list_mc_results,
|
|
)
|
|
run = get_check_run(check_id)
|
|
if not run:
|
|
return {"check_id": check_id, "found": False}
|
|
rows = list_mc_results(
|
|
check_id,
|
|
doc_type=doc_type or None,
|
|
regulation=regulation or None,
|
|
only_failed=only_failed,
|
|
)
|
|
return {
|
|
"check_id": check_id,
|
|
"found": True,
|
|
"run": run,
|
|
"mc_count": len(rows),
|
|
"results": rows,
|
|
}
|
|
|
|
|
|
@router.get("/audit/tenant/{tenant_id}")
|
|
async def audit_tenant_history(
|
|
tenant_id: str,
|
|
base_domain: str = "",
|
|
limit: int = 30,
|
|
):
|
|
"""Tenant-level history for the trend view (A6)."""
|
|
from compliance.services.compliance_audit_log import list_runs_for_tenant
|
|
runs = list_runs_for_tenant(
|
|
tenant_id, base_domain=base_domain or None, limit=limit,
|
|
)
|
|
return {"tenant_id": tenant_id, "count": len(runs), "runs": runs}
|