diff --git a/backend-compliance/compliance/api/agent_history_routes.py b/backend-compliance/compliance/api/agent_history_routes.py new file mode 100644 index 0000000..ac05535 --- /dev/null +++ b/backend-compliance/compliance/api/agent_history_routes.py @@ -0,0 +1,197 @@ +""" +Agent History Routes — persist and retrieve scan results. + +GET /api/compliance/agent/scans — list recent scans +GET /api/compliance/agent/scans/{id} — get single scan +POST /api/compliance/agent/scans — save a scan result +""" + +import json +import logging +import os +import uuid +from datetime import datetime, timezone + +from fastapi import APIRouter, Query +from pydantic import BaseModel + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/compliance/agent", tags=["agent"]) + +DATABASE_URL = os.environ.get( + "COMPLIANCE_DATABASE_URL", + os.environ.get("DATABASE_URL", ""), +) + + +class SaveScanRequest(BaseModel): + url: str + scan_type: str = "scan" + analysis_mode: str = "post_launch" + result: dict # Full scan result JSON + + +class ScanHistoryItem(BaseModel): + id: str + url: str + scan_type: str + analysis_mode: str + risk_level: str | None = None + risk_score: float = 0 + findings_count: int = 0 + pages_scanned: int = 0 + email_sent: bool = False + created_at: str + + +class ScanDetail(BaseModel): + id: str + url: str + scan_type: str + analysis_mode: str + result: dict + created_at: str + + +async def _get_pool(): + """Get or create database connection pool.""" + import asyncpg + if not DATABASE_URL: + return None + try: + return await asyncpg.create_pool(DATABASE_URL, min_size=1, max_size=3) + except Exception as e: + logger.warning("DB connection failed: %s", e) + return None + + +@router.post("/scans") +async def save_scan(req: SaveScanRequest): + """Save a scan result to the database.""" + pool = await _get_pool() + if not pool: + return {"status": "skipped", "reason": "no database"} + + scan_id = str(uuid.uuid4()) + result = req.result + + try: + async with pool.acquire() as conn: + await conn.execute(""" + INSERT INTO compliance_agent_scans + (id, url, scan_type, analysis_mode, classification, risk_level, + risk_score, escalation_level, responsible_role, services, + findings, summary_html, pages_scanned, pages_list, email_sent, + created_at) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16) + """, + uuid.UUID(scan_id), + req.url, + req.scan_type, + req.analysis_mode, + result.get("classification", ""), + result.get("risk_level", ""), + result.get("risk_score", 0), + result.get("escalation_level", ""), + result.get("responsible_role", ""), + json.dumps(result.get("services", [])), + json.dumps(result.get("findings", [])), + result.get("summary", result.get("summary_html", "")), + result.get("pages_scanned", 0), + json.dumps(result.get("pages_list", [])), + result.get("email_status") == "sent", + datetime.now(timezone.utc), + ) + return {"status": "saved", "id": scan_id} + except Exception as e: + logger.error("Failed to save scan: %s", e) + return {"status": "error", "error": str(e)} + finally: + await pool.close() + + +@router.get("/scans", response_model=list[ScanHistoryItem]) +async def list_scans( + limit: int = Query(20, le=100), + scan_type: str | None = None, +): + """List recent scans.""" + pool = await _get_pool() + if not pool: + return [] + + try: + async with pool.acquire() as conn: + query = """ + SELECT id, url, scan_type, analysis_mode, risk_level, risk_score, + findings, pages_scanned, email_sent, created_at + FROM compliance_agent_scans + """ + params = [] + if scan_type: + query += " WHERE scan_type = $1" + params.append(scan_type) + query += " ORDER BY created_at DESC LIMIT " + str(limit) + + rows = await conn.fetch(query, *params) + return [ + ScanHistoryItem( + id=str(r["id"]), + url=r["url"], + scan_type=r["scan_type"], + analysis_mode=r["analysis_mode"], + risk_level=r["risk_level"], + risk_score=r["risk_score"] or 0, + findings_count=len(json.loads(r["findings"] or "[]")), + pages_scanned=r["pages_scanned"] or 0, + email_sent=r["email_sent"] or False, + created_at=r["created_at"].isoformat() if r["created_at"] else "", + ) + for r in rows + ] + except Exception as e: + logger.error("Failed to list scans: %s", e) + return [] + finally: + await pool.close() + + +@router.get("/scans/{scan_id}", response_model=ScanDetail) +async def get_scan(scan_id: str): + """Get a single scan result.""" + pool = await _get_pool() + if not pool: + return ScanDetail(id=scan_id, url="", scan_type="", analysis_mode="", result={}, created_at="") + + try: + async with pool.acquire() as conn: + row = await conn.fetchrow(""" + SELECT * FROM compliance_agent_scans WHERE id = $1 + """, uuid.UUID(scan_id)) + + if not row: + return ScanDetail(id=scan_id, url="", scan_type="", analysis_mode="", result={}, created_at="") + + return ScanDetail( + id=str(row["id"]), + url=row["url"], + scan_type=row["scan_type"], + analysis_mode=row["analysis_mode"], + result={ + "classification": row["classification"], + "risk_level": row["risk_level"], + "risk_score": row["risk_score"], + "services": json.loads(row["services"] or "[]"), + "findings": json.loads(row["findings"] or "[]"), + "summary": row["summary_html"], + "pages_scanned": row["pages_scanned"], + "pages_list": json.loads(row["pages_list"] or "[]"), + }, + created_at=row["created_at"].isoformat() if row["created_at"] else "", + ) + except Exception as e: + logger.error("Failed to get scan: %s", e) + return ScanDetail(id=scan_id, url="", scan_type="", analysis_mode="", result={}, created_at="") + finally: + await pool.close() diff --git a/backend-compliance/main.py b/backend-compliance/main.py index e60d92d..671f833 100644 --- a/backend-compliance/main.py +++ b/backend-compliance/main.py @@ -45,6 +45,7 @@ from compliance.api.company_profile_routes import router as company_profile_rout from compliance.api.agent_notification_routes import router as agent_notify_router from compliance.api.agent_analyze_routes import router as agent_analyze_router from compliance.api.agent_scan_routes import router as agent_scan_router +from compliance.api.agent_history_routes import router as agent_history_router # Middleware from middleware import ( @@ -144,6 +145,7 @@ app.include_router(company_profile_router, prefix="/api") app.include_router(agent_notify_router, prefix="/api") app.include_router(agent_analyze_router, prefix="/api") app.include_router(agent_scan_router, prefix="/api") +app.include_router(agent_history_router, prefix="/api") if __name__ == "__main__": diff --git a/backend-compliance/migrations/086_agent_scans.sql b/backend-compliance/migrations/086_agent_scans.sql new file mode 100644 index 0000000..9281735 --- /dev/null +++ b/backend-compliance/migrations/086_agent_scans.sql @@ -0,0 +1,33 @@ +-- Migration 086: Agent Scan Results persistence +-- Stores scan results so they survive page reloads and can be reviewed later + +CREATE TABLE IF NOT EXISTS compliance_agent_scans ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + tenant_id UUID NOT NULL DEFAULT '9282a473-5c95-4b3a-bf78-0ecc0ec71d3e', + user_id TEXT NOT NULL DEFAULT 'default', + url TEXT NOT NULL, + scan_type TEXT NOT NULL, -- 'quick', 'scan', 'consent_test' + analysis_mode TEXT NOT NULL DEFAULT 'post_launch', -- 'pre_launch', 'post_launch' + classification TEXT, + risk_level TEXT, + risk_score FLOAT DEFAULT 0, + escalation_level TEXT, + responsible_role TEXT, + services JSONB DEFAULT '[]', + findings JSONB DEFAULT '[]', + corrections JSONB DEFAULT '[]', + consent_test JSONB, + text_references JSONB DEFAULT '[]', + mandatory_checks JSONB DEFAULT '[]', + summary_html TEXT, + pages_scanned INT DEFAULT 0, + pages_list JSONB DEFAULT '[]', + email_sent BOOLEAN DEFAULT FALSE, + email_recipient TEXT, + created_at TIMESTAMPTZ DEFAULT now() +); + +CREATE INDEX IF NOT EXISTS idx_agent_scans_tenant ON compliance_agent_scans(tenant_id); +CREATE INDEX IF NOT EXISTS idx_agent_scans_url ON compliance_agent_scans(url); +CREATE INDEX IF NOT EXISTS idx_agent_scans_created ON compliance_agent_scans(created_at DESC); +CREATE INDEX IF NOT EXISTS idx_agent_scans_type ON compliance_agent_scans(scan_type); diff --git a/zeroclaw/PLAN-compliance-agent-product.md b/zeroclaw/PLAN-compliance-agent-product.md index 166a163..2a523ac 100644 --- a/zeroclaw/PLAN-compliance-agent-product.md +++ b/zeroclaw/PLAN-compliance-agent-product.md @@ -509,6 +509,36 @@ und prueft den Kundenbereich auf Pflichtfunktionen: | `consent-tester/main.py` | +30 | Neuer /authenticated-scan Endpoint | | Frontend: AuthenticatedTestTab | ~150 | Credential-Eingabe + Ergebnis | +--- + +## Phase 10: Website-Scan auf Playwright umstellen (P3, 1-2 Tage) + +### Problem + +Der Website-Scan nutzt httpx (wie curl) — bekommt nur initiales HTML. +SPAs (React, Angular, Vue) die Inhalte per JavaScript nachladen werden +unvollstaendig gescannt. Opodo-Stil Seiten liefern nur Shell-HTML. + +### Loesung + +Website-Scanner auf Playwright umstellen — gleicher Headless Browser +wie der Consent-Tester. Dann sieht der Scan ALLES was der Browser sieht. + +| Technologie | Aktuell (httpx) | Nach Phase 10 (Playwright) | +|------------|-----------------|---------------------------| +| Statisches HTML | ✓ | ✓ | +| WordPress | ✓ | ✓ | +| React/Vue SPA | ✗ (nur Shell) | ✓ (rendert JS) | +| Angular SSR | ✗/✓ | ✓ | +| JS-heavy (Opodo) | ✗ | ✓ | + +### Implementierung + +- `consent-tester` Service um `/website-scan` Endpoint erweitern +- Playwright navigiert zu jeder Seite, wartet auf JS, extrahiert HTML +- Backend-Scanner ruft consent-tester statt httpx auf +- Gleicher Output (DetectedService, ScanResult) — nur bessere Eingabedaten + ## Investoren-Demo Szenario Nach Phase 2 (Woche 2) koennen wir folgende Demo zeigen: