feat: add compliance modules 2-5 (dashboard, security templates, process manager, evidence collector)

Module 2: Extended Compliance Dashboard with roadmap, module-status, next-actions, snapshots, score-history Module 3: 7 German security document templates (IT-Sicherheitskonzept, Datenschutz, Backup, Logging, Incident-Response, Zugriff, Risikomanagement) Module 4: Compliance Process Manager with CRUD, complete/skip/seed, ~50 seed tasks, 3-tab UI Module 5: Evidence Collector Extended with automated checks, control-mapping, coverage report, 4-tab UI Also includes: canonical control library enhancements (verification method, categories, dedup), control generator improvements, RAG client extensions 52 tests pass, frontend builds clean. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 21:03:04 +01:00
parent 13d13c8226
commit 49ce417428
35 changed files with 8741 additions and 422 deletions
@@ -53,6 +53,8 @@ _ROUTER_MODULES = [
    "wiki_routes",
    "canonical_control_routes",
    "control_generator_routes",
+    "process_task_routes",
+    "evidence_check_routes",
 ]

 _loaded_count = 0
@@ -78,6 +78,7 @@ class ControlResponse(BaseModel):
    customer_visible: Optional[bool] = None
    verification_method: Optional[str] = None
    category: Optional[str] = None
+    target_audience: Optional[str] = None
    generation_metadata: Optional[dict] = None
    created_at: str
    updated_at: str
@@ -106,6 +107,7 @@ class ControlCreateRequest(BaseModel):
    customer_visible: Optional[bool] = True
    verification_method: Optional[str] = None
    category: Optional[str] = None
+    target_audience: Optional[str] = None
    generation_metadata: Optional[dict] = None


@@ -130,6 +132,7 @@ class ControlUpdateRequest(BaseModel):
    customer_visible: Optional[bool] = None
    verification_method: Optional[str] = None
    category: Optional[str] = None
+    target_audience: Optional[str] = None
    generation_metadata: Optional[dict] = None


@@ -158,7 +161,7 @@ _CONTROL_COLS = """id, framework_id, control_id, title, objective, rationale,
                   evidence_confidence, open_anchors, release_state, tags,
                   license_rule, source_original_text, source_citation,
                   customer_visible, verification_method, category,
-                   generation_metadata,
+                   target_audience, generation_metadata,
                   created_at, updated_at"""


@@ -241,6 +244,7 @@ async def list_framework_controls(
    release_state: Optional[str] = Query(None),
    verification_method: Optional[str] = Query(None),
    category: Optional[str] = Query(None),
+    target_audience: Optional[str] = Query(None),
 ):
    """List controls belonging to a framework."""
    with SessionLocal() as db:
@@ -271,6 +275,9 @@ async def list_framework_controls(
        if category:
            query += " AND category = :cat"
            params["cat"] = category
+        if target_audience:
+            query += " AND target_audience = :ta"
+            params["ta"] = target_audience

        query += " ORDER BY control_id"
        rows = db.execute(text(query), params).fetchall()
@@ -289,6 +296,7 @@ async def list_controls(
    release_state: Optional[str] = Query(None),
    verification_method: Optional[str] = Query(None),
    category: Optional[str] = Query(None),
+    target_audience: Optional[str] = Query(None),
 ):
    """List all canonical controls, with optional filters."""
    query = f"""
@@ -313,6 +321,9 @@ async def list_controls(
    if category:
        query += " AND category = :cat"
        params["cat"] = category
+    if target_audience:
+        query += " AND target_audience = :ta"
+        params["ta"] = target_audience

    query += " ORDER BY control_id"

@@ -384,7 +395,7 @@ async def create_control(body: ControlCreateRequest):
                    open_anchors, release_state, tags,
                    license_rule, source_original_text, source_citation,
                    customer_visible, verification_method, category,
-                    generation_metadata
+                    target_audience, generation_metadata
                ) VALUES (
                    :fw_id, :cid, :title, :objective, :rationale,
                    CAST(:scope AS jsonb), CAST(:requirements AS jsonb),
@@ -394,7 +405,7 @@ async def create_control(body: ControlCreateRequest):
                    :license_rule, :source_original_text,
                    CAST(:source_citation AS jsonb),
                    :customer_visible, :verification_method, :category,
-                    CAST(:generation_metadata AS jsonb)
+                    :target_audience, CAST(:generation_metadata AS jsonb)
                )
                RETURNING {_CONTROL_COLS}
            """),
@@ -421,6 +432,7 @@ async def create_control(body: ControlCreateRequest):
                "customer_visible": body.customer_visible,
                "verification_method": body.verification_method,
                "category": body.category,
+                "target_audience": body.target_audience,
                "generation_metadata": _json.dumps(body.generation_metadata) if body.generation_metadata else None,
            },
        ).fetchone()
@@ -647,6 +659,7 @@ def _control_row(r) -> dict:
        "customer_visible": r.customer_visible,
        "verification_method": r.verification_method,
        "category": r.category,
+        "target_audience": r.target_audience,
        "generation_metadata": r.generation_metadata,
        "created_at": r.created_at.isoformat() if r.created_at else None,
        "updated_at": r.updated_at.isoformat() if r.updated_at else None,
@@ -12,6 +12,7 @@ Endpoints:
  POST /v1/canonical/blocked-sources/cleanup      — Start cleanup workflow
 """

+import asyncio
 import json
 import logging
 from typing import Optional, List
@@ -89,9 +90,42 @@ class BlockedSourceResponse(BaseModel):
 # ENDPOINTS
 # =============================================================================

+async def _run_pipeline_background(config: GeneratorConfig, job_id: str):
+    """Run the pipeline in the background. Uses its own DB session."""
+    db = SessionLocal()
+    try:
+        config.existing_job_id = job_id
+        pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
+        result = await pipeline.run(config)
+        logger.info(
+            "Background generation job %s completed: %d controls from %d chunks",
+            job_id, result.controls_generated, result.total_chunks_scanned,
+        )
+    except Exception as e:
+        logger.error("Background generation job %s failed: %s", job_id, e)
+        # Update job as failed
+        try:
+            db.execute(
+                text("""
+                    UPDATE canonical_generation_jobs
+                    SET status = 'failed', errors = :errors, completed_at = NOW()
+                    WHERE id = CAST(:job_id AS uuid)
+                """),
+                {"job_id": job_id, "errors": json.dumps([str(e)])},
+            )
+            db.commit()
+        except Exception:
+            pass
+    finally:
+        db.close()
+
+
@router.post("/generate", response_model=GenerateResponse)
 async def start_generation(req: GenerateRequest):
-    """Start a control generation run."""
+    """Start a control generation run (runs in background).
+
+    Returns immediately with job_id. Use GET /generate/status/{job_id} to poll progress.
+    """
    config = GeneratorConfig(
        collections=req.collections,
        domain=req.domain,
@@ -101,30 +135,63 @@ async def start_generation(req: GenerateRequest):
        dry_run=req.dry_run,
    )

+    if req.dry_run:
+        # Dry run: execute synchronously and return controls
+        db = SessionLocal()
+        try:
+            pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
+            result = await pipeline.run(config)
+            return GenerateResponse(
+                job_id=result.job_id,
+                status=result.status,
+                message=f"Dry run: {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
+                total_chunks_scanned=result.total_chunks_scanned,
+                controls_generated=result.controls_generated,
+                controls_verified=result.controls_verified,
+                controls_needs_review=result.controls_needs_review,
+                controls_too_close=result.controls_too_close,
+                controls_duplicates_found=result.controls_duplicates_found,
+                errors=result.errors,
+                controls=result.controls,
+            )
+        except Exception as e:
+            logger.error("Dry run failed: %s", e)
+            raise HTTPException(status_code=500, detail=str(e))
+        finally:
+            db.close()
+
+    # Create job record first so we can return the ID
    db = SessionLocal()
    try:
-        pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
-        result = await pipeline.run(config)
-
-        return GenerateResponse(
-            job_id=result.job_id,
-            status=result.status,
-            message=f"Generated {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
-            total_chunks_scanned=result.total_chunks_scanned,
-            controls_generated=result.controls_generated,
-            controls_verified=result.controls_verified,
-            controls_needs_review=result.controls_needs_review,
-            controls_too_close=result.controls_too_close,
-            controls_duplicates_found=result.controls_duplicates_found,
-            errors=result.errors,
-            controls=result.controls if req.dry_run else [],
+        result = db.execute(
+            text("""
+                INSERT INTO canonical_generation_jobs (status, config)
+                VALUES ('running', :config)
+                RETURNING id
+            """),
+            {"config": json.dumps(config.model_dump())},
        )
+        db.commit()
+        row = result.fetchone()
+        job_id = str(row[0]) if row else None
    except Exception as e:
-        logger.error("Generation failed: %s", e)
-        raise HTTPException(status_code=500, detail=str(e))
+        logger.error("Failed to create job: %s", e)
+        raise HTTPException(status_code=500, detail=f"Failed to create job: {e}")
    finally:
        db.close()

+    if not job_id:
+        raise HTTPException(status_code=500, detail="Failed to create job record")
+
+    # Launch pipeline in background
+    asyncio.create_task(_run_pipeline_background(config, job_id))
+
+    return GenerateResponse(
+        job_id=job_id,
+        status="running",
+        message="Generation started in background. Poll /generate/status/{job_id} for progress.",
+    )
+

@router.get("/generate/status/{job_id}")
 async def get_job_status(job_id: str):
@@ -5,16 +5,23 @@ Endpoints:
 - /dashboard: Main compliance dashboard
 - /dashboard/executive: Executive summary for managers
 - /dashboard/trend: Compliance score trend over time
+- /dashboard/roadmap: Prioritised controls in 4 buckets
+- /dashboard/module-status: Completion status of each SDK module
+- /dashboard/next-actions: Top 5 most important actions
+- /dashboard/snapshot: Save / query compliance score snapshots
 - /score: Quick compliance score
 - /reports: Report generation
 """

 import logging
-from datetime import datetime, timedelta
+from datetime import datetime, date, timedelta
 from calendar import month_abbr
-from typing import Optional
+from typing import Optional, Dict, Any, List
+from decimal import Decimal

 from fastapi import APIRouter, Depends, HTTPException, Query
+from pydantic import BaseModel
+from sqlalchemy import text
 from sqlalchemy.orm import Session

 from classroom_engine.database import get_db
@@ -34,6 +41,8 @@ from .schemas import (
    DeadlineItem,
    TeamWorkloadItem,
 )
+from .tenant_utils import get_tenant_id as _get_tenant_id
+from .db_utils import row_to_dict as _row_to_dict

 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["compliance-dashboard"])
@@ -322,6 +331,272 @@ async def get_compliance_trend(
    }


+# ============================================================================
+# Dashboard Extended — Roadmap, Module-Status, Next-Actions, Snapshots
+# ============================================================================
+
+# Weight map for control prioritisation
+_PRIORITY_WEIGHTS = {"legal": 5, "security": 3, "best_practice": 1, "operational": 2}
+
+# SDK module definitions → DB table used for counting completion
+_MODULE_DEFS: List[Dict[str, str]] = [
+    {"key": "vvt", "label": "VVT", "table": "compliance_vvt_activities"},
+    {"key": "tom", "label": "TOM", "table": "compliance_toms"},
+    {"key": "dsfa", "label": "DSFA", "table": "compliance_dsfa_assessments"},
+    {"key": "loeschfristen", "label": "Loeschfristen", "table": "compliance_loeschfristen"},
+    {"key": "risks", "label": "Risiken", "table": "compliance_risks"},
+    {"key": "controls", "label": "Controls", "table": "compliance_controls"},
+    {"key": "evidence", "label": "Nachweise", "table": "compliance_evidence"},
+    {"key": "obligations", "label": "Pflichten", "table": "compliance_obligations"},
+    {"key": "incidents", "label": "Vorfaelle", "table": "compliance_notfallplan_incidents"},
+    {"key": "vendor", "label": "Auftragsverarbeiter", "table": "compliance_vendor_assessments"},
+    {"key": "legal_templates", "label": "Rechtl. Dokumente", "table": "compliance_legal_templates"},
+    {"key": "training", "label": "Schulungen", "table": "training_modules"},
+    {"key": "audit", "label": "Audit", "table": "compliance_audit_sessions"},
+    {"key": "security_backlog", "label": "Security-Backlog", "table": "compliance_security_backlog"},
+    {"key": "quality", "label": "Qualitaet", "table": "compliance_quality_items"},
+]
+
+
+@router.get("/dashboard/roadmap")
+async def get_dashboard_roadmap(
+    db: Session = Depends(get_db),
+    tenant_id: str = Depends(_get_tenant_id),
+):
+    """Prioritised controls in 4 buckets: Quick Wins, Must Have, Should Have, Nice to Have."""
+    ctrl_repo = ControlRepository(db)
+    controls = ctrl_repo.get_all()
+    today = datetime.utcnow().date()
+
+    buckets: Dict[str, list] = {
+        "quick_wins": [],
+        "must_have": [],
+        "should_have": [],
+        "nice_to_have": [],
+    }
+
+    for ctrl in controls:
+        status = ctrl.status.value if ctrl.status else "planned"
+        if status == "pass":
+            continue  # already done
+
+        weight = _PRIORITY_WEIGHTS.get(ctrl.category if hasattr(ctrl, "category") else "best_practice", 1)
+        days_overdue = 0
+        if ctrl.next_review_at:
+            review_date = ctrl.next_review_at.date() if hasattr(ctrl.next_review_at, "date") else ctrl.next_review_at
+            days_overdue = (today - review_date).days
+
+        urgency = weight * 2 + (1 if days_overdue > 0 else 0)
+
+        item = {
+            "id": str(ctrl.id),
+            "control_id": ctrl.control_id,
+            "title": ctrl.title,
+            "status": status,
+            "domain": ctrl.domain.value if ctrl.domain else "unknown",
+            "owner": ctrl.owner,
+            "next_review_at": ctrl.next_review_at.isoformat() if ctrl.next_review_at else None,
+            "days_overdue": max(0, days_overdue),
+            "weight": weight,
+        }
+
+        if weight >= 5 and days_overdue > 0:
+            buckets["quick_wins"].append(item)
+        elif weight >= 4:
+            buckets["must_have"].append(item)
+        elif weight >= 2:
+            buckets["should_have"].append(item)
+        else:
+            buckets["nice_to_have"].append(item)
+
+    # Sort each bucket by urgency desc
+    for key in buckets:
+        buckets[key].sort(key=lambda x: x["days_overdue"], reverse=True)
+
+    return {
+        "buckets": buckets,
+        "counts": {k: len(v) for k, v in buckets.items()},
+        "generated_at": datetime.utcnow().isoformat(),
+    }
+
+
+@router.get("/dashboard/module-status")
+async def get_module_status(
+    db: Session = Depends(get_db),
+    tenant_id: str = Depends(_get_tenant_id),
+):
+    """Completion status for each SDK module based on DB record counts."""
+    modules = []
+    for mod in _MODULE_DEFS:
+        try:
+            row = db.execute(
+                text(f"SELECT COUNT(*) FROM {mod['table']} WHERE tenant_id = :tid"),
+                {"tid": tenant_id},
+            ).fetchone()
+            count = int(row[0]) if row else 0
+        except Exception:
+            count = 0
+
+        # Simple heuristic: 0 = not started, 1-2 = in progress, 3+ = complete
+        if count == 0:
+            status = "not_started"
+            progress = 0
+        elif count < 3:
+            status = "in_progress"
+            progress = min(60, count * 30)
+        else:
+            status = "complete"
+            progress = 100
+
+        modules.append({
+            "key": mod["key"],
+            "label": mod["label"],
+            "count": count,
+            "status": status,
+            "progress": progress,
+        })
+
+    started = sum(1 for m in modules if m["status"] != "not_started")
+    complete = sum(1 for m in modules if m["status"] == "complete")
+
+    return {
+        "modules": modules,
+        "total": len(modules),
+        "started": started,
+        "complete": complete,
+        "overall_progress": round((complete / len(modules)) * 100, 1) if modules else 0,
+    }
+
+
+@router.get("/dashboard/next-actions")
+async def get_next_actions(
+    limit: int = Query(5, ge=1, le=20),
+    db: Session = Depends(get_db),
+    tenant_id: str = Depends(_get_tenant_id),
+):
+    """Top N most important actions sorted by urgency*impact."""
+    ctrl_repo = ControlRepository(db)
+    controls = ctrl_repo.get_all()
+    today = datetime.utcnow().date()
+
+    actions = []
+    for ctrl in controls:
+        status = ctrl.status.value if ctrl.status else "planned"
+        if status == "pass":
+            continue
+
+        days_overdue = 0
+        if ctrl.next_review_at:
+            review_date = ctrl.next_review_at.date() if hasattr(ctrl.next_review_at, "date") else ctrl.next_review_at
+            days_overdue = max(0, (today - review_date).days)
+
+        weight = _PRIORITY_WEIGHTS.get(ctrl.category if hasattr(ctrl, "category") else "best_practice", 1)
+        urgency_score = weight * 10 + days_overdue
+
+        actions.append({
+            "id": str(ctrl.id),
+            "control_id": ctrl.control_id,
+            "title": ctrl.title,
+            "status": status,
+            "domain": ctrl.domain.value if ctrl.domain else "unknown",
+            "owner": ctrl.owner,
+            "days_overdue": days_overdue,
+            "urgency_score": urgency_score,
+            "reason": "Ueberfaellig" if days_overdue > 0 else "Offen",
+        })
+
+    actions.sort(key=lambda x: x["urgency_score"], reverse=True)
+    return {"actions": actions[:limit]}
+
+
+@router.post("/dashboard/snapshot")
+async def create_score_snapshot(
+    db: Session = Depends(get_db),
+    tenant_id: str = Depends(_get_tenant_id),
+):
+    """Save current compliance score as a historical snapshot."""
+    ctrl_repo = ControlRepository(db)
+    evidence_repo = EvidenceRepository(db)
+    risk_repo = RiskRepository(db)
+
+    ctrl_stats = ctrl_repo.get_statistics()
+    evidence_stats = evidence_repo.get_statistics()
+    risks = risk_repo.get_all()
+
+    total = ctrl_stats.get("total", 0)
+    passing = ctrl_stats.get("pass", 0)
+    partial = ctrl_stats.get("partial", 0)
+    score = round(((passing + partial * 0.5) / total) * 100, 2) if total > 0 else 0
+
+    risks_high = sum(1 for r in risks if (r.inherent_risk.value if r.inherent_risk else "low") in ("high", "critical"))
+
+    today = date.today()
+
+    row = db.execute(text("""
+        INSERT INTO compliance_score_snapshots (
+            tenant_id, score, controls_total, controls_pass, controls_partial,
+            evidence_total, evidence_valid, risks_total, risks_high, snapshot_date
+        ) VALUES (
+            :tenant_id, :score, :controls_total, :controls_pass, :controls_partial,
+            :evidence_total, :evidence_valid, :risks_total, :risks_high, :snapshot_date
+        )
+        ON CONFLICT (tenant_id, project_id, snapshot_date) DO UPDATE SET
+            score = EXCLUDED.score,
+            controls_total = EXCLUDED.controls_total,
+            controls_pass = EXCLUDED.controls_pass,
+            controls_partial = EXCLUDED.controls_partial,
+            evidence_total = EXCLUDED.evidence_total,
+            evidence_valid = EXCLUDED.evidence_valid,
+            risks_total = EXCLUDED.risks_total,
+            risks_high = EXCLUDED.risks_high
+        RETURNING *
+    """), {
+        "tenant_id": tenant_id,
+        "score": score,
+        "controls_total": total,
+        "controls_pass": passing,
+        "controls_partial": partial,
+        "evidence_total": evidence_stats.get("total", 0),
+        "evidence_valid": evidence_stats.get("by_status", {}).get("valid", 0),
+        "risks_total": len(risks),
+        "risks_high": risks_high,
+        "snapshot_date": today,
+    }).fetchone()
+    db.commit()
+
+    return _row_to_dict(row)
+
+
+@router.get("/dashboard/score-history")
+async def get_score_history(
+    months: int = Query(12, ge=1, le=36),
+    db: Session = Depends(get_db),
+    tenant_id: str = Depends(_get_tenant_id),
+):
+    """Get compliance score history from snapshots."""
+    since = date.today() - timedelta(days=months * 30)
+
+    rows = db.execute(text("""
+        SELECT * FROM compliance_score_snapshots
+        WHERE tenant_id = :tenant_id AND snapshot_date >= :since
+        ORDER BY snapshot_date ASC
+    """), {"tenant_id": tenant_id, "since": since}).fetchall()
+
+    snapshots = []
+    for r in rows:
+        d = _row_to_dict(r)
+        # Convert Decimal to float for JSON
+        if isinstance(d.get("score"), Decimal):
+            d["score"] = float(d["score"])
+        snapshots.append(d)
+
+    return {
+        "snapshots": snapshots,
+        "total": len(snapshots),
+        "period_months": months,
+    }
+
+
 # ============================================================================
 # Reports
 # ============================================================================
@@ -50,6 +50,14 @@ VALID_DOCUMENT_TYPES = {
    "cookie_banner",
    "agb",
    "clause",
+    # Security document templates (Migration 051)
+    "it_security_concept",
+    "data_protection_concept",
+    "backup_recovery_concept",
+    "logging_concept",
+    "incident_response_plan",
+    "access_control_concept",
+    "risk_management_concept",
 }
 VALID_STATUSES = {"published", "draft", "archived"}

@@ -46,7 +46,7 @@ EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
 ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
 ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
-OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3:30b-a3b")
+OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
 LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "120"))

 HARMONIZATION_THRESHOLD = 0.85  # Cosine similarity above this = duplicate
@@ -157,6 +157,9 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
    "edpb_transfers_07_2020":{"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB Transfers 07/2020"},
    "edpb_video_03_2019":    {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB Video Surveillance"},
    "edps_dpia_list":        {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPS DPIA Liste"},
+    "edpb_certification_01_2018": {"license": "EU_PUBLIC",     "rule": 1, "name": "EDPB Certification 01/2018"},
+    "edpb_certification_01_2019": {"license": "EU_PUBLIC",     "rule": 1, "name": "EDPB Certification 01/2019"},
+    "eaa":                   {"license": "EU_LAW",              "rule": 1, "name": "European Accessibility Act"},
    # WP29 (pre-EDPB) Guidelines
    "wp244_profiling":       {"license": "EU_PUBLIC",           "rule": 1, "name": "WP29 Profiling"},
    "wp251_profiling":       {"license": "EU_PUBLIC",           "rule": 1, "name": "WP29 Data Portability"},
@@ -240,6 +243,83 @@ DOMAIN_KEYWORDS = {
 }


+CATEGORY_KEYWORDS = {
+    "encryption": ["encryption", "cryptography", "tls", "ssl", "certificate", "hashing",
+                    "aes", "rsa", "verschlüsselung", "kryptographie", "zertifikat", "cipher"],
+    "authentication": ["authentication", "login", "password", "credential", "mfa", "2fa",
+                       "session", "oauth", "authentifizierung", "anmeldung", "passwort"],
+    "network": ["network", "firewall", "dns", "vpn", "proxy", "segmentation",
+                "netzwerk", "routing", "port", "intrusion", "ids", "ips"],
+    "data_protection": ["data protection", "privacy", "personal data", "datenschutz",
+                        "personenbezogen", "dsgvo", "gdpr", "löschung", "verarbeitung", "einwilligung"],
+    "logging": ["logging", "monitoring", "audit trail", "siem", "alert", "anomaly",
+                "protokollierung", "überwachung", "nachvollziehbar"],
+    "incident": ["incident", "response", "breach", "recovery", "vorfall", "sicherheitsvorfall"],
+    "continuity": ["backup", "disaster recovery", "notfall", "wiederherstellung", "notfallplan",
+                   "business continuity", "ausfallsicherheit"],
+    "compliance": ["compliance", "audit", "regulation", "certification", "konformität",
+                   "prüfung", "zertifizierung", "nachweis"],
+    "supply_chain": ["supplier", "vendor", "third party", "lieferant", "auftragnehmer",
+                     "unterauftragnehmer", "supply chain", "dienstleister"],
+    "physical": ["physical", "building", "access zone", "physisch", "gebäude", "zutritt",
+                 "schließsystem", "rechenzentrum"],
+    "personnel": ["training", "awareness", "employee", "schulung", "mitarbeiter",
+                  "sensibilisierung", "personal", "unterweisung"],
+    "application": ["application", "software", "code review", "sdlc", "secure coding",
+                    "anwendung", "entwicklung", "software-entwicklung", "api"],
+    "system": ["hardening", "patch", "configuration", "update", "härtung", "konfiguration",
+               "betriebssystem", "system", "server"],
+    "risk": ["risk assessment", "risk management", "risiko", "bewertung", "risikobewertung",
+             "risikoanalyse", "bedrohung", "threat"],
+    "governance": ["governance", "policy", "organization", "isms", "sicherheitsorganisation",
+                   "richtlinie", "verantwortlichkeit", "rolle"],
+    "hardware": ["hardware", "platform", "firmware", "bios", "tpm", "chip",
+                 "plattform", "geräte"],
+    "identity": ["identity", "iam", "directory", "ldap", "sso", "provisioning",
+                 "identität", "identitätsmanagement", "benutzerverzeichnis"],
+}
+
+VERIFICATION_KEYWORDS = {
+    "code_review": ["source code", "code review", "static analysis", "sast", "dast",
+                    "dependency check", "quellcode", "codeanalyse", "secure coding",
+                    "software development", "api", "input validation", "output encoding"],
+    "document": ["policy", "procedure", "documentation", "training", "awareness",
+                 "richtlinie", "dokumentation", "schulung", "nachweis", "vertrag",
+                 "organizational", "process", "role", "responsibility"],
+    "tool": ["scanner", "monitoring", "siem", "ids", "ips", "firewall", "antivirus",
+             "vulnerability scan", "penetration test", "tool", "automated"],
+    "hybrid": [],  # Assigned when multiple methods match equally
+}
+
+
+def _detect_category(text: str) -> Optional[str]:
+    """Detect the most likely category from text content."""
+    text_lower = text.lower()
+    scores: dict[str, int] = {}
+    for cat, keywords in CATEGORY_KEYWORDS.items():
+        scores[cat] = sum(1 for kw in keywords if kw in text_lower)
+    if not scores or max(scores.values()) == 0:
+        return None
+    return max(scores, key=scores.get)
+
+
+def _detect_verification_method(text: str) -> Optional[str]:
+    """Detect verification method from text content."""
+    text_lower = text.lower()
+    scores: dict[str, int] = {}
+    for method, keywords in VERIFICATION_KEYWORDS.items():
+        if method == "hybrid":
+            continue
+        scores[method] = sum(1 for kw in keywords if kw in text_lower)
+    if not scores or max(scores.values()) == 0:
+        return None
+    top = sorted(scores.items(), key=lambda x: -x[1])
+    # If top two are close, it's hybrid
+    if len(top) >= 2 and top[0][1] > 0 and top[1][1] > 0 and top[1][1] >= top[0][1] * 0.7:
+        return "hybrid"
+    return top[0][0] if top[0][1] > 0 else None
+
+
 def _detect_domain(text: str) -> str:
    """Detect the most likely domain from text content."""
    text_lower = text.lower()
@@ -259,10 +339,11 @@ class GeneratorConfig(BaseModel):
    collections: Optional[List[str]] = None
    domain: Optional[str] = None
    batch_size: int = 5
-    max_controls: int = 50
+    max_controls: int = 0  # 0 = unlimited (process ALL chunks)
    skip_processed: bool = True
    skip_web_search: bool = False
    dry_run: bool = False
+    existing_job_id: Optional[str] = None  # If set, reuse this job instead of creating a new one


@dataclass
@@ -287,6 +368,9 @@ class GeneratedControl:
    source_citation: Optional[dict] = None
    customer_visible: bool = True
    generation_metadata: dict = field(default_factory=dict)
+    # Classification fields
+    verification_method: Optional[str] = None  # code_review, document, tool, hybrid
+    category: Optional[str] = None  # one of 17 categories


@dataclass
@@ -299,6 +383,7 @@ class GeneratorResult:
    controls_needs_review: int = 0
    controls_too_close: int = 0
    controls_duplicates_found: int = 0
+    chunks_skipped_prefilter: int = 0
    errors: list = field(default_factory=list)
    controls: list = field(default_factory=list)

@@ -310,14 +395,68 @@ class GeneratorResult:
 async def _llm_chat(prompt: str, system_prompt: Optional[str] = None) -> str:
    """Call LLM — Anthropic Claude (primary) or Ollama (fallback)."""
    if ANTHROPIC_API_KEY:
+        logger.info("Calling Anthropic API (model=%s)...", ANTHROPIC_MODEL)
        result = await _llm_anthropic(prompt, system_prompt)
        if result:
+            logger.info("Anthropic API success (%d chars)", len(result))
            return result
        logger.warning("Anthropic failed, falling back to Ollama")

+    logger.info("Calling Ollama (model=%s)...", OLLAMA_MODEL)
    return await _llm_ollama(prompt, system_prompt)


+async def _llm_local(prompt: str, system_prompt: Optional[str] = None) -> str:
+    """Call local Ollama LLM only (for pre-filtering and classification tasks)."""
+    return await _llm_ollama(prompt, system_prompt)
+
+
+PREFILTER_SYSTEM_PROMPT = """Du bist ein Compliance-Analyst. Deine Aufgabe: Prüfe ob ein Textabschnitt eine konkrete Sicherheitsanforderung, Datenschutzpflicht, oder technische/organisatorische Maßnahme enthält.
+
+Antworte NUR mit einem JSON-Objekt: {"relevant": true/false, "reason": "kurze Begründung"}
+
+Relevant = true wenn der Text mindestens EINE der folgenden enthält:
+- Konkrete Pflicht/Anforderung ("muss", "soll", "ist sicherzustellen")
+- Technische Sicherheitsmaßnahme (Verschlüsselung, Zugriffskontrolle, Logging)
+- Organisatorische Maßnahme (Schulung, Dokumentation, Audit)
+- Datenschutz-Vorgabe (Löschpflicht, Einwilligung, Zweckbindung)
+- Risikomanagement-Anforderung
+
+Relevant = false wenn der Text NUR enthält:
+- Definitionen ohne Pflichten
+- Inhaltsverzeichnisse oder Verweise
+- Reine Begriffsbestimmungen
+- Übergangsvorschriften ohne Substanz
+- Adressaten/Geltungsbereich ohne Anforderung"""
+
+
+async def _prefilter_chunk(chunk_text: str) -> tuple[bool, str]:
+    """Use local LLM to check if a chunk contains an actionable requirement.
+
+    Returns (is_relevant, reason).
+    Much cheaper than sending every chunk to Anthropic.
+    """
+    prompt = f"""Prüfe ob dieser Textabschnitt eine konkrete Sicherheitsanforderung oder Compliance-Pflicht enthält.
+
+Text:
+---
+{chunk_text[:1500]}
+---
+
+Antworte NUR mit JSON: {{"relevant": true/false, "reason": "kurze Begründung"}}"""
+
+    try:
+        raw = await _llm_local(prompt, PREFILTER_SYSTEM_PROMPT)
+        data = _parse_llm_json(raw)
+        if data:
+            return data.get("relevant", True), data.get("reason", "")
+        # If parsing fails, assume relevant (don't skip)
+        return True, "parse_failed"
+    except Exception as e:
+        logger.warning("Prefilter failed: %s — treating as relevant", e)
+        return True, f"error: {e}"
+
+
 async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> str:
    """Call Anthropic Messages API."""
    headers = {
@@ -364,6 +503,8 @@ async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
        "model": OLLAMA_MODEL,
        "messages": messages,
        "stream": False,
+        "options": {"num_predict": 512},  # Limit response length for speed
+        "think": False,  # Disable thinking for faster responses
    }

    try:
@@ -397,6 +538,26 @@ async def _get_embedding(text: str) -> list[float]:
        return []


+async def _get_embeddings_batch(texts: list[str], batch_size: int = 32) -> list[list[float]]:
+    """Get embedding vectors for multiple texts in batches."""
+    all_embeddings: list[list[float]] = []
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i + batch_size]
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                resp = await client.post(
+                    f"{EMBEDDING_URL}/embed",
+                    json={"texts": batch},
+                )
+                resp.raise_for_status()
+                embeddings = resp.json().get("embeddings", [])
+                all_embeddings.extend(embeddings)
+        except Exception as e:
+            logger.warning("Batch embedding failed for %d texts: %s", len(batch), e)
+            all_embeddings.extend([[] for _ in batch])
+    return all_embeddings
+
+
 def _cosine_sim(a: list[float], b: list[float]) -> float:
    """Compute cosine similarity between two vectors."""
    if not a or not b or len(a) != len(b):
@@ -464,50 +625,96 @@ class ControlGeneratorPipeline:
    # ── Stage 1: RAG Scan ──────────────────────────────────────────────

    async def _scan_rag(self, config: GeneratorConfig) -> list[RAGSearchResult]:
-        """Load unprocessed chunks from RAG collections."""
+        """Scroll through ALL chunks in RAG collections.
+
+        Uses the scroll endpoint to iterate over every chunk (not just top-K search).
+        Filters out already-processed chunks by hash.
+        """
        collections = config.collections or ALL_COLLECTIONS
        all_results: list[RAGSearchResult] = []

-        queries = [
-            "security requirement control measure",
-            "Sicherheitsanforderung Maßnahme Prüfaspekt",
-            "compliance requirement audit criterion",
-            "data protection privacy obligation",
-            "access control authentication authorization",
-        ]
+        # Pre-load all processed hashes for fast filtering
+        processed_hashes: set[str] = set()
+        if config.skip_processed:
+            try:
+                result = self.db.execute(
+                    text("SELECT chunk_hash FROM canonical_processed_chunks")
+                )
+                processed_hashes = {row[0] for row in result}
+                logger.info("Loaded %d processed chunk hashes", len(processed_hashes))
+            except Exception as e:
+                logger.warning("Error loading processed hashes: %s", e)

-        if config.domain:
-            domain_kw = DOMAIN_KEYWORDS.get(config.domain, [])
-            if domain_kw:
-                queries.append(" ".join(domain_kw[:5]))
+        seen_hashes: set[str] = set()

        for collection in collections:
-            for query in queries:
-                results = await self.rag.search(
-                    query=query,
+            offset = None
+            page = 0
+            collection_total = 0
+            collection_new = 0
+            seen_offsets: set[str] = set()  # Detect scroll loops
+
+            while True:
+                chunks, next_offset = await self.rag.scroll(
                    collection=collection,
-                    top_k=20,
+                    offset=offset,
+                    limit=200,
                )
-                all_results.extend(results)

-        # Deduplicate by text hash
-        seen_hashes: set[str] = set()
-        unique: list[RAGSearchResult] = []
-        for r in all_results:
-            h = hashlib.sha256(r.text.encode()).hexdigest()
-            if h not in seen_hashes:
-                seen_hashes.add(h)
-                unique.append(r)
+                if not chunks:
+                    break

-        # Filter out already-processed chunks
-        if config.skip_processed and unique:
-            hashes = [hashlib.sha256(r.text.encode()).hexdigest() for r in unique]
-            processed = self._get_processed_hashes(hashes)
-            unique = [r for r, h in zip(unique, hashes) if h not in processed]
+                collection_total += len(chunks)

-        logger.info("RAG scan: %d unique chunks (%d after filtering processed)",
-                     len(seen_hashes), len(unique))
-        return unique[:config.max_controls * 3]  # Over-fetch to account for duplicates
+                for chunk in chunks:
+                    if not chunk.text or len(chunk.text.strip()) < 50:
+                        continue  # Skip empty/tiny chunks
+
+                    h = hashlib.sha256(chunk.text.encode()).hexdigest()
+
+                    # Skip duplicates (same text in multiple collections)
+                    if h in seen_hashes:
+                        continue
+                    seen_hashes.add(h)
+
+                    # Skip already-processed
+                    if h in processed_hashes:
+                        continue
+
+                    all_results.append(chunk)
+                    collection_new += 1
+
+                page += 1
+                if page % 50 == 0:
+                    logger.info(
+                        "Scrolling %s: page %d, %d total chunks, %d new unprocessed",
+                        collection, page, collection_total, collection_new,
+                    )
+
+                # Stop conditions
+                if not next_offset:
+                    break
+                # Detect infinite scroll loops (Qdrant mixed ID types)
+                if next_offset in seen_offsets:
+                    logger.warning(
+                        "Scroll loop detected in %s at offset %s (page %d) — stopping",
+                        collection, next_offset, page,
+                    )
+                    break
+                seen_offsets.add(next_offset)
+
+                offset = next_offset
+
+            logger.info(
+                "Collection %s: %d total chunks scrolled, %d new unprocessed",
+                collection, collection_total, collection_new,
+            )
+
+        logger.info(
+            "RAG scroll complete: %d total unique seen, %d new unprocessed to process",
+            len(seen_hashes), len(all_results),
+        )
+        return all_results

    def _get_processed_hashes(self, hashes: list[str]) -> set[str]:
        """Check which chunk hashes are already processed."""
@@ -568,6 +775,8 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
            "url": chunk.source_url or "",
        }
        control.customer_visible = True
+        control.verification_method = _detect_verification_method(chunk.text)
+        control.category = _detect_category(chunk.text)
        control.generation_metadata = {
            "processing_path": "structured",
            "license_rule": 1,
@@ -617,6 +826,8 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
            "url": chunk.source_url or "",
        }
        control.customer_visible = True
+        control.verification_method = _detect_verification_method(chunk.text)
+        control.category = _detect_category(chunk.text)
        control.generation_metadata = {
            "processing_path": "structured",
            "license_rule": 2,
@@ -661,6 +872,8 @@ Gib JSON zurück mit diesen Feldern:
        control.source_original_text = None   # NEVER store original
        control.source_citation = None        # NEVER cite source
        control.customer_visible = False      # Only our formulation
+        control.verification_method = _detect_verification_method(chunk.text)
+        control.category = _detect_category(chunk.text)
        # generation_metadata: NO source names, NO original texts
        control.generation_metadata = {
            "processing_path": "llm_reform",
@@ -676,6 +889,10 @@ Gib JSON zurück mit diesen Feldern:
        if not existing:
            return None

+        # Pre-load all existing embeddings in batch (once per pipeline run)
+        if not self._existing_embeddings:
+            await self._preload_embeddings(existing)
+
        new_text = f"{new_control.title} {new_control.objective}"
        new_emb = await _get_embedding(new_text)
        if not new_emb:
@@ -684,14 +901,7 @@ Gib JSON zurück mit diesen Feldern:
        similar = []
        for ex in existing:
            ex_key = ex.get("control_id", "")
-            ex_text = f"{ex.get('title', '')} {ex.get('objective', '')}"
-
-            # Get or compute embedding for existing control
-            if ex_key not in self._existing_embeddings:
-                emb = await _get_embedding(ex_text)
-                self._existing_embeddings[ex_key] = emb
            ex_emb = self._existing_embeddings.get(ex_key, [])
-
            if not ex_emb:
                continue

@@ -705,6 +915,20 @@ Gib JSON zurück mit diesen Feldern:

        return similar if similar else None

+    async def _preload_embeddings(self, existing: list[dict]):
+        """Pre-load embeddings for all existing controls in batches."""
+        texts = [f"{ex.get('title', '')} {ex.get('objective', '')}" for ex in existing]
+        keys = [ex.get("control_id", "") for ex in existing]
+
+        logger.info("Pre-loading embeddings for %d existing controls...", len(texts))
+        embeddings = await _get_embeddings_batch(texts)
+
+        for key, emb in zip(keys, embeddings):
+            self._existing_embeddings[key] = emb
+
+        loaded = sum(1 for emb in embeddings if emb)
+        logger.info("Pre-loaded %d/%d embeddings", loaded, len(texts))
+
    def _load_existing_controls(self) -> list[dict]:
        """Load existing controls from DB (cached per pipeline run)."""
        if self._existing_controls is not None:
@@ -799,10 +1023,11 @@ Gib JSON zurück mit diesen Feldern:
            return str(uuid.uuid4())

    def _update_job(self, job_id: str, result: GeneratorResult):
-        """Update job with final stats."""
+        """Update job with current stats. Sets completed_at only when status is final."""
+        is_final = result.status in ("completed", "failed")
        try:
            self.db.execute(
-                text("""
+                text(f"""
                    UPDATE canonical_generation_jobs
                    SET status = :status,
                        total_chunks_scanned = :scanned,
@@ -811,8 +1036,8 @@ Gib JSON zurück mit diesen Feldern:
                        controls_needs_review = :needs_review,
                        controls_too_close = :too_close,
                        controls_duplicates_found = :duplicates,
-                        errors = :errors,
-                        completed_at = NOW()
+                        errors = :errors
+                        {"" if not is_final else ", completed_at = NOW()"}
                    WHERE id = CAST(:job_id AS uuid)
                """),
                {
@@ -857,14 +1082,16 @@ Gib JSON zurück mit diesen Feldern:
                        severity, risk_score, implementation_effort,
                        open_anchors, release_state, tags,
                        license_rule, source_original_text, source_citation,
-                        customer_visible, generation_metadata
+                        customer_visible, generation_metadata,
+                        verification_method, category
                    ) VALUES (
                        :framework_id, :control_id, :title, :objective, :rationale,
                        :scope, :requirements, :test_procedure, :evidence,
                        :severity, :risk_score, :implementation_effort,
                        :open_anchors, :release_state, :tags,
                        :license_rule, :source_original_text, :source_citation,
-                        :customer_visible, :generation_metadata
+                        :customer_visible, :generation_metadata,
+                        :verification_method, :category
                    )
                    ON CONFLICT (framework_id, control_id) DO NOTHING
                    RETURNING id
@@ -890,6 +1117,8 @@ Gib JSON zurück mit diesen Feldern:
                    "source_citation": json.dumps(control.source_citation) if control.source_citation else None,
                    "customer_visible": control.customer_visible,
                    "generation_metadata": json.dumps(control.generation_metadata) if control.generation_metadata else None,
+                    "verification_method": control.verification_method,
+                    "category": control.category,
                },
            )
            self.db.commit()
@@ -926,7 +1155,7 @@ Gib JSON zurück mit diesen Feldern:
                """),
                {
                    "hash": chunk_hash,
-                    "collection": "bp_compliance_ce",  # Default, we don't track collection per result
+                    "collection": chunk.collection or "bp_compliance_ce",
                    "regulation_code": chunk.regulation_code,
                    "doc_version": "1.0",
                    "license": license_info.get("license", ""),
@@ -946,8 +1175,11 @@ Gib JSON zurück mit diesen Feldern:
        """Execute the full 7-stage pipeline."""
        result = GeneratorResult()

-        # Create job
-        job_id = self._create_job(config)
+        # Create or reuse job
+        if config.existing_job_id:
+            job_id = config.existing_job_id
+        else:
+            job_id = self._create_job(config)
        result.job_id = job_id

        try:
@@ -962,13 +1194,37 @@ Gib JSON zurück mit diesen Feldern:

            # Process chunks
            controls_count = 0
-            for chunk in chunks:
-                if controls_count >= config.max_controls:
-                    break
-
+            chunks_skipped_prefilter = 0
+            for i, chunk in enumerate(chunks):
                try:
+                    # Progress logging every 50 chunks
+                    if i > 0 and i % 50 == 0:
+                        logger.info(
+                            "Progress: %d/%d chunks processed, %d controls generated, %d skipped by prefilter",
+                            i, len(chunks), controls_count, chunks_skipped_prefilter,
+                        )
+                        self._update_job(job_id, result)
+
+                    # Stage 1.5: Local LLM pre-filter — skip chunks without requirements
+                    if not config.dry_run:
+                        is_relevant, prefilter_reason = await _prefilter_chunk(chunk.text)
+                        if not is_relevant:
+                            chunks_skipped_prefilter += 1
+                            # Mark as processed so we don't re-check next time
+                            license_info = self._classify_license(chunk)
+                            self._mark_chunk_processed(
+                                chunk, license_info, "prefilter_skip", [], job_id
+                            )
+                            continue
+
                    control = await self._process_single_chunk(chunk, config, job_id)
                    if control is None:
+                        # No control generated — still mark as processed
+                        if not config.dry_run:
+                            license_info = self._classify_license(chunk)
+                            self._mark_chunk_processed(
+                                chunk, license_info, "no_control", [], job_id
+                            )
                        continue

                    # Count by state
@@ -989,6 +1245,12 @@ Gib JSON zurück mit diesen Feldern:
                            license_info = self._classify_license(chunk)
                            path = "llm_reform" if license_info["rule"] == 3 else "structured"
                            self._mark_chunk_processed(chunk, license_info, path, [ctrl_uuid], job_id)
+                        else:
+                            # Store failed — still mark as processed
+                            license_info = self._classify_license(chunk)
+                            self._mark_chunk_processed(
+                                chunk, license_info, "store_failed", [], job_id
+                            )

                    result.controls_generated += 1
                    result.controls.append(asdict(control))
@@ -1006,6 +1268,21 @@ Gib JSON zurück mit diesen Feldern:
                    error_msg = f"Error processing chunk {chunk.regulation_code}/{chunk.article}: {e}"
                    logger.error(error_msg)
                    result.errors.append(error_msg)
+                    # Mark failed chunks as processed too (so we don't retry endlessly)
+                    try:
+                        if not config.dry_run:
+                            license_info = self._classify_license(chunk)
+                            self._mark_chunk_processed(
+                                chunk, license_info, "error", [], job_id
+                            )
+                    except Exception:
+                        pass
+
+            result.chunks_skipped_prefilter = chunks_skipped_prefilter
+            logger.info(
+                "Pipeline complete: %d controls generated, %d chunks skipped by prefilter, %d total chunks",
+                controls_count, chunks_skipped_prefilter, len(chunks),
+            )

            result.status = "completed"

@@ -33,6 +33,7 @@ class RAGSearchResult:
    paragraph: str
    source_url: str
    score: float
+    collection: str = ""


 class ComplianceRAGClient:
@@ -91,6 +92,7 @@ class ComplianceRAGClient:
                    paragraph=r.get("paragraph", ""),
                    source_url=r.get("source_url", ""),
                    score=r.get("score", 0.0),
+                    collection=collection,
                ))
            return results

@@ -98,6 +100,54 @@ class ComplianceRAGClient:
            logger.warning("RAG search failed: %s", e)
            return []

+    async def scroll(
+        self,
+        collection: str,
+        offset: Optional[str] = None,
+        limit: int = 100,
+    ) -> tuple[List[RAGSearchResult], Optional[str]]:
+        """
+        Scroll through ALL chunks in a collection (paginated).
+
+        Returns (chunks, next_offset). next_offset is None when done.
+        """
+        scroll_url = self._search_url.replace("/search", "/scroll")
+        params = {"collection": collection, "limit": str(limit)}
+        if offset:
+            params["offset"] = offset
+
+        try:
+            async with httpx.AsyncClient(timeout=30.0) as client:
+                resp = await client.get(scroll_url, params=params)
+
+            if resp.status_code != 200:
+                logger.warning(
+                    "RAG scroll returned %d: %s", resp.status_code, resp.text[:200]
+                )
+                return [], None
+
+            data = resp.json()
+            results = []
+            for r in data.get("chunks", []):
+                results.append(RAGSearchResult(
+                    text=r.get("text", ""),
+                    regulation_code=r.get("regulation_code", ""),
+                    regulation_name=r.get("regulation_name", ""),
+                    regulation_short=r.get("regulation_short", ""),
+                    category=r.get("category", ""),
+                    article=r.get("article", ""),
+                    paragraph=r.get("paragraph", ""),
+                    source_url=r.get("source_url", ""),
+                    score=0.0,
+                    collection=collection,
+                ))
+            next_offset = data.get("next_offset") or None
+            return results, next_offset
+
+        except Exception as e:
+            logger.warning("RAG scroll failed: %s", e)
+            return [], None
+
    def format_for_prompt(
        self, results: List[RAGSearchResult], max_results: int = 5
    ) -> str: