feat: add compliance modules 2-5 (dashboard, security templates, process manager, evidence collector)
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 32s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 23s
CI/CD / test-python-dsms-gateway (push) Successful in 21s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Successful in 2s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 32s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 23s
CI/CD / test-python-dsms-gateway (push) Successful in 21s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Successful in 2s
Module 2: Extended Compliance Dashboard with roadmap, module-status, next-actions, snapshots, score-history Module 3: 7 German security document templates (IT-Sicherheitskonzept, Datenschutz, Backup, Logging, Incident-Response, Zugriff, Risikomanagement) Module 4: Compliance Process Manager with CRUD, complete/skip/seed, ~50 seed tasks, 3-tab UI Module 5: Evidence Collector Extended with automated checks, control-mapping, coverage report, 4-tab UI Also includes: canonical control library enhancements (verification method, categories, dedup), control generator improvements, RAG client extensions 52 tests pass, frontend builds clean. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -53,6 +53,8 @@ _ROUTER_MODULES = [
|
||||
"wiki_routes",
|
||||
"canonical_control_routes",
|
||||
"control_generator_routes",
|
||||
"process_task_routes",
|
||||
"evidence_check_routes",
|
||||
]
|
||||
|
||||
_loaded_count = 0
|
||||
|
||||
@@ -78,6 +78,7 @@ class ControlResponse(BaseModel):
|
||||
customer_visible: Optional[bool] = None
|
||||
verification_method: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
target_audience: Optional[str] = None
|
||||
generation_metadata: Optional[dict] = None
|
||||
created_at: str
|
||||
updated_at: str
|
||||
@@ -106,6 +107,7 @@ class ControlCreateRequest(BaseModel):
|
||||
customer_visible: Optional[bool] = True
|
||||
verification_method: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
target_audience: Optional[str] = None
|
||||
generation_metadata: Optional[dict] = None
|
||||
|
||||
|
||||
@@ -130,6 +132,7 @@ class ControlUpdateRequest(BaseModel):
|
||||
customer_visible: Optional[bool] = None
|
||||
verification_method: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
target_audience: Optional[str] = None
|
||||
generation_metadata: Optional[dict] = None
|
||||
|
||||
|
||||
@@ -158,7 +161,7 @@ _CONTROL_COLS = """id, framework_id, control_id, title, objective, rationale,
|
||||
evidence_confidence, open_anchors, release_state, tags,
|
||||
license_rule, source_original_text, source_citation,
|
||||
customer_visible, verification_method, category,
|
||||
generation_metadata,
|
||||
target_audience, generation_metadata,
|
||||
created_at, updated_at"""
|
||||
|
||||
|
||||
@@ -241,6 +244,7 @@ async def list_framework_controls(
|
||||
release_state: Optional[str] = Query(None),
|
||||
verification_method: Optional[str] = Query(None),
|
||||
category: Optional[str] = Query(None),
|
||||
target_audience: Optional[str] = Query(None),
|
||||
):
|
||||
"""List controls belonging to a framework."""
|
||||
with SessionLocal() as db:
|
||||
@@ -271,6 +275,9 @@ async def list_framework_controls(
|
||||
if category:
|
||||
query += " AND category = :cat"
|
||||
params["cat"] = category
|
||||
if target_audience:
|
||||
query += " AND target_audience = :ta"
|
||||
params["ta"] = target_audience
|
||||
|
||||
query += " ORDER BY control_id"
|
||||
rows = db.execute(text(query), params).fetchall()
|
||||
@@ -289,6 +296,7 @@ async def list_controls(
|
||||
release_state: Optional[str] = Query(None),
|
||||
verification_method: Optional[str] = Query(None),
|
||||
category: Optional[str] = Query(None),
|
||||
target_audience: Optional[str] = Query(None),
|
||||
):
|
||||
"""List all canonical controls, with optional filters."""
|
||||
query = f"""
|
||||
@@ -313,6 +321,9 @@ async def list_controls(
|
||||
if category:
|
||||
query += " AND category = :cat"
|
||||
params["cat"] = category
|
||||
if target_audience:
|
||||
query += " AND target_audience = :ta"
|
||||
params["ta"] = target_audience
|
||||
|
||||
query += " ORDER BY control_id"
|
||||
|
||||
@@ -384,7 +395,7 @@ async def create_control(body: ControlCreateRequest):
|
||||
open_anchors, release_state, tags,
|
||||
license_rule, source_original_text, source_citation,
|
||||
customer_visible, verification_method, category,
|
||||
generation_metadata
|
||||
target_audience, generation_metadata
|
||||
) VALUES (
|
||||
:fw_id, :cid, :title, :objective, :rationale,
|
||||
CAST(:scope AS jsonb), CAST(:requirements AS jsonb),
|
||||
@@ -394,7 +405,7 @@ async def create_control(body: ControlCreateRequest):
|
||||
:license_rule, :source_original_text,
|
||||
CAST(:source_citation AS jsonb),
|
||||
:customer_visible, :verification_method, :category,
|
||||
CAST(:generation_metadata AS jsonb)
|
||||
:target_audience, CAST(:generation_metadata AS jsonb)
|
||||
)
|
||||
RETURNING {_CONTROL_COLS}
|
||||
"""),
|
||||
@@ -421,6 +432,7 @@ async def create_control(body: ControlCreateRequest):
|
||||
"customer_visible": body.customer_visible,
|
||||
"verification_method": body.verification_method,
|
||||
"category": body.category,
|
||||
"target_audience": body.target_audience,
|
||||
"generation_metadata": _json.dumps(body.generation_metadata) if body.generation_metadata else None,
|
||||
},
|
||||
).fetchone()
|
||||
@@ -647,6 +659,7 @@ def _control_row(r) -> dict:
|
||||
"customer_visible": r.customer_visible,
|
||||
"verification_method": r.verification_method,
|
||||
"category": r.category,
|
||||
"target_audience": r.target_audience,
|
||||
"generation_metadata": r.generation_metadata,
|
||||
"created_at": r.created_at.isoformat() if r.created_at else None,
|
||||
"updated_at": r.updated_at.isoformat() if r.updated_at else None,
|
||||
|
||||
@@ -12,6 +12,7 @@ Endpoints:
|
||||
POST /v1/canonical/blocked-sources/cleanup — Start cleanup workflow
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
@@ -89,9 +90,42 @@ class BlockedSourceResponse(BaseModel):
|
||||
# ENDPOINTS
|
||||
# =============================================================================
|
||||
|
||||
async def _run_pipeline_background(config: GeneratorConfig, job_id: str):
|
||||
"""Run the pipeline in the background. Uses its own DB session."""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
config.existing_job_id = job_id
|
||||
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
|
||||
result = await pipeline.run(config)
|
||||
logger.info(
|
||||
"Background generation job %s completed: %d controls from %d chunks",
|
||||
job_id, result.controls_generated, result.total_chunks_scanned,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Background generation job %s failed: %s", job_id, e)
|
||||
# Update job as failed
|
||||
try:
|
||||
db.execute(
|
||||
text("""
|
||||
UPDATE canonical_generation_jobs
|
||||
SET status = 'failed', errors = :errors, completed_at = NOW()
|
||||
WHERE id = CAST(:job_id AS uuid)
|
||||
"""),
|
||||
{"job_id": job_id, "errors": json.dumps([str(e)])},
|
||||
)
|
||||
db.commit()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.post("/generate", response_model=GenerateResponse)
|
||||
async def start_generation(req: GenerateRequest):
|
||||
"""Start a control generation run."""
|
||||
"""Start a control generation run (runs in background).
|
||||
|
||||
Returns immediately with job_id. Use GET /generate/status/{job_id} to poll progress.
|
||||
"""
|
||||
config = GeneratorConfig(
|
||||
collections=req.collections,
|
||||
domain=req.domain,
|
||||
@@ -101,30 +135,63 @@ async def start_generation(req: GenerateRequest):
|
||||
dry_run=req.dry_run,
|
||||
)
|
||||
|
||||
if req.dry_run:
|
||||
# Dry run: execute synchronously and return controls
|
||||
db = SessionLocal()
|
||||
try:
|
||||
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
|
||||
result = await pipeline.run(config)
|
||||
return GenerateResponse(
|
||||
job_id=result.job_id,
|
||||
status=result.status,
|
||||
message=f"Dry run: {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
|
||||
total_chunks_scanned=result.total_chunks_scanned,
|
||||
controls_generated=result.controls_generated,
|
||||
controls_verified=result.controls_verified,
|
||||
controls_needs_review=result.controls_needs_review,
|
||||
controls_too_close=result.controls_too_close,
|
||||
controls_duplicates_found=result.controls_duplicates_found,
|
||||
errors=result.errors,
|
||||
controls=result.controls,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Dry run failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
# Create job record first so we can return the ID
|
||||
db = SessionLocal()
|
||||
try:
|
||||
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
|
||||
result = await pipeline.run(config)
|
||||
|
||||
return GenerateResponse(
|
||||
job_id=result.job_id,
|
||||
status=result.status,
|
||||
message=f"Generated {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
|
||||
total_chunks_scanned=result.total_chunks_scanned,
|
||||
controls_generated=result.controls_generated,
|
||||
controls_verified=result.controls_verified,
|
||||
controls_needs_review=result.controls_needs_review,
|
||||
controls_too_close=result.controls_too_close,
|
||||
controls_duplicates_found=result.controls_duplicates_found,
|
||||
errors=result.errors,
|
||||
controls=result.controls if req.dry_run else [],
|
||||
result = db.execute(
|
||||
text("""
|
||||
INSERT INTO canonical_generation_jobs (status, config)
|
||||
VALUES ('running', :config)
|
||||
RETURNING id
|
||||
"""),
|
||||
{"config": json.dumps(config.model_dump())},
|
||||
)
|
||||
db.commit()
|
||||
row = result.fetchone()
|
||||
job_id = str(row[0]) if row else None
|
||||
except Exception as e:
|
||||
logger.error("Generation failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
logger.error("Failed to create job: %s", e)
|
||||
raise HTTPException(status_code=500, detail=f"Failed to create job: {e}")
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
if not job_id:
|
||||
raise HTTPException(status_code=500, detail="Failed to create job record")
|
||||
|
||||
# Launch pipeline in background
|
||||
asyncio.create_task(_run_pipeline_background(config, job_id))
|
||||
|
||||
return GenerateResponse(
|
||||
job_id=job_id,
|
||||
status="running",
|
||||
message="Generation started in background. Poll /generate/status/{job_id} for progress.",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/generate/status/{job_id}")
|
||||
async def get_job_status(job_id: str):
|
||||
|
||||
@@ -5,16 +5,23 @@ Endpoints:
|
||||
- /dashboard: Main compliance dashboard
|
||||
- /dashboard/executive: Executive summary for managers
|
||||
- /dashboard/trend: Compliance score trend over time
|
||||
- /dashboard/roadmap: Prioritised controls in 4 buckets
|
||||
- /dashboard/module-status: Completion status of each SDK module
|
||||
- /dashboard/next-actions: Top 5 most important actions
|
||||
- /dashboard/snapshot: Save / query compliance score snapshots
|
||||
- /score: Quick compliance score
|
||||
- /reports: Report generation
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime, date, timedelta
|
||||
from calendar import month_abbr
|
||||
from typing import Optional
|
||||
from typing import Optional, Dict, Any, List
|
||||
from decimal import Decimal
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from classroom_engine.database import get_db
|
||||
@@ -34,6 +41,8 @@ from .schemas import (
|
||||
DeadlineItem,
|
||||
TeamWorkloadItem,
|
||||
)
|
||||
from .tenant_utils import get_tenant_id as _get_tenant_id
|
||||
from .db_utils import row_to_dict as _row_to_dict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(tags=["compliance-dashboard"])
|
||||
@@ -322,6 +331,272 @@ async def get_compliance_trend(
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Dashboard Extended — Roadmap, Module-Status, Next-Actions, Snapshots
|
||||
# ============================================================================
|
||||
|
||||
# Weight map for control prioritisation
|
||||
_PRIORITY_WEIGHTS = {"legal": 5, "security": 3, "best_practice": 1, "operational": 2}
|
||||
|
||||
# SDK module definitions → DB table used for counting completion
|
||||
_MODULE_DEFS: List[Dict[str, str]] = [
|
||||
{"key": "vvt", "label": "VVT", "table": "compliance_vvt_activities"},
|
||||
{"key": "tom", "label": "TOM", "table": "compliance_toms"},
|
||||
{"key": "dsfa", "label": "DSFA", "table": "compliance_dsfa_assessments"},
|
||||
{"key": "loeschfristen", "label": "Loeschfristen", "table": "compliance_loeschfristen"},
|
||||
{"key": "risks", "label": "Risiken", "table": "compliance_risks"},
|
||||
{"key": "controls", "label": "Controls", "table": "compliance_controls"},
|
||||
{"key": "evidence", "label": "Nachweise", "table": "compliance_evidence"},
|
||||
{"key": "obligations", "label": "Pflichten", "table": "compliance_obligations"},
|
||||
{"key": "incidents", "label": "Vorfaelle", "table": "compliance_notfallplan_incidents"},
|
||||
{"key": "vendor", "label": "Auftragsverarbeiter", "table": "compliance_vendor_assessments"},
|
||||
{"key": "legal_templates", "label": "Rechtl. Dokumente", "table": "compliance_legal_templates"},
|
||||
{"key": "training", "label": "Schulungen", "table": "training_modules"},
|
||||
{"key": "audit", "label": "Audit", "table": "compliance_audit_sessions"},
|
||||
{"key": "security_backlog", "label": "Security-Backlog", "table": "compliance_security_backlog"},
|
||||
{"key": "quality", "label": "Qualitaet", "table": "compliance_quality_items"},
|
||||
]
|
||||
|
||||
|
||||
@router.get("/dashboard/roadmap")
|
||||
async def get_dashboard_roadmap(
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Prioritised controls in 4 buckets: Quick Wins, Must Have, Should Have, Nice to Have."""
|
||||
ctrl_repo = ControlRepository(db)
|
||||
controls = ctrl_repo.get_all()
|
||||
today = datetime.utcnow().date()
|
||||
|
||||
buckets: Dict[str, list] = {
|
||||
"quick_wins": [],
|
||||
"must_have": [],
|
||||
"should_have": [],
|
||||
"nice_to_have": [],
|
||||
}
|
||||
|
||||
for ctrl in controls:
|
||||
status = ctrl.status.value if ctrl.status else "planned"
|
||||
if status == "pass":
|
||||
continue # already done
|
||||
|
||||
weight = _PRIORITY_WEIGHTS.get(ctrl.category if hasattr(ctrl, "category") else "best_practice", 1)
|
||||
days_overdue = 0
|
||||
if ctrl.next_review_at:
|
||||
review_date = ctrl.next_review_at.date() if hasattr(ctrl.next_review_at, "date") else ctrl.next_review_at
|
||||
days_overdue = (today - review_date).days
|
||||
|
||||
urgency = weight * 2 + (1 if days_overdue > 0 else 0)
|
||||
|
||||
item = {
|
||||
"id": str(ctrl.id),
|
||||
"control_id": ctrl.control_id,
|
||||
"title": ctrl.title,
|
||||
"status": status,
|
||||
"domain": ctrl.domain.value if ctrl.domain else "unknown",
|
||||
"owner": ctrl.owner,
|
||||
"next_review_at": ctrl.next_review_at.isoformat() if ctrl.next_review_at else None,
|
||||
"days_overdue": max(0, days_overdue),
|
||||
"weight": weight,
|
||||
}
|
||||
|
||||
if weight >= 5 and days_overdue > 0:
|
||||
buckets["quick_wins"].append(item)
|
||||
elif weight >= 4:
|
||||
buckets["must_have"].append(item)
|
||||
elif weight >= 2:
|
||||
buckets["should_have"].append(item)
|
||||
else:
|
||||
buckets["nice_to_have"].append(item)
|
||||
|
||||
# Sort each bucket by urgency desc
|
||||
for key in buckets:
|
||||
buckets[key].sort(key=lambda x: x["days_overdue"], reverse=True)
|
||||
|
||||
return {
|
||||
"buckets": buckets,
|
||||
"counts": {k: len(v) for k, v in buckets.items()},
|
||||
"generated_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@router.get("/dashboard/module-status")
|
||||
async def get_module_status(
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Completion status for each SDK module based on DB record counts."""
|
||||
modules = []
|
||||
for mod in _MODULE_DEFS:
|
||||
try:
|
||||
row = db.execute(
|
||||
text(f"SELECT COUNT(*) FROM {mod['table']} WHERE tenant_id = :tid"),
|
||||
{"tid": tenant_id},
|
||||
).fetchone()
|
||||
count = int(row[0]) if row else 0
|
||||
except Exception:
|
||||
count = 0
|
||||
|
||||
# Simple heuristic: 0 = not started, 1-2 = in progress, 3+ = complete
|
||||
if count == 0:
|
||||
status = "not_started"
|
||||
progress = 0
|
||||
elif count < 3:
|
||||
status = "in_progress"
|
||||
progress = min(60, count * 30)
|
||||
else:
|
||||
status = "complete"
|
||||
progress = 100
|
||||
|
||||
modules.append({
|
||||
"key": mod["key"],
|
||||
"label": mod["label"],
|
||||
"count": count,
|
||||
"status": status,
|
||||
"progress": progress,
|
||||
})
|
||||
|
||||
started = sum(1 for m in modules if m["status"] != "not_started")
|
||||
complete = sum(1 for m in modules if m["status"] == "complete")
|
||||
|
||||
return {
|
||||
"modules": modules,
|
||||
"total": len(modules),
|
||||
"started": started,
|
||||
"complete": complete,
|
||||
"overall_progress": round((complete / len(modules)) * 100, 1) if modules else 0,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/dashboard/next-actions")
|
||||
async def get_next_actions(
|
||||
limit: int = Query(5, ge=1, le=20),
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Top N most important actions sorted by urgency*impact."""
|
||||
ctrl_repo = ControlRepository(db)
|
||||
controls = ctrl_repo.get_all()
|
||||
today = datetime.utcnow().date()
|
||||
|
||||
actions = []
|
||||
for ctrl in controls:
|
||||
status = ctrl.status.value if ctrl.status else "planned"
|
||||
if status == "pass":
|
||||
continue
|
||||
|
||||
days_overdue = 0
|
||||
if ctrl.next_review_at:
|
||||
review_date = ctrl.next_review_at.date() if hasattr(ctrl.next_review_at, "date") else ctrl.next_review_at
|
||||
days_overdue = max(0, (today - review_date).days)
|
||||
|
||||
weight = _PRIORITY_WEIGHTS.get(ctrl.category if hasattr(ctrl, "category") else "best_practice", 1)
|
||||
urgency_score = weight * 10 + days_overdue
|
||||
|
||||
actions.append({
|
||||
"id": str(ctrl.id),
|
||||
"control_id": ctrl.control_id,
|
||||
"title": ctrl.title,
|
||||
"status": status,
|
||||
"domain": ctrl.domain.value if ctrl.domain else "unknown",
|
||||
"owner": ctrl.owner,
|
||||
"days_overdue": days_overdue,
|
||||
"urgency_score": urgency_score,
|
||||
"reason": "Ueberfaellig" if days_overdue > 0 else "Offen",
|
||||
})
|
||||
|
||||
actions.sort(key=lambda x: x["urgency_score"], reverse=True)
|
||||
return {"actions": actions[:limit]}
|
||||
|
||||
|
||||
@router.post("/dashboard/snapshot")
|
||||
async def create_score_snapshot(
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Save current compliance score as a historical snapshot."""
|
||||
ctrl_repo = ControlRepository(db)
|
||||
evidence_repo = EvidenceRepository(db)
|
||||
risk_repo = RiskRepository(db)
|
||||
|
||||
ctrl_stats = ctrl_repo.get_statistics()
|
||||
evidence_stats = evidence_repo.get_statistics()
|
||||
risks = risk_repo.get_all()
|
||||
|
||||
total = ctrl_stats.get("total", 0)
|
||||
passing = ctrl_stats.get("pass", 0)
|
||||
partial = ctrl_stats.get("partial", 0)
|
||||
score = round(((passing + partial * 0.5) / total) * 100, 2) if total > 0 else 0
|
||||
|
||||
risks_high = sum(1 for r in risks if (r.inherent_risk.value if r.inherent_risk else "low") in ("high", "critical"))
|
||||
|
||||
today = date.today()
|
||||
|
||||
row = db.execute(text("""
|
||||
INSERT INTO compliance_score_snapshots (
|
||||
tenant_id, score, controls_total, controls_pass, controls_partial,
|
||||
evidence_total, evidence_valid, risks_total, risks_high, snapshot_date
|
||||
) VALUES (
|
||||
:tenant_id, :score, :controls_total, :controls_pass, :controls_partial,
|
||||
:evidence_total, :evidence_valid, :risks_total, :risks_high, :snapshot_date
|
||||
)
|
||||
ON CONFLICT (tenant_id, project_id, snapshot_date) DO UPDATE SET
|
||||
score = EXCLUDED.score,
|
||||
controls_total = EXCLUDED.controls_total,
|
||||
controls_pass = EXCLUDED.controls_pass,
|
||||
controls_partial = EXCLUDED.controls_partial,
|
||||
evidence_total = EXCLUDED.evidence_total,
|
||||
evidence_valid = EXCLUDED.evidence_valid,
|
||||
risks_total = EXCLUDED.risks_total,
|
||||
risks_high = EXCLUDED.risks_high
|
||||
RETURNING *
|
||||
"""), {
|
||||
"tenant_id": tenant_id,
|
||||
"score": score,
|
||||
"controls_total": total,
|
||||
"controls_pass": passing,
|
||||
"controls_partial": partial,
|
||||
"evidence_total": evidence_stats.get("total", 0),
|
||||
"evidence_valid": evidence_stats.get("by_status", {}).get("valid", 0),
|
||||
"risks_total": len(risks),
|
||||
"risks_high": risks_high,
|
||||
"snapshot_date": today,
|
||||
}).fetchone()
|
||||
db.commit()
|
||||
|
||||
return _row_to_dict(row)
|
||||
|
||||
|
||||
@router.get("/dashboard/score-history")
|
||||
async def get_score_history(
|
||||
months: int = Query(12, ge=1, le=36),
|
||||
db: Session = Depends(get_db),
|
||||
tenant_id: str = Depends(_get_tenant_id),
|
||||
):
|
||||
"""Get compliance score history from snapshots."""
|
||||
since = date.today() - timedelta(days=months * 30)
|
||||
|
||||
rows = db.execute(text("""
|
||||
SELECT * FROM compliance_score_snapshots
|
||||
WHERE tenant_id = :tenant_id AND snapshot_date >= :since
|
||||
ORDER BY snapshot_date ASC
|
||||
"""), {"tenant_id": tenant_id, "since": since}).fetchall()
|
||||
|
||||
snapshots = []
|
||||
for r in rows:
|
||||
d = _row_to_dict(r)
|
||||
# Convert Decimal to float for JSON
|
||||
if isinstance(d.get("score"), Decimal):
|
||||
d["score"] = float(d["score"])
|
||||
snapshots.append(d)
|
||||
|
||||
return {
|
||||
"snapshots": snapshots,
|
||||
"total": len(snapshots),
|
||||
"period_months": months,
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Reports
|
||||
# ============================================================================
|
||||
|
||||
1151
backend-compliance/compliance/api/evidence_check_routes.py
Normal file
1151
backend-compliance/compliance/api/evidence_check_routes.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -50,6 +50,14 @@ VALID_DOCUMENT_TYPES = {
|
||||
"cookie_banner",
|
||||
"agb",
|
||||
"clause",
|
||||
# Security document templates (Migration 051)
|
||||
"it_security_concept",
|
||||
"data_protection_concept",
|
||||
"backup_recovery_concept",
|
||||
"logging_concept",
|
||||
"incident_response_plan",
|
||||
"access_control_concept",
|
||||
"risk_management_concept",
|
||||
}
|
||||
VALID_STATUSES = {"published", "draft", "archived"}
|
||||
|
||||
|
||||
1072
backend-compliance/compliance/api/process_task_routes.py
Normal file
1072
backend-compliance/compliance/api/process_task_routes.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -46,7 +46,7 @@ EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
|
||||
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
||||
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3:30b-a3b")
|
||||
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
||||
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "120"))
|
||||
|
||||
HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
|
||||
@@ -157,6 +157,9 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
|
||||
"edpb_transfers_07_2020":{"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Transfers 07/2020"},
|
||||
"edpb_video_03_2019": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Video Surveillance"},
|
||||
"edps_dpia_list": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPS DPIA Liste"},
|
||||
"edpb_certification_01_2018": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Certification 01/2018"},
|
||||
"edpb_certification_01_2019": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Certification 01/2019"},
|
||||
"eaa": {"license": "EU_LAW", "rule": 1, "name": "European Accessibility Act"},
|
||||
# WP29 (pre-EDPB) Guidelines
|
||||
"wp244_profiling": {"license": "EU_PUBLIC", "rule": 1, "name": "WP29 Profiling"},
|
||||
"wp251_profiling": {"license": "EU_PUBLIC", "rule": 1, "name": "WP29 Data Portability"},
|
||||
@@ -240,6 +243,83 @@ DOMAIN_KEYWORDS = {
|
||||
}
|
||||
|
||||
|
||||
CATEGORY_KEYWORDS = {
|
||||
"encryption": ["encryption", "cryptography", "tls", "ssl", "certificate", "hashing",
|
||||
"aes", "rsa", "verschlüsselung", "kryptographie", "zertifikat", "cipher"],
|
||||
"authentication": ["authentication", "login", "password", "credential", "mfa", "2fa",
|
||||
"session", "oauth", "authentifizierung", "anmeldung", "passwort"],
|
||||
"network": ["network", "firewall", "dns", "vpn", "proxy", "segmentation",
|
||||
"netzwerk", "routing", "port", "intrusion", "ids", "ips"],
|
||||
"data_protection": ["data protection", "privacy", "personal data", "datenschutz",
|
||||
"personenbezogen", "dsgvo", "gdpr", "löschung", "verarbeitung", "einwilligung"],
|
||||
"logging": ["logging", "monitoring", "audit trail", "siem", "alert", "anomaly",
|
||||
"protokollierung", "überwachung", "nachvollziehbar"],
|
||||
"incident": ["incident", "response", "breach", "recovery", "vorfall", "sicherheitsvorfall"],
|
||||
"continuity": ["backup", "disaster recovery", "notfall", "wiederherstellung", "notfallplan",
|
||||
"business continuity", "ausfallsicherheit"],
|
||||
"compliance": ["compliance", "audit", "regulation", "certification", "konformität",
|
||||
"prüfung", "zertifizierung", "nachweis"],
|
||||
"supply_chain": ["supplier", "vendor", "third party", "lieferant", "auftragnehmer",
|
||||
"unterauftragnehmer", "supply chain", "dienstleister"],
|
||||
"physical": ["physical", "building", "access zone", "physisch", "gebäude", "zutritt",
|
||||
"schließsystem", "rechenzentrum"],
|
||||
"personnel": ["training", "awareness", "employee", "schulung", "mitarbeiter",
|
||||
"sensibilisierung", "personal", "unterweisung"],
|
||||
"application": ["application", "software", "code review", "sdlc", "secure coding",
|
||||
"anwendung", "entwicklung", "software-entwicklung", "api"],
|
||||
"system": ["hardening", "patch", "configuration", "update", "härtung", "konfiguration",
|
||||
"betriebssystem", "system", "server"],
|
||||
"risk": ["risk assessment", "risk management", "risiko", "bewertung", "risikobewertung",
|
||||
"risikoanalyse", "bedrohung", "threat"],
|
||||
"governance": ["governance", "policy", "organization", "isms", "sicherheitsorganisation",
|
||||
"richtlinie", "verantwortlichkeit", "rolle"],
|
||||
"hardware": ["hardware", "platform", "firmware", "bios", "tpm", "chip",
|
||||
"plattform", "geräte"],
|
||||
"identity": ["identity", "iam", "directory", "ldap", "sso", "provisioning",
|
||||
"identität", "identitätsmanagement", "benutzerverzeichnis"],
|
||||
}
|
||||
|
||||
VERIFICATION_KEYWORDS = {
|
||||
"code_review": ["source code", "code review", "static analysis", "sast", "dast",
|
||||
"dependency check", "quellcode", "codeanalyse", "secure coding",
|
||||
"software development", "api", "input validation", "output encoding"],
|
||||
"document": ["policy", "procedure", "documentation", "training", "awareness",
|
||||
"richtlinie", "dokumentation", "schulung", "nachweis", "vertrag",
|
||||
"organizational", "process", "role", "responsibility"],
|
||||
"tool": ["scanner", "monitoring", "siem", "ids", "ips", "firewall", "antivirus",
|
||||
"vulnerability scan", "penetration test", "tool", "automated"],
|
||||
"hybrid": [], # Assigned when multiple methods match equally
|
||||
}
|
||||
|
||||
|
||||
def _detect_category(text: str) -> Optional[str]:
|
||||
"""Detect the most likely category from text content."""
|
||||
text_lower = text.lower()
|
||||
scores: dict[str, int] = {}
|
||||
for cat, keywords in CATEGORY_KEYWORDS.items():
|
||||
scores[cat] = sum(1 for kw in keywords if kw in text_lower)
|
||||
if not scores or max(scores.values()) == 0:
|
||||
return None
|
||||
return max(scores, key=scores.get)
|
||||
|
||||
|
||||
def _detect_verification_method(text: str) -> Optional[str]:
|
||||
"""Detect verification method from text content."""
|
||||
text_lower = text.lower()
|
||||
scores: dict[str, int] = {}
|
||||
for method, keywords in VERIFICATION_KEYWORDS.items():
|
||||
if method == "hybrid":
|
||||
continue
|
||||
scores[method] = sum(1 for kw in keywords if kw in text_lower)
|
||||
if not scores or max(scores.values()) == 0:
|
||||
return None
|
||||
top = sorted(scores.items(), key=lambda x: -x[1])
|
||||
# If top two are close, it's hybrid
|
||||
if len(top) >= 2 and top[0][1] > 0 and top[1][1] > 0 and top[1][1] >= top[0][1] * 0.7:
|
||||
return "hybrid"
|
||||
return top[0][0] if top[0][1] > 0 else None
|
||||
|
||||
|
||||
def _detect_domain(text: str) -> str:
|
||||
"""Detect the most likely domain from text content."""
|
||||
text_lower = text.lower()
|
||||
@@ -259,10 +339,11 @@ class GeneratorConfig(BaseModel):
|
||||
collections: Optional[List[str]] = None
|
||||
domain: Optional[str] = None
|
||||
batch_size: int = 5
|
||||
max_controls: int = 50
|
||||
max_controls: int = 0 # 0 = unlimited (process ALL chunks)
|
||||
skip_processed: bool = True
|
||||
skip_web_search: bool = False
|
||||
dry_run: bool = False
|
||||
existing_job_id: Optional[str] = None # If set, reuse this job instead of creating a new one
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -287,6 +368,9 @@ class GeneratedControl:
|
||||
source_citation: Optional[dict] = None
|
||||
customer_visible: bool = True
|
||||
generation_metadata: dict = field(default_factory=dict)
|
||||
# Classification fields
|
||||
verification_method: Optional[str] = None # code_review, document, tool, hybrid
|
||||
category: Optional[str] = None # one of 17 categories
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -299,6 +383,7 @@ class GeneratorResult:
|
||||
controls_needs_review: int = 0
|
||||
controls_too_close: int = 0
|
||||
controls_duplicates_found: int = 0
|
||||
chunks_skipped_prefilter: int = 0
|
||||
errors: list = field(default_factory=list)
|
||||
controls: list = field(default_factory=list)
|
||||
|
||||
@@ -310,14 +395,68 @@ class GeneratorResult:
|
||||
async def _llm_chat(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
"""Call LLM — Anthropic Claude (primary) or Ollama (fallback)."""
|
||||
if ANTHROPIC_API_KEY:
|
||||
logger.info("Calling Anthropic API (model=%s)...", ANTHROPIC_MODEL)
|
||||
result = await _llm_anthropic(prompt, system_prompt)
|
||||
if result:
|
||||
logger.info("Anthropic API success (%d chars)", len(result))
|
||||
return result
|
||||
logger.warning("Anthropic failed, falling back to Ollama")
|
||||
|
||||
logger.info("Calling Ollama (model=%s)...", OLLAMA_MODEL)
|
||||
return await _llm_ollama(prompt, system_prompt)
|
||||
|
||||
|
||||
async def _llm_local(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
"""Call local Ollama LLM only (for pre-filtering and classification tasks)."""
|
||||
return await _llm_ollama(prompt, system_prompt)
|
||||
|
||||
|
||||
PREFILTER_SYSTEM_PROMPT = """Du bist ein Compliance-Analyst. Deine Aufgabe: Prüfe ob ein Textabschnitt eine konkrete Sicherheitsanforderung, Datenschutzpflicht, oder technische/organisatorische Maßnahme enthält.
|
||||
|
||||
Antworte NUR mit einem JSON-Objekt: {"relevant": true/false, "reason": "kurze Begründung"}
|
||||
|
||||
Relevant = true wenn der Text mindestens EINE der folgenden enthält:
|
||||
- Konkrete Pflicht/Anforderung ("muss", "soll", "ist sicherzustellen")
|
||||
- Technische Sicherheitsmaßnahme (Verschlüsselung, Zugriffskontrolle, Logging)
|
||||
- Organisatorische Maßnahme (Schulung, Dokumentation, Audit)
|
||||
- Datenschutz-Vorgabe (Löschpflicht, Einwilligung, Zweckbindung)
|
||||
- Risikomanagement-Anforderung
|
||||
|
||||
Relevant = false wenn der Text NUR enthält:
|
||||
- Definitionen ohne Pflichten
|
||||
- Inhaltsverzeichnisse oder Verweise
|
||||
- Reine Begriffsbestimmungen
|
||||
- Übergangsvorschriften ohne Substanz
|
||||
- Adressaten/Geltungsbereich ohne Anforderung"""
|
||||
|
||||
|
||||
async def _prefilter_chunk(chunk_text: str) -> tuple[bool, str]:
|
||||
"""Use local LLM to check if a chunk contains an actionable requirement.
|
||||
|
||||
Returns (is_relevant, reason).
|
||||
Much cheaper than sending every chunk to Anthropic.
|
||||
"""
|
||||
prompt = f"""Prüfe ob dieser Textabschnitt eine konkrete Sicherheitsanforderung oder Compliance-Pflicht enthält.
|
||||
|
||||
Text:
|
||||
---
|
||||
{chunk_text[:1500]}
|
||||
---
|
||||
|
||||
Antworte NUR mit JSON: {{"relevant": true/false, "reason": "kurze Begründung"}}"""
|
||||
|
||||
try:
|
||||
raw = await _llm_local(prompt, PREFILTER_SYSTEM_PROMPT)
|
||||
data = _parse_llm_json(raw)
|
||||
if data:
|
||||
return data.get("relevant", True), data.get("reason", "")
|
||||
# If parsing fails, assume relevant (don't skip)
|
||||
return True, "parse_failed"
|
||||
except Exception as e:
|
||||
logger.warning("Prefilter failed: %s — treating as relevant", e)
|
||||
return True, f"error: {e}"
|
||||
|
||||
|
||||
async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
"""Call Anthropic Messages API."""
|
||||
headers = {
|
||||
@@ -364,6 +503,8 @@ async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
"model": OLLAMA_MODEL,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {"num_predict": 512}, # Limit response length for speed
|
||||
"think": False, # Disable thinking for faster responses
|
||||
}
|
||||
|
||||
try:
|
||||
@@ -397,6 +538,26 @@ async def _get_embedding(text: str) -> list[float]:
|
||||
return []
|
||||
|
||||
|
||||
async def _get_embeddings_batch(texts: list[str], batch_size: int = 32) -> list[list[float]]:
|
||||
"""Get embedding vectors for multiple texts in batches."""
|
||||
all_embeddings: list[list[float]] = []
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch = texts[i:i + batch_size]
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
resp = await client.post(
|
||||
f"{EMBEDDING_URL}/embed",
|
||||
json={"texts": batch},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
embeddings = resp.json().get("embeddings", [])
|
||||
all_embeddings.extend(embeddings)
|
||||
except Exception as e:
|
||||
logger.warning("Batch embedding failed for %d texts: %s", len(batch), e)
|
||||
all_embeddings.extend([[] for _ in batch])
|
||||
return all_embeddings
|
||||
|
||||
|
||||
def _cosine_sim(a: list[float], b: list[float]) -> float:
|
||||
"""Compute cosine similarity between two vectors."""
|
||||
if not a or not b or len(a) != len(b):
|
||||
@@ -464,50 +625,96 @@ class ControlGeneratorPipeline:
|
||||
# ── Stage 1: RAG Scan ──────────────────────────────────────────────
|
||||
|
||||
async def _scan_rag(self, config: GeneratorConfig) -> list[RAGSearchResult]:
|
||||
"""Load unprocessed chunks from RAG collections."""
|
||||
"""Scroll through ALL chunks in RAG collections.
|
||||
|
||||
Uses the scroll endpoint to iterate over every chunk (not just top-K search).
|
||||
Filters out already-processed chunks by hash.
|
||||
"""
|
||||
collections = config.collections or ALL_COLLECTIONS
|
||||
all_results: list[RAGSearchResult] = []
|
||||
|
||||
queries = [
|
||||
"security requirement control measure",
|
||||
"Sicherheitsanforderung Maßnahme Prüfaspekt",
|
||||
"compliance requirement audit criterion",
|
||||
"data protection privacy obligation",
|
||||
"access control authentication authorization",
|
||||
]
|
||||
# Pre-load all processed hashes for fast filtering
|
||||
processed_hashes: set[str] = set()
|
||||
if config.skip_processed:
|
||||
try:
|
||||
result = self.db.execute(
|
||||
text("SELECT chunk_hash FROM canonical_processed_chunks")
|
||||
)
|
||||
processed_hashes = {row[0] for row in result}
|
||||
logger.info("Loaded %d processed chunk hashes", len(processed_hashes))
|
||||
except Exception as e:
|
||||
logger.warning("Error loading processed hashes: %s", e)
|
||||
|
||||
if config.domain:
|
||||
domain_kw = DOMAIN_KEYWORDS.get(config.domain, [])
|
||||
if domain_kw:
|
||||
queries.append(" ".join(domain_kw[:5]))
|
||||
seen_hashes: set[str] = set()
|
||||
|
||||
for collection in collections:
|
||||
for query in queries:
|
||||
results = await self.rag.search(
|
||||
query=query,
|
||||
offset = None
|
||||
page = 0
|
||||
collection_total = 0
|
||||
collection_new = 0
|
||||
seen_offsets: set[str] = set() # Detect scroll loops
|
||||
|
||||
while True:
|
||||
chunks, next_offset = await self.rag.scroll(
|
||||
collection=collection,
|
||||
top_k=20,
|
||||
offset=offset,
|
||||
limit=200,
|
||||
)
|
||||
all_results.extend(results)
|
||||
|
||||
# Deduplicate by text hash
|
||||
seen_hashes: set[str] = set()
|
||||
unique: list[RAGSearchResult] = []
|
||||
for r in all_results:
|
||||
h = hashlib.sha256(r.text.encode()).hexdigest()
|
||||
if h not in seen_hashes:
|
||||
seen_hashes.add(h)
|
||||
unique.append(r)
|
||||
if not chunks:
|
||||
break
|
||||
|
||||
# Filter out already-processed chunks
|
||||
if config.skip_processed and unique:
|
||||
hashes = [hashlib.sha256(r.text.encode()).hexdigest() for r in unique]
|
||||
processed = self._get_processed_hashes(hashes)
|
||||
unique = [r for r, h in zip(unique, hashes) if h not in processed]
|
||||
collection_total += len(chunks)
|
||||
|
||||
logger.info("RAG scan: %d unique chunks (%d after filtering processed)",
|
||||
len(seen_hashes), len(unique))
|
||||
return unique[:config.max_controls * 3] # Over-fetch to account for duplicates
|
||||
for chunk in chunks:
|
||||
if not chunk.text or len(chunk.text.strip()) < 50:
|
||||
continue # Skip empty/tiny chunks
|
||||
|
||||
h = hashlib.sha256(chunk.text.encode()).hexdigest()
|
||||
|
||||
# Skip duplicates (same text in multiple collections)
|
||||
if h in seen_hashes:
|
||||
continue
|
||||
seen_hashes.add(h)
|
||||
|
||||
# Skip already-processed
|
||||
if h in processed_hashes:
|
||||
continue
|
||||
|
||||
all_results.append(chunk)
|
||||
collection_new += 1
|
||||
|
||||
page += 1
|
||||
if page % 50 == 0:
|
||||
logger.info(
|
||||
"Scrolling %s: page %d, %d total chunks, %d new unprocessed",
|
||||
collection, page, collection_total, collection_new,
|
||||
)
|
||||
|
||||
# Stop conditions
|
||||
if not next_offset:
|
||||
break
|
||||
# Detect infinite scroll loops (Qdrant mixed ID types)
|
||||
if next_offset in seen_offsets:
|
||||
logger.warning(
|
||||
"Scroll loop detected in %s at offset %s (page %d) — stopping",
|
||||
collection, next_offset, page,
|
||||
)
|
||||
break
|
||||
seen_offsets.add(next_offset)
|
||||
|
||||
offset = next_offset
|
||||
|
||||
logger.info(
|
||||
"Collection %s: %d total chunks scrolled, %d new unprocessed",
|
||||
collection, collection_total, collection_new,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"RAG scroll complete: %d total unique seen, %d new unprocessed to process",
|
||||
len(seen_hashes), len(all_results),
|
||||
)
|
||||
return all_results
|
||||
|
||||
def _get_processed_hashes(self, hashes: list[str]) -> set[str]:
|
||||
"""Check which chunk hashes are already processed."""
|
||||
@@ -568,6 +775,8 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
|
||||
"url": chunk.source_url or "",
|
||||
}
|
||||
control.customer_visible = True
|
||||
control.verification_method = _detect_verification_method(chunk.text)
|
||||
control.category = _detect_category(chunk.text)
|
||||
control.generation_metadata = {
|
||||
"processing_path": "structured",
|
||||
"license_rule": 1,
|
||||
@@ -617,6 +826,8 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
|
||||
"url": chunk.source_url or "",
|
||||
}
|
||||
control.customer_visible = True
|
||||
control.verification_method = _detect_verification_method(chunk.text)
|
||||
control.category = _detect_category(chunk.text)
|
||||
control.generation_metadata = {
|
||||
"processing_path": "structured",
|
||||
"license_rule": 2,
|
||||
@@ -661,6 +872,8 @@ Gib JSON zurück mit diesen Feldern:
|
||||
control.source_original_text = None # NEVER store original
|
||||
control.source_citation = None # NEVER cite source
|
||||
control.customer_visible = False # Only our formulation
|
||||
control.verification_method = _detect_verification_method(chunk.text)
|
||||
control.category = _detect_category(chunk.text)
|
||||
# generation_metadata: NO source names, NO original texts
|
||||
control.generation_metadata = {
|
||||
"processing_path": "llm_reform",
|
||||
@@ -676,6 +889,10 @@ Gib JSON zurück mit diesen Feldern:
|
||||
if not existing:
|
||||
return None
|
||||
|
||||
# Pre-load all existing embeddings in batch (once per pipeline run)
|
||||
if not self._existing_embeddings:
|
||||
await self._preload_embeddings(existing)
|
||||
|
||||
new_text = f"{new_control.title} {new_control.objective}"
|
||||
new_emb = await _get_embedding(new_text)
|
||||
if not new_emb:
|
||||
@@ -684,14 +901,7 @@ Gib JSON zurück mit diesen Feldern:
|
||||
similar = []
|
||||
for ex in existing:
|
||||
ex_key = ex.get("control_id", "")
|
||||
ex_text = f"{ex.get('title', '')} {ex.get('objective', '')}"
|
||||
|
||||
# Get or compute embedding for existing control
|
||||
if ex_key not in self._existing_embeddings:
|
||||
emb = await _get_embedding(ex_text)
|
||||
self._existing_embeddings[ex_key] = emb
|
||||
ex_emb = self._existing_embeddings.get(ex_key, [])
|
||||
|
||||
if not ex_emb:
|
||||
continue
|
||||
|
||||
@@ -705,6 +915,20 @@ Gib JSON zurück mit diesen Feldern:
|
||||
|
||||
return similar if similar else None
|
||||
|
||||
async def _preload_embeddings(self, existing: list[dict]):
|
||||
"""Pre-load embeddings for all existing controls in batches."""
|
||||
texts = [f"{ex.get('title', '')} {ex.get('objective', '')}" for ex in existing]
|
||||
keys = [ex.get("control_id", "") for ex in existing]
|
||||
|
||||
logger.info("Pre-loading embeddings for %d existing controls...", len(texts))
|
||||
embeddings = await _get_embeddings_batch(texts)
|
||||
|
||||
for key, emb in zip(keys, embeddings):
|
||||
self._existing_embeddings[key] = emb
|
||||
|
||||
loaded = sum(1 for emb in embeddings if emb)
|
||||
logger.info("Pre-loaded %d/%d embeddings", loaded, len(texts))
|
||||
|
||||
def _load_existing_controls(self) -> list[dict]:
|
||||
"""Load existing controls from DB (cached per pipeline run)."""
|
||||
if self._existing_controls is not None:
|
||||
@@ -799,10 +1023,11 @@ Gib JSON zurück mit diesen Feldern:
|
||||
return str(uuid.uuid4())
|
||||
|
||||
def _update_job(self, job_id: str, result: GeneratorResult):
|
||||
"""Update job with final stats."""
|
||||
"""Update job with current stats. Sets completed_at only when status is final."""
|
||||
is_final = result.status in ("completed", "failed")
|
||||
try:
|
||||
self.db.execute(
|
||||
text("""
|
||||
text(f"""
|
||||
UPDATE canonical_generation_jobs
|
||||
SET status = :status,
|
||||
total_chunks_scanned = :scanned,
|
||||
@@ -811,8 +1036,8 @@ Gib JSON zurück mit diesen Feldern:
|
||||
controls_needs_review = :needs_review,
|
||||
controls_too_close = :too_close,
|
||||
controls_duplicates_found = :duplicates,
|
||||
errors = :errors,
|
||||
completed_at = NOW()
|
||||
errors = :errors
|
||||
{"" if not is_final else ", completed_at = NOW()"}
|
||||
WHERE id = CAST(:job_id AS uuid)
|
||||
"""),
|
||||
{
|
||||
@@ -857,14 +1082,16 @@ Gib JSON zurück mit diesen Feldern:
|
||||
severity, risk_score, implementation_effort,
|
||||
open_anchors, release_state, tags,
|
||||
license_rule, source_original_text, source_citation,
|
||||
customer_visible, generation_metadata
|
||||
customer_visible, generation_metadata,
|
||||
verification_method, category
|
||||
) VALUES (
|
||||
:framework_id, :control_id, :title, :objective, :rationale,
|
||||
:scope, :requirements, :test_procedure, :evidence,
|
||||
:severity, :risk_score, :implementation_effort,
|
||||
:open_anchors, :release_state, :tags,
|
||||
:license_rule, :source_original_text, :source_citation,
|
||||
:customer_visible, :generation_metadata
|
||||
:customer_visible, :generation_metadata,
|
||||
:verification_method, :category
|
||||
)
|
||||
ON CONFLICT (framework_id, control_id) DO NOTHING
|
||||
RETURNING id
|
||||
@@ -890,6 +1117,8 @@ Gib JSON zurück mit diesen Feldern:
|
||||
"source_citation": json.dumps(control.source_citation) if control.source_citation else None,
|
||||
"customer_visible": control.customer_visible,
|
||||
"generation_metadata": json.dumps(control.generation_metadata) if control.generation_metadata else None,
|
||||
"verification_method": control.verification_method,
|
||||
"category": control.category,
|
||||
},
|
||||
)
|
||||
self.db.commit()
|
||||
@@ -926,7 +1155,7 @@ Gib JSON zurück mit diesen Feldern:
|
||||
"""),
|
||||
{
|
||||
"hash": chunk_hash,
|
||||
"collection": "bp_compliance_ce", # Default, we don't track collection per result
|
||||
"collection": chunk.collection or "bp_compliance_ce",
|
||||
"regulation_code": chunk.regulation_code,
|
||||
"doc_version": "1.0",
|
||||
"license": license_info.get("license", ""),
|
||||
@@ -946,8 +1175,11 @@ Gib JSON zurück mit diesen Feldern:
|
||||
"""Execute the full 7-stage pipeline."""
|
||||
result = GeneratorResult()
|
||||
|
||||
# Create job
|
||||
job_id = self._create_job(config)
|
||||
# Create or reuse job
|
||||
if config.existing_job_id:
|
||||
job_id = config.existing_job_id
|
||||
else:
|
||||
job_id = self._create_job(config)
|
||||
result.job_id = job_id
|
||||
|
||||
try:
|
||||
@@ -962,13 +1194,37 @@ Gib JSON zurück mit diesen Feldern:
|
||||
|
||||
# Process chunks
|
||||
controls_count = 0
|
||||
for chunk in chunks:
|
||||
if controls_count >= config.max_controls:
|
||||
break
|
||||
|
||||
chunks_skipped_prefilter = 0
|
||||
for i, chunk in enumerate(chunks):
|
||||
try:
|
||||
# Progress logging every 50 chunks
|
||||
if i > 0 and i % 50 == 0:
|
||||
logger.info(
|
||||
"Progress: %d/%d chunks processed, %d controls generated, %d skipped by prefilter",
|
||||
i, len(chunks), controls_count, chunks_skipped_prefilter,
|
||||
)
|
||||
self._update_job(job_id, result)
|
||||
|
||||
# Stage 1.5: Local LLM pre-filter — skip chunks without requirements
|
||||
if not config.dry_run:
|
||||
is_relevant, prefilter_reason = await _prefilter_chunk(chunk.text)
|
||||
if not is_relevant:
|
||||
chunks_skipped_prefilter += 1
|
||||
# Mark as processed so we don't re-check next time
|
||||
license_info = self._classify_license(chunk)
|
||||
self._mark_chunk_processed(
|
||||
chunk, license_info, "prefilter_skip", [], job_id
|
||||
)
|
||||
continue
|
||||
|
||||
control = await self._process_single_chunk(chunk, config, job_id)
|
||||
if control is None:
|
||||
# No control generated — still mark as processed
|
||||
if not config.dry_run:
|
||||
license_info = self._classify_license(chunk)
|
||||
self._mark_chunk_processed(
|
||||
chunk, license_info, "no_control", [], job_id
|
||||
)
|
||||
continue
|
||||
|
||||
# Count by state
|
||||
@@ -989,6 +1245,12 @@ Gib JSON zurück mit diesen Feldern:
|
||||
license_info = self._classify_license(chunk)
|
||||
path = "llm_reform" if license_info["rule"] == 3 else "structured"
|
||||
self._mark_chunk_processed(chunk, license_info, path, [ctrl_uuid], job_id)
|
||||
else:
|
||||
# Store failed — still mark as processed
|
||||
license_info = self._classify_license(chunk)
|
||||
self._mark_chunk_processed(
|
||||
chunk, license_info, "store_failed", [], job_id
|
||||
)
|
||||
|
||||
result.controls_generated += 1
|
||||
result.controls.append(asdict(control))
|
||||
@@ -1006,6 +1268,21 @@ Gib JSON zurück mit diesen Feldern:
|
||||
error_msg = f"Error processing chunk {chunk.regulation_code}/{chunk.article}: {e}"
|
||||
logger.error(error_msg)
|
||||
result.errors.append(error_msg)
|
||||
# Mark failed chunks as processed too (so we don't retry endlessly)
|
||||
try:
|
||||
if not config.dry_run:
|
||||
license_info = self._classify_license(chunk)
|
||||
self._mark_chunk_processed(
|
||||
chunk, license_info, "error", [], job_id
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
result.chunks_skipped_prefilter = chunks_skipped_prefilter
|
||||
logger.info(
|
||||
"Pipeline complete: %d controls generated, %d chunks skipped by prefilter, %d total chunks",
|
||||
controls_count, chunks_skipped_prefilter, len(chunks),
|
||||
)
|
||||
|
||||
result.status = "completed"
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ class RAGSearchResult:
|
||||
paragraph: str
|
||||
source_url: str
|
||||
score: float
|
||||
collection: str = ""
|
||||
|
||||
|
||||
class ComplianceRAGClient:
|
||||
@@ -91,6 +92,7 @@ class ComplianceRAGClient:
|
||||
paragraph=r.get("paragraph", ""),
|
||||
source_url=r.get("source_url", ""),
|
||||
score=r.get("score", 0.0),
|
||||
collection=collection,
|
||||
))
|
||||
return results
|
||||
|
||||
@@ -98,6 +100,54 @@ class ComplianceRAGClient:
|
||||
logger.warning("RAG search failed: %s", e)
|
||||
return []
|
||||
|
||||
async def scroll(
|
||||
self,
|
||||
collection: str,
|
||||
offset: Optional[str] = None,
|
||||
limit: int = 100,
|
||||
) -> tuple[List[RAGSearchResult], Optional[str]]:
|
||||
"""
|
||||
Scroll through ALL chunks in a collection (paginated).
|
||||
|
||||
Returns (chunks, next_offset). next_offset is None when done.
|
||||
"""
|
||||
scroll_url = self._search_url.replace("/search", "/scroll")
|
||||
params = {"collection": collection, "limit": str(limit)}
|
||||
if offset:
|
||||
params["offset"] = offset
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
resp = await client.get(scroll_url, params=params)
|
||||
|
||||
if resp.status_code != 200:
|
||||
logger.warning(
|
||||
"RAG scroll returned %d: %s", resp.status_code, resp.text[:200]
|
||||
)
|
||||
return [], None
|
||||
|
||||
data = resp.json()
|
||||
results = []
|
||||
for r in data.get("chunks", []):
|
||||
results.append(RAGSearchResult(
|
||||
text=r.get("text", ""),
|
||||
regulation_code=r.get("regulation_code", ""),
|
||||
regulation_name=r.get("regulation_name", ""),
|
||||
regulation_short=r.get("regulation_short", ""),
|
||||
category=r.get("category", ""),
|
||||
article=r.get("article", ""),
|
||||
paragraph=r.get("paragraph", ""),
|
||||
source_url=r.get("source_url", ""),
|
||||
score=0.0,
|
||||
collection=collection,
|
||||
))
|
||||
next_offset = data.get("next_offset") or None
|
||||
return results, next_offset
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("RAG scroll failed: %s", e)
|
||||
return [], None
|
||||
|
||||
def format_for_prompt(
|
||||
self, results: List[RAGSearchResult], max_results: int = 5
|
||||
) -> str:
|
||||
|
||||
Reference in New Issue
Block a user