Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 35s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
- Add Anthropic API support to decomposition Pass 0a/0b (prompt caching, content batching) - Add Anthropic Batch API (50% cost reduction, async 24h processing) - Add source_filter (ILIKE on source_citation) for regulation-based filtering - Add category_filter to Pass 0a for selective decomposition - Add regulation_filter to control_generator for RAG scan phase filtering (prefix match on regulation_code — enables CE + Code Review focus) - New API endpoints: batch-submit-0a, batch-submit-0b, batch-status, batch-process - 83 new tests (all passing) Cost reduction: $2,525 → ~$600-700 with all optimizations combined. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
979 lines
36 KiB
Python
979 lines
36 KiB
Python
"""
|
|
FastAPI routes for the Control Generator Pipeline.
|
|
|
|
Endpoints:
|
|
POST /v1/canonical/generate — Start generation run
|
|
GET /v1/canonical/generate/status/{job_id} — Job status
|
|
GET /v1/canonical/generate/jobs — All jobs
|
|
GET /v1/canonical/generate/review-queue — Controls needing review
|
|
POST /v1/canonical/generate/review/{control_id} — Complete review
|
|
GET /v1/canonical/generate/processed-stats — Processing stats per collection
|
|
GET /v1/canonical/blocked-sources — Blocked sources list
|
|
POST /v1/canonical/blocked-sources/cleanup — Start cleanup workflow
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
from typing import Optional, List
|
|
|
|
from fastapi import APIRouter, HTTPException, Query
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import text
|
|
|
|
from database import SessionLocal
|
|
from compliance.services.control_generator import (
|
|
ControlGeneratorPipeline,
|
|
GeneratorConfig,
|
|
ALL_COLLECTIONS,
|
|
VALID_CATEGORIES,
|
|
VALID_DOMAINS,
|
|
_detect_category,
|
|
_detect_domain,
|
|
_llm_local,
|
|
_parse_llm_json,
|
|
CATEGORY_LIST_STR,
|
|
)
|
|
from compliance.services.citation_backfill import CitationBackfill, BackfillResult
|
|
from compliance.services.rag_client import get_rag_client
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/v1/canonical", tags=["control-generator"])
|
|
|
|
|
|
# =============================================================================
|
|
# REQUEST / RESPONSE MODELS
|
|
# =============================================================================
|
|
|
|
class GenerateRequest(BaseModel):
|
|
domain: Optional[str] = None
|
|
collections: Optional[List[str]] = None
|
|
max_controls: int = 50
|
|
max_chunks: int = 1000 # Default: process max 1000 chunks per job (respects document boundaries)
|
|
batch_size: int = 5
|
|
skip_web_search: bool = False
|
|
dry_run: bool = False
|
|
regulation_filter: Optional[List[str]] = None # Only process these regulation_code prefixes
|
|
|
|
|
|
class GenerateResponse(BaseModel):
|
|
job_id: str
|
|
status: str
|
|
message: str
|
|
total_chunks_scanned: int = 0
|
|
controls_generated: int = 0
|
|
controls_verified: int = 0
|
|
controls_needs_review: int = 0
|
|
controls_too_close: int = 0
|
|
controls_duplicates_found: int = 0
|
|
controls_qa_fixed: int = 0
|
|
errors: list = []
|
|
controls: list = []
|
|
|
|
|
|
class ReviewRequest(BaseModel):
|
|
action: str # "approve", "reject", "needs_rework"
|
|
release_state: Optional[str] = None # Override release_state
|
|
notes: Optional[str] = None
|
|
|
|
|
|
class ProcessedStats(BaseModel):
|
|
collection: str
|
|
total_chunks_estimated: int
|
|
processed_chunks: int
|
|
pending_chunks: int
|
|
direct_adopted: int
|
|
llm_reformed: int
|
|
skipped: int
|
|
|
|
|
|
class BlockedSourceResponse(BaseModel):
|
|
id: str
|
|
regulation_code: str
|
|
document_title: str
|
|
reason: str
|
|
deletion_status: str
|
|
qdrant_collection: Optional[str] = None
|
|
marked_at: str
|
|
|
|
|
|
# =============================================================================
|
|
# ENDPOINTS
|
|
# =============================================================================
|
|
|
|
async def _run_pipeline_background(config: GeneratorConfig, job_id: str):
|
|
"""Run the pipeline in the background. Uses its own DB session."""
|
|
db = SessionLocal()
|
|
try:
|
|
config.existing_job_id = job_id
|
|
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
|
|
result = await pipeline.run(config)
|
|
logger.info(
|
|
"Background generation job %s completed: %d controls from %d chunks",
|
|
job_id, result.controls_generated, result.total_chunks_scanned,
|
|
)
|
|
except Exception as e:
|
|
logger.error("Background generation job %s failed: %s", job_id, e)
|
|
# Update job as failed
|
|
try:
|
|
db.execute(
|
|
text("""
|
|
UPDATE canonical_generation_jobs
|
|
SET status = 'failed', errors = :errors, completed_at = NOW()
|
|
WHERE id = CAST(:job_id AS uuid)
|
|
"""),
|
|
{"job_id": job_id, "errors": json.dumps([str(e)])},
|
|
)
|
|
db.commit()
|
|
except Exception:
|
|
pass
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.post("/generate", response_model=GenerateResponse)
|
|
async def start_generation(req: GenerateRequest):
|
|
"""Start a control generation run (runs in background).
|
|
|
|
Returns immediately with job_id. Use GET /generate/status/{job_id} to poll progress.
|
|
"""
|
|
config = GeneratorConfig(
|
|
collections=req.collections,
|
|
domain=req.domain,
|
|
batch_size=req.batch_size,
|
|
max_controls=req.max_controls,
|
|
max_chunks=req.max_chunks,
|
|
skip_web_search=req.skip_web_search,
|
|
dry_run=req.dry_run,
|
|
regulation_filter=req.regulation_filter,
|
|
)
|
|
|
|
if req.dry_run:
|
|
# Dry run: execute synchronously and return controls
|
|
db = SessionLocal()
|
|
try:
|
|
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
|
|
result = await pipeline.run(config)
|
|
return GenerateResponse(
|
|
job_id=result.job_id,
|
|
status=result.status,
|
|
message=f"Dry run: {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
|
|
total_chunks_scanned=result.total_chunks_scanned,
|
|
controls_generated=result.controls_generated,
|
|
controls_verified=result.controls_verified,
|
|
controls_needs_review=result.controls_needs_review,
|
|
controls_too_close=result.controls_too_close,
|
|
controls_duplicates_found=result.controls_duplicates_found,
|
|
errors=result.errors,
|
|
controls=result.controls,
|
|
)
|
|
except Exception as e:
|
|
logger.error("Dry run failed: %s", e)
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
finally:
|
|
db.close()
|
|
|
|
# Create job record first so we can return the ID
|
|
db = SessionLocal()
|
|
try:
|
|
result = db.execute(
|
|
text("""
|
|
INSERT INTO canonical_generation_jobs (status, config)
|
|
VALUES ('running', :config)
|
|
RETURNING id
|
|
"""),
|
|
{"config": json.dumps(config.model_dump())},
|
|
)
|
|
db.commit()
|
|
row = result.fetchone()
|
|
job_id = str(row[0]) if row else None
|
|
except Exception as e:
|
|
logger.error("Failed to create job: %s", e)
|
|
raise HTTPException(status_code=500, detail=f"Failed to create job: {e}")
|
|
finally:
|
|
db.close()
|
|
|
|
if not job_id:
|
|
raise HTTPException(status_code=500, detail="Failed to create job record")
|
|
|
|
# Launch pipeline in background
|
|
asyncio.create_task(_run_pipeline_background(config, job_id))
|
|
|
|
return GenerateResponse(
|
|
job_id=job_id,
|
|
status="running",
|
|
message="Generation started in background. Poll /generate/status/{job_id} for progress.",
|
|
)
|
|
|
|
|
|
@router.get("/generate/status/{job_id}")
|
|
async def get_job_status(job_id: str):
|
|
"""Get status of a generation job."""
|
|
db = SessionLocal()
|
|
try:
|
|
result = db.execute(
|
|
text("SELECT * FROM canonical_generation_jobs WHERE id = CAST(:id AS uuid)"),
|
|
{"id": job_id},
|
|
)
|
|
row = result.fetchone()
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
|
|
cols = result.keys()
|
|
job = dict(zip(cols, row))
|
|
# Serialize datetime fields
|
|
for key in ("started_at", "completed_at", "created_at"):
|
|
if job.get(key):
|
|
job[key] = str(job[key])
|
|
job["id"] = str(job["id"])
|
|
return job
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.get("/generate/jobs")
|
|
async def list_jobs(
|
|
limit: int = Query(20, ge=1, le=100),
|
|
offset: int = Query(0, ge=0),
|
|
):
|
|
"""List all generation jobs."""
|
|
db = SessionLocal()
|
|
try:
|
|
result = db.execute(
|
|
text("""
|
|
SELECT id, status, total_chunks_scanned, controls_generated,
|
|
controls_verified, controls_needs_review, controls_too_close,
|
|
controls_duplicates_found, created_at, completed_at
|
|
FROM canonical_generation_jobs
|
|
ORDER BY created_at DESC
|
|
LIMIT :limit OFFSET :offset
|
|
"""),
|
|
{"limit": limit, "offset": offset},
|
|
)
|
|
jobs = []
|
|
cols = result.keys()
|
|
for row in result:
|
|
job = dict(zip(cols, row))
|
|
job["id"] = str(job["id"])
|
|
for key in ("created_at", "completed_at"):
|
|
if job.get(key):
|
|
job[key] = str(job[key])
|
|
jobs.append(job)
|
|
return {"jobs": jobs, "total": len(jobs)}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.get("/generate/review-queue")
|
|
async def get_review_queue(
|
|
release_state: str = Query("needs_review", regex="^(needs_review|too_close|duplicate)$"),
|
|
limit: int = Query(50, ge=1, le=200),
|
|
):
|
|
"""Get controls that need manual review."""
|
|
db = SessionLocal()
|
|
try:
|
|
result = db.execute(
|
|
text("""
|
|
SELECT c.id, c.control_id, c.title, c.objective, c.severity,
|
|
c.release_state, c.license_rule, c.customer_visible,
|
|
c.generation_metadata, c.open_anchors, c.tags,
|
|
c.created_at
|
|
FROM canonical_controls c
|
|
WHERE c.release_state = :state
|
|
ORDER BY c.created_at DESC
|
|
LIMIT :limit
|
|
"""),
|
|
{"state": release_state, "limit": limit},
|
|
)
|
|
controls = []
|
|
cols = result.keys()
|
|
for row in result:
|
|
ctrl = dict(zip(cols, row))
|
|
ctrl["id"] = str(ctrl["id"])
|
|
ctrl["created_at"] = str(ctrl["created_at"])
|
|
# Parse JSON fields
|
|
for jf in ("generation_metadata", "open_anchors", "tags"):
|
|
if isinstance(ctrl.get(jf), str):
|
|
try:
|
|
ctrl[jf] = json.loads(ctrl[jf])
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
controls.append(ctrl)
|
|
return {"controls": controls, "total": len(controls)}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.post("/generate/review/{control_id}")
|
|
async def review_control(control_id: str, req: ReviewRequest):
|
|
"""Complete review of a generated control."""
|
|
db = SessionLocal()
|
|
try:
|
|
# Validate control exists and is in reviewable state
|
|
result = db.execute(
|
|
text("SELECT id, release_state FROM canonical_controls WHERE control_id = :cid"),
|
|
{"cid": control_id},
|
|
)
|
|
row = result.fetchone()
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Control not found")
|
|
|
|
current_state = row[1]
|
|
if current_state not in ("needs_review", "too_close", "duplicate"):
|
|
raise HTTPException(status_code=400, detail=f"Control is in state '{current_state}', not reviewable")
|
|
|
|
# Determine new state
|
|
if req.action == "approve":
|
|
new_state = req.release_state or "draft"
|
|
elif req.action == "reject":
|
|
new_state = "deprecated"
|
|
elif req.action == "needs_rework":
|
|
new_state = "needs_review"
|
|
else:
|
|
raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
|
|
|
|
if new_state not in ("draft", "review", "approved", "deprecated", "needs_review", "too_close", "duplicate"):
|
|
raise HTTPException(status_code=400, detail=f"Invalid release_state: {new_state}")
|
|
|
|
db.execute(
|
|
text("""
|
|
UPDATE canonical_controls
|
|
SET release_state = :state, updated_at = NOW()
|
|
WHERE control_id = :cid
|
|
"""),
|
|
{"state": new_state, "cid": control_id},
|
|
)
|
|
db.commit()
|
|
|
|
return {"control_id": control_id, "release_state": new_state, "action": req.action}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
class BulkReviewRequest(BaseModel):
|
|
release_state: str # Filter: which controls to bulk-review
|
|
action: str # "approve" or "reject"
|
|
new_state: Optional[str] = None # Override target state
|
|
|
|
|
|
@router.post("/generate/bulk-review")
|
|
async def bulk_review(req: BulkReviewRequest):
|
|
"""Bulk review all controls matching a release_state filter.
|
|
|
|
Example: reject all needs_review → sets them to deprecated.
|
|
"""
|
|
if req.release_state not in ("needs_review", "too_close", "duplicate"):
|
|
raise HTTPException(status_code=400, detail=f"Invalid filter state: {req.release_state}")
|
|
|
|
if req.action == "approve":
|
|
target = req.new_state or "draft"
|
|
elif req.action == "reject":
|
|
target = "deprecated"
|
|
else:
|
|
raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
|
|
|
|
if target not in ("draft", "review", "approved", "deprecated", "needs_review"):
|
|
raise HTTPException(status_code=400, detail=f"Invalid target state: {target}")
|
|
|
|
db = SessionLocal()
|
|
try:
|
|
result = db.execute(
|
|
text("""
|
|
UPDATE canonical_controls
|
|
SET release_state = :target, updated_at = NOW()
|
|
WHERE release_state = :source
|
|
RETURNING control_id
|
|
"""),
|
|
{"source": req.release_state, "target": target},
|
|
)
|
|
affected = [row[0] for row in result]
|
|
db.commit()
|
|
|
|
return {
|
|
"action": req.action,
|
|
"source_state": req.release_state,
|
|
"target_state": target,
|
|
"affected_count": len(affected),
|
|
}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
class QAReclassifyRequest(BaseModel):
|
|
limit: int = 100 # How many controls to reclassify per run
|
|
dry_run: bool = True # Preview only by default
|
|
filter_category: Optional[str] = None # Only reclassify controls of this category
|
|
filter_domain_prefix: Optional[str] = None # Only reclassify controls with this prefix
|
|
|
|
|
|
@router.post("/generate/qa-reclassify")
|
|
async def qa_reclassify(req: QAReclassifyRequest):
|
|
"""Run QA reclassification on existing controls using local LLM.
|
|
|
|
Finds controls where keyword-detection disagrees with current category/domain,
|
|
then uses Ollama to determine the correct classification.
|
|
"""
|
|
db = SessionLocal()
|
|
try:
|
|
# Load controls to check
|
|
where_clauses = ["release_state NOT IN ('deprecated')"]
|
|
params = {"limit": req.limit}
|
|
if req.filter_category:
|
|
where_clauses.append("category = :cat")
|
|
params["cat"] = req.filter_category
|
|
if req.filter_domain_prefix:
|
|
where_clauses.append("control_id LIKE :prefix")
|
|
params["prefix"] = f"{req.filter_domain_prefix}-%"
|
|
|
|
where_sql = " AND ".join(where_clauses)
|
|
rows = db.execute(
|
|
text(f"""
|
|
SELECT id, control_id, title, objective, category,
|
|
COALESCE(requirements::text, '[]') as requirements,
|
|
COALESCE(source_original_text, '') as source_text
|
|
FROM canonical_controls
|
|
WHERE {where_sql}
|
|
ORDER BY created_at DESC
|
|
LIMIT :limit
|
|
"""),
|
|
params,
|
|
).fetchall()
|
|
|
|
results = {"checked": 0, "mismatches": 0, "fixes": [], "errors": []}
|
|
|
|
for row in rows:
|
|
results["checked"] += 1
|
|
control_id = row[1]
|
|
title = row[2]
|
|
objective = row[3] or ""
|
|
current_category = row[4]
|
|
source_text = row[6] or objective
|
|
|
|
# Keyword detection on source text
|
|
kw_category = _detect_category(source_text) or _detect_category(objective)
|
|
kw_domain = _detect_domain(source_text)
|
|
current_prefix = control_id.split("-")[0] if "-" in control_id else ""
|
|
|
|
# Skip if keyword detection agrees with current classification
|
|
if kw_category == current_category and kw_domain == current_prefix:
|
|
continue
|
|
|
|
results["mismatches"] += 1
|
|
|
|
# Ask Ollama to arbitrate
|
|
try:
|
|
reqs_text = ""
|
|
try:
|
|
reqs = json.loads(row[5])
|
|
if isinstance(reqs, list):
|
|
reqs_text = ", ".join(str(r) for r in reqs[:3])
|
|
except Exception:
|
|
pass
|
|
|
|
prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
|
|
|
|
Titel: {title[:100]}
|
|
Ziel: {objective[:200]}
|
|
Anforderungen: {reqs_text[:200]}
|
|
|
|
Aktuelle Zuordnung: domain={current_prefix}, category={current_category}
|
|
Keyword-Erkennung: domain={kw_domain}, category={kw_category}
|
|
|
|
Welche Zuordnung ist korrekt? Antworte NUR als JSON:
|
|
{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
|
|
|
|
Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
|
|
Kategorien: {CATEGORY_LIST_STR}"""
|
|
|
|
raw = await _llm_local(prompt)
|
|
data = _parse_llm_json(raw)
|
|
if not data:
|
|
continue
|
|
|
|
qa_domain = data.get("domain", "").upper()
|
|
qa_category = data.get("category", "")
|
|
reason = data.get("reason", "")
|
|
|
|
fix_entry = {
|
|
"control_id": control_id,
|
|
"title": title[:80],
|
|
"old_category": current_category,
|
|
"old_domain": current_prefix,
|
|
"new_category": qa_category if qa_category in VALID_CATEGORIES else current_category,
|
|
"new_domain": qa_domain if qa_domain in VALID_DOMAINS else current_prefix,
|
|
"reason": reason,
|
|
}
|
|
|
|
category_changed = qa_category in VALID_CATEGORIES and qa_category != current_category
|
|
|
|
if category_changed and not req.dry_run:
|
|
db.execute(
|
|
text("""
|
|
UPDATE canonical_controls
|
|
SET category = :category, updated_at = NOW()
|
|
WHERE id = :id
|
|
"""),
|
|
{"id": row[0], "category": qa_category},
|
|
)
|
|
fix_entry["applied"] = True
|
|
else:
|
|
fix_entry["applied"] = False
|
|
|
|
results["fixes"].append(fix_entry)
|
|
|
|
except Exception as e:
|
|
results["errors"].append({"control_id": control_id, "error": str(e)})
|
|
|
|
if not req.dry_run:
|
|
db.commit()
|
|
|
|
return results
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.get("/generate/processed-stats")
|
|
async def get_processed_stats():
|
|
"""Get processing statistics per collection."""
|
|
db = SessionLocal()
|
|
try:
|
|
result = db.execute(
|
|
text("""
|
|
SELECT
|
|
collection,
|
|
COUNT(*) as processed_chunks,
|
|
COUNT(*) FILTER (WHERE processing_path = 'structured') as direct_adopted,
|
|
COUNT(*) FILTER (WHERE processing_path = 'llm_reform') as llm_reformed,
|
|
COUNT(*) FILTER (WHERE processing_path = 'skipped') as skipped
|
|
FROM canonical_processed_chunks
|
|
GROUP BY collection
|
|
ORDER BY collection
|
|
""")
|
|
)
|
|
stats = []
|
|
cols = result.keys()
|
|
for row in result:
|
|
stat = dict(zip(cols, row))
|
|
stat["total_chunks_estimated"] = 0 # Would need Qdrant API to get total
|
|
stat["pending_chunks"] = 0
|
|
stats.append(stat)
|
|
return {"stats": stats}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
# =============================================================================
|
|
# BLOCKED SOURCES
|
|
# =============================================================================
|
|
|
|
@router.get("/blocked-sources")
|
|
async def list_blocked_sources():
|
|
"""List all blocked (Rule 3) sources."""
|
|
db = SessionLocal()
|
|
try:
|
|
result = db.execute(
|
|
text("""
|
|
SELECT id, regulation_code, document_title, reason,
|
|
deletion_status, qdrant_collection, marked_at
|
|
FROM canonical_blocked_sources
|
|
ORDER BY marked_at DESC
|
|
""")
|
|
)
|
|
sources = []
|
|
cols = result.keys()
|
|
for row in result:
|
|
src = dict(zip(cols, row))
|
|
src["id"] = str(src["id"])
|
|
src["marked_at"] = str(src["marked_at"])
|
|
sources.append(src)
|
|
return {"sources": sources}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.post("/blocked-sources/cleanup")
|
|
async def start_cleanup():
|
|
"""Start cleanup workflow for blocked sources.
|
|
|
|
This marks all pending blocked sources for deletion.
|
|
Actual RAG chunk deletion and file removal is a separate manual step.
|
|
"""
|
|
db = SessionLocal()
|
|
try:
|
|
result = db.execute(
|
|
text("""
|
|
UPDATE canonical_blocked_sources
|
|
SET deletion_status = 'marked_for_deletion'
|
|
WHERE deletion_status = 'pending'
|
|
RETURNING regulation_code
|
|
""")
|
|
)
|
|
marked = [row[0] for row in result]
|
|
db.commit()
|
|
|
|
return {
|
|
"status": "marked_for_deletion",
|
|
"marked_count": len(marked),
|
|
"regulation_codes": marked,
|
|
"message": "Sources marked for deletion. Run manual cleanup to remove RAG chunks and files.",
|
|
}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
# =============================================================================
|
|
# CUSTOMER VIEW FILTER
|
|
# =============================================================================
|
|
|
|
@router.get("/controls-customer")
|
|
async def get_controls_customer_view(
|
|
severity: Optional[str] = Query(None),
|
|
domain: Optional[str] = Query(None),
|
|
):
|
|
"""Get controls filtered for customer visibility.
|
|
|
|
Rule 3 controls have source_citation and source_original_text hidden.
|
|
generation_metadata is NEVER shown to customers.
|
|
"""
|
|
db = SessionLocal()
|
|
try:
|
|
query = """
|
|
SELECT c.id, c.control_id, c.title, c.objective, c.rationale,
|
|
c.scope, c.requirements, c.test_procedure, c.evidence,
|
|
c.severity, c.risk_score, c.implementation_effort,
|
|
c.open_anchors, c.release_state, c.tags,
|
|
c.license_rule, c.customer_visible,
|
|
c.source_original_text, c.source_citation,
|
|
c.created_at, c.updated_at
|
|
FROM canonical_controls c
|
|
WHERE c.release_state IN ('draft', 'approved')
|
|
"""
|
|
params: dict = {}
|
|
|
|
if severity:
|
|
query += " AND c.severity = :severity"
|
|
params["severity"] = severity
|
|
if domain:
|
|
query += " AND c.control_id LIKE :domain"
|
|
params["domain"] = f"{domain.upper()}-%"
|
|
|
|
query += " ORDER BY c.control_id"
|
|
|
|
result = db.execute(text(query), params)
|
|
controls = []
|
|
cols = result.keys()
|
|
for row in result:
|
|
ctrl = dict(zip(cols, row))
|
|
ctrl["id"] = str(ctrl["id"])
|
|
for key in ("created_at", "updated_at"):
|
|
if ctrl.get(key):
|
|
ctrl[key] = str(ctrl[key])
|
|
|
|
# Parse JSON fields
|
|
for jf in ("scope", "requirements", "test_procedure", "evidence",
|
|
"open_anchors", "tags", "source_citation"):
|
|
if isinstance(ctrl.get(jf), str):
|
|
try:
|
|
ctrl[jf] = json.loads(ctrl[jf])
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
|
|
# Customer visibility rules:
|
|
# - NEVER show generation_metadata
|
|
# - Rule 3: NEVER show source_citation or source_original_text
|
|
ctrl.pop("generation_metadata", None)
|
|
if not ctrl.get("customer_visible", True):
|
|
ctrl["source_citation"] = None
|
|
ctrl["source_original_text"] = None
|
|
|
|
controls.append(ctrl)
|
|
|
|
return {"controls": controls, "total": len(controls)}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
# =============================================================================
|
|
# CITATION BACKFILL
|
|
# =============================================================================
|
|
|
|
class BackfillRequest(BaseModel):
|
|
dry_run: bool = True # Default to dry_run for safety
|
|
limit: int = 0 # 0 = all controls
|
|
|
|
|
|
class BackfillResponse(BaseModel):
|
|
status: str
|
|
total_controls: int = 0
|
|
matched_hash: int = 0
|
|
matched_regex: int = 0
|
|
matched_llm: int = 0
|
|
unmatched: int = 0
|
|
updated: int = 0
|
|
errors: list = []
|
|
|
|
|
|
_backfill_status: dict = {}
|
|
|
|
|
|
async def _run_backfill_background(dry_run: bool, limit: int, backfill_id: str):
|
|
"""Run backfill in background with own DB session."""
|
|
db = SessionLocal()
|
|
try:
|
|
backfill = CitationBackfill(db=db, rag_client=get_rag_client())
|
|
result = await backfill.run(dry_run=dry_run, limit=limit)
|
|
_backfill_status[backfill_id] = {
|
|
"status": "completed",
|
|
"total_controls": result.total_controls,
|
|
"matched_hash": result.matched_hash,
|
|
"matched_regex": result.matched_regex,
|
|
"matched_llm": result.matched_llm,
|
|
"unmatched": result.unmatched,
|
|
"updated": result.updated,
|
|
"errors": result.errors[:50],
|
|
}
|
|
logger.info("Backfill %s completed: %d updated", backfill_id, result.updated)
|
|
except Exception as e:
|
|
logger.error("Backfill %s failed: %s", backfill_id, e)
|
|
_backfill_status[backfill_id] = {"status": "failed", "errors": [str(e)]}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.post("/generate/backfill-citations", response_model=BackfillResponse)
|
|
async def start_backfill(req: BackfillRequest):
|
|
"""Backfill article/paragraph into existing control source_citations.
|
|
|
|
Uses 3-tier matching: hash lookup → regex parse → Ollama LLM.
|
|
Default is dry_run=True (preview only, no DB changes).
|
|
"""
|
|
import uuid
|
|
backfill_id = str(uuid.uuid4())[:8]
|
|
_backfill_status[backfill_id] = {"status": "running"}
|
|
|
|
# Always run in background (RAG index build takes minutes)
|
|
asyncio.create_task(_run_backfill_background(req.dry_run, req.limit, backfill_id))
|
|
return BackfillResponse(
|
|
status=f"running (id={backfill_id})",
|
|
)
|
|
|
|
|
|
@router.get("/generate/backfill-status/{backfill_id}")
|
|
async def get_backfill_status(backfill_id: str):
|
|
"""Get status of a backfill job."""
|
|
status = _backfill_status.get(backfill_id)
|
|
if not status:
|
|
raise HTTPException(status_code=404, detail="Backfill job not found")
|
|
return status
|
|
|
|
|
|
# =============================================================================
|
|
# DOMAIN + TARGET AUDIENCE BACKFILL
|
|
# =============================================================================
|
|
|
|
class DomainBackfillRequest(BaseModel):
|
|
dry_run: bool = True
|
|
job_id: Optional[str] = None # Only backfill controls from this job
|
|
limit: int = 0 # 0 = all
|
|
|
|
_domain_backfill_status: dict = {}
|
|
|
|
|
|
async def _run_domain_backfill(req: DomainBackfillRequest, backfill_id: str):
|
|
"""Backfill domain, category, and target_audience for existing controls using Anthropic."""
|
|
import os
|
|
import httpx
|
|
|
|
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
|
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
|
|
|
if not ANTHROPIC_API_KEY:
|
|
_domain_backfill_status[backfill_id] = {
|
|
"status": "failed", "error": "ANTHROPIC_API_KEY not set"
|
|
}
|
|
return
|
|
|
|
db = SessionLocal()
|
|
try:
|
|
# Find controls needing backfill
|
|
where_clauses = ["(target_audience IS NULL OR target_audience = '[]' OR target_audience = 'null')"]
|
|
params: dict = {}
|
|
if req.job_id:
|
|
where_clauses.append("generation_metadata->>'job_id' = :job_id")
|
|
params["job_id"] = req.job_id
|
|
|
|
query = f"""
|
|
SELECT id, control_id, title, objective, category, source_original_text, tags
|
|
FROM canonical_controls
|
|
WHERE {' AND '.join(where_clauses)}
|
|
ORDER BY control_id
|
|
"""
|
|
if req.limit > 0:
|
|
query += f" LIMIT {req.limit}"
|
|
|
|
result = db.execute(text(query), params)
|
|
controls = [dict(zip(result.keys(), row)) for row in result]
|
|
|
|
total = len(controls)
|
|
updated = 0
|
|
errors = []
|
|
|
|
_domain_backfill_status[backfill_id] = {
|
|
"status": "running", "total": total, "updated": 0, "errors": []
|
|
}
|
|
|
|
# Process in batches of 10
|
|
BATCH_SIZE = 10
|
|
for batch_start in range(0, total, BATCH_SIZE):
|
|
batch = controls[batch_start:batch_start + BATCH_SIZE]
|
|
|
|
entries = []
|
|
for idx, ctrl in enumerate(batch):
|
|
text_for_analysis = ctrl.get("objective") or ctrl.get("title") or ""
|
|
original = ctrl.get("source_original_text") or ""
|
|
if original:
|
|
text_for_analysis += f"\n\nQuelltext-Auszug: {original[:500]}"
|
|
entries.append(
|
|
f"--- CONTROL {idx + 1}: {ctrl['control_id']} ---\n"
|
|
f"Titel: {ctrl.get('title', '')}\n"
|
|
f"Objective: {text_for_analysis[:800]}\n"
|
|
f"Tags: {json.dumps(ctrl.get('tags', []))}"
|
|
)
|
|
|
|
prompt = f"""Analysiere die folgenden {len(batch)} Controls und bestimme fuer jedes:
|
|
1. domain: Das Fachgebiet (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT)
|
|
2. category: Die Kategorie (encryption, authentication, network, data_protection, logging, incident, continuity, compliance, supply_chain, physical, personnel, application, system, risk, governance, hardware, identity, public_administration, labor_law, finance, trade_regulation, environmental, health)
|
|
3. target_audience: Liste der Zielgruppen (moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
|
|
|
|
Antworte mit einem JSON-Array mit {len(batch)} Objekten. Jedes Objekt hat:
|
|
- control_index: 1-basierter Index
|
|
- domain: Fachgebiet-Kuerzel
|
|
- category: Kategorie
|
|
- target_audience: Liste der Zielgruppen
|
|
|
|
{"".join(entries)}"""
|
|
|
|
try:
|
|
headers = {
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"anthropic-version": "2023-06-01",
|
|
"content-type": "application/json",
|
|
}
|
|
payload = {
|
|
"model": ANTHROPIC_MODEL,
|
|
"max_tokens": 4096,
|
|
"system": "Du bist ein Compliance-Experte. Klassifiziere Controls nach Fachgebiet und Zielgruppe. Antworte NUR mit validem JSON.",
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
resp = await client.post(
|
|
"https://api.anthropic.com/v1/messages",
|
|
headers=headers,
|
|
json=payload,
|
|
)
|
|
if resp.status_code != 200:
|
|
errors.append(f"Anthropic API {resp.status_code} at batch {batch_start}")
|
|
continue
|
|
|
|
raw = resp.json().get("content", [{}])[0].get("text", "")
|
|
|
|
# Parse response
|
|
import re
|
|
bracket_match = re.search(r"\[.*\]", raw, re.DOTALL)
|
|
if not bracket_match:
|
|
errors.append(f"No JSON array in response at batch {batch_start}")
|
|
continue
|
|
|
|
results_list = json.loads(bracket_match.group(0))
|
|
|
|
for item in results_list:
|
|
idx = item.get("control_index", 0) - 1
|
|
if idx < 0 or idx >= len(batch):
|
|
continue
|
|
ctrl = batch[idx]
|
|
ctrl_id = str(ctrl["id"])
|
|
|
|
new_domain = item.get("domain", "")
|
|
new_category = item.get("category", "")
|
|
new_audience = item.get("target_audience", [])
|
|
|
|
if not isinstance(new_audience, list):
|
|
new_audience = []
|
|
|
|
# Build new control_id from domain if domain changed
|
|
old_prefix = ctrl["control_id"].split("-")[0] if ctrl["control_id"] else ""
|
|
new_prefix = new_domain.upper()[:4] if new_domain else old_prefix
|
|
|
|
if not req.dry_run:
|
|
update_parts = []
|
|
update_params: dict = {"ctrl_id": ctrl_id}
|
|
|
|
if new_category:
|
|
update_parts.append("category = :category")
|
|
update_params["category"] = new_category
|
|
|
|
if new_audience:
|
|
update_parts.append("target_audience = :target_audience")
|
|
update_params["target_audience"] = json.dumps(new_audience)
|
|
|
|
# Note: We do NOT rename control_ids here — that would
|
|
# break references and cause unique constraint violations.
|
|
|
|
if update_parts:
|
|
update_parts.append("updated_at = NOW()")
|
|
db.execute(
|
|
text(f"UPDATE canonical_controls SET {', '.join(update_parts)} WHERE id = CAST(:ctrl_id AS uuid)"),
|
|
update_params,
|
|
)
|
|
updated += 1
|
|
|
|
if not req.dry_run:
|
|
db.commit()
|
|
|
|
except Exception as e:
|
|
errors.append(f"Batch {batch_start}: {str(e)}")
|
|
db.rollback()
|
|
|
|
_domain_backfill_status[backfill_id] = {
|
|
"status": "running", "total": total, "updated": updated,
|
|
"progress": f"{min(batch_start + BATCH_SIZE, total)}/{total}",
|
|
"errors": errors[-10:],
|
|
}
|
|
|
|
_domain_backfill_status[backfill_id] = {
|
|
"status": "completed", "total": total, "updated": updated,
|
|
"errors": errors[-50:],
|
|
}
|
|
logger.info("Domain backfill %s completed: %d/%d updated", backfill_id, updated, total)
|
|
|
|
except Exception as e:
|
|
logger.error("Domain backfill %s failed: %s", backfill_id, e)
|
|
_domain_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.post("/generate/backfill-domain")
|
|
async def start_domain_backfill(req: DomainBackfillRequest):
|
|
"""Backfill domain, category, and target_audience for controls using Anthropic API.
|
|
|
|
Finds controls where target_audience is NULL and enriches them.
|
|
Default is dry_run=True (preview only).
|
|
"""
|
|
import uuid
|
|
backfill_id = str(uuid.uuid4())[:8]
|
|
_domain_backfill_status[backfill_id] = {"status": "starting"}
|
|
asyncio.create_task(_run_domain_backfill(req, backfill_id))
|
|
return {"status": "running", "backfill_id": backfill_id,
|
|
"message": f"Domain backfill started. Poll /generate/backfill-status/{backfill_id}"}
|
|
|
|
|
|
@router.get("/generate/domain-backfill-status/{backfill_id}")
|
|
async def get_domain_backfill_status(backfill_id: str):
|
|
"""Get status of a domain backfill job."""
|
|
status = _domain_backfill_status.get(backfill_id)
|
|
if not status:
|
|
raise HTTPException(status_code=404, detail="Domain backfill job not found")
|
|
return status
|