""" FastAPI routes for the Control Generator Pipeline. Endpoints: POST /v1/canonical/generate — Start generation run GET /v1/canonical/generate/status/{job_id} — Job status GET /v1/canonical/generate/jobs — All jobs GET /v1/canonical/generate/review-queue — Controls needing review POST /v1/canonical/generate/review/{control_id} — Complete review GET /v1/canonical/generate/processed-stats — Processing stats per collection GET /v1/canonical/blocked-sources — Blocked sources list POST /v1/canonical/blocked-sources/cleanup — Start cleanup workflow """ import asyncio import json import logging from typing import Optional, List from fastapi import APIRouter, HTTPException, Query from pydantic import BaseModel from sqlalchemy import text from database import SessionLocal from compliance.services.control_generator import ( ControlGeneratorPipeline, GeneratorConfig, ALL_COLLECTIONS, VALID_CATEGORIES, VALID_DOMAINS, _detect_category, _detect_domain, _llm_local, _parse_llm_json, CATEGORY_LIST_STR, ) from compliance.services.citation_backfill import CitationBackfill, BackfillResult from compliance.services.rag_client import get_rag_client logger = logging.getLogger(__name__) router = APIRouter(prefix="/v1/canonical", tags=["control-generator"]) # ============================================================================= # REQUEST / RESPONSE MODELS # ============================================================================= class GenerateRequest(BaseModel): domain: Optional[str] = None collections: Optional[List[str]] = None max_controls: int = 50 max_chunks: int = 1000 # Default: process max 1000 chunks per job (respects document boundaries) batch_size: int = 5 skip_web_search: bool = False dry_run: bool = False class GenerateResponse(BaseModel): job_id: str status: str message: str total_chunks_scanned: int = 0 controls_generated: int = 0 controls_verified: int = 0 controls_needs_review: int = 0 controls_too_close: int = 0 controls_duplicates_found: int = 0 controls_qa_fixed: int = 0 errors: list = [] controls: list = [] class ReviewRequest(BaseModel): action: str # "approve", "reject", "needs_rework" release_state: Optional[str] = None # Override release_state notes: Optional[str] = None class ProcessedStats(BaseModel): collection: str total_chunks_estimated: int processed_chunks: int pending_chunks: int direct_adopted: int llm_reformed: int skipped: int class BlockedSourceResponse(BaseModel): id: str regulation_code: str document_title: str reason: str deletion_status: str qdrant_collection: Optional[str] = None marked_at: str # ============================================================================= # ENDPOINTS # ============================================================================= async def _run_pipeline_background(config: GeneratorConfig, job_id: str): """Run the pipeline in the background. Uses its own DB session.""" db = SessionLocal() try: config.existing_job_id = job_id pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client()) result = await pipeline.run(config) logger.info( "Background generation job %s completed: %d controls from %d chunks", job_id, result.controls_generated, result.total_chunks_scanned, ) except Exception as e: logger.error("Background generation job %s failed: %s", job_id, e) # Update job as failed try: db.execute( text(""" UPDATE canonical_generation_jobs SET status = 'failed', errors = :errors, completed_at = NOW() WHERE id = CAST(:job_id AS uuid) """), {"job_id": job_id, "errors": json.dumps([str(e)])}, ) db.commit() except Exception: pass finally: db.close() @router.post("/generate", response_model=GenerateResponse) async def start_generation(req: GenerateRequest): """Start a control generation run (runs in background). Returns immediately with job_id. Use GET /generate/status/{job_id} to poll progress. """ config = GeneratorConfig( collections=req.collections, domain=req.domain, batch_size=req.batch_size, max_controls=req.max_controls, max_chunks=req.max_chunks, skip_web_search=req.skip_web_search, dry_run=req.dry_run, ) if req.dry_run: # Dry run: execute synchronously and return controls db = SessionLocal() try: pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client()) result = await pipeline.run(config) return GenerateResponse( job_id=result.job_id, status=result.status, message=f"Dry run: {result.controls_generated} controls from {result.total_chunks_scanned} chunks", total_chunks_scanned=result.total_chunks_scanned, controls_generated=result.controls_generated, controls_verified=result.controls_verified, controls_needs_review=result.controls_needs_review, controls_too_close=result.controls_too_close, controls_duplicates_found=result.controls_duplicates_found, errors=result.errors, controls=result.controls, ) except Exception as e: logger.error("Dry run failed: %s", e) raise HTTPException(status_code=500, detail=str(e)) finally: db.close() # Create job record first so we can return the ID db = SessionLocal() try: result = db.execute( text(""" INSERT INTO canonical_generation_jobs (status, config) VALUES ('running', :config) RETURNING id """), {"config": json.dumps(config.model_dump())}, ) db.commit() row = result.fetchone() job_id = str(row[0]) if row else None except Exception as e: logger.error("Failed to create job: %s", e) raise HTTPException(status_code=500, detail=f"Failed to create job: {e}") finally: db.close() if not job_id: raise HTTPException(status_code=500, detail="Failed to create job record") # Launch pipeline in background asyncio.create_task(_run_pipeline_background(config, job_id)) return GenerateResponse( job_id=job_id, status="running", message="Generation started in background. Poll /generate/status/{job_id} for progress.", ) @router.get("/generate/status/{job_id}") async def get_job_status(job_id: str): """Get status of a generation job.""" db = SessionLocal() try: result = db.execute( text("SELECT * FROM canonical_generation_jobs WHERE id = CAST(:id AS uuid)"), {"id": job_id}, ) row = result.fetchone() if not row: raise HTTPException(status_code=404, detail="Job not found") cols = result.keys() job = dict(zip(cols, row)) # Serialize datetime fields for key in ("started_at", "completed_at", "created_at"): if job.get(key): job[key] = str(job[key]) job["id"] = str(job["id"]) return job finally: db.close() @router.get("/generate/jobs") async def list_jobs( limit: int = Query(20, ge=1, le=100), offset: int = Query(0, ge=0), ): """List all generation jobs.""" db = SessionLocal() try: result = db.execute( text(""" SELECT id, status, total_chunks_scanned, controls_generated, controls_verified, controls_needs_review, controls_too_close, controls_duplicates_found, created_at, completed_at FROM canonical_generation_jobs ORDER BY created_at DESC LIMIT :limit OFFSET :offset """), {"limit": limit, "offset": offset}, ) jobs = [] cols = result.keys() for row in result: job = dict(zip(cols, row)) job["id"] = str(job["id"]) for key in ("created_at", "completed_at"): if job.get(key): job[key] = str(job[key]) jobs.append(job) return {"jobs": jobs, "total": len(jobs)} finally: db.close() @router.get("/generate/review-queue") async def get_review_queue( release_state: str = Query("needs_review", regex="^(needs_review|too_close|duplicate)$"), limit: int = Query(50, ge=1, le=200), ): """Get controls that need manual review.""" db = SessionLocal() try: result = db.execute( text(""" SELECT c.id, c.control_id, c.title, c.objective, c.severity, c.release_state, c.license_rule, c.customer_visible, c.generation_metadata, c.open_anchors, c.tags, c.created_at FROM canonical_controls c WHERE c.release_state = :state ORDER BY c.created_at DESC LIMIT :limit """), {"state": release_state, "limit": limit}, ) controls = [] cols = result.keys() for row in result: ctrl = dict(zip(cols, row)) ctrl["id"] = str(ctrl["id"]) ctrl["created_at"] = str(ctrl["created_at"]) # Parse JSON fields for jf in ("generation_metadata", "open_anchors", "tags"): if isinstance(ctrl.get(jf), str): try: ctrl[jf] = json.loads(ctrl[jf]) except (json.JSONDecodeError, TypeError): pass controls.append(ctrl) return {"controls": controls, "total": len(controls)} finally: db.close() @router.post("/generate/review/{control_id}") async def review_control(control_id: str, req: ReviewRequest): """Complete review of a generated control.""" db = SessionLocal() try: # Validate control exists and is in reviewable state result = db.execute( text("SELECT id, release_state FROM canonical_controls WHERE control_id = :cid"), {"cid": control_id}, ) row = result.fetchone() if not row: raise HTTPException(status_code=404, detail="Control not found") current_state = row[1] if current_state not in ("needs_review", "too_close", "duplicate"): raise HTTPException(status_code=400, detail=f"Control is in state '{current_state}', not reviewable") # Determine new state if req.action == "approve": new_state = req.release_state or "draft" elif req.action == "reject": new_state = "deprecated" elif req.action == "needs_rework": new_state = "needs_review" else: raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}") if new_state not in ("draft", "review", "approved", "deprecated", "needs_review", "too_close", "duplicate"): raise HTTPException(status_code=400, detail=f"Invalid release_state: {new_state}") db.execute( text(""" UPDATE canonical_controls SET release_state = :state, updated_at = NOW() WHERE control_id = :cid """), {"state": new_state, "cid": control_id}, ) db.commit() return {"control_id": control_id, "release_state": new_state, "action": req.action} finally: db.close() class BulkReviewRequest(BaseModel): release_state: str # Filter: which controls to bulk-review action: str # "approve" or "reject" new_state: Optional[str] = None # Override target state @router.post("/generate/bulk-review") async def bulk_review(req: BulkReviewRequest): """Bulk review all controls matching a release_state filter. Example: reject all needs_review → sets them to deprecated. """ if req.release_state not in ("needs_review", "too_close", "duplicate"): raise HTTPException(status_code=400, detail=f"Invalid filter state: {req.release_state}") if req.action == "approve": target = req.new_state or "draft" elif req.action == "reject": target = "deprecated" else: raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}") if target not in ("draft", "review", "approved", "deprecated", "needs_review"): raise HTTPException(status_code=400, detail=f"Invalid target state: {target}") db = SessionLocal() try: result = db.execute( text(""" UPDATE canonical_controls SET release_state = :target, updated_at = NOW() WHERE release_state = :source RETURNING control_id """), {"source": req.release_state, "target": target}, ) affected = [row[0] for row in result] db.commit() return { "action": req.action, "source_state": req.release_state, "target_state": target, "affected_count": len(affected), } finally: db.close() class QAReclassifyRequest(BaseModel): limit: int = 100 # How many controls to reclassify per run dry_run: bool = True # Preview only by default filter_category: Optional[str] = None # Only reclassify controls of this category filter_domain_prefix: Optional[str] = None # Only reclassify controls with this prefix @router.post("/generate/qa-reclassify") async def qa_reclassify(req: QAReclassifyRequest): """Run QA reclassification on existing controls using local LLM. Finds controls where keyword-detection disagrees with current category/domain, then uses Ollama to determine the correct classification. """ db = SessionLocal() try: # Load controls to check where_clauses = ["release_state NOT IN ('deprecated')"] params = {"limit": req.limit} if req.filter_category: where_clauses.append("category = :cat") params["cat"] = req.filter_category if req.filter_domain_prefix: where_clauses.append("control_id LIKE :prefix") params["prefix"] = f"{req.filter_domain_prefix}-%" where_sql = " AND ".join(where_clauses) rows = db.execute( text(f""" SELECT id, control_id, title, objective, category, COALESCE(requirements::text, '[]') as requirements, COALESCE(source_original_text, '') as source_text FROM canonical_controls WHERE {where_sql} ORDER BY created_at DESC LIMIT :limit """), params, ).fetchall() results = {"checked": 0, "mismatches": 0, "fixes": [], "errors": []} for row in rows: results["checked"] += 1 control_id = row[1] title = row[2] objective = row[3] or "" current_category = row[4] source_text = row[6] or objective # Keyword detection on source text kw_category = _detect_category(source_text) or _detect_category(objective) kw_domain = _detect_domain(source_text) current_prefix = control_id.split("-")[0] if "-" in control_id else "" # Skip if keyword detection agrees with current classification if kw_category == current_category and kw_domain == current_prefix: continue results["mismatches"] += 1 # Ask Ollama to arbitrate try: reqs_text = "" try: reqs = json.loads(row[5]) if isinstance(reqs, list): reqs_text = ", ".join(str(r) for r in reqs[:3]) except Exception: pass prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung. Titel: {title[:100]} Ziel: {objective[:200]} Anforderungen: {reqs_text[:200]} Aktuelle Zuordnung: domain={current_prefix}, category={current_category} Keyword-Erkennung: domain={kw_domain}, category={kw_category} Welche Zuordnung ist korrekt? Antworte NUR als JSON: {{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}} Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit Kategorien: {CATEGORY_LIST_STR}""" raw = await _llm_local(prompt) data = _parse_llm_json(raw) if not data: continue qa_domain = data.get("domain", "").upper() qa_category = data.get("category", "") reason = data.get("reason", "") fix_entry = { "control_id": control_id, "title": title[:80], "old_category": current_category, "old_domain": current_prefix, "new_category": qa_category if qa_category in VALID_CATEGORIES else current_category, "new_domain": qa_domain if qa_domain in VALID_DOMAINS else current_prefix, "reason": reason, } category_changed = qa_category in VALID_CATEGORIES and qa_category != current_category if category_changed and not req.dry_run: db.execute( text(""" UPDATE canonical_controls SET category = :category, updated_at = NOW() WHERE id = :id """), {"id": row[0], "category": qa_category}, ) fix_entry["applied"] = True else: fix_entry["applied"] = False results["fixes"].append(fix_entry) except Exception as e: results["errors"].append({"control_id": control_id, "error": str(e)}) if not req.dry_run: db.commit() return results finally: db.close() @router.get("/generate/processed-stats") async def get_processed_stats(): """Get processing statistics per collection.""" db = SessionLocal() try: result = db.execute( text(""" SELECT collection, COUNT(*) as processed_chunks, COUNT(*) FILTER (WHERE processing_path = 'structured') as direct_adopted, COUNT(*) FILTER (WHERE processing_path = 'llm_reform') as llm_reformed, COUNT(*) FILTER (WHERE processing_path = 'skipped') as skipped FROM canonical_processed_chunks GROUP BY collection ORDER BY collection """) ) stats = [] cols = result.keys() for row in result: stat = dict(zip(cols, row)) stat["total_chunks_estimated"] = 0 # Would need Qdrant API to get total stat["pending_chunks"] = 0 stats.append(stat) return {"stats": stats} finally: db.close() # ============================================================================= # BLOCKED SOURCES # ============================================================================= @router.get("/blocked-sources") async def list_blocked_sources(): """List all blocked (Rule 3) sources.""" db = SessionLocal() try: result = db.execute( text(""" SELECT id, regulation_code, document_title, reason, deletion_status, qdrant_collection, marked_at FROM canonical_blocked_sources ORDER BY marked_at DESC """) ) sources = [] cols = result.keys() for row in result: src = dict(zip(cols, row)) src["id"] = str(src["id"]) src["marked_at"] = str(src["marked_at"]) sources.append(src) return {"sources": sources} finally: db.close() @router.post("/blocked-sources/cleanup") async def start_cleanup(): """Start cleanup workflow for blocked sources. This marks all pending blocked sources for deletion. Actual RAG chunk deletion and file removal is a separate manual step. """ db = SessionLocal() try: result = db.execute( text(""" UPDATE canonical_blocked_sources SET deletion_status = 'marked_for_deletion' WHERE deletion_status = 'pending' RETURNING regulation_code """) ) marked = [row[0] for row in result] db.commit() return { "status": "marked_for_deletion", "marked_count": len(marked), "regulation_codes": marked, "message": "Sources marked for deletion. Run manual cleanup to remove RAG chunks and files.", } finally: db.close() # ============================================================================= # CUSTOMER VIEW FILTER # ============================================================================= @router.get("/controls-customer") async def get_controls_customer_view( severity: Optional[str] = Query(None), domain: Optional[str] = Query(None), ): """Get controls filtered for customer visibility. Rule 3 controls have source_citation and source_original_text hidden. generation_metadata is NEVER shown to customers. """ db = SessionLocal() try: query = """ SELECT c.id, c.control_id, c.title, c.objective, c.rationale, c.scope, c.requirements, c.test_procedure, c.evidence, c.severity, c.risk_score, c.implementation_effort, c.open_anchors, c.release_state, c.tags, c.license_rule, c.customer_visible, c.source_original_text, c.source_citation, c.created_at, c.updated_at FROM canonical_controls c WHERE c.release_state IN ('draft', 'approved') """ params: dict = {} if severity: query += " AND c.severity = :severity" params["severity"] = severity if domain: query += " AND c.control_id LIKE :domain" params["domain"] = f"{domain.upper()}-%" query += " ORDER BY c.control_id" result = db.execute(text(query), params) controls = [] cols = result.keys() for row in result: ctrl = dict(zip(cols, row)) ctrl["id"] = str(ctrl["id"]) for key in ("created_at", "updated_at"): if ctrl.get(key): ctrl[key] = str(ctrl[key]) # Parse JSON fields for jf in ("scope", "requirements", "test_procedure", "evidence", "open_anchors", "tags", "source_citation"): if isinstance(ctrl.get(jf), str): try: ctrl[jf] = json.loads(ctrl[jf]) except (json.JSONDecodeError, TypeError): pass # Customer visibility rules: # - NEVER show generation_metadata # - Rule 3: NEVER show source_citation or source_original_text ctrl.pop("generation_metadata", None) if not ctrl.get("customer_visible", True): ctrl["source_citation"] = None ctrl["source_original_text"] = None controls.append(ctrl) return {"controls": controls, "total": len(controls)} finally: db.close() # ============================================================================= # CITATION BACKFILL # ============================================================================= class BackfillRequest(BaseModel): dry_run: bool = True # Default to dry_run for safety limit: int = 0 # 0 = all controls class BackfillResponse(BaseModel): status: str total_controls: int = 0 matched_hash: int = 0 matched_regex: int = 0 matched_llm: int = 0 unmatched: int = 0 updated: int = 0 errors: list = [] _backfill_status: dict = {} async def _run_backfill_background(dry_run: bool, limit: int, backfill_id: str): """Run backfill in background with own DB session.""" db = SessionLocal() try: backfill = CitationBackfill(db=db, rag_client=get_rag_client()) result = await backfill.run(dry_run=dry_run, limit=limit) _backfill_status[backfill_id] = { "status": "completed", "total_controls": result.total_controls, "matched_hash": result.matched_hash, "matched_regex": result.matched_regex, "matched_llm": result.matched_llm, "unmatched": result.unmatched, "updated": result.updated, "errors": result.errors[:50], } logger.info("Backfill %s completed: %d updated", backfill_id, result.updated) except Exception as e: logger.error("Backfill %s failed: %s", backfill_id, e) _backfill_status[backfill_id] = {"status": "failed", "errors": [str(e)]} finally: db.close() @router.post("/generate/backfill-citations", response_model=BackfillResponse) async def start_backfill(req: BackfillRequest): """Backfill article/paragraph into existing control source_citations. Uses 3-tier matching: hash lookup → regex parse → Ollama LLM. Default is dry_run=True (preview only, no DB changes). """ import uuid backfill_id = str(uuid.uuid4())[:8] _backfill_status[backfill_id] = {"status": "running"} # Always run in background (RAG index build takes minutes) asyncio.create_task(_run_backfill_background(req.dry_run, req.limit, backfill_id)) return BackfillResponse( status=f"running (id={backfill_id})", ) @router.get("/generate/backfill-status/{backfill_id}") async def get_backfill_status(backfill_id: str): """Get status of a backfill job.""" status = _backfill_status.get(backfill_id) if not status: raise HTTPException(status_code=404, detail="Backfill job not found") return status # ============================================================================= # DOMAIN + TARGET AUDIENCE BACKFILL # ============================================================================= class DomainBackfillRequest(BaseModel): dry_run: bool = True job_id: Optional[str] = None # Only backfill controls from this job limit: int = 0 # 0 = all _domain_backfill_status: dict = {} async def _run_domain_backfill(req: DomainBackfillRequest, backfill_id: str): """Backfill domain, category, and target_audience for existing controls using Anthropic.""" import os import httpx ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "") ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6") if not ANTHROPIC_API_KEY: _domain_backfill_status[backfill_id] = { "status": "failed", "error": "ANTHROPIC_API_KEY not set" } return db = SessionLocal() try: # Find controls needing backfill where_clauses = ["(target_audience IS NULL OR target_audience = '[]' OR target_audience = 'null')"] params: dict = {} if req.job_id: where_clauses.append("generation_metadata->>'job_id' = :job_id") params["job_id"] = req.job_id query = f""" SELECT id, control_id, title, objective, category, source_original_text, tags FROM canonical_controls WHERE {' AND '.join(where_clauses)} ORDER BY control_id """ if req.limit > 0: query += f" LIMIT {req.limit}" result = db.execute(text(query), params) controls = [dict(zip(result.keys(), row)) for row in result] total = len(controls) updated = 0 errors = [] _domain_backfill_status[backfill_id] = { "status": "running", "total": total, "updated": 0, "errors": [] } # Process in batches of 10 BATCH_SIZE = 10 for batch_start in range(0, total, BATCH_SIZE): batch = controls[batch_start:batch_start + BATCH_SIZE] entries = [] for idx, ctrl in enumerate(batch): text_for_analysis = ctrl.get("objective") or ctrl.get("title") or "" original = ctrl.get("source_original_text") or "" if original: text_for_analysis += f"\n\nQuelltext-Auszug: {original[:500]}" entries.append( f"--- CONTROL {idx + 1}: {ctrl['control_id']} ---\n" f"Titel: {ctrl.get('title', '')}\n" f"Objective: {text_for_analysis[:800]}\n" f"Tags: {json.dumps(ctrl.get('tags', []))}" ) prompt = f"""Analysiere die folgenden {len(batch)} Controls und bestimme fuer jedes: 1. domain: Das Fachgebiet (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT) 2. category: Die Kategorie (encryption, authentication, network, data_protection, logging, incident, continuity, compliance, supply_chain, physical, personnel, application, system, risk, governance, hardware, identity, public_administration, labor_law, finance, trade_regulation, environmental, health) 3. target_audience: Liste der Zielgruppen (moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst") Antworte mit einem JSON-Array mit {len(batch)} Objekten. Jedes Objekt hat: - control_index: 1-basierter Index - domain: Fachgebiet-Kuerzel - category: Kategorie - target_audience: Liste der Zielgruppen {"".join(entries)}""" try: headers = { "x-api-key": ANTHROPIC_API_KEY, "anthropic-version": "2023-06-01", "content-type": "application/json", } payload = { "model": ANTHROPIC_MODEL, "max_tokens": 4096, "system": "Du bist ein Compliance-Experte. Klassifiziere Controls nach Fachgebiet und Zielgruppe. Antworte NUR mit validem JSON.", "messages": [{"role": "user", "content": prompt}], } async with httpx.AsyncClient(timeout=60.0) as client: resp = await client.post( "https://api.anthropic.com/v1/messages", headers=headers, json=payload, ) if resp.status_code != 200: errors.append(f"Anthropic API {resp.status_code} at batch {batch_start}") continue raw = resp.json().get("content", [{}])[0].get("text", "") # Parse response import re bracket_match = re.search(r"\[.*\]", raw, re.DOTALL) if not bracket_match: errors.append(f"No JSON array in response at batch {batch_start}") continue results_list = json.loads(bracket_match.group(0)) for item in results_list: idx = item.get("control_index", 0) - 1 if idx < 0 or idx >= len(batch): continue ctrl = batch[idx] ctrl_id = str(ctrl["id"]) new_domain = item.get("domain", "") new_category = item.get("category", "") new_audience = item.get("target_audience", []) if not isinstance(new_audience, list): new_audience = [] # Build new control_id from domain if domain changed old_prefix = ctrl["control_id"].split("-")[0] if ctrl["control_id"] else "" new_prefix = new_domain.upper()[:4] if new_domain else old_prefix if not req.dry_run: update_parts = [] update_params: dict = {"ctrl_id": ctrl_id} if new_category: update_parts.append("category = :category") update_params["category"] = new_category if new_audience: update_parts.append("target_audience = :target_audience") update_params["target_audience"] = json.dumps(new_audience) # Note: We do NOT rename control_ids here — that would # break references and cause unique constraint violations. if update_parts: update_parts.append("updated_at = NOW()") db.execute( text(f"UPDATE canonical_controls SET {', '.join(update_parts)} WHERE id = CAST(:ctrl_id AS uuid)"), update_params, ) updated += 1 if not req.dry_run: db.commit() except Exception as e: errors.append(f"Batch {batch_start}: {str(e)}") db.rollback() _domain_backfill_status[backfill_id] = { "status": "running", "total": total, "updated": updated, "progress": f"{min(batch_start + BATCH_SIZE, total)}/{total}", "errors": errors[-10:], } _domain_backfill_status[backfill_id] = { "status": "completed", "total": total, "updated": updated, "errors": errors[-50:], } logger.info("Domain backfill %s completed: %d/%d updated", backfill_id, updated, total) except Exception as e: logger.error("Domain backfill %s failed: %s", backfill_id, e) _domain_backfill_status[backfill_id] = {"status": "failed", "error": str(e)} finally: db.close() @router.post("/generate/backfill-domain") async def start_domain_backfill(req: DomainBackfillRequest): """Backfill domain, category, and target_audience for controls using Anthropic API. Finds controls where target_audience is NULL and enriches them. Default is dry_run=True (preview only). """ import uuid backfill_id = str(uuid.uuid4())[:8] _domain_backfill_status[backfill_id] = {"status": "starting"} asyncio.create_task(_run_domain_backfill(req, backfill_id)) return {"status": "running", "backfill_id": backfill_id, "message": f"Domain backfill started. Poll /generate/backfill-status/{backfill_id}"} @router.get("/generate/domain-backfill-status/{backfill_id}") async def get_domain_backfill_status(backfill_id: str): """Get status of a domain backfill job.""" status = _domain_backfill_status.get(backfill_id) if not status: raise HTTPException(status_code=404, detail="Domain backfill job not found") return status