feat(control-pipeline): add applicability backfill endpoint (Phase 5/C3)
POST /v1/canonical/generate/backfill-applicability enriches controls with applicable_industries, applicable_company_size, scope_conditions via Anthropic API. Targets ~26k controls from pipeline version < 3. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -985,6 +985,216 @@ async def get_domain_backfill_status(backfill_id: str):
|
|||||||
return status
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# APPLICABILITY BACKFILL — Industry, company size, scope conditions
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class ApplicabilityBackfillRequest(BaseModel):
|
||||||
|
dry_run: bool = True
|
||||||
|
limit: int = 0 # 0 = all
|
||||||
|
batch_size: int = 10
|
||||||
|
|
||||||
|
|
||||||
|
_applicability_backfill_status: dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
async def _run_applicability_backfill(req: ApplicabilityBackfillRequest, backfill_id: str):
|
||||||
|
"""Backfill applicable_industries, applicable_company_size, scope_conditions."""
|
||||||
|
import os
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
||||||
|
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
||||||
|
|
||||||
|
if not ANTHROPIC_API_KEY:
|
||||||
|
_applicability_backfill_status[backfill_id] = {
|
||||||
|
"status": "failed", "error": "ANTHROPIC_API_KEY not set"
|
||||||
|
}
|
||||||
|
return
|
||||||
|
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
limit_clause = f"LIMIT {req.limit}" if req.limit > 0 else ""
|
||||||
|
rows = db.execute(text(f"""
|
||||||
|
SELECT id, control_id, title, objective, category,
|
||||||
|
source_citation->>'source' as source_name,
|
||||||
|
source_citation->>'regulation_code' as regulation_code
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE release_state = 'draft'
|
||||||
|
AND (applicable_industries IS NULL OR applicable_industries::text = 'null')
|
||||||
|
ORDER BY control_id
|
||||||
|
{limit_clause}
|
||||||
|
""")).fetchall()
|
||||||
|
|
||||||
|
total = len(rows)
|
||||||
|
updated = 0
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
_applicability_backfill_status[backfill_id] = {
|
||||||
|
"status": "running", "total": total, "updated": 0,
|
||||||
|
"dry_run": req.dry_run, "errors": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
BATCH_SIZE = req.batch_size
|
||||||
|
for batch_start in range(0, total, BATCH_SIZE):
|
||||||
|
batch = rows[batch_start:batch_start + BATCH_SIZE]
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
for idx, row in enumerate(batch):
|
||||||
|
entries.append(
|
||||||
|
f"--- CONTROL {idx + 1}: {row.control_id} ---\n"
|
||||||
|
f"Titel: {row.title or ''}\n"
|
||||||
|
f"Objective: {(row.objective or '')[:400]}\n"
|
||||||
|
f"Kategorie: {row.category or ''}\n"
|
||||||
|
f"Quelle: {row.source_name or ''} ({row.regulation_code or ''})"
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = f"""Analysiere die folgenden {len(batch)} Compliance-Controls und bestimme fuer jedes:
|
||||||
|
|
||||||
|
1. applicable_industries: Liste der Branchen fuer die dieser Control relevant ist.
|
||||||
|
Verwende "all" wenn der Control branchenuebergreifend gilt.
|
||||||
|
Moegliche Werte: "all", "Technologie/IT", "Finanzdienstleistungen", "Versicherungen",
|
||||||
|
"Gesundheitswesen", "Pharma", "Telekommunikation", "Energie", "Produktion/Industrie",
|
||||||
|
"Logistik/Transport", "E-Commerce/Handel", "Oeffentlicher Dienst", "Bildung",
|
||||||
|
"Beratung/Consulting", "Immobilien", "Bau", "Automobil", "Maschinenbau",
|
||||||
|
"Luft-/Raumfahrt", "Medien/Verlage", "Gastronomie/Hotellerie", "Recht/Kanzlei",
|
||||||
|
"Agrar", "Chemie", "Verteidigung"
|
||||||
|
Beispiel DSGVO: ["all"], Beispiel TKG: ["Telekommunikation"], Beispiel NIS2: ["Energie", "Gesundheitswesen", "Transport", "Finanzdienstleistungen"]
|
||||||
|
|
||||||
|
2. applicable_company_size: Ab welcher Unternehmensgroesse gilt dieser Control?
|
||||||
|
Verwende "all" wenn keine Groessenbeschraenkung.
|
||||||
|
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
|
||||||
|
Beispiel NIS2 Art.21: ["medium", "large", "enterprise"], Beispiel DSGVO Art.5: ["all"]
|
||||||
|
|
||||||
|
3. scope_conditions: Optionale Bedingungen. Null wenn keine besonderen Bedingungen.
|
||||||
|
Sonst JSON-Objekt mit requires_any und description.
|
||||||
|
Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
|
||||||
|
"processes_minors_data", "automated_decisions", "employee_monitoring",
|
||||||
|
"video_surveillance", "financial_data", "is_kritis_operator", "payment_services"
|
||||||
|
Beispiel AI Act: {{"requires_any": ["uses_ai"], "description": "Nur bei KI-Einsatz"}}
|
||||||
|
Beispiel DSGVO Art.32: null (gilt immer)
|
||||||
|
|
||||||
|
Antworte mit einem JSON-Array. Jedes Objekt hat:
|
||||||
|
- control_index: 1-basierter Index
|
||||||
|
- applicable_industries: Liste
|
||||||
|
- applicable_company_size: Liste
|
||||||
|
- scope_conditions: Objekt oder null
|
||||||
|
|
||||||
|
{chr(10).join(entries)}"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
headers = {
|
||||||
|
"x-api-key": ANTHROPIC_API_KEY,
|
||||||
|
"anthropic-version": "2023-06-01",
|
||||||
|
"content-type": "application/json",
|
||||||
|
}
|
||||||
|
payload = {
|
||||||
|
"model": ANTHROPIC_MODEL,
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"system": "Du bist ein Compliance-Experte. Klassifiziere Controls nach Branche, Unternehmensgroesse und Scope-Bedingungen. Antworte NUR mit validem JSON.",
|
||||||
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
|
}
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
"https://api.anthropic.com/v1/messages",
|
||||||
|
headers=headers,
|
||||||
|
json=payload,
|
||||||
|
)
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
errors.append(f"Batch {batch_start}: API {resp.status_code}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
content = resp.json().get("content", [{}])[0].get("text", "")
|
||||||
|
parsed = _parse_llm_json(content)
|
||||||
|
if not parsed:
|
||||||
|
errors.append(f"Batch {batch_start}: JSON parse failed")
|
||||||
|
continue
|
||||||
|
|
||||||
|
items = parsed if isinstance(parsed, list) else [parsed]
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
idx = item.get("control_index", 0) - 1
|
||||||
|
if idx < 0 or idx >= len(batch):
|
||||||
|
continue
|
||||||
|
|
||||||
|
row = batch[idx]
|
||||||
|
industries = item.get("applicable_industries", ["all"])
|
||||||
|
company_size = item.get("applicable_company_size", ["all"])
|
||||||
|
scope = item.get("scope_conditions")
|
||||||
|
|
||||||
|
if not req.dry_run:
|
||||||
|
db.execute(text("""
|
||||||
|
UPDATE compliance.canonical_controls
|
||||||
|
SET applicable_industries = CAST(:ind AS jsonb),
|
||||||
|
applicable_company_size = CAST(:size AS jsonb),
|
||||||
|
scope_conditions = CAST(:scope AS jsonb),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = CAST(:cid AS uuid)
|
||||||
|
"""), {
|
||||||
|
"ind": json.dumps(industries),
|
||||||
|
"size": json.dumps(company_size),
|
||||||
|
"scope": json.dumps(scope) if scope else None,
|
||||||
|
"cid": str(row.id),
|
||||||
|
})
|
||||||
|
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
if not req.dry_run:
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"Batch {batch_start}: {str(e)[:200]}")
|
||||||
|
logger.warning("Applicability backfill batch %d error: %s", batch_start, e)
|
||||||
|
db.rollback()
|
||||||
|
|
||||||
|
_applicability_backfill_status[backfill_id] = {
|
||||||
|
"status": "running", "total": total, "updated": updated,
|
||||||
|
"progress": f"{min(batch_start + BATCH_SIZE, total)}/{total}",
|
||||||
|
"dry_run": req.dry_run, "errors": errors[-10:],
|
||||||
|
}
|
||||||
|
|
||||||
|
_applicability_backfill_status[backfill_id] = {
|
||||||
|
"status": "completed", "total": total, "updated": updated,
|
||||||
|
"dry_run": req.dry_run, "errors": errors[-50:],
|
||||||
|
}
|
||||||
|
logger.info("Applicability backfill %s completed: %d/%d updated",
|
||||||
|
backfill_id, updated, total)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Applicability backfill %s failed: %s", backfill_id, e)
|
||||||
|
_applicability_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/generate/backfill-applicability")
|
||||||
|
async def start_applicability_backfill(req: ApplicabilityBackfillRequest):
|
||||||
|
"""Backfill applicable_industries, applicable_company_size, scope_conditions
|
||||||
|
for controls missing these fields. Uses Anthropic API.
|
||||||
|
Default is dry_run=True (preview only).
|
||||||
|
"""
|
||||||
|
import uuid
|
||||||
|
backfill_id = str(uuid.uuid4())[:8]
|
||||||
|
_applicability_backfill_status[backfill_id] = {"status": "starting"}
|
||||||
|
asyncio.create_task(_run_applicability_backfill(req, backfill_id))
|
||||||
|
return {
|
||||||
|
"status": "running",
|
||||||
|
"backfill_id": backfill_id,
|
||||||
|
"message": f"Applicability backfill started. Poll /generate/applicability-backfill-status/{backfill_id}",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/generate/applicability-backfill-status/{backfill_id}")
|
||||||
|
async def get_applicability_backfill_status(backfill_id: str):
|
||||||
|
"""Get status of an applicability backfill job."""
|
||||||
|
status = _applicability_backfill_status.get(backfill_id)
|
||||||
|
if not status:
|
||||||
|
raise HTTPException(status_code=404, detail="Applicability backfill job not found")
|
||||||
|
return status
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Source-Type Backfill — Classify law vs guideline vs standard vs restricted
|
# Source-Type Backfill — Classify law vs guideline vs standard vs restricted
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
Reference in New Issue
Block a user