feat: LLM-basierter Rationale-Backfill fuer atomare Controls

POST /controls/backfill-rationale — ersetzt Placeholder "Aus Obligation
abgeleitet." durch LLM-generierte Begruendungen (Ollama/qwen3.5).
Optimierung: gruppiert ~86k Controls nach ~7k Parents, ein LLM-Call pro Parent.
Paginierung via batch_size/offset fuer kontrollierte Ausfuehrung.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-25 23:01:49 +01:00
parent 81ce9dde07
commit 23dd5116b3
2 changed files with 423 additions and 0 deletions

View File

@@ -1112,6 +1112,170 @@ async def backfill_evidence_type(
}
# =============================================================================
# RATIONALE BACKFILL (LLM)
# =============================================================================
@router.post("/controls/backfill-rationale")
async def backfill_rationale(
dry_run: bool = Query(True, description="Nur zaehlen, nicht aendern"),
batch_size: int = Query(50, description="Parent-Controls pro Durchlauf"),
offset: int = Query(0, description="Offset fuer Paginierung (Parent-Index)"),
):
"""
Generiert sinnvolle Begruendungen fuer atomare Controls per LLM.
Optimierung: Gruppiert nach Parent-Control (~7k Parents statt ~86k Einzel-Calls).
Pro Parent-Gruppe wird EIN LLM-Aufruf gemacht, der eine gemeinsame
Begruendung fuer alle Kinder erzeugt.
Workflow:
1. dry_run=true → Statistiken anzeigen
2. dry_run=false&batch_size=50&offset=0 → Erste 50 Parents verarbeiten
3. Wiederholen mit offset=50, 100, ... bis fertig
"""
from compliance.services.llm_provider import get_llm_provider
with SessionLocal() as db:
# 1. Parent-Controls mit Kindern laden (nur wo rationale = Placeholder)
parents = db.execute(text("""
SELECT p.id AS parent_uuid, p.control_id, p.title, p.category,
p.source_citation->>'source' AS source_name,
COUNT(c.id) AS child_count
FROM canonical_controls p
JOIN canonical_controls c ON c.parent_control_uuid = p.id
WHERE c.rationale = 'Aus Obligation abgeleitet.'
AND c.release_state NOT IN ('rejected', 'merged')
GROUP BY p.id, p.control_id, p.title, p.category,
p.source_citation->>'source'
ORDER BY p.control_id
""")).fetchall()
total_parents = len(parents)
total_children = sum(p.child_count for p in parents)
if dry_run:
return {
"dry_run": True,
"total_parents": total_parents,
"total_children": total_children,
"estimated_llm_calls": total_parents,
"sample_parents": [
{
"control_id": p.control_id,
"title": p.title,
"source": p.source_name,
"child_count": p.child_count,
}
for p in parents[:10]
],
}
# 2. Batch auswählen
batch = parents[offset : offset + batch_size]
if not batch:
return {
"dry_run": False,
"message": "Kein weiterer Batch — alle Parents verarbeitet.",
"total_parents": total_parents,
"offset": offset,
"processed": 0,
}
provider = get_llm_provider()
processed = 0
children_updated = 0
errors = []
sample_rationales = []
for parent in batch:
parent_uuid = str(parent.parent_uuid)
source = parent.source_name or "Regulierung"
# LLM-Prompt
prompt = (
f"Du bist Compliance-Experte. Erklaere in 1-2 Saetzen auf Deutsch, "
f"WARUM aus dem uebergeordneten Control atomare Teilmassnahmen "
f"abgeleitet wurden.\n\n"
f"Uebergeordnetes Control: {parent.control_id}{parent.title}\n"
f"Regulierung: {source}\n"
f"Kategorie: {parent.category or 'k.A.'}\n"
f"Anzahl atomarer Controls: {parent.child_count}\n\n"
f"Schreibe NUR die Begruendung (1-2 Saetze). Kein Markdown, "
f"keine Aufzaehlung, kein Praefix. "
f"Erklaere den regulatorischen Hintergrund und warum die "
f"Zerlegung in atomare, testbare Massnahmen notwendig ist."
)
try:
response = await provider.complete(
prompt=prompt,
max_tokens=256,
temperature=0.3,
)
rationale = response.content.strip()
# Bereinigen: Anfuehrungszeichen, Markdown entfernen
rationale = rationale.strip('"').strip("'").strip()
if rationale.startswith("Begründung:") or rationale.startswith("Begruendung:"):
rationale = rationale.split(":", 1)[1].strip()
# Laenge begrenzen (max 500 Zeichen)
if len(rationale) > 500:
rationale = rationale[:497] + "..."
if not rationale or len(rationale) < 10:
errors.append({
"control_id": parent.control_id,
"error": "LLM-Antwort zu kurz oder leer",
})
continue
# Alle Kinder dieses Parents updaten
result = db.execute(
text("""
UPDATE canonical_controls
SET rationale = :rationale
WHERE parent_control_uuid = CAST(:pid AS uuid)
AND rationale = 'Aus Obligation abgeleitet.'
AND release_state NOT IN ('rejected', 'merged')
"""),
{"rationale": rationale, "pid": parent_uuid},
)
children_updated += result.rowcount
processed += 1
if len(sample_rationales) < 5:
sample_rationales.append({
"parent": parent.control_id,
"title": parent.title,
"rationale": rationale,
"children_updated": result.rowcount,
})
except Exception as e:
logger.error(f"LLM error for {parent.control_id}: {e}")
errors.append({
"control_id": parent.control_id,
"error": str(e)[:200],
})
db.commit()
return {
"dry_run": False,
"offset": offset,
"batch_size": batch_size,
"next_offset": offset + batch_size if offset + batch_size < total_parents else None,
"processed_parents": processed,
"children_updated": children_updated,
"total_parents": total_parents,
"total_children": total_children,
"errors": errors[:10],
"sample_rationales": sample_rationales,
}
# =============================================================================
# CONTROL CRUD (CREATE / UPDATE / DELETE)
# =============================================================================