fix: Pipeline-Skalierung — 6 Optimierungen für 80k+ Controls

1. control_generator: GeneratorResult.status Default "completed" → "running" (Bug)
2. control_generator: Anthropic API mit Phase-Timeouts + Retry bei Disconnect
3. control_generator: regulation_exclude Filter + Harmonization via Qdrant statt In-Memory
4. decomposition_pass: Enrich Pass Batch-UPDATEs (400k → ~400 DB-Calls)
5. decomposition_pass: Merge Pass single Query statt N+1
6. batch_dedup_runner: Cross-Group Dedup parallelisiert (asyncio.gather)
7. canonical_control_routes: Framework Controls API Pagination (limit/offset)
8. DB-Indizes: idx_oc_parent_release, idx_oc_trigger_null, idx_cc_framework

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-11 14:09:32 +02:00
parent fc71117bf2
commit f89ce46631
5 changed files with 291 additions and 141 deletions

View File

@@ -17,6 +17,7 @@ Usage:
stats = await runner.run(hint_filter="implement:multi_factor_auth:none")
"""
import asyncio
import json
import logging
import time
@@ -342,80 +343,92 @@ class BatchDedupRunner:
cross_linked = 0
cross_review = 0
for i, r in enumerate(rows):
uuid = r[0]
# Process in parallel batches for embedding + Qdrant search
PARALLEL_BATCH = 10
async def _embed_and_search(r):
"""Embed one control and search Qdrant — safe for asyncio.gather."""
hint = r[3] or ""
parts = hint.split(":", 2)
action = parts[0] if len(parts) > 0 else ""
obj = parts[1] if len(parts) > 1 else ""
canonical = canonicalize_text(action, obj, r[2])
embedding = await get_embedding(canonical)
if not embedding:
continue
return None
results = await qdrant_search_cross_regulation(
embedding, top_k=5, collection=self.collection,
)
if not results:
continue
return (r, results)
# Find best match from a DIFFERENT hint group
for match in results:
match_score = match.get("score", 0.0)
match_payload = match.get("payload", {})
match_uuid = match_payload.get("control_uuid", "")
for batch_start in range(0, len(rows), PARALLEL_BATCH):
batch = rows[batch_start:batch_start + PARALLEL_BATCH]
tasks = [_embed_and_search(r) for r in batch]
results_batch = await asyncio.gather(*tasks, return_exceptions=True)
# Skip self-match
if match_uuid == uuid:
for res in results_batch:
if res is None or isinstance(res, Exception):
if isinstance(res, Exception):
logger.error("BatchDedup embed/search error: %s", res)
self.stats["errors"] += 1
continue
# Must be a different hint group (otherwise already handled in Phase 1)
match_action = match_payload.get("action_normalized", "")
match_object = match_payload.get("object_normalized", "")
# Simple check: different control UUID is enough
if match_score > LINK_THRESHOLD:
# Mark the worse one as duplicate
try:
self.db.execute(text("""
UPDATE canonical_controls
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
WHERE id = CAST(:dup AS uuid)
AND release_state != 'duplicate'
"""), {"master": match_uuid, "dup": uuid})
r, results = res
ctrl_uuid = r[0]
hint = r[3] or ""
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence)
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'cross_regulation', :conf)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {"cu": match_uuid, "pu": uuid, "conf": match_score})
if not results:
continue
# Transfer parent links
transferred = self._transfer_parent_links(match_uuid, uuid)
self.stats["parent_links_transferred"] += transferred
for match in results:
match_score = match.get("score", 0.0)
match_payload = match.get("payload", {})
match_uuid = match_payload.get("control_uuid", "")
self.db.commit()
cross_linked += 1
except Exception as e:
logger.error("BatchDedup cross-group link error %s%s: %s",
uuid, match_uuid, e)
self.db.rollback()
self.stats["errors"] += 1
break # Only one cross-link per control
elif match_score > REVIEW_THRESHOLD:
self._write_review(
{"control_id": r[1], "title": r[2], "objective": "",
"merge_group_hint": hint, "pattern_id": None},
match_payload, match_score,
)
cross_review += 1
break
if match_uuid == ctrl_uuid:
continue
self._progress_count = i + 1
if (i + 1) % 500 == 0:
if match_score > LINK_THRESHOLD:
try:
self.db.execute(text("""
UPDATE canonical_controls
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
WHERE id = CAST(:dup AS uuid)
AND release_state != 'duplicate'
"""), {"master": match_uuid, "dup": ctrl_uuid})
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence)
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'cross_regulation', :conf)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {"cu": match_uuid, "pu": ctrl_uuid, "conf": match_score})
transferred = self._transfer_parent_links(match_uuid, ctrl_uuid)
self.stats["parent_links_transferred"] += transferred
self.db.commit()
cross_linked += 1
except Exception as e:
logger.error("BatchDedup cross-group link error %s%s: %s",
ctrl_uuid, match_uuid, e)
self.db.rollback()
self.stats["errors"] += 1
break
elif match_score > REVIEW_THRESHOLD:
self._write_review(
{"control_id": r[1], "title": r[2], "objective": "",
"merge_group_hint": hint, "pattern_id": None},
match_payload, match_score,
)
cross_review += 1
break
processed = min(batch_start + PARALLEL_BATCH, len(rows))
self._progress_count = processed
if processed % 500 < PARALLEL_BATCH:
logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review",
i + 1, len(rows), cross_linked, cross_review)
processed, len(rows), cross_linked, cross_review)
self.stats["cross_group_linked"] = cross_linked
self.stats["cross_group_review"] = cross_review