fix(pipeline): make dedup Phase 2 resilient — paginated, timeout, per-control error handling

- Paginated DB queries (100 rows/page) instead of loading all 166k rows
- Individual timeout (30s) per embedding + qdrant call
- Per-control try/except — one failure doesn't kill the job
- Sequential processing (no asyncio.gather) for stability
- Progress logging every 500 controls

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-28 15:31:28 +02:00
parent 2e2e81b3e1
commit b151951448

View File

@@ -330,12 +330,31 @@ class BatchDedupRunner:
async def _run_cross_group_pass(self): async def _run_cross_group_pass(self):
"""Phase 2: Find cross-group duplicates among surviving masters. """Phase 2: Find cross-group duplicates among surviving masters.
After Phase 1, ~52k masters remain. Many have similar semantics Paginated DB queries + individual error handling per control.
despite different merge_group_hints (e.g. different German spellings). Never loads all rows into memory at once.
This pass embeds all masters and finds near-duplicates via Qdrant.
""" """
logger.info("BatchDedup Phase 2: Cross-group pass starting...") logger.info("BatchDedup Phase 2: Cross-group pass starting...")
# Count total
total_row = self.db.execute(text("""
SELECT COUNT(*) FROM canonical_controls
WHERE decomposition_method = 'pass0b'
AND release_state != 'duplicate'
AND release_state != 'deprecated'
""")).fetchone()
total = total_row[0] if total_row else 0
self._progress_total = total
self._progress_count = 0
logger.info("BatchDedup Cross-group: %d masters to check", total)
cross_linked = 0
cross_review = 0
# Paginated processing — 100 rows per DB query
DB_PAGE = 100
last_control_id = ""
while True:
rows = self.db.execute(text(""" rows = self.db.execute(text("""
SELECT id::text, control_id, title, SELECT id::text, control_id, title,
generation_metadata->>'merge_group_hint' as merge_group_hint generation_metadata->>'merge_group_hint' as merge_group_hint
@@ -343,53 +362,50 @@ class BatchDedupRunner:
WHERE decomposition_method = 'pass0b' WHERE decomposition_method = 'pass0b'
AND release_state != 'duplicate' AND release_state != 'duplicate'
AND release_state != 'deprecated' AND release_state != 'deprecated'
AND control_id > :last_id
ORDER BY control_id ORDER BY control_id
""")).fetchall() LIMIT :page_size
"""), {"last_id": last_control_id, "page_size": DB_PAGE}).fetchall()
self._progress_total = len(rows) if not rows:
self._progress_count = 0 break
logger.info("BatchDedup Cross-group: %d masters to check", len(rows))
cross_linked = 0
cross_review = 0
# Process in parallel batches for embedding + Qdrant search last_control_id = rows[-1][1]
PARALLEL_BATCH = 10
async def _embed_and_search(r): # Process each control individually (no asyncio.gather — more stable)
"""Embed one control and search Qdrant — safe for asyncio.gather.""" for r in rows:
try:
hint = r[3] or "" hint = r[3] or ""
parts = hint.split(":", 2) parts = hint.split(":", 2)
action = parts[0] if len(parts) > 0 else "" action = parts[0] if len(parts) > 0 else ""
obj = parts[1] if len(parts) > 1 else "" obj = parts[1] if len(parts) > 1 else ""
canonical = canonicalize_text(action, obj, r[2]) canonical = canonicalize_text(action, obj, r[2])
embedding = await get_embedding(canonical)
if not embedding: # Timeout per embedding call
return None try:
results = await qdrant_search_cross_regulation( embedding = await asyncio.wait_for(
embedding, top_k=5, collection=self.collection, get_embedding(canonical), timeout=30.0
) )
return (r, results) except asyncio.TimeoutError:
for batch_start in range(0, len(rows), PARALLEL_BATCH):
batch = rows[batch_start:batch_start + PARALLEL_BATCH]
tasks = [_embed_and_search(r) for r in batch]
results_batch = await asyncio.gather(*tasks, return_exceptions=True)
for res in results_batch:
if res is None or isinstance(res, Exception):
if isinstance(res, Exception):
logger.error("BatchDedup embed/search error: %s", res)
self.stats["errors"] += 1 self.stats["errors"] += 1
continue continue
r, results = res if not embedding:
ctrl_uuid = r[0]
hint = r[3] or ""
if not results:
continue continue
for match in results: try:
results = await asyncio.wait_for(
qdrant_search_cross_regulation(
embedding, top_k=5, collection=self.collection,
), timeout=30.0
)
except asyncio.TimeoutError:
self.stats["errors"] += 1
continue
ctrl_uuid = r[0]
for match in (results or []):
match_score = match.get("score", 0.0) match_score = match.get("score", 0.0)
match_payload = match.get("payload", {}) match_payload = match.get("payload", {})
match_uuid = match_payload.get("control_uuid", "") match_uuid = match_payload.get("control_uuid", "")
@@ -415,13 +431,15 @@ class BatchDedupRunner:
transferred = self._transfer_parent_links(match_uuid, ctrl_uuid) transferred = self._transfer_parent_links(match_uuid, ctrl_uuid)
self.stats["parent_links_transferred"] += transferred self.stats["parent_links_transferred"] += transferred
self.db.commit() self.db.commit()
cross_linked += 1 cross_linked += 1
except Exception as e: except Exception as e:
logger.error("BatchDedup cross-group link error %s%s: %s", logger.error("BatchDedup cross-group link error %s%s: %s",
ctrl_uuid, match_uuid, e) ctrl_uuid, match_uuid, e)
try:
self.db.rollback() self.db.rollback()
except Exception:
pass
self.stats["errors"] += 1 self.stats["errors"] += 1
break break
elif match_score > REVIEW_THRESHOLD: elif match_score > REVIEW_THRESHOLD:
@@ -433,9 +451,19 @@ class BatchDedupRunner:
cross_review += 1 cross_review += 1
break break
processed = min(batch_start + PARALLEL_BATCH, len(rows)) except Exception as e:
self._progress_count = processed logger.error("BatchDedup cross-group control %s error: %s", r[1], e)
if processed % 500 < PARALLEL_BATCH: self.stats["errors"] += 1
try:
self.db.rollback()
except Exception:
pass
self._progress_count += 1
# Log progress every page
processed = self._progress_count
if processed % 500 < DB_PAGE:
logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review", logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review",
processed, len(rows), cross_linked, cross_review) processed, len(rows), cross_linked, cross_review)