perf(pipeline): skip singleton groups in dedup Phase 1

153k of 160k merge groups have only 1 control — no intra-group
dedup possible. Skip them in Phase 1, they become masters automatically.
Phase 2 (cross-group) still checks them via Qdrant embeddings.

Reduces Phase 1 from ~96h to ~2h.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-28 00:31:22 +02:00
parent e6e2688b56
commit 9dc16674e2

View File

@@ -131,11 +131,20 @@ class BatchDedupRunner:
await ensure_qdrant_collection(collection=self.collection)
# Phase 1: Intra-group dedup (same merge_group_hint)
# Optimization: skip singleton groups (they're automatically masters)
self._progress_phase = "phase1"
groups = self._load_merge_groups(hint_filter)
self._progress_total = self.stats["total_controls"]
for hint, controls in groups:
multi_groups = [(h, c) for h, c in groups if len(c) > 1]
singleton_count = len(groups) - len(multi_groups)
self.stats["singleton_groups_skipped"] = singleton_count
logger.info(
"BatchDedup Phase 1: %d multi-control groups to process, %d singletons skipped",
len(multi_groups), singleton_count,
)
for hint, controls in multi_groups:
try:
await self._process_hint_group(hint, controls, dry_run)
self.stats["phase1_groups_processed"] += 1
@@ -148,8 +157,8 @@ class BatchDedupRunner:
pass
logger.info(
"BatchDedup Phase 1 done: %d masters, %d linked, %d review",
self.stats["masters"], self.stats["linked"], self.stats["review"],
"BatchDedup Phase 1 done: %d masters, %d linked, %d review (skipped %d singletons)",
self.stats["masters"], self.stats["linked"], self.stats["review"], singleton_count,
)
# Phase 2: Cross-group dedup via embeddings