perf(pipeline): skip singleton groups in dedup Phase 1
153k of 160k merge groups have only 1 control — no intra-group dedup possible. Skip them in Phase 1, they become masters automatically. Phase 2 (cross-group) still checks them via Qdrant embeddings. Reduces Phase 1 from ~96h to ~2h. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -131,11 +131,20 @@ class BatchDedupRunner:
|
||||
await ensure_qdrant_collection(collection=self.collection)
|
||||
|
||||
# Phase 1: Intra-group dedup (same merge_group_hint)
|
||||
# Optimization: skip singleton groups (they're automatically masters)
|
||||
self._progress_phase = "phase1"
|
||||
groups = self._load_merge_groups(hint_filter)
|
||||
self._progress_total = self.stats["total_controls"]
|
||||
|
||||
for hint, controls in groups:
|
||||
multi_groups = [(h, c) for h, c in groups if len(c) > 1]
|
||||
singleton_count = len(groups) - len(multi_groups)
|
||||
self.stats["singleton_groups_skipped"] = singleton_count
|
||||
logger.info(
|
||||
"BatchDedup Phase 1: %d multi-control groups to process, %d singletons skipped",
|
||||
len(multi_groups), singleton_count,
|
||||
)
|
||||
|
||||
for hint, controls in multi_groups:
|
||||
try:
|
||||
await self._process_hint_group(hint, controls, dry_run)
|
||||
self.stats["phase1_groups_processed"] += 1
|
||||
@@ -148,8 +157,8 @@ class BatchDedupRunner:
|
||||
pass
|
||||
|
||||
logger.info(
|
||||
"BatchDedup Phase 1 done: %d masters, %d linked, %d review",
|
||||
self.stats["masters"], self.stats["linked"], self.stats["review"],
|
||||
"BatchDedup Phase 1 done: %d masters, %d linked, %d review (skipped %d singletons)",
|
||||
self.stats["masters"], self.stats["linked"], self.stats["review"], singleton_count,
|
||||
)
|
||||
|
||||
# Phase 2: Cross-group dedup via embeddings
|
||||
|
||||
Reference in New Issue
Block a user