perf(pipeline): skip singleton groups in dedup Phase 1
153k of 160k merge groups have only 1 control — no intra-group dedup possible. Skip them in Phase 1, they become masters automatically. Phase 2 (cross-group) still checks them via Qdrant embeddings. Reduces Phase 1 from ~96h to ~2h. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -131,11 +131,20 @@ class BatchDedupRunner:
|
|||||||
await ensure_qdrant_collection(collection=self.collection)
|
await ensure_qdrant_collection(collection=self.collection)
|
||||||
|
|
||||||
# Phase 1: Intra-group dedup (same merge_group_hint)
|
# Phase 1: Intra-group dedup (same merge_group_hint)
|
||||||
|
# Optimization: skip singleton groups (they're automatically masters)
|
||||||
self._progress_phase = "phase1"
|
self._progress_phase = "phase1"
|
||||||
groups = self._load_merge_groups(hint_filter)
|
groups = self._load_merge_groups(hint_filter)
|
||||||
self._progress_total = self.stats["total_controls"]
|
self._progress_total = self.stats["total_controls"]
|
||||||
|
|
||||||
for hint, controls in groups:
|
multi_groups = [(h, c) for h, c in groups if len(c) > 1]
|
||||||
|
singleton_count = len(groups) - len(multi_groups)
|
||||||
|
self.stats["singleton_groups_skipped"] = singleton_count
|
||||||
|
logger.info(
|
||||||
|
"BatchDedup Phase 1: %d multi-control groups to process, %d singletons skipped",
|
||||||
|
len(multi_groups), singleton_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
for hint, controls in multi_groups:
|
||||||
try:
|
try:
|
||||||
await self._process_hint_group(hint, controls, dry_run)
|
await self._process_hint_group(hint, controls, dry_run)
|
||||||
self.stats["phase1_groups_processed"] += 1
|
self.stats["phase1_groups_processed"] += 1
|
||||||
@@ -148,8 +157,8 @@ class BatchDedupRunner:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"BatchDedup Phase 1 done: %d masters, %d linked, %d review",
|
"BatchDedup Phase 1 done: %d masters, %d linked, %d review (skipped %d singletons)",
|
||||||
self.stats["masters"], self.stats["linked"], self.stats["review"],
|
self.stats["masters"], self.stats["linked"], self.stats["review"], singleton_count,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Phase 2: Cross-group dedup via embeddings
|
# Phase 2: Cross-group dedup via embeddings
|
||||||
|
|||||||
Reference in New Issue
Block a user