From 9dc16674e2287cd8573ef443ee4f3ad567b750fd Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 28 Apr 2026 00:31:22 +0200 Subject: [PATCH] perf(pipeline): skip singleton groups in dedup Phase 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 153k of 160k merge groups have only 1 control — no intra-group dedup possible. Skip them in Phase 1, they become masters automatically. Phase 2 (cross-group) still checks them via Qdrant embeddings. Reduces Phase 1 from ~96h to ~2h. Co-Authored-By: Claude Opus 4.6 (1M context) --- control-pipeline/services/batch_dedup_runner.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/control-pipeline/services/batch_dedup_runner.py b/control-pipeline/services/batch_dedup_runner.py index e305e2a..bd73696 100644 --- a/control-pipeline/services/batch_dedup_runner.py +++ b/control-pipeline/services/batch_dedup_runner.py @@ -131,11 +131,20 @@ class BatchDedupRunner: await ensure_qdrant_collection(collection=self.collection) # Phase 1: Intra-group dedup (same merge_group_hint) + # Optimization: skip singleton groups (they're automatically masters) self._progress_phase = "phase1" groups = self._load_merge_groups(hint_filter) self._progress_total = self.stats["total_controls"] - for hint, controls in groups: + multi_groups = [(h, c) for h, c in groups if len(c) > 1] + singleton_count = len(groups) - len(multi_groups) + self.stats["singleton_groups_skipped"] = singleton_count + logger.info( + "BatchDedup Phase 1: %d multi-control groups to process, %d singletons skipped", + len(multi_groups), singleton_count, + ) + + for hint, controls in multi_groups: try: await self._process_hint_group(hint, controls, dry_run) self.stats["phase1_groups_processed"] += 1 @@ -148,8 +157,8 @@ class BatchDedupRunner: pass logger.info( - "BatchDedup Phase 1 done: %d masters, %d linked, %d review", - self.stats["masters"], self.stats["linked"], self.stats["review"], + "BatchDedup Phase 1 done: %d masters, %d linked, %d review (skipped %d singletons)", + self.stats["masters"], self.stats["linked"], self.stats["review"], singleton_count, ) # Phase 2: Cross-group dedup via embeddings