feat(pipeline): add checkpoint to dedup Phase 2 — survives container restart
Stores last_control_id in canonical_generation_jobs after each page. On restart, resumes from checkpoint instead of starting over. Checkpoint is deleted on completion. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -346,13 +346,40 @@ class BatchDedupRunner:
|
|||||||
|
|
||||||
self._progress_total = total
|
self._progress_total = total
|
||||||
self._progress_count = 0
|
self._progress_count = 0
|
||||||
logger.info("BatchDedup Cross-group: %d masters to check", total)
|
|
||||||
cross_linked = 0
|
cross_linked = 0
|
||||||
cross_review = 0
|
cross_review = 0
|
||||||
|
|
||||||
# Paginated processing — 100 rows per DB query
|
# Checkpoint: resume from last processed control_id
|
||||||
DB_PAGE = 100
|
DB_PAGE = 100
|
||||||
last_control_id = ""
|
# Checkpoint: resume from last processed control_id (survives container restart)
|
||||||
|
checkpoint_row = self.db.execute(text("""
|
||||||
|
SELECT config FROM canonical_generation_jobs
|
||||||
|
WHERE status = 'dedup_phase2_checkpoint'
|
||||||
|
LIMIT 1
|
||||||
|
""")).fetchone()
|
||||||
|
last_control_id = checkpoint_row[0] if checkpoint_row else ""
|
||||||
|
|
||||||
|
if last_control_id:
|
||||||
|
skip_row = self.db.execute(text("""
|
||||||
|
SELECT COUNT(*) FROM canonical_controls
|
||||||
|
WHERE decomposition_method = 'pass0b'
|
||||||
|
AND release_state != 'duplicate'
|
||||||
|
AND release_state != 'deprecated'
|
||||||
|
AND control_id <= :last_id
|
||||||
|
"""), {"last_id": last_control_id}).fetchone()
|
||||||
|
skipped = skip_row[0] if skip_row else 0
|
||||||
|
self._progress_count = skipped
|
||||||
|
logger.info("BatchDedup Cross-group: RESUMING from %s (skipping %d already processed)",
|
||||||
|
last_control_id, skipped)
|
||||||
|
else:
|
||||||
|
self.db.execute(text("""
|
||||||
|
INSERT INTO canonical_generation_jobs (id, status, config)
|
||||||
|
VALUES (gen_random_uuid(), 'dedup_phase2_checkpoint', '')
|
||||||
|
"""))
|
||||||
|
self.db.commit()
|
||||||
|
|
||||||
|
logger.info("BatchDedup Cross-group: %d masters to check (starting from %s)",
|
||||||
|
total, last_control_id or "beginning")
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
rows = self.db.execute(text("""
|
rows = self.db.execute(text("""
|
||||||
@@ -461,11 +488,34 @@ class BatchDedupRunner:
|
|||||||
|
|
||||||
self._progress_count += 1
|
self._progress_count += 1
|
||||||
|
|
||||||
# Log progress every page
|
# Save checkpoint + log progress every page
|
||||||
|
try:
|
||||||
|
self.db.execute(text("""
|
||||||
|
UPDATE canonical_generation_jobs
|
||||||
|
SET config = :cid
|
||||||
|
WHERE status = 'dedup_phase2_checkpoint'
|
||||||
|
"""), {"cid": last_control_id})
|
||||||
|
self.db.commit()
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
self.db.rollback()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
processed = self._progress_count
|
processed = self._progress_count
|
||||||
if processed % 500 < DB_PAGE:
|
if processed % 500 < DB_PAGE:
|
||||||
logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review",
|
logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review (checkpoint: %s)",
|
||||||
processed, len(rows), cross_linked, cross_review)
|
processed, total, cross_linked, cross_review, last_control_id)
|
||||||
|
|
||||||
|
# Clear checkpoint on completion
|
||||||
|
try:
|
||||||
|
self.db.execute(text("""
|
||||||
|
DELETE FROM canonical_generation_jobs
|
||||||
|
WHERE status = 'dedup_phase2_checkpoint'
|
||||||
|
"""))
|
||||||
|
self.db.commit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
self.stats["cross_group_linked"] = cross_linked
|
self.stats["cross_group_linked"] = cross_linked
|
||||||
self.stats["cross_group_review"] = cross_review
|
self.stats["cross_group_review"] = cross_review
|
||||||
|
|||||||
Reference in New Issue
Block a user