SQLAlchemy's text() parser doesn't properly handle :param::type syntax — it fails to recognize :dd as a bind parameter when followed by ::jsonb. Using CAST(:dd AS jsonb) instead. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
619 lines
24 KiB
Python
619 lines
24 KiB
Python
"""Batch Dedup Runner — Orchestrates deduplication of ~85k atomare Controls.
|
|
|
|
Reduces Pass 0b controls from ~85k to ~18-25k unique Master Controls via:
|
|
Phase 1: Intra-Group Dedup — same merge_group_hint → pick best, link rest
|
|
(85k → ~52k, mostly title-identical short-circuit, no embeddings)
|
|
Phase 2: Cross-Group Dedup — embed masters, search Qdrant for similar
|
|
masters with different hints (52k → ~18-25k)
|
|
|
|
All Pass 0b controls have pattern_id=NULL. The primary grouping key is
|
|
merge_group_hint (format: "action_type:norm_obj:trigger_key"), which
|
|
encodes the normalized action, object, and trigger.
|
|
|
|
Usage:
|
|
runner = BatchDedupRunner(db)
|
|
stats = await runner.run(dry_run=True) # preview
|
|
stats = await runner.run(dry_run=False) # execute
|
|
stats = await runner.run(hint_filter="implement:multi_factor_auth:none")
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
from collections import defaultdict
|
|
|
|
from sqlalchemy import text
|
|
|
|
from compliance.services.control_dedup import (
|
|
canonicalize_text,
|
|
ensure_qdrant_collection,
|
|
get_embedding,
|
|
normalize_action,
|
|
normalize_object,
|
|
qdrant_search_cross_regulation,
|
|
qdrant_upsert,
|
|
LINK_THRESHOLD,
|
|
REVIEW_THRESHOLD,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DEDUP_COLLECTION = "atomic_controls_dedup"
|
|
|
|
|
|
# ── Quality Score ────────────────────────────────────────────────────────
|
|
|
|
|
|
def quality_score(control: dict) -> float:
|
|
"""Score a control by richness of requirements, tests, evidence, and objective.
|
|
|
|
Higher score = better candidate for master control.
|
|
"""
|
|
score = 0.0
|
|
|
|
reqs = control.get("requirements") or "[]"
|
|
if isinstance(reqs, str):
|
|
try:
|
|
reqs = json.loads(reqs)
|
|
except (json.JSONDecodeError, TypeError):
|
|
reqs = []
|
|
score += len(reqs) * 2.0
|
|
|
|
tests = control.get("test_procedure") or "[]"
|
|
if isinstance(tests, str):
|
|
try:
|
|
tests = json.loads(tests)
|
|
except (json.JSONDecodeError, TypeError):
|
|
tests = []
|
|
score += len(tests) * 1.5
|
|
|
|
evidence = control.get("evidence") or "[]"
|
|
if isinstance(evidence, str):
|
|
try:
|
|
evidence = json.loads(evidence)
|
|
except (json.JSONDecodeError, TypeError):
|
|
evidence = []
|
|
score += len(evidence) * 1.0
|
|
|
|
objective = control.get("objective") or ""
|
|
score += min(len(objective) / 200, 3.0)
|
|
|
|
return score
|
|
|
|
|
|
# ── Batch Dedup Runner ───────────────────────────────────────────────────
|
|
|
|
|
|
class BatchDedupRunner:
|
|
"""Batch dedup orchestrator for existing Pass 0b atomic controls."""
|
|
|
|
def __init__(self, db, collection: str = DEDUP_COLLECTION):
|
|
self.db = db
|
|
self.collection = collection
|
|
self.stats = {
|
|
"total_controls": 0,
|
|
"unique_hints": 0,
|
|
"phase1_groups_processed": 0,
|
|
"masters": 0,
|
|
"linked": 0,
|
|
"review": 0,
|
|
"new_controls": 0,
|
|
"parent_links_transferred": 0,
|
|
"cross_group_linked": 0,
|
|
"cross_group_review": 0,
|
|
"errors": 0,
|
|
"skipped_title_identical": 0,
|
|
}
|
|
self._progress_phase = ""
|
|
self._progress_count = 0
|
|
self._progress_total = 0
|
|
|
|
async def run(
|
|
self,
|
|
dry_run: bool = False,
|
|
hint_filter: str = None,
|
|
) -> dict:
|
|
"""Run the full batch dedup pipeline.
|
|
|
|
Args:
|
|
dry_run: If True, compute stats but don't modify DB/Qdrant.
|
|
hint_filter: If set, only process groups matching this hint prefix.
|
|
|
|
Returns:
|
|
Stats dict with counts.
|
|
"""
|
|
start = time.monotonic()
|
|
logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s)",
|
|
dry_run, hint_filter)
|
|
|
|
if not dry_run:
|
|
await ensure_qdrant_collection(collection=self.collection)
|
|
|
|
# Phase 1: Intra-group dedup (same merge_group_hint)
|
|
self._progress_phase = "phase1"
|
|
groups = self._load_merge_groups(hint_filter)
|
|
self._progress_total = self.stats["total_controls"]
|
|
|
|
for hint, controls in groups:
|
|
try:
|
|
await self._process_hint_group(hint, controls, dry_run)
|
|
self.stats["phase1_groups_processed"] += 1
|
|
except Exception as e:
|
|
logger.error("BatchDedup Phase 1 error on hint %s: %s", hint, e)
|
|
self.stats["errors"] += 1
|
|
try:
|
|
self.db.rollback()
|
|
except Exception:
|
|
pass
|
|
|
|
logger.info(
|
|
"BatchDedup Phase 1 done: %d masters, %d linked, %d review",
|
|
self.stats["masters"], self.stats["linked"], self.stats["review"],
|
|
)
|
|
|
|
# Phase 2: Cross-group dedup via embeddings
|
|
if not dry_run:
|
|
self._progress_phase = "phase2"
|
|
await self._run_cross_group_pass()
|
|
|
|
elapsed = time.monotonic() - start
|
|
self.stats["elapsed_seconds"] = round(elapsed, 1)
|
|
logger.info("BatchDedup completed in %.1fs: %s", elapsed, self.stats)
|
|
return self.stats
|
|
|
|
def _load_merge_groups(self, hint_filter: str = None) -> list:
|
|
"""Load all Pass 0b controls grouped by merge_group_hint, largest first."""
|
|
conditions = [
|
|
"decomposition_method = 'pass0b'",
|
|
"release_state != 'deprecated'",
|
|
"release_state != 'duplicate'",
|
|
]
|
|
params = {}
|
|
|
|
if hint_filter:
|
|
conditions.append("generation_metadata->>'merge_group_hint' LIKE :hf")
|
|
params["hf"] = f"{hint_filter}%"
|
|
|
|
where = " AND ".join(conditions)
|
|
rows = self.db.execute(text(f"""
|
|
SELECT id::text, control_id, title, objective,
|
|
pattern_id, requirements::text, test_procedure::text,
|
|
evidence::text, release_state,
|
|
generation_metadata->>'merge_group_hint' as merge_group_hint,
|
|
generation_metadata->>'action_object_class' as action_object_class
|
|
FROM canonical_controls
|
|
WHERE {where}
|
|
ORDER BY control_id
|
|
"""), params).fetchall()
|
|
|
|
by_hint = defaultdict(list)
|
|
for r in rows:
|
|
by_hint[r[9] or ""].append({
|
|
"uuid": r[0],
|
|
"control_id": r[1],
|
|
"title": r[2],
|
|
"objective": r[3],
|
|
"pattern_id": r[4],
|
|
"requirements": r[5],
|
|
"test_procedure": r[6],
|
|
"evidence": r[7],
|
|
"release_state": r[8],
|
|
"merge_group_hint": r[9] or "",
|
|
"action_object_class": r[10] or "",
|
|
})
|
|
|
|
self.stats["total_controls"] = len(rows)
|
|
self.stats["unique_hints"] = len(by_hint)
|
|
|
|
sorted_groups = sorted(by_hint.items(), key=lambda x: len(x[1]), reverse=True)
|
|
logger.info("BatchDedup loaded %d controls in %d hint groups",
|
|
len(rows), len(sorted_groups))
|
|
return sorted_groups
|
|
|
|
def _sub_group_by_merge_hint(self, controls: list) -> dict:
|
|
"""Group controls by merge_group_hint composite key."""
|
|
groups = defaultdict(list)
|
|
for c in controls:
|
|
hint = c["merge_group_hint"]
|
|
if hint:
|
|
groups[hint].append(c)
|
|
else:
|
|
groups[f"__no_hint_{c['uuid']}"].append(c)
|
|
return dict(groups)
|
|
|
|
async def _process_hint_group(
|
|
self,
|
|
hint: str,
|
|
controls: list,
|
|
dry_run: bool,
|
|
):
|
|
"""Process all controls sharing the same merge_group_hint.
|
|
|
|
Within a hint group, all controls share action+object+trigger.
|
|
The best-quality control becomes master, rest are linked as duplicates.
|
|
"""
|
|
if len(controls) < 2:
|
|
# Singleton → always master
|
|
self.stats["masters"] += 1
|
|
if not dry_run:
|
|
await self._embed_and_index(controls[0])
|
|
self._progress_count += 1
|
|
self._log_progress(hint)
|
|
return
|
|
|
|
# Sort by quality score (best first)
|
|
sorted_group = sorted(controls, key=quality_score, reverse=True)
|
|
master = sorted_group[0]
|
|
self.stats["masters"] += 1
|
|
|
|
if not dry_run:
|
|
await self._embed_and_index(master)
|
|
|
|
for candidate in sorted_group[1:]:
|
|
# All share the same hint → check title similarity
|
|
if candidate["title"].strip().lower() == master["title"].strip().lower():
|
|
# Identical title → direct link (no embedding needed)
|
|
self.stats["linked"] += 1
|
|
self.stats["skipped_title_identical"] += 1
|
|
if not dry_run:
|
|
await self._mark_duplicate(master, candidate, confidence=1.0)
|
|
else:
|
|
# Different title within same hint → still likely duplicate
|
|
# Use embedding to verify
|
|
await self._check_and_link_within_group(master, candidate, dry_run)
|
|
|
|
self._progress_count += 1
|
|
self._log_progress(hint)
|
|
|
|
async def _check_and_link_within_group(
|
|
self,
|
|
master: dict,
|
|
candidate: dict,
|
|
dry_run: bool,
|
|
):
|
|
"""Check if candidate (same hint group) is duplicate of master via embedding."""
|
|
parts = candidate["merge_group_hint"].split(":", 2)
|
|
action = parts[0] if len(parts) > 0 else ""
|
|
obj = parts[1] if len(parts) > 1 else ""
|
|
|
|
canonical = canonicalize_text(action, obj, candidate["title"])
|
|
embedding = await get_embedding(canonical)
|
|
|
|
if not embedding:
|
|
# Can't embed → link anyway (same hint = same action+object)
|
|
self.stats["linked"] += 1
|
|
if not dry_run:
|
|
await self._mark_duplicate(master, candidate, confidence=0.90)
|
|
return
|
|
|
|
# Search the dedup collection (unfiltered — pattern_id is NULL)
|
|
results = await qdrant_search_cross_regulation(
|
|
embedding, top_k=3, collection=self.collection,
|
|
)
|
|
|
|
if not results:
|
|
# No Qdrant matches yet (master might not be indexed yet) → link to master
|
|
self.stats["linked"] += 1
|
|
if not dry_run:
|
|
await self._mark_duplicate(master, candidate, confidence=0.90)
|
|
return
|
|
|
|
best = results[0]
|
|
best_score = best.get("score", 0.0)
|
|
best_payload = best.get("payload", {})
|
|
best_uuid = best_payload.get("control_uuid", "")
|
|
|
|
if best_score > LINK_THRESHOLD:
|
|
self.stats["linked"] += 1
|
|
if not dry_run:
|
|
await self._mark_duplicate_to(best_uuid, candidate, confidence=best_score)
|
|
elif best_score > REVIEW_THRESHOLD:
|
|
self.stats["review"] += 1
|
|
if not dry_run:
|
|
self._write_review(candidate, best_payload, best_score)
|
|
else:
|
|
# Very different despite same hint → new master
|
|
self.stats["new_controls"] += 1
|
|
if not dry_run:
|
|
await self._index_with_embedding(candidate, embedding)
|
|
|
|
async def _run_cross_group_pass(self):
|
|
"""Phase 2: Find cross-group duplicates among surviving masters.
|
|
|
|
After Phase 1, ~52k masters remain. Many have similar semantics
|
|
despite different merge_group_hints (e.g. different German spellings).
|
|
This pass embeds all masters and finds near-duplicates via Qdrant.
|
|
"""
|
|
logger.info("BatchDedup Phase 2: Cross-group pass starting...")
|
|
|
|
rows = self.db.execute(text("""
|
|
SELECT id::text, control_id, title,
|
|
generation_metadata->>'merge_group_hint' as merge_group_hint
|
|
FROM canonical_controls
|
|
WHERE decomposition_method = 'pass0b'
|
|
AND release_state != 'duplicate'
|
|
AND release_state != 'deprecated'
|
|
ORDER BY control_id
|
|
""")).fetchall()
|
|
|
|
self._progress_total = len(rows)
|
|
self._progress_count = 0
|
|
logger.info("BatchDedup Cross-group: %d masters to check", len(rows))
|
|
cross_linked = 0
|
|
cross_review = 0
|
|
|
|
for i, r in enumerate(rows):
|
|
uuid = r[0]
|
|
hint = r[3] or ""
|
|
parts = hint.split(":", 2)
|
|
action = parts[0] if len(parts) > 0 else ""
|
|
obj = parts[1] if len(parts) > 1 else ""
|
|
|
|
canonical = canonicalize_text(action, obj, r[2])
|
|
embedding = await get_embedding(canonical)
|
|
if not embedding:
|
|
continue
|
|
|
|
results = await qdrant_search_cross_regulation(
|
|
embedding, top_k=5, collection=self.collection,
|
|
)
|
|
if not results:
|
|
continue
|
|
|
|
# Find best match from a DIFFERENT hint group
|
|
for match in results:
|
|
match_score = match.get("score", 0.0)
|
|
match_payload = match.get("payload", {})
|
|
match_uuid = match_payload.get("control_uuid", "")
|
|
|
|
# Skip self-match
|
|
if match_uuid == uuid:
|
|
continue
|
|
|
|
# Must be a different hint group (otherwise already handled in Phase 1)
|
|
match_action = match_payload.get("action_normalized", "")
|
|
match_object = match_payload.get("object_normalized", "")
|
|
# Simple check: different control UUID is enough
|
|
if match_score > LINK_THRESHOLD:
|
|
# Mark the worse one as duplicate
|
|
try:
|
|
self.db.execute(text("""
|
|
UPDATE canonical_controls
|
|
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
|
|
WHERE id = CAST(:dup AS uuid)
|
|
AND release_state != 'duplicate'
|
|
"""), {"master": match_uuid, "dup": uuid})
|
|
|
|
self.db.execute(text("""
|
|
INSERT INTO control_parent_links
|
|
(control_uuid, parent_control_uuid, link_type, confidence)
|
|
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'cross_regulation', :conf)
|
|
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
|
"""), {"cu": match_uuid, "pu": uuid, "conf": match_score})
|
|
|
|
# Transfer parent links
|
|
transferred = self._transfer_parent_links(match_uuid, uuid)
|
|
self.stats["parent_links_transferred"] += transferred
|
|
|
|
self.db.commit()
|
|
cross_linked += 1
|
|
except Exception as e:
|
|
logger.error("BatchDedup cross-group link error %s→%s: %s",
|
|
uuid, match_uuid, e)
|
|
self.db.rollback()
|
|
self.stats["errors"] += 1
|
|
break # Only one cross-link per control
|
|
elif match_score > REVIEW_THRESHOLD:
|
|
self._write_review(
|
|
{"control_id": r[1], "title": r[2], "objective": "",
|
|
"merge_group_hint": hint, "pattern_id": None},
|
|
match_payload, match_score,
|
|
)
|
|
cross_review += 1
|
|
break
|
|
|
|
self._progress_count = i + 1
|
|
if (i + 1) % 500 == 0:
|
|
logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review",
|
|
i + 1, len(rows), cross_linked, cross_review)
|
|
|
|
self.stats["cross_group_linked"] = cross_linked
|
|
self.stats["cross_group_review"] = cross_review
|
|
logger.info("BatchDedup Cross-group complete: %d linked, %d review",
|
|
cross_linked, cross_review)
|
|
|
|
# ── Qdrant Helpers ───────────────────────────────────────────────────
|
|
|
|
async def _embed_and_index(self, control: dict):
|
|
"""Compute embedding and index a control in the dedup Qdrant collection."""
|
|
parts = control["merge_group_hint"].split(":", 2)
|
|
action = parts[0] if len(parts) > 0 else ""
|
|
obj = parts[1] if len(parts) > 1 else ""
|
|
|
|
norm_action = normalize_action(action)
|
|
norm_object = normalize_object(obj)
|
|
canonical = canonicalize_text(action, obj, control["title"])
|
|
embedding = await get_embedding(canonical)
|
|
|
|
if not embedding:
|
|
return
|
|
|
|
await qdrant_upsert(
|
|
point_id=control["uuid"],
|
|
embedding=embedding,
|
|
payload={
|
|
"control_uuid": control["uuid"],
|
|
"control_id": control["control_id"],
|
|
"title": control["title"],
|
|
"pattern_id": control.get("pattern_id"),
|
|
"action_normalized": norm_action,
|
|
"object_normalized": norm_object,
|
|
"canonical_text": canonical,
|
|
"merge_group_hint": control["merge_group_hint"],
|
|
},
|
|
collection=self.collection,
|
|
)
|
|
|
|
async def _index_with_embedding(self, control: dict, embedding: list):
|
|
"""Index a control with a pre-computed embedding."""
|
|
parts = control["merge_group_hint"].split(":", 2)
|
|
action = parts[0] if len(parts) > 0 else ""
|
|
obj = parts[1] if len(parts) > 1 else ""
|
|
|
|
norm_action = normalize_action(action)
|
|
norm_object = normalize_object(obj)
|
|
canonical = canonicalize_text(action, obj, control["title"])
|
|
|
|
await qdrant_upsert(
|
|
point_id=control["uuid"],
|
|
embedding=embedding,
|
|
payload={
|
|
"control_uuid": control["uuid"],
|
|
"control_id": control["control_id"],
|
|
"title": control["title"],
|
|
"pattern_id": control.get("pattern_id"),
|
|
"action_normalized": norm_action,
|
|
"object_normalized": norm_object,
|
|
"canonical_text": canonical,
|
|
"merge_group_hint": control["merge_group_hint"],
|
|
},
|
|
collection=self.collection,
|
|
)
|
|
|
|
# ── DB Write Helpers ─────────────────────────────────────────────────
|
|
|
|
async def _mark_duplicate(self, master: dict, candidate: dict, confidence: float):
|
|
"""Mark candidate as duplicate of master, transfer parent links."""
|
|
try:
|
|
self.db.execute(text("""
|
|
UPDATE canonical_controls
|
|
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
|
|
WHERE id = CAST(:cand AS uuid)
|
|
"""), {"master": master["uuid"], "cand": candidate["uuid"]})
|
|
|
|
self.db.execute(text("""
|
|
INSERT INTO control_parent_links
|
|
(control_uuid, parent_control_uuid, link_type, confidence)
|
|
VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf)
|
|
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
|
"""), {"master": master["uuid"], "cand_parent": candidate["uuid"], "conf": confidence})
|
|
|
|
transferred = self._transfer_parent_links(master["uuid"], candidate["uuid"])
|
|
self.stats["parent_links_transferred"] += transferred
|
|
|
|
self.db.commit()
|
|
except Exception as e:
|
|
logger.error("BatchDedup _mark_duplicate error %s→%s: %s",
|
|
candidate["uuid"], master["uuid"], e)
|
|
self.db.rollback()
|
|
raise
|
|
|
|
async def _mark_duplicate_to(self, master_uuid: str, candidate: dict, confidence: float):
|
|
"""Mark candidate as duplicate of a Qdrant-matched master."""
|
|
try:
|
|
self.db.execute(text("""
|
|
UPDATE canonical_controls
|
|
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
|
|
WHERE id = CAST(:cand AS uuid)
|
|
"""), {"master": master_uuid, "cand": candidate["uuid"]})
|
|
|
|
self.db.execute(text("""
|
|
INSERT INTO control_parent_links
|
|
(control_uuid, parent_control_uuid, link_type, confidence)
|
|
VALUES (CAST(:master AS uuid), CAST(:cand_parent AS uuid), 'dedup_merge', :conf)
|
|
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
|
"""), {"master": master_uuid, "cand_parent": candidate["uuid"], "conf": confidence})
|
|
|
|
transferred = self._transfer_parent_links(master_uuid, candidate["uuid"])
|
|
self.stats["parent_links_transferred"] += transferred
|
|
|
|
self.db.commit()
|
|
except Exception as e:
|
|
logger.error("BatchDedup _mark_duplicate_to error %s→%s: %s",
|
|
candidate["uuid"], master_uuid, e)
|
|
self.db.rollback()
|
|
raise
|
|
|
|
def _transfer_parent_links(self, master_uuid: str, duplicate_uuid: str) -> int:
|
|
"""Move existing parent links from duplicate to master."""
|
|
rows = self.db.execute(text("""
|
|
SELECT parent_control_uuid::text, link_type, confidence,
|
|
source_regulation, source_article, obligation_candidate_id::text
|
|
FROM control_parent_links
|
|
WHERE control_uuid = CAST(:dup AS uuid)
|
|
AND link_type = 'decomposition'
|
|
"""), {"dup": duplicate_uuid}).fetchall()
|
|
|
|
transferred = 0
|
|
for r in rows:
|
|
parent_uuid = r[0]
|
|
if parent_uuid == master_uuid:
|
|
continue
|
|
self.db.execute(text("""
|
|
INSERT INTO control_parent_links
|
|
(control_uuid, parent_control_uuid, link_type, confidence,
|
|
source_regulation, source_article, obligation_candidate_id)
|
|
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), :lt, :conf,
|
|
:sr, :sa, CAST(:oci AS uuid))
|
|
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
|
"""), {
|
|
"cu": master_uuid,
|
|
"pu": parent_uuid,
|
|
"lt": r[1],
|
|
"conf": float(r[2]) if r[2] else 1.0,
|
|
"sr": r[3],
|
|
"sa": r[4],
|
|
"oci": r[5],
|
|
})
|
|
transferred += 1
|
|
|
|
return transferred
|
|
|
|
def _write_review(self, candidate: dict, matched_payload: dict, score: float):
|
|
"""Write a dedup review entry for borderline matches."""
|
|
try:
|
|
self.db.execute(text("""
|
|
INSERT INTO control_dedup_reviews
|
|
(candidate_control_id, candidate_title, candidate_objective,
|
|
matched_control_uuid, matched_control_id,
|
|
similarity_score, dedup_stage, dedup_details)
|
|
VALUES (:ccid, :ct, :co, CAST(:mcu AS uuid), :mci,
|
|
:ss, 'batch_dedup', CAST(:dd AS jsonb))
|
|
"""), {
|
|
"ccid": candidate["control_id"],
|
|
"ct": candidate["title"],
|
|
"co": candidate.get("objective", ""),
|
|
"mcu": matched_payload.get("control_uuid"),
|
|
"mci": matched_payload.get("control_id"),
|
|
"ss": score,
|
|
"dd": json.dumps({
|
|
"merge_group_hint": candidate.get("merge_group_hint", ""),
|
|
"pattern_id": candidate.get("pattern_id"),
|
|
}),
|
|
})
|
|
self.db.commit()
|
|
except Exception as e:
|
|
logger.error("BatchDedup _write_review error: %s", e)
|
|
self.db.rollback()
|
|
raise
|
|
|
|
# ── Progress ─────────────────────────────────────────────────────────
|
|
|
|
def _log_progress(self, hint: str):
|
|
"""Log progress every 500 controls."""
|
|
if self._progress_count > 0 and self._progress_count % 500 == 0:
|
|
logger.info(
|
|
"BatchDedup [%s] %d/%d — masters=%d, linked=%d, review=%d",
|
|
self._progress_phase, self._progress_count, self._progress_total,
|
|
self.stats["masters"], self.stats["linked"], self.stats["review"],
|
|
)
|
|
|
|
def get_status(self) -> dict:
|
|
"""Return current progress stats (for status endpoint)."""
|
|
return {
|
|
"phase": self._progress_phase,
|
|
"progress": self._progress_count,
|
|
"total": self._progress_total,
|
|
**self.stats,
|
|
}
|