feat: Batch Dedup Runner — 85k→~18-25k Master Controls
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 32s
CI/CD / test-python-backend-compliance (push) Successful in 30s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 1s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 32s
CI/CD / test-python-backend-compliance (push) Successful in 30s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 1s
Adds batch orchestration for deduplicating ~85k Pass 0b atomic controls into ~18-25k unique masters with M:N parent linking. New files: - migrations/078_batch_dedup.sql: merged_into_uuid column, perf indexes, link_type CHECK extended for cross_regulation - batch_dedup_runner.py: BatchDedupRunner with quality scoring, merge-hint grouping, title-identical short-circuit, parent-link transfer, and cross-regulation pass - tests/test_batch_dedup_runner.py: 21 tests (all passing) Modified: - control_dedup.py: optional collection param on Qdrant functions - crosswalk_routes.py: POST/GET batch-dedup endpoints Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -764,6 +764,75 @@ async def decomposition_status():
|
||||
db.close()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# BATCH DEDUP ENDPOINTS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
# Module-level runner reference for status polling
|
||||
_batch_dedup_runner = None
|
||||
|
||||
|
||||
@router.post("/migrate/batch-dedup", response_model=MigrationResponse)
|
||||
async def migrate_batch_dedup(
|
||||
dry_run: bool = Query(False, description="Preview mode — no DB changes"),
|
||||
pattern_id: Optional[str] = Query(None, description="Only process this pattern"),
|
||||
):
|
||||
"""Batch dedup: reduce ~85k Pass 0b controls to ~18-25k masters.
|
||||
|
||||
Groups controls by pattern_id + merge_group_hint, picks the best
|
||||
quality master, and links duplicates via control_parent_links.
|
||||
"""
|
||||
global _batch_dedup_runner
|
||||
from compliance.services.batch_dedup_runner import BatchDedupRunner
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
runner = BatchDedupRunner(db=db)
|
||||
_batch_dedup_runner = runner
|
||||
stats = await runner.run(dry_run=dry_run, pattern_filter=pattern_id)
|
||||
return MigrationResponse(status="completed", stats=stats)
|
||||
except Exception as e:
|
||||
logger.error("Batch dedup failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
finally:
|
||||
_batch_dedup_runner = None
|
||||
db.close()
|
||||
|
||||
|
||||
@router.get("/migrate/batch-dedup/status")
|
||||
async def batch_dedup_status():
|
||||
"""Get current batch dedup progress (while running)."""
|
||||
if _batch_dedup_runner is not None:
|
||||
return {"running": True, **_batch_dedup_runner.get_status()}
|
||||
|
||||
# Not running — show DB stats
|
||||
db = SessionLocal()
|
||||
try:
|
||||
row = db.execute(text("""
|
||||
SELECT
|
||||
count(*) FILTER (WHERE decomposition_method = 'pass0b') AS total_pass0b,
|
||||
count(*) FILTER (WHERE decomposition_method = 'pass0b'
|
||||
AND release_state = 'duplicate') AS duplicates,
|
||||
count(*) FILTER (WHERE decomposition_method = 'pass0b'
|
||||
AND release_state != 'duplicate'
|
||||
AND release_state != 'deprecated') AS masters
|
||||
FROM canonical_controls
|
||||
""")).fetchone()
|
||||
review_count = db.execute(text(
|
||||
"SELECT count(*) FROM control_dedup_reviews WHERE review_status = 'pending'"
|
||||
)).fetchone()[0]
|
||||
return {
|
||||
"running": False,
|
||||
"total_pass0b": row[0],
|
||||
"duplicates": row[1],
|
||||
"masters": row[2],
|
||||
"pending_reviews": review_count,
|
||||
}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# HELPERS
|
||||
# =============================================================================
|
||||
|
||||
Reference in New Issue
Block a user