feat: Batch Dedup Runner — 85k→~18-25k Master Controls
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 32s
CI/CD / test-python-backend-compliance (push) Successful in 30s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 1s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 32s
CI/CD / test-python-backend-compliance (push) Successful in 30s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 1s
Adds batch orchestration for deduplicating ~85k Pass 0b atomic controls into ~18-25k unique masters with M:N parent linking. New files: - migrations/078_batch_dedup.sql: merged_into_uuid column, perf indexes, link_type CHECK extended for cross_regulation - batch_dedup_runner.py: BatchDedupRunner with quality scoring, merge-hint grouping, title-identical short-circuit, parent-link transfer, and cross-regulation pass - tests/test_batch_dedup_runner.py: 21 tests (all passing) Modified: - control_dedup.py: optional collection param on Qdrant functions - crosswalk_routes.py: POST/GET batch-dedup endpoints Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
42
backend-compliance/migrations/078_batch_dedup.sql
Normal file
42
backend-compliance/migrations/078_batch_dedup.sql
Normal file
@@ -0,0 +1,42 @@
|
||||
-- Migration 078: Batch Dedup — Schema extensions for 85k→~18-25k reduction
|
||||
-- Adds merged_into_uuid tracking, performance indexes for batch dedup,
|
||||
-- and extends link_type CHECK to include 'cross_regulation'.
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- =============================================================================
|
||||
-- 1. merged_into_uuid: Track which master a duplicate was merged into
|
||||
-- =============================================================================
|
||||
|
||||
ALTER TABLE canonical_controls
|
||||
ADD COLUMN IF NOT EXISTS merged_into_uuid UUID REFERENCES canonical_controls(id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_cc_merged_into
|
||||
ON canonical_controls(merged_into_uuid) WHERE merged_into_uuid IS NOT NULL;
|
||||
|
||||
-- =============================================================================
|
||||
-- 2. Performance indexes for batch dedup queries
|
||||
-- =============================================================================
|
||||
|
||||
-- Index on merge_group_hint inside generation_metadata (for sub-grouping)
|
||||
CREATE INDEX IF NOT EXISTS idx_cc_merge_group_hint
|
||||
ON canonical_controls ((generation_metadata->>'merge_group_hint'))
|
||||
WHERE decomposition_method = 'pass0b';
|
||||
|
||||
-- Composite index for pattern-based dedup loading
|
||||
CREATE INDEX IF NOT EXISTS idx_cc_pattern_dedup
|
||||
ON canonical_controls (pattern_id, release_state)
|
||||
WHERE decomposition_method = 'pass0b';
|
||||
|
||||
-- =============================================================================
|
||||
-- 3. Extend link_type CHECK to include 'cross_regulation'
|
||||
-- =============================================================================
|
||||
|
||||
ALTER TABLE control_parent_links
|
||||
DROP CONSTRAINT IF EXISTS control_parent_links_link_type_check;
|
||||
|
||||
ALTER TABLE control_parent_links
|
||||
ADD CONSTRAINT control_parent_links_link_type_check
|
||||
CHECK (link_type IN ('decomposition', 'dedup_merge', 'manual', 'crosswalk', 'cross_regulation'));
|
||||
|
||||
COMMIT;
|
||||
Reference in New Issue
Block a user