""" Task 1: Remove obvious duplicate controls. Strategy: Within each (regulation, article, paragraph) group, compare titles using word overlap (Jaccard). If >60% similar → duplicate. Keep the oldest control (first created), mark others as 'rejected'. """ import json import os import re import sys from collections import defaultdict from sqlalchemy import create_engine, text as sql_text DB_URL = os.environ['DATABASE_URL'] engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"}) DRY_RUN = '--dry-run' in sys.argv JACCARD_THRESHOLD = 0.45 # Title word overlap threshold for dedup def tokenize(text): """Simple word tokenizer for German/English text.""" if not text: return set() words = re.findall(r'\b[a-zA-ZäöüÄÖÜß]{3,}\b', text.lower()) # Remove common stopwords stops = {'und', 'der', 'die', 'das', 'für', 'von', 'mit', 'bei', 'zur', 'zum', 'den', 'des', 'dem', 'ein', 'eine', 'einer', 'eines', 'the', 'and', 'for', 'with', 'nicht', 'oder', 'auf', 'als', 'nach', 'über', 'aus', 'ist', 'sind', 'werden', 'wird', 'durch', 'unter', 'vor', 'dass'} return set(words) - stops def jaccard(set_a, set_b): if not set_a or not set_b: return 0.0 intersection = set_a & set_b union = set_a | set_b return len(intersection) / len(union) if union else 0.0 print("=" * 60) print("TASK 1: DEDUPLICATE CONTROLS (Jaccard title similarity)") print(f" Threshold: {JACCARD_THRESHOLD}") print("=" * 60) with engine.begin() as conn: # Load all duplicate groups with open("/tmp/dedup_plan.json") as f: dup_groups = json.load(f) print(f" Duplicate groups from plan: {len(dup_groups)}") # For each group, load full control data and compare titles total_rejected = 0 total_kept = 0 groups_with_dupes = 0 for group in dup_groups: reg = group["reg"] article = group["article"] paragraph = group["paragraph"] ids = group["ids"] if len(ids) < 2: continue # Load controls rows = conn.execute(sql_text(""" SELECT id, title, objective, created_at, release_state, control_id FROM compliance.canonical_controls WHERE id = ANY(CAST(:ids AS uuid[])) ORDER BY created_at ASC """), {"ids": ids}).fetchall() if len(rows) < 2: continue # Compare: keep first (oldest), check others against it and each other kept = [rows[0]] to_reject = [] for candidate in rows[1:]: cand_tokens = tokenize(candidate[1]) is_dup = False # Check against all kept controls for keeper in kept: keep_tokens = tokenize(keeper[1]) sim = jaccard(cand_tokens, keep_tokens) if sim >= JACCARD_THRESHOLD: is_dup = True break if is_dup: to_reject.append(candidate) else: kept.append(candidate) if to_reject: groups_with_dupes += 1 total_rejected += len(to_reject) total_kept += len(kept) if groups_with_dupes <= 5: print(f"\n {reg} {article} {paragraph}: {len(rows)} controls → keep {len(kept)}, reject {len(to_reject)}") for k in kept[:2]: print(f" [KEEP] {k[1][:70]}") for r in to_reject[:3]: print(f" [REJ ] {r[1][:70]}") if len(to_reject) > 3: print(f" ... +{len(to_reject) - 3} more rejected") if not DRY_RUN: reject_ids = [r[0] for r in to_reject] conn.execute(sql_text(""" UPDATE compliance.canonical_controls SET release_state = 'duplicate', customer_visible = false, generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || '{"dedup_reason": "title_jaccard_qa", "dedup_date": "2026-03-19"}'::jsonb, updated_at = NOW() WHERE id = ANY(CAST(:ids AS uuid[])) """), {"ids": reject_ids}) print(f"\n{'=' * 60}") print(f"DEDUP RESULTS") print(f"{'=' * 60}") print(f" Groups processed: {len(dup_groups)}") print(f" Groups with dupes: {groups_with_dupes}") print(f" Controls rejected: {total_rejected}") print(f" Controls kept: {total_kept}") print(f" Dry run: {DRY_RUN}") # Verify final counts if not DRY_RUN: r = conn.execute(sql_text(""" SELECT release_state, count(*) FROM compliance.canonical_controls GROUP BY release_state ORDER BY count(*) DESC """)) print(f"\n === Final control state distribution ===") for row in r.fetchall(): print(f" {str(row[0]):20s} {row[1]:6d}") # Active controls (not rejected/too_close) r2 = conn.execute(sql_text(""" SELECT count(*) FROM compliance.canonical_controls WHERE release_state NOT IN ('duplicate', 'too_close', 'deprecated') """)) active = r2.scalar() print(f"\n Active controls (draft/verified/needs_review): {active}")