breakpilot-compliance/scripts/qa/qa_dedup_controls.py

"""
Task 1: Remove obvious duplicate controls.
Strategy: Within each (regulation, article, paragraph) group,
compare titles using word overlap (Jaccard). If >60% similar → duplicate.
Keep the oldest control (first created), mark others as 'rejected'.
"""
import json
import os
import re
import sys
from collections import defaultdict

from sqlalchemy import create_engine, text as sql_text

DB_URL = os.environ['DATABASE_URL']
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
DRY_RUN = '--dry-run' in sys.argv

JACCARD_THRESHOLD = 0.45  # Title word overlap threshold for dedup


def tokenize(text):
    """Simple word tokenizer for German/English text."""
    if not text:
        return set()
    words = re.findall(r'\b[a-zA-ZäöüÄÖÜß]{3,}\b', text.lower())
    # Remove common stopwords
    stops = {'und', 'der', 'die', 'das', 'für', 'von', 'mit', 'bei', 'zur', 'zum',
             'den', 'des', 'dem', 'ein', 'eine', 'einer', 'eines', 'the', 'and',
             'for', 'with', 'nicht', 'oder', 'auf', 'als', 'nach', 'über', 'aus',
             'ist', 'sind', 'werden', 'wird', 'durch', 'unter', 'vor', 'dass'}
    return set(words) - stops


def jaccard(set_a, set_b):
    if not set_a or not set_b:
        return 0.0
    intersection = set_a & set_b
    union = set_a | set_b
    return len(intersection) / len(union) if union else 0.0


print("=" * 60)
print("TASK 1: DEDUPLICATE CONTROLS (Jaccard title similarity)")
print(f"  Threshold: {JACCARD_THRESHOLD}")
print("=" * 60)

with engine.begin() as conn:
    # Load all duplicate groups
    with open("/tmp/dedup_plan.json") as f:
        dup_groups = json.load(f)

    print(f"  Duplicate groups from plan: {len(dup_groups)}")

    # For each group, load full control data and compare titles
    total_rejected = 0
    total_kept = 0
    groups_with_dupes = 0

    for group in dup_groups:
        reg = group["reg"]
        article = group["article"]
        paragraph = group["paragraph"]
        ids = group["ids"]

        if len(ids) < 2:
            continue

        # Load controls
        rows = conn.execute(sql_text("""
            SELECT id, title, objective, created_at, release_state, control_id
            FROM compliance.canonical_controls
            WHERE id = ANY(CAST(:ids AS uuid[]))
            ORDER BY created_at ASC
        """), {"ids": ids}).fetchall()

        if len(rows) < 2:
            continue

        # Compare: keep first (oldest), check others against it and each other
        kept = [rows[0]]
        to_reject = []

        for candidate in rows[1:]:
            cand_tokens = tokenize(candidate[1])
            is_dup = False

            # Check against all kept controls
            for keeper in kept:
                keep_tokens = tokenize(keeper[1])
                sim = jaccard(cand_tokens, keep_tokens)
                if sim >= JACCARD_THRESHOLD:
                    is_dup = True
                    break

            if is_dup:
                to_reject.append(candidate)
            else:
                kept.append(candidate)

        if to_reject:
            groups_with_dupes += 1
            total_rejected += len(to_reject)
            total_kept += len(kept)

            if groups_with_dupes <= 5:
                print(f"\n  {reg} {article} {paragraph}: {len(rows)} controls → keep {len(kept)}, reject {len(to_reject)}")
                for k in kept[:2]:
                    print(f"    [KEEP] {k[1][:70]}")
                for r in to_reject[:3]:
                    print(f"    [REJ ] {r[1][:70]}")
                if len(to_reject) > 3:
                    print(f"    ... +{len(to_reject) - 3} more rejected")

            if not DRY_RUN:
                reject_ids = [r[0] for r in to_reject]
                conn.execute(sql_text("""
                    UPDATE compliance.canonical_controls
                    SET release_state = 'duplicate',
                        customer_visible = false,
                        generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
                            || '{"dedup_reason": "title_jaccard_qa", "dedup_date": "2026-03-19"}'::jsonb,
                        updated_at = NOW()
                    WHERE id = ANY(CAST(:ids AS uuid[]))
                """), {"ids": reject_ids})

    print(f"\n{'=' * 60}")
    print(f"DEDUP RESULTS")
    print(f"{'=' * 60}")
    print(f"  Groups processed:    {len(dup_groups)}")
    print(f"  Groups with dupes:   {groups_with_dupes}")
    print(f"  Controls rejected:   {total_rejected}")
    print(f"  Controls kept:       {total_kept}")
    print(f"  Dry run:             {DRY_RUN}")

    # Verify final counts
    if not DRY_RUN:
        r = conn.execute(sql_text("""
            SELECT release_state, count(*)
            FROM compliance.canonical_controls
            GROUP BY release_state
            ORDER BY count(*) DESC
        """))
        print(f"\n  === Final control state distribution ===")
        for row in r.fetchall():
            print(f"    {str(row[0]):20s} {row[1]:6d}")

        # Active controls (not rejected/too_close)
        r2 = conn.execute(sql_text("""
            SELECT count(*) FROM compliance.canonical_controls
            WHERE release_state NOT IN ('duplicate', 'too_close', 'deprecated')
        """))
        active = r2.scalar()
        print(f"\n  Active controls (draft/verified/needs_review): {active}")