Files
breakpilot-compliance/scripts/qa/qa_dedup_controls.py
Benjamin Admin 9b0f25c105
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
chore(qa): add PDF-based control QA scripts and results
QA pipeline that matches control source_original_text directly against
original PDF documents to verify article/paragraph assignments. Covers
backfill, dedup, source normalization, Qdrant cleanup, and prod sync.

Key results (2026-03-20):
- 4,110/7,943 controls matched to PDF (100% for major EU regs)
- 3,366 article corrections, 705 new assignments
- 1,290 controls from Erwägungsgründe (preamble) identified
- 779 controls from Anhänge (annexes) identified

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:56:13 +01:00

155 lines
5.3 KiB
Python

"""
Task 1: Remove obvious duplicate controls.
Strategy: Within each (regulation, article, paragraph) group,
compare titles using word overlap (Jaccard). If >60% similar → duplicate.
Keep the oldest control (first created), mark others as 'rejected'.
"""
import json
import os
import re
import sys
from collections import defaultdict
from sqlalchemy import create_engine, text as sql_text
DB_URL = os.environ['DATABASE_URL']
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
DRY_RUN = '--dry-run' in sys.argv
JACCARD_THRESHOLD = 0.45 # Title word overlap threshold for dedup
def tokenize(text):
"""Simple word tokenizer for German/English text."""
if not text:
return set()
words = re.findall(r'\b[a-zA-ZäöüÄÖÜß]{3,}\b', text.lower())
# Remove common stopwords
stops = {'und', 'der', 'die', 'das', 'für', 'von', 'mit', 'bei', 'zur', 'zum',
'den', 'des', 'dem', 'ein', 'eine', 'einer', 'eines', 'the', 'and',
'for', 'with', 'nicht', 'oder', 'auf', 'als', 'nach', 'über', 'aus',
'ist', 'sind', 'werden', 'wird', 'durch', 'unter', 'vor', 'dass'}
return set(words) - stops
def jaccard(set_a, set_b):
if not set_a or not set_b:
return 0.0
intersection = set_a & set_b
union = set_a | set_b
return len(intersection) / len(union) if union else 0.0
print("=" * 60)
print("TASK 1: DEDUPLICATE CONTROLS (Jaccard title similarity)")
print(f" Threshold: {JACCARD_THRESHOLD}")
print("=" * 60)
with engine.begin() as conn:
# Load all duplicate groups
with open("/tmp/dedup_plan.json") as f:
dup_groups = json.load(f)
print(f" Duplicate groups from plan: {len(dup_groups)}")
# For each group, load full control data and compare titles
total_rejected = 0
total_kept = 0
groups_with_dupes = 0
for group in dup_groups:
reg = group["reg"]
article = group["article"]
paragraph = group["paragraph"]
ids = group["ids"]
if len(ids) < 2:
continue
# Load controls
rows = conn.execute(sql_text("""
SELECT id, title, objective, created_at, release_state, control_id
FROM compliance.canonical_controls
WHERE id = ANY(CAST(:ids AS uuid[]))
ORDER BY created_at ASC
"""), {"ids": ids}).fetchall()
if len(rows) < 2:
continue
# Compare: keep first (oldest), check others against it and each other
kept = [rows[0]]
to_reject = []
for candidate in rows[1:]:
cand_tokens = tokenize(candidate[1])
is_dup = False
# Check against all kept controls
for keeper in kept:
keep_tokens = tokenize(keeper[1])
sim = jaccard(cand_tokens, keep_tokens)
if sim >= JACCARD_THRESHOLD:
is_dup = True
break
if is_dup:
to_reject.append(candidate)
else:
kept.append(candidate)
if to_reject:
groups_with_dupes += 1
total_rejected += len(to_reject)
total_kept += len(kept)
if groups_with_dupes <= 5:
print(f"\n {reg} {article} {paragraph}: {len(rows)} controls → keep {len(kept)}, reject {len(to_reject)}")
for k in kept[:2]:
print(f" [KEEP] {k[1][:70]}")
for r in to_reject[:3]:
print(f" [REJ ] {r[1][:70]}")
if len(to_reject) > 3:
print(f" ... +{len(to_reject) - 3} more rejected")
if not DRY_RUN:
reject_ids = [r[0] for r in to_reject]
conn.execute(sql_text("""
UPDATE compliance.canonical_controls
SET release_state = 'duplicate',
customer_visible = false,
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|| '{"dedup_reason": "title_jaccard_qa", "dedup_date": "2026-03-19"}'::jsonb,
updated_at = NOW()
WHERE id = ANY(CAST(:ids AS uuid[]))
"""), {"ids": reject_ids})
print(f"\n{'=' * 60}")
print(f"DEDUP RESULTS")
print(f"{'=' * 60}")
print(f" Groups processed: {len(dup_groups)}")
print(f" Groups with dupes: {groups_with_dupes}")
print(f" Controls rejected: {total_rejected}")
print(f" Controls kept: {total_kept}")
print(f" Dry run: {DRY_RUN}")
# Verify final counts
if not DRY_RUN:
r = conn.execute(sql_text("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY release_state
ORDER BY count(*) DESC
"""))
print(f"\n === Final control state distribution ===")
for row in r.fetchall():
print(f" {str(row[0]):20s} {row[1]:6d}")
# Active controls (not rejected/too_close)
r2 = conn.execute(sql_text("""
SELECT count(*) FROM compliance.canonical_controls
WHERE release_state NOT IN ('duplicate', 'too_close', 'deprecated')
"""))
active = r2.scalar()
print(f"\n Active controls (draft/verified/needs_review): {active}")