Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
155 lines
5.3 KiB
Python
155 lines
5.3 KiB
Python
"""
|
|
Task 1: Remove obvious duplicate controls.
|
|
Strategy: Within each (regulation, article, paragraph) group,
|
|
compare titles using word overlap (Jaccard). If >60% similar → duplicate.
|
|
Keep the oldest control (first created), mark others as 'rejected'.
|
|
"""
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
from sqlalchemy import create_engine, text as sql_text
|
|
|
|
DB_URL = os.environ['DATABASE_URL']
|
|
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
|
DRY_RUN = '--dry-run' in sys.argv
|
|
|
|
JACCARD_THRESHOLD = 0.45 # Title word overlap threshold for dedup
|
|
|
|
|
|
def tokenize(text):
|
|
"""Simple word tokenizer for German/English text."""
|
|
if not text:
|
|
return set()
|
|
words = re.findall(r'\b[a-zA-ZäöüÄÖÜß]{3,}\b', text.lower())
|
|
# Remove common stopwords
|
|
stops = {'und', 'der', 'die', 'das', 'für', 'von', 'mit', 'bei', 'zur', 'zum',
|
|
'den', 'des', 'dem', 'ein', 'eine', 'einer', 'eines', 'the', 'and',
|
|
'for', 'with', 'nicht', 'oder', 'auf', 'als', 'nach', 'über', 'aus',
|
|
'ist', 'sind', 'werden', 'wird', 'durch', 'unter', 'vor', 'dass'}
|
|
return set(words) - stops
|
|
|
|
|
|
def jaccard(set_a, set_b):
|
|
if not set_a or not set_b:
|
|
return 0.0
|
|
intersection = set_a & set_b
|
|
union = set_a | set_b
|
|
return len(intersection) / len(union) if union else 0.0
|
|
|
|
|
|
print("=" * 60)
|
|
print("TASK 1: DEDUPLICATE CONTROLS (Jaccard title similarity)")
|
|
print(f" Threshold: {JACCARD_THRESHOLD}")
|
|
print("=" * 60)
|
|
|
|
with engine.begin() as conn:
|
|
# Load all duplicate groups
|
|
with open("/tmp/dedup_plan.json") as f:
|
|
dup_groups = json.load(f)
|
|
|
|
print(f" Duplicate groups from plan: {len(dup_groups)}")
|
|
|
|
# For each group, load full control data and compare titles
|
|
total_rejected = 0
|
|
total_kept = 0
|
|
groups_with_dupes = 0
|
|
|
|
for group in dup_groups:
|
|
reg = group["reg"]
|
|
article = group["article"]
|
|
paragraph = group["paragraph"]
|
|
ids = group["ids"]
|
|
|
|
if len(ids) < 2:
|
|
continue
|
|
|
|
# Load controls
|
|
rows = conn.execute(sql_text("""
|
|
SELECT id, title, objective, created_at, release_state, control_id
|
|
FROM compliance.canonical_controls
|
|
WHERE id = ANY(CAST(:ids AS uuid[]))
|
|
ORDER BY created_at ASC
|
|
"""), {"ids": ids}).fetchall()
|
|
|
|
if len(rows) < 2:
|
|
continue
|
|
|
|
# Compare: keep first (oldest), check others against it and each other
|
|
kept = [rows[0]]
|
|
to_reject = []
|
|
|
|
for candidate in rows[1:]:
|
|
cand_tokens = tokenize(candidate[1])
|
|
is_dup = False
|
|
|
|
# Check against all kept controls
|
|
for keeper in kept:
|
|
keep_tokens = tokenize(keeper[1])
|
|
sim = jaccard(cand_tokens, keep_tokens)
|
|
if sim >= JACCARD_THRESHOLD:
|
|
is_dup = True
|
|
break
|
|
|
|
if is_dup:
|
|
to_reject.append(candidate)
|
|
else:
|
|
kept.append(candidate)
|
|
|
|
if to_reject:
|
|
groups_with_dupes += 1
|
|
total_rejected += len(to_reject)
|
|
total_kept += len(kept)
|
|
|
|
if groups_with_dupes <= 5:
|
|
print(f"\n {reg} {article} {paragraph}: {len(rows)} controls → keep {len(kept)}, reject {len(to_reject)}")
|
|
for k in kept[:2]:
|
|
print(f" [KEEP] {k[1][:70]}")
|
|
for r in to_reject[:3]:
|
|
print(f" [REJ ] {r[1][:70]}")
|
|
if len(to_reject) > 3:
|
|
print(f" ... +{len(to_reject) - 3} more rejected")
|
|
|
|
if not DRY_RUN:
|
|
reject_ids = [r[0] for r in to_reject]
|
|
conn.execute(sql_text("""
|
|
UPDATE compliance.canonical_controls
|
|
SET release_state = 'duplicate',
|
|
customer_visible = false,
|
|
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
|
|| '{"dedup_reason": "title_jaccard_qa", "dedup_date": "2026-03-19"}'::jsonb,
|
|
updated_at = NOW()
|
|
WHERE id = ANY(CAST(:ids AS uuid[]))
|
|
"""), {"ids": reject_ids})
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print(f"DEDUP RESULTS")
|
|
print(f"{'=' * 60}")
|
|
print(f" Groups processed: {len(dup_groups)}")
|
|
print(f" Groups with dupes: {groups_with_dupes}")
|
|
print(f" Controls rejected: {total_rejected}")
|
|
print(f" Controls kept: {total_kept}")
|
|
print(f" Dry run: {DRY_RUN}")
|
|
|
|
# Verify final counts
|
|
if not DRY_RUN:
|
|
r = conn.execute(sql_text("""
|
|
SELECT release_state, count(*)
|
|
FROM compliance.canonical_controls
|
|
GROUP BY release_state
|
|
ORDER BY count(*) DESC
|
|
"""))
|
|
print(f"\n === Final control state distribution ===")
|
|
for row in r.fetchall():
|
|
print(f" {str(row[0]):20s} {row[1]:6d}")
|
|
|
|
# Active controls (not rejected/too_close)
|
|
r2 = conn.execute(sql_text("""
|
|
SELECT count(*) FROM compliance.canonical_controls
|
|
WHERE release_state NOT IN ('duplicate', 'too_close', 'deprecated')
|
|
"""))
|
|
active = r2.scalar()
|
|
print(f"\n Active controls (draft/verified/needs_review): {active}")
|