chore(qa): add PDF-based control QA scripts and results
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
154
scripts/qa/qa_dedup_controls.py
Normal file
154
scripts/qa/qa_dedup_controls.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""
|
||||
Task 1: Remove obvious duplicate controls.
|
||||
Strategy: Within each (regulation, article, paragraph) group,
|
||||
compare titles using word overlap (Jaccard). If >60% similar → duplicate.
|
||||
Keep the oldest control (first created), mark others as 'rejected'.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
from sqlalchemy import create_engine, text as sql_text
|
||||
|
||||
DB_URL = os.environ['DATABASE_URL']
|
||||
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||
DRY_RUN = '--dry-run' in sys.argv
|
||||
|
||||
JACCARD_THRESHOLD = 0.45 # Title word overlap threshold for dedup
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
"""Simple word tokenizer for German/English text."""
|
||||
if not text:
|
||||
return set()
|
||||
words = re.findall(r'\b[a-zA-ZäöüÄÖÜß]{3,}\b', text.lower())
|
||||
# Remove common stopwords
|
||||
stops = {'und', 'der', 'die', 'das', 'für', 'von', 'mit', 'bei', 'zur', 'zum',
|
||||
'den', 'des', 'dem', 'ein', 'eine', 'einer', 'eines', 'the', 'and',
|
||||
'for', 'with', 'nicht', 'oder', 'auf', 'als', 'nach', 'über', 'aus',
|
||||
'ist', 'sind', 'werden', 'wird', 'durch', 'unter', 'vor', 'dass'}
|
||||
return set(words) - stops
|
||||
|
||||
|
||||
def jaccard(set_a, set_b):
|
||||
if not set_a or not set_b:
|
||||
return 0.0
|
||||
intersection = set_a & set_b
|
||||
union = set_a | set_b
|
||||
return len(intersection) / len(union) if union else 0.0
|
||||
|
||||
|
||||
print("=" * 60)
|
||||
print("TASK 1: DEDUPLICATE CONTROLS (Jaccard title similarity)")
|
||||
print(f" Threshold: {JACCARD_THRESHOLD}")
|
||||
print("=" * 60)
|
||||
|
||||
with engine.begin() as conn:
|
||||
# Load all duplicate groups
|
||||
with open("/tmp/dedup_plan.json") as f:
|
||||
dup_groups = json.load(f)
|
||||
|
||||
print(f" Duplicate groups from plan: {len(dup_groups)}")
|
||||
|
||||
# For each group, load full control data and compare titles
|
||||
total_rejected = 0
|
||||
total_kept = 0
|
||||
groups_with_dupes = 0
|
||||
|
||||
for group in dup_groups:
|
||||
reg = group["reg"]
|
||||
article = group["article"]
|
||||
paragraph = group["paragraph"]
|
||||
ids = group["ids"]
|
||||
|
||||
if len(ids) < 2:
|
||||
continue
|
||||
|
||||
# Load controls
|
||||
rows = conn.execute(sql_text("""
|
||||
SELECT id, title, objective, created_at, release_state, control_id
|
||||
FROM compliance.canonical_controls
|
||||
WHERE id = ANY(CAST(:ids AS uuid[]))
|
||||
ORDER BY created_at ASC
|
||||
"""), {"ids": ids}).fetchall()
|
||||
|
||||
if len(rows) < 2:
|
||||
continue
|
||||
|
||||
# Compare: keep first (oldest), check others against it and each other
|
||||
kept = [rows[0]]
|
||||
to_reject = []
|
||||
|
||||
for candidate in rows[1:]:
|
||||
cand_tokens = tokenize(candidate[1])
|
||||
is_dup = False
|
||||
|
||||
# Check against all kept controls
|
||||
for keeper in kept:
|
||||
keep_tokens = tokenize(keeper[1])
|
||||
sim = jaccard(cand_tokens, keep_tokens)
|
||||
if sim >= JACCARD_THRESHOLD:
|
||||
is_dup = True
|
||||
break
|
||||
|
||||
if is_dup:
|
||||
to_reject.append(candidate)
|
||||
else:
|
||||
kept.append(candidate)
|
||||
|
||||
if to_reject:
|
||||
groups_with_dupes += 1
|
||||
total_rejected += len(to_reject)
|
||||
total_kept += len(kept)
|
||||
|
||||
if groups_with_dupes <= 5:
|
||||
print(f"\n {reg} {article} {paragraph}: {len(rows)} controls → keep {len(kept)}, reject {len(to_reject)}")
|
||||
for k in kept[:2]:
|
||||
print(f" [KEEP] {k[1][:70]}")
|
||||
for r in to_reject[:3]:
|
||||
print(f" [REJ ] {r[1][:70]}")
|
||||
if len(to_reject) > 3:
|
||||
print(f" ... +{len(to_reject) - 3} more rejected")
|
||||
|
||||
if not DRY_RUN:
|
||||
reject_ids = [r[0] for r in to_reject]
|
||||
conn.execute(sql_text("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET release_state = 'duplicate',
|
||||
customer_visible = false,
|
||||
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
||||
|| '{"dedup_reason": "title_jaccard_qa", "dedup_date": "2026-03-19"}'::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE id = ANY(CAST(:ids AS uuid[]))
|
||||
"""), {"ids": reject_ids})
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"DEDUP RESULTS")
|
||||
print(f"{'=' * 60}")
|
||||
print(f" Groups processed: {len(dup_groups)}")
|
||||
print(f" Groups with dupes: {groups_with_dupes}")
|
||||
print(f" Controls rejected: {total_rejected}")
|
||||
print(f" Controls kept: {total_kept}")
|
||||
print(f" Dry run: {DRY_RUN}")
|
||||
|
||||
# Verify final counts
|
||||
if not DRY_RUN:
|
||||
r = conn.execute(sql_text("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
GROUP BY release_state
|
||||
ORDER BY count(*) DESC
|
||||
"""))
|
||||
print(f"\n === Final control state distribution ===")
|
||||
for row in r.fetchall():
|
||||
print(f" {str(row[0]):20s} {row[1]:6d}")
|
||||
|
||||
# Active controls (not rejected/too_close)
|
||||
r2 = conn.execute(sql_text("""
|
||||
SELECT count(*) FROM compliance.canonical_controls
|
||||
WHERE release_state NOT IN ('duplicate', 'too_close', 'deprecated')
|
||||
"""))
|
||||
active = r2.scalar()
|
||||
print(f"\n Active controls (draft/verified/needs_review): {active}")
|
||||
Reference in New Issue
Block a user