chore(qa): add PDF-based control QA scripts and results

QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:56:13 +01:00
parent 1cc34c23d9
commit 9b0f25c105
12 changed files with 30839 additions and 0 deletions
@@ -0,0 +1,190 @@
+"""
+Step 3: Apply article mappings to all controls + detect duplicates.
+1. Update source_citation article/paragraph for controls that have a better mapping
+2. Identify duplicate controls (same regulation + article + paragraph)
+"""
+import json
+import os
+import sys
+from collections import defaultdict
+
+from sqlalchemy import create_engine, text as sql_text
+
+DB_URL = os.environ['DATABASE_URL']
+engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
+DRY_RUN = '--dry-run' in sys.argv
+
+# Load mappings
+with open("/tmp/all_article_mappings.json") as f:
+    article_mapping = json.load(f)
+print(f"Loaded {len(article_mapping)} article mappings")
+
+print(f"\n{'=' * 70}")
+print("STEP 3a: UPDATE CONTROLS WITH IMPROVED ARTICLE MAPPINGS")
+print(f"{'=' * 70}")
+
+with engine.begin() as conn:
+    # Fast approach: load all chunk→control mappings at once
+    print("  Loading chunk→control mappings...")
+    chunk_rows = conn.execute(sql_text("""
+        SELECT chunk_hash, jsonb_array_elements_text(generated_control_ids) as control_id
+        FROM compliance.canonical_processed_chunks
+        WHERE jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0
+    """)).fetchall()
+
+    control_to_hash = {}
+    for row in chunk_rows:
+        control_to_hash[row[1]] = row[0]
+    print(f"  Unique controls with chunk: {len(control_to_hash)}")
+
+    # Get current article info for controls with citations (skip v1/v2 without citation)
+    print("  Loading control article data...")
+    ctrl_rows = conn.execute(sql_text("""
+        SELECT id,
+               source_citation->>'article' as current_article,
+               source_citation->>'paragraph' as current_paragraph
+        FROM compliance.canonical_controls
+        WHERE source_citation IS NOT NULL
+          AND release_state NOT IN ('rejected')
+    """)).fetchall()
+    print(f"  Controls with citation: {len(ctrl_rows)}")
+
+    updated = 0
+    improved = 0
+    changed = 0
+
+    for row in ctrl_rows:
+        ctrl_id = str(row[0])
+        current_art = row[1] or ""
+        current_para = row[2] or ""
+        chunk_hash = control_to_hash.get(ctrl_id)
+
+        if not chunk_hash:
+            continue
+
+        mapping = article_mapping.get(chunk_hash)
+        if not mapping or not mapping["article"]:
+            continue
+
+        new_art = mapping["article"]
+        new_para = mapping["paragraph"]
+
+        # Only update if it's an improvement
+        if current_art == new_art and current_para == new_para:
+            continue
+
+        if not current_art and new_art:
+            improved += 1
+        elif current_art != new_art:
+            changed += 1
+
+        if not DRY_RUN:
+            citation_patch = json.dumps({"article": new_art, "paragraph": new_para})
+            meta_patch = json.dumps({"source_article": new_art, "source_paragraph": new_para})
+            conn.execute(sql_text("""
+                UPDATE compliance.canonical_controls
+                SET source_citation = COALESCE(source_citation, '{}'::jsonb) || CAST(:citation AS jsonb),
+                    generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta AS jsonb)
+                WHERE id = :id
+            """), {"id": row[0], "citation": citation_patch, "meta": meta_patch})
+
+        updated += 1
+
+    print(f"\n  Updated:  {updated}")
+    print(f"    New article (was empty): {improved}")
+    print(f"    Changed article:         {changed}")
+    print(f"  Dry run: {DRY_RUN}")
+
+    # ── Step 3b: Verification — article coverage after update ─────────
+    print(f"\n{'=' * 70}")
+    print("STEP 3b: ARTICLE COVERAGE AFTER UPDATE")
+    print(f"{'=' * 70}")
+
+    r = conn.execute(sql_text("""
+        SELECT
+            generation_metadata->>'source_regulation' as reg,
+            count(*) as total,
+            count(*) FILTER (WHERE source_citation->>'article' != '' AND source_citation->>'article' IS NOT NULL) as with_art,
+            count(*) FILTER (WHERE source_citation IS NULL) as no_cit
+        FROM compliance.canonical_controls
+        WHERE release_state NOT IN ('rejected')
+        GROUP BY generation_metadata->>'source_regulation'
+        HAVING count(*) >= 3
+        ORDER BY count(*) DESC
+    """))
+    print(f"\n  {'Regulation':35s} {'Total':>6s} {'WithArt':>7s} {'%':>5s}")
+    print(f"  {'-' * 60}")
+    grand_total = 0
+    grand_art = 0
+    for row in r.fetchall():
+        reg = str(row[0])[:35] if row[0] else "(none/v1v2)"
+        pct = f"{row[2]/row[1]*100:.0f}%" if row[1] > 0 else ""
+        print(f"  {reg:35s} {row[1]:6d} {row[2]:7d} {pct:>5s}")
+        grand_total += row[1]
+        grand_art += row[2]
+    print(f"\n  TOTAL: {grand_total} controls, {grand_art} with article ({grand_art/grand_total*100:.0f}%)")
+
+    # ── Step 3c: Duplicate analysis ──────────────────────────────────
+    print(f"\n{'=' * 70}")
+    print("STEP 3c: DUPLICATE CONTROLS (same reg + article + paragraph, >1)")
+    print(f"{'=' * 70}")
+
+    r2 = conn.execute(sql_text("""
+        SELECT
+            generation_metadata->>'source_regulation' as reg,
+            source_citation->>'article' as article,
+            source_citation->>'paragraph' as paragraph,
+            count(*) as cnt,
+            array_agg(id ORDER BY created_at) as ids,
+            array_agg(title ORDER BY created_at) as titles,
+            array_agg(release_state ORDER BY created_at) as states
+        FROM compliance.canonical_controls
+        WHERE release_state NOT IN ('rejected', 'too_close')
+          AND source_citation->>'article' IS NOT NULL
+          AND source_citation->>'article' != ''
+        GROUP BY
+            generation_metadata->>'source_regulation',
+            source_citation->>'article',
+            source_citation->>'paragraph'
+        HAVING count(*) > 1
+        ORDER BY count(*) DESC
+    """))
+
+    dup_groups = []
+    total_dup_controls = 0
+    total_removable = 0
+
+    for row in r2.fetchall():
+        group = {
+            "reg": row[0],
+            "article": row[1],
+            "paragraph": row[2],
+            "count": row[3],
+            "ids": [str(i) for i in row[4]],
+            "titles": row[5],
+            "states": row[6],
+        }
+        dup_groups.append(group)
+        total_dup_controls += row[3]
+        total_removable += row[3] - 1  # Keep the oldest
+
+    print(f"\n  Duplicate groups: {len(dup_groups)}")
+    print(f"  Controls in groups: {total_dup_controls}")
+    print(f"  Removable (keep oldest): {total_removable}")
+
+    # Show top 20
+    print(f"\n  {'Reg':25s} {'Article':15s} {'Para':10s} {'Count':>5s}")
+    print(f"  {'-' * 60}")
+    for g in dup_groups[:30]:
+        print(f"  {str(g['reg']):25s} {str(g['article']):15s} {str(g['paragraph']):10s} {g['count']:5d}")
+        for i, title in enumerate(g['titles'][:3]):
+            state = g['states'][i] if i < len(g['states']) else '?'
+            marker = "KEEP" if i == 0 else "DUP "
+            print(f"    [{marker}][{state:6s}] {title[:70]}")
+        if g['count'] > 3:
+            print(f"    ... +{g['count'] - 3} more")
+
+    # Save dedup plan
+    with open("/tmp/dedup_plan.json", "w") as f:
+        json.dump(dup_groups, f, indent=2, default=str)
+    print(f"\n  Saved dedup plan to /tmp/dedup_plan.json")