feat: Control Library UI, dedup migration, QA tooling, docs

- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00
parent c52dbdb8f1
commit 643b26618f
28 changed files with 5781 additions and 75 deletions
@@ -1,11 +1,29 @@
-"""Apply PDF QA results: update source_citation with correct article + article_type."""
+"""
+Apply PDF QA results: update source_citation with correct article_type + article.
+
+Safety modes:
+  --safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
+  --force-article:  Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
+  --dry-run:        Show what would change without writing.
+
+Usage:
+    python3 apply_pdf_qa_results.py                    # safe mode (apply article_type + empty articles)
+    python3 apply_pdf_qa_results.py --dry-run          # show changes without writing
+    python3 apply_pdf_qa_results.py --force-article    # also overwrite existing articles
+"""
 import os
+import sys
 import json
 import psycopg2
 import urllib.parse
+from collections import Counter

 RESULTS_FILE = "/tmp/pdf_qa_results.json"

+# Parse args
+dry_run = "--dry-run" in sys.argv
+force_article = "--force-article" in sys.argv
+
 # Load results
 with open(RESULTS_FILE) as f:
    results = json.load(f)
@@ -21,35 +39,101 @@ conn = psycopg2.connect(
    options="-c search_path=compliance,public"
 )

-# Update in batches
+# Load current DB state for all affected controls
 cur = conn.cursor()
-updated = 0
+ctrl_ids = [r["ctrl_id"] for r in results]
+cur.execute("""
+    SELECT id,
+           source_citation->>'article' as article,
+           source_citation->>'article_type' as article_type,
+           source_citation->>'source' as source
+    FROM compliance.canonical_controls
+    WHERE id = ANY(%s::uuid[])
+""", (ctrl_ids,))
+db_state = {}
+for row in cur.fetchall():
+    db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
+
+# Counters
+stats = Counter()
+updated_type = 0
+updated_article = 0
+updated_recital = 0
 errors = 0
-unchanged = 0

 for i, r in enumerate(results):
    ctrl_id = r["ctrl_id"]
-    article_label = r["article_label"]
-    article_type = r["article_type"]  # preamble, article, annex, section, unknown
+    new_article = r["article_label"]
+    new_type = r["article_type"]
+    db = db_state.get(ctrl_id, {})
+
+    if not db:
+        stats["missing_in_db"] += 1
+        continue
+
+    old_type = db.get("article_type")
+    old_article = db.get("article", "").strip()
+
+    # Decide what to update
+    set_type = (old_type != new_type)
+    set_article = (not old_article) or (force_article and old_article != new_article)
+    set_recital = (new_type == "preamble")
+
+    if set_type:
+        stats["type_" + ("new" if not old_type else "changed")] += 1
+    else:
+        stats["type_unchanged"] += 1
+
+    if not old_article and set_article:
+        stats["article_new"] += 1
+    elif old_article and old_article != new_article:
+        if force_article:
+            stats["article_force_changed"] += 1
+        else:
+            stats["article_skipped"] += 1
+    else:
+        stats["article_unchanged"] += 1
+
+    if set_recital:
+        stats["recital"] += 1
+
+    if dry_run:
+        continue

    try:
-        # Update source_citation: set article and article_type
-        cur.execute("""
-            UPDATE compliance.canonical_controls
-            SET source_citation = source_citation
-                || jsonb_build_object('article', %s, 'article_type', %s),
-                updated_at = now()
-            WHERE id = %s::uuid
-            AND (
-                source_citation->>'article' IS DISTINCT FROM %s
-                OR source_citation->>'article_type' IS DISTINCT FROM %s
-            )
-        """, (article_label, article_type, ctrl_id, article_label, article_type))
+        # Build JSONB update
+        updates = {}
+        if set_type:
+            updates["article_type"] = new_type
+        if set_article:
+            updates["article"] = new_article

-        if cur.rowcount > 0:
-            updated += 1
-        else:
-            unchanged += 1
+        if updates:
+            # Merge into source_citation
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
+                    updated_at = now()
+                WHERE id = %s::uuid
+            """, (json.dumps(updates), ctrl_id))
+            if set_type:
+                updated_type += 1
+            if set_article:
+                updated_article += 1
+
+        # Mark preamble as recital_suspect
+        if set_recital:
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET generation_metadata = jsonb_set(
+                    COALESCE(generation_metadata, '{}'::jsonb),
+                    '{recital_suspect}',
+                    'true'::jsonb
+                ),
+                updated_at = now()
+                WHERE id = %s::uuid
+            """, (ctrl_id,))
+            updated_recital += 1

    except Exception as e:
        errors += 1
@@ -58,12 +142,37 @@ for i, r in enumerate(results):
        conn.rollback()
        continue

-    if (i + 1) % 500 == 0:
+    if (i + 1) % 1000 == 0:
        conn.commit()
-        print(f"  Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
+        print(f"  Progress: {i+1}/{len(results)}")

-conn.commit()
-print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
+if not dry_run:
+    conn.commit()
+
+mode = "DRY-RUN" if dry_run else "APPLIED"
+print(f"\n{'='*60}")
+print(f"  Mode: {mode}")
+print(f"{'='*60}")
+print(f"\n  article_type:")
+print(f"    New (was NULL):    {stats['type_new']:5d}")
+print(f"    Changed:           {stats['type_changed']:5d}")
+print(f"    Unchanged:         {stats['type_unchanged']:5d}")
+print(f"\n  article:")
+print(f"    New (was empty):   {stats['article_new']:5d}")
+if force_article:
+    print(f"    Force-changed:     {stats['article_force_changed']:5d}")
+else:
+    print(f"    Differs (SKIPPED): {stats['article_skipped']:5d}")
+print(f"    Unchanged:         {stats['article_unchanged']:5d}")
+print(f"\n  Preamble/Recital:    {stats['recital']:5d}")
+print(f"  Missing in DB:       {stats['missing_in_db']:5d}")
+
+if not dry_run:
+    print(f"\n  Updates written:")
+    print(f"    article_type:      {updated_type:5d}")
+    print(f"    article:           {updated_article:5d}")
+    print(f"    recital_suspect:   {updated_recital:5d}")
+    print(f"    Errors:            {errors:5d}")

 # Verify: count by article_type
 cur.execute("""