feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped

- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-21 11:56:08 +01:00
parent c52dbdb8f1
commit 643b26618f
28 changed files with 5781 additions and 75 deletions

View File

@@ -1,11 +1,29 @@
"""Apply PDF QA results: update source_citation with correct article + article_type."""
"""
Apply PDF QA results: update source_citation with correct article_type + article.
Safety modes:
--safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
--force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
--dry-run: Show what would change without writing.
Usage:
python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles)
python3 apply_pdf_qa_results.py --dry-run # show changes without writing
python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles
"""
import os
import sys
import json
import psycopg2
import urllib.parse
from collections import Counter
RESULTS_FILE = "/tmp/pdf_qa_results.json"
# Parse args
dry_run = "--dry-run" in sys.argv
force_article = "--force-article" in sys.argv
# Load results
with open(RESULTS_FILE) as f:
results = json.load(f)
@@ -21,35 +39,101 @@ conn = psycopg2.connect(
options="-c search_path=compliance,public"
)
# Update in batches
# Load current DB state for all affected controls
cur = conn.cursor()
updated = 0
ctrl_ids = [r["ctrl_id"] for r in results]
cur.execute("""
SELECT id,
source_citation->>'article' as article,
source_citation->>'article_type' as article_type,
source_citation->>'source' as source
FROM compliance.canonical_controls
WHERE id = ANY(%s::uuid[])
""", (ctrl_ids,))
db_state = {}
for row in cur.fetchall():
db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
# Counters
stats = Counter()
updated_type = 0
updated_article = 0
updated_recital = 0
errors = 0
unchanged = 0
for i, r in enumerate(results):
ctrl_id = r["ctrl_id"]
article_label = r["article_label"]
article_type = r["article_type"] # preamble, article, annex, section, unknown
new_article = r["article_label"]
new_type = r["article_type"]
db = db_state.get(ctrl_id, {})
if not db:
stats["missing_in_db"] += 1
continue
old_type = db.get("article_type")
old_article = db.get("article", "").strip()
# Decide what to update
set_type = (old_type != new_type)
set_article = (not old_article) or (force_article and old_article != new_article)
set_recital = (new_type == "preamble")
if set_type:
stats["type_" + ("new" if not old_type else "changed")] += 1
else:
stats["type_unchanged"] += 1
if not old_article and set_article:
stats["article_new"] += 1
elif old_article and old_article != new_article:
if force_article:
stats["article_force_changed"] += 1
else:
stats["article_skipped"] += 1
else:
stats["article_unchanged"] += 1
if set_recital:
stats["recital"] += 1
if dry_run:
continue
try:
# Update source_citation: set article and article_type
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation
|| jsonb_build_object('article', %s, 'article_type', %s),
updated_at = now()
WHERE id = %s::uuid
AND (
source_citation->>'article' IS DISTINCT FROM %s
OR source_citation->>'article_type' IS DISTINCT FROM %s
)
""", (article_label, article_type, ctrl_id, article_label, article_type))
# Build JSONB update
updates = {}
if set_type:
updates["article_type"] = new_type
if set_article:
updates["article"] = new_article
if cur.rowcount > 0:
updated += 1
else:
unchanged += 1
if updates:
# Merge into source_citation
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
updated_at = now()
WHERE id = %s::uuid
""", (json.dumps(updates), ctrl_id))
if set_type:
updated_type += 1
if set_article:
updated_article += 1
# Mark preamble as recital_suspect
if set_recital:
cur.execute("""
UPDATE compliance.canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata, '{}'::jsonb),
'{recital_suspect}',
'true'::jsonb
),
updated_at = now()
WHERE id = %s::uuid
""", (ctrl_id,))
updated_recital += 1
except Exception as e:
errors += 1
@@ -58,12 +142,37 @@ for i, r in enumerate(results):
conn.rollback()
continue
if (i + 1) % 500 == 0:
if (i + 1) % 1000 == 0:
conn.commit()
print(f" Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
print(f" Progress: {i+1}/{len(results)}")
conn.commit()
print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
if not dry_run:
conn.commit()
mode = "DRY-RUN" if dry_run else "APPLIED"
print(f"\n{'='*60}")
print(f" Mode: {mode}")
print(f"{'='*60}")
print(f"\n article_type:")
print(f" New (was NULL): {stats['type_new']:5d}")
print(f" Changed: {stats['type_changed']:5d}")
print(f" Unchanged: {stats['type_unchanged']:5d}")
print(f"\n article:")
print(f" New (was empty): {stats['article_new']:5d}")
if force_article:
print(f" Force-changed: {stats['article_force_changed']:5d}")
else:
print(f" Differs (SKIPPED): {stats['article_skipped']:5d}")
print(f" Unchanged: {stats['article_unchanged']:5d}")
print(f"\n Preamble/Recital: {stats['recital']:5d}")
print(f" Missing in DB: {stats['missing_in_db']:5d}")
if not dry_run:
print(f"\n Updates written:")
print(f" article_type: {updated_type:5d}")
print(f" article: {updated_article:5d}")
print(f" recital_suspect: {updated_recital:5d}")
print(f" Errors: {errors:5d}")
# Verify: count by article_type
cur.execute("""