feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,29 @@
|
||||
"""Apply PDF QA results: update source_citation with correct article + article_type."""
|
||||
"""
|
||||
Apply PDF QA results: update source_citation with correct article_type + article.
|
||||
|
||||
Safety modes:
|
||||
--safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
|
||||
--force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
|
||||
--dry-run: Show what would change without writing.
|
||||
|
||||
Usage:
|
||||
python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles)
|
||||
python3 apply_pdf_qa_results.py --dry-run # show changes without writing
|
||||
python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
from collections import Counter
|
||||
|
||||
RESULTS_FILE = "/tmp/pdf_qa_results.json"
|
||||
|
||||
# Parse args
|
||||
dry_run = "--dry-run" in sys.argv
|
||||
force_article = "--force-article" in sys.argv
|
||||
|
||||
# Load results
|
||||
with open(RESULTS_FILE) as f:
|
||||
results = json.load(f)
|
||||
@@ -21,35 +39,101 @@ conn = psycopg2.connect(
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
|
||||
# Update in batches
|
||||
# Load current DB state for all affected controls
|
||||
cur = conn.cursor()
|
||||
updated = 0
|
||||
ctrl_ids = [r["ctrl_id"] for r in results]
|
||||
cur.execute("""
|
||||
SELECT id,
|
||||
source_citation->>'article' as article,
|
||||
source_citation->>'article_type' as article_type,
|
||||
source_citation->>'source' as source
|
||||
FROM compliance.canonical_controls
|
||||
WHERE id = ANY(%s::uuid[])
|
||||
""", (ctrl_ids,))
|
||||
db_state = {}
|
||||
for row in cur.fetchall():
|
||||
db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
|
||||
|
||||
# Counters
|
||||
stats = Counter()
|
||||
updated_type = 0
|
||||
updated_article = 0
|
||||
updated_recital = 0
|
||||
errors = 0
|
||||
unchanged = 0
|
||||
|
||||
for i, r in enumerate(results):
|
||||
ctrl_id = r["ctrl_id"]
|
||||
article_label = r["article_label"]
|
||||
article_type = r["article_type"] # preamble, article, annex, section, unknown
|
||||
new_article = r["article_label"]
|
||||
new_type = r["article_type"]
|
||||
db = db_state.get(ctrl_id, {})
|
||||
|
||||
if not db:
|
||||
stats["missing_in_db"] += 1
|
||||
continue
|
||||
|
||||
old_type = db.get("article_type")
|
||||
old_article = db.get("article", "").strip()
|
||||
|
||||
# Decide what to update
|
||||
set_type = (old_type != new_type)
|
||||
set_article = (not old_article) or (force_article and old_article != new_article)
|
||||
set_recital = (new_type == "preamble")
|
||||
|
||||
if set_type:
|
||||
stats["type_" + ("new" if not old_type else "changed")] += 1
|
||||
else:
|
||||
stats["type_unchanged"] += 1
|
||||
|
||||
if not old_article and set_article:
|
||||
stats["article_new"] += 1
|
||||
elif old_article and old_article != new_article:
|
||||
if force_article:
|
||||
stats["article_force_changed"] += 1
|
||||
else:
|
||||
stats["article_skipped"] += 1
|
||||
else:
|
||||
stats["article_unchanged"] += 1
|
||||
|
||||
if set_recital:
|
||||
stats["recital"] += 1
|
||||
|
||||
if dry_run:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Update source_citation: set article and article_type
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation
|
||||
|| jsonb_build_object('article', %s, 'article_type', %s),
|
||||
updated_at = now()
|
||||
WHERE id = %s::uuid
|
||||
AND (
|
||||
source_citation->>'article' IS DISTINCT FROM %s
|
||||
OR source_citation->>'article_type' IS DISTINCT FROM %s
|
||||
)
|
||||
""", (article_label, article_type, ctrl_id, article_label, article_type))
|
||||
# Build JSONB update
|
||||
updates = {}
|
||||
if set_type:
|
||||
updates["article_type"] = new_type
|
||||
if set_article:
|
||||
updates["article"] = new_article
|
||||
|
||||
if cur.rowcount > 0:
|
||||
updated += 1
|
||||
else:
|
||||
unchanged += 1
|
||||
if updates:
|
||||
# Merge into source_citation
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
|
||||
updated_at = now()
|
||||
WHERE id = %s::uuid
|
||||
""", (json.dumps(updates), ctrl_id))
|
||||
if set_type:
|
||||
updated_type += 1
|
||||
if set_article:
|
||||
updated_article += 1
|
||||
|
||||
# Mark preamble as recital_suspect
|
||||
if set_recital:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
COALESCE(generation_metadata, '{}'::jsonb),
|
||||
'{recital_suspect}',
|
||||
'true'::jsonb
|
||||
),
|
||||
updated_at = now()
|
||||
WHERE id = %s::uuid
|
||||
""", (ctrl_id,))
|
||||
updated_recital += 1
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
@@ -58,12 +142,37 @@ for i, r in enumerate(results):
|
||||
conn.rollback()
|
||||
continue
|
||||
|
||||
if (i + 1) % 500 == 0:
|
||||
if (i + 1) % 1000 == 0:
|
||||
conn.commit()
|
||||
print(f" Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
|
||||
print(f" Progress: {i+1}/{len(results)}")
|
||||
|
||||
conn.commit()
|
||||
print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
|
||||
if not dry_run:
|
||||
conn.commit()
|
||||
|
||||
mode = "DRY-RUN" if dry_run else "APPLIED"
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Mode: {mode}")
|
||||
print(f"{'='*60}")
|
||||
print(f"\n article_type:")
|
||||
print(f" New (was NULL): {stats['type_new']:5d}")
|
||||
print(f" Changed: {stats['type_changed']:5d}")
|
||||
print(f" Unchanged: {stats['type_unchanged']:5d}")
|
||||
print(f"\n article:")
|
||||
print(f" New (was empty): {stats['article_new']:5d}")
|
||||
if force_article:
|
||||
print(f" Force-changed: {stats['article_force_changed']:5d}")
|
||||
else:
|
||||
print(f" Differs (SKIPPED): {stats['article_skipped']:5d}")
|
||||
print(f" Unchanged: {stats['article_unchanged']:5d}")
|
||||
print(f"\n Preamble/Recital: {stats['recital']:5d}")
|
||||
print(f" Missing in DB: {stats['missing_in_db']:5d}")
|
||||
|
||||
if not dry_run:
|
||||
print(f"\n Updates written:")
|
||||
print(f" article_type: {updated_type:5d}")
|
||||
print(f" article: {updated_article:5d}")
|
||||
print(f" recital_suspect: {updated_recital:5d}")
|
||||
print(f" Errors: {errors:5d}")
|
||||
|
||||
# Verify: count by article_type
|
||||
cur.execute("""
|
||||
|
||||
Reference in New Issue
Block a user