chore(qa): PDF QA v3 — 6,259/7,943 controls matched (79%)
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 43s
CI/CD / test-python-backend-compliance (push) Successful in 33s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 22s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 43s
CI/CD / test-python-backend-compliance (push) Successful in 33s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 22s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
- Added NIST 800-53, OWASP Top 10/ASVS/SAMM/API/MASVS, ENISA ICS PDFs - Improved normalize() for ligatures, smart quotes, dashes - Added OWASP-specific index builder (A01:2021, V1.1, MASVS-*) - 6,259 article assignments in DB (1,817 article, 1,355 preamble, 1,173 control, 790 annex, 666 section) - Remaining 1,651 unmatched: Blue Guide (EN text vs DE PDF), OWASP multilingual translations (PT/AR/ID/ES) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
104
scripts/qa/apply_pdf_qa_results.py
Normal file
104
scripts/qa/apply_pdf_qa_results.py
Normal file
@@ -0,0 +1,104 @@
|
||||
"""Apply PDF QA results: update source_citation with correct article + article_type."""
|
||||
import os
|
||||
import json
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
RESULTS_FILE = "/tmp/pdf_qa_results.json"
|
||||
|
||||
# Load results
|
||||
with open(RESULTS_FILE) as f:
|
||||
results = json.load(f)
|
||||
print(f"Loaded {len(results)} results")
|
||||
|
||||
# DB connection
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
|
||||
# Update in batches
|
||||
cur = conn.cursor()
|
||||
updated = 0
|
||||
errors = 0
|
||||
unchanged = 0
|
||||
|
||||
for i, r in enumerate(results):
|
||||
ctrl_id = r["ctrl_id"]
|
||||
article_label = r["article_label"]
|
||||
article_type = r["article_type"] # preamble, article, annex, section, unknown
|
||||
|
||||
try:
|
||||
# Update source_citation: set article and article_type
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation
|
||||
|| jsonb_build_object('article', %s, 'article_type', %s),
|
||||
updated_at = now()
|
||||
WHERE id = %s::uuid
|
||||
AND (
|
||||
source_citation->>'article' IS DISTINCT FROM %s
|
||||
OR source_citation->>'article_type' IS DISTINCT FROM %s
|
||||
)
|
||||
""", (article_label, article_type, ctrl_id, article_label, article_type))
|
||||
|
||||
if cur.rowcount > 0:
|
||||
updated += 1
|
||||
else:
|
||||
unchanged += 1
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
if errors <= 5:
|
||||
print(f" ERROR {ctrl_id}: {str(e)[:100]}")
|
||||
conn.rollback()
|
||||
continue
|
||||
|
||||
if (i + 1) % 500 == 0:
|
||||
conn.commit()
|
||||
print(f" Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
|
||||
|
||||
conn.commit()
|
||||
print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
|
||||
|
||||
# Verify: count by article_type
|
||||
cur.execute("""
|
||||
SELECT source_citation->>'article_type' as art_type, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'article_type' IS NOT NULL
|
||||
GROUP BY 1
|
||||
ORDER BY count(*) DESC
|
||||
""")
|
||||
print("\nArticle type distribution in DB:")
|
||||
for row in cur.fetchall():
|
||||
print(f" {str(row[0]):12s}: {row[1]:5d}")
|
||||
|
||||
# Verify: sample preamble controls
|
||||
cur.execute("""
|
||||
SELECT control_id, source_citation->>'article', source_citation->>'article_type',
|
||||
source_citation->>'source'
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'article_type' = 'preamble'
|
||||
LIMIT 5
|
||||
""")
|
||||
print("\nSample preamble controls:")
|
||||
for row in cur.fetchall():
|
||||
print(f" {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}")
|
||||
|
||||
# Verify: sample annex controls
|
||||
cur.execute("""
|
||||
SELECT control_id, source_citation->>'article', source_citation->>'article_type',
|
||||
source_citation->>'source'
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'article_type' = 'annex'
|
||||
LIMIT 5
|
||||
""")
|
||||
print("\nSample annex controls:")
|
||||
for row in cur.fetchall():
|
||||
print(f" {row[0]}: {row[1]} ({row[2]}) — {row[3][:40]}")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user