chore(qa): add PDF-based control QA scripts and results
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
121
scripts/qa/qa_normalize_sources.py
Normal file
121
scripts/qa/qa_normalize_sources.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""
|
||||
Task 3: Normalize source_citation.source names.
|
||||
Same regulation has different source names from different pipeline runs.
|
||||
Standardize to one canonical name per regulation.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from sqlalchemy import create_engine, text as sql_text
|
||||
|
||||
DB_URL = os.environ['DATABASE_URL']
|
||||
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||
DRY_RUN = '--dry-run' in sys.argv
|
||||
|
||||
# Canonical source names per regulation
|
||||
SOURCE_NAMES = {
|
||||
"eu_2023_1230": "Maschinenverordnung (EU) 2023/1230",
|
||||
"eu_2024_2847": "Cyber Resilience Act (CRA)",
|
||||
"eu_2024_1689": "KI-Verordnung (EU) 2024/1689",
|
||||
"eu_2022_2555": "NIS2-Richtlinie (EU) 2022/2555",
|
||||
"eu_2016_679": "DSGVO (EU) 2016/679",
|
||||
"eu_blue_guide_2022": "EU Blue Guide 2022",
|
||||
"nist_sp800_53r5": "NIST SP 800-53 Rev. 5",
|
||||
"nist_sp_800_218": "NIST SP 800-218 (SSDF)",
|
||||
"nist_csf_2_0": "NIST Cybersecurity Framework 2.0",
|
||||
"nist_sp800_63_3": "NIST SP 800-63-3",
|
||||
"nist_sp800_207": "NIST SP 800-207 (Zero Trust)",
|
||||
"nist_ai_rmf": "NIST AI Risk Management Framework",
|
||||
"owasp_top10_2021": "OWASP Top 10 (2021)",
|
||||
"owasp_asvs": "OWASP ASVS 4.0",
|
||||
"owasp_samm": "OWASP SAMM 2.0",
|
||||
"owasp_api_top10_2023": "OWASP API Security Top 10 (2023)",
|
||||
"owasp_masvs": "OWASP MASVS 2.0",
|
||||
"cisa_secure_by_design": "CISA Secure by Design",
|
||||
"enisa_ics_scada_dependencies": "ENISA ICS/SCADA Dependencies",
|
||||
"enisa_supply_chain_good_practices": "ENISA Supply Chain Good Practices",
|
||||
"enisa_threat_landscape_supply_chain": "ENISA Threat Landscape Supply Chain",
|
||||
"enisa_cybersecurity_state_2024": "ENISA Cybersecurity State 2024",
|
||||
"oecd_ai_principles": "OECD KI-Empfehlung",
|
||||
"gpsr": "Allgemeine Produktsicherheitsverordnung (GPSR)",
|
||||
"eu_2023_1542": "Batterieverordnung (EU) 2023/1542",
|
||||
"mica": "Markets in Crypto-Assets (MiCA)",
|
||||
"eu_2022_868": "Data Governance Act (DGA)",
|
||||
"dataact": "Data Act",
|
||||
"eucsa": "EU Cybersecurity Act (EUCSA)",
|
||||
"eaa": "European Accessibility Act (EAA)",
|
||||
"eu_2023_1803": "IFRS-Übernahmeverordnung",
|
||||
"amlr": "AML-Verordnung",
|
||||
"bdsg_2018_komplett": "Bundesdatenschutzgesetz (BDSG)",
|
||||
"bdsg": "Bundesdatenschutzgesetz (BDSG)",
|
||||
}
|
||||
|
||||
print("=" * 60)
|
||||
print("TASK 3: NORMALIZE SOURCE NAMES")
|
||||
print("=" * 60)
|
||||
|
||||
with engine.begin() as conn:
|
||||
# Find all current source_name variants
|
||||
r = conn.execute(sql_text("""
|
||||
SELECT generation_metadata->>'source_regulation' as reg,
|
||||
source_citation->>'source' as current_name,
|
||||
count(*) as cnt
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation IS NOT NULL
|
||||
AND generation_metadata->>'source_regulation' IS NOT NULL
|
||||
GROUP BY 1, 2
|
||||
ORDER BY 1, cnt DESC
|
||||
"""))
|
||||
|
||||
updates = []
|
||||
for row in r.fetchall():
|
||||
reg = row[0]
|
||||
current = row[1]
|
||||
count = row[2]
|
||||
canonical = SOURCE_NAMES.get(reg)
|
||||
|
||||
if canonical and current != canonical:
|
||||
updates.append((reg, current, canonical, count))
|
||||
|
||||
print(f"\n Source names to normalize: {len(updates)}")
|
||||
print(f"\n {'Regulation':30s} {'From':45s} → {'To':45s} {'Count':>5s}")
|
||||
print(f" {'-' * 130}")
|
||||
|
||||
total_updated = 0
|
||||
for reg, old_name, new_name, count in updates:
|
||||
print(f" {reg:30s} {old_name[:45]:45s} → {new_name[:45]:45s} {count:5d}")
|
||||
total_updated += count
|
||||
|
||||
if not DRY_RUN:
|
||||
name_json = json.dumps(new_name) # "name" with quotes for jsonb
|
||||
conn.execute(sql_text("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = jsonb_set(
|
||||
source_citation,
|
||||
'{source}',
|
||||
CAST(:name_json AS jsonb)
|
||||
)
|
||||
WHERE generation_metadata->>'source_regulation' = :reg
|
||||
AND source_citation->>'source' = :old_name
|
||||
"""), {"reg": reg, "old_name": old_name, "name_json": name_json})
|
||||
|
||||
print(f"\n Total controls updated: {total_updated}")
|
||||
print(f" Dry run: {DRY_RUN}")
|
||||
|
||||
# Verify
|
||||
if not DRY_RUN:
|
||||
r2 = conn.execute(sql_text("""
|
||||
SELECT generation_metadata->>'source_regulation' as reg,
|
||||
source_citation->>'source' as name,
|
||||
count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation IS NOT NULL
|
||||
AND generation_metadata->>'source_regulation' IS NOT NULL
|
||||
GROUP BY 1, 2
|
||||
HAVING count(*) >= 5
|
||||
ORDER BY count(*) DESC
|
||||
"""))
|
||||
print(f"\n === Verified source names (>= 5 controls) ===")
|
||||
for row in r2.fetchall():
|
||||
print(f" {str(row[0]):30s} {str(row[1]):50s} {row[2]:5d}")
|
||||
Reference in New Issue
Block a user