chore(qa): add PDF-based control QA scripts and results
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
95
scripts/qa/pdf_qa_inventory.py
Normal file
95
scripts/qa/pdf_qa_inventory.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""Inventory: Which regulations have controls, how many, and do we have PDFs?"""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
||||
TEXT_DIR = Path(os.path.expanduser("~/rag-ingestion/texts"))
|
||||
|
||||
# DB connection
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get all regulations with controls (excluding duplicates/too_close)
|
||||
cur.execute("""
|
||||
SELECT
|
||||
source_citation->>'source' as source_name,
|
||||
count(*) as total,
|
||||
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close')) as active,
|
||||
count(*) FILTER (WHERE source_citation->>'article' IS NOT NULL AND source_citation->>'article' != '') as has_article,
|
||||
count(*) FILTER (WHERE source_original_text IS NOT NULL AND length(source_original_text) > 50) as has_text
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation IS NOT NULL
|
||||
GROUP BY 1
|
||||
ORDER BY active DESC
|
||||
""")
|
||||
regs = cur.fetchall()
|
||||
|
||||
# List available PDFs and text files
|
||||
pdf_files = {f.stem: f for f in PDF_DIR.glob("*.pdf")} if PDF_DIR.exists() else {}
|
||||
txt_files = {f.stem: f for f in TEXT_DIR.glob("*.txt")} if TEXT_DIR.exists() else {}
|
||||
html_files = {f.stem: f for f in PDF_DIR.glob("*.html")} if PDF_DIR.exists() else {}
|
||||
|
||||
# Also check for XML/zip files
|
||||
all_files = {}
|
||||
if PDF_DIR.exists():
|
||||
for f in PDF_DIR.iterdir():
|
||||
all_files[f.stem] = f
|
||||
|
||||
print(f"{'Source':55s} {'Total':>6s} {'Active':>7s} {'w/Art':>6s} {'w/Text':>7s} {'PDF?':>5s}")
|
||||
print("-" * 92)
|
||||
|
||||
total_controls = 0
|
||||
total_active = 0
|
||||
total_with_text = 0
|
||||
total_with_pdf = 0
|
||||
no_pdf = []
|
||||
|
||||
for row in regs:
|
||||
source, total, active, has_art, has_text = row
|
||||
if not source:
|
||||
source = "(null)"
|
||||
total_controls += total
|
||||
total_active += active
|
||||
total_with_text += has_text if active > 0 else 0
|
||||
|
||||
# Try to find matching PDF
|
||||
has_pdf = "?"
|
||||
# Common name mappings
|
||||
name_lower = source.lower()
|
||||
for stem, path in all_files.items():
|
||||
if stem.lower() in name_lower or name_lower[:20] in stem.lower():
|
||||
has_pdf = path.suffix
|
||||
break
|
||||
|
||||
if active > 0:
|
||||
if has_pdf == "?":
|
||||
no_pdf.append((source, active, has_text))
|
||||
print(f"{source[:55]:55s} {total:6d} {active:7d} {has_art:6d} {has_text:7d} {has_pdf:>5s}")
|
||||
|
||||
print("-" * 92)
|
||||
print(f"{'TOTAL':55s} {total_controls:6d} {total_active:7d}")
|
||||
print(f"\nAvailable files in {PDF_DIR}: {len(all_files)}")
|
||||
print(f" PDFs: {len(pdf_files)}, TXT: {len(txt_files)}, HTML: {len(html_files)}")
|
||||
|
||||
if no_pdf:
|
||||
print(f"\n=== Regulations WITHOUT obvious PDF match ({len(no_pdf)}) ===")
|
||||
for source, active, has_text in no_pdf:
|
||||
print(f" {source[:60]:60s} {active:4d} controls, {has_text:4d} with text")
|
||||
|
||||
# Also list all available files for manual matching
|
||||
print(f"\n=== Available source files ({len(all_files)}) ===")
|
||||
for stem in sorted(all_files.keys()):
|
||||
print(f" {stem}{all_files[stem].suffix}")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user