Files
breakpilot-compliance/scripts/qa/pdf_qa_inventory.py
Benjamin Admin 9b0f25c105
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
chore(qa): add PDF-based control QA scripts and results
QA pipeline that matches control source_original_text directly against
original PDF documents to verify article/paragraph assignments. Covers
backfill, dedup, source normalization, Qdrant cleanup, and prod sync.

Key results (2026-03-20):
- 4,110/7,943 controls matched to PDF (100% for major EU regs)
- 3,366 article corrections, 705 new assignments
- 1,290 controls from Erwägungsgründe (preamble) identified
- 779 controls from Anhänge (annexes) identified

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:56:13 +01:00

96 lines
3.2 KiB
Python

"""Inventory: Which regulations have controls, how many, and do we have PDFs?"""
import os
import re
import json
import psycopg2
import urllib.parse
from pathlib import Path
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
TEXT_DIR = Path(os.path.expanduser("~/rag-ingestion/texts"))
# DB connection
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# Get all regulations with controls (excluding duplicates/too_close)
cur.execute("""
SELECT
source_citation->>'source' as source_name,
count(*) as total,
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close')) as active,
count(*) FILTER (WHERE source_citation->>'article' IS NOT NULL AND source_citation->>'article' != '') as has_article,
count(*) FILTER (WHERE source_original_text IS NOT NULL AND length(source_original_text) > 50) as has_text
FROM compliance.canonical_controls
WHERE source_citation IS NOT NULL
GROUP BY 1
ORDER BY active DESC
""")
regs = cur.fetchall()
# List available PDFs and text files
pdf_files = {f.stem: f for f in PDF_DIR.glob("*.pdf")} if PDF_DIR.exists() else {}
txt_files = {f.stem: f for f in TEXT_DIR.glob("*.txt")} if TEXT_DIR.exists() else {}
html_files = {f.stem: f for f in PDF_DIR.glob("*.html")} if PDF_DIR.exists() else {}
# Also check for XML/zip files
all_files = {}
if PDF_DIR.exists():
for f in PDF_DIR.iterdir():
all_files[f.stem] = f
print(f"{'Source':55s} {'Total':>6s} {'Active':>7s} {'w/Art':>6s} {'w/Text':>7s} {'PDF?':>5s}")
print("-" * 92)
total_controls = 0
total_active = 0
total_with_text = 0
total_with_pdf = 0
no_pdf = []
for row in regs:
source, total, active, has_art, has_text = row
if not source:
source = "(null)"
total_controls += total
total_active += active
total_with_text += has_text if active > 0 else 0
# Try to find matching PDF
has_pdf = "?"
# Common name mappings
name_lower = source.lower()
for stem, path in all_files.items():
if stem.lower() in name_lower or name_lower[:20] in stem.lower():
has_pdf = path.suffix
break
if active > 0:
if has_pdf == "?":
no_pdf.append((source, active, has_text))
print(f"{source[:55]:55s} {total:6d} {active:7d} {has_art:6d} {has_text:7d} {has_pdf:>5s}")
print("-" * 92)
print(f"{'TOTAL':55s} {total_controls:6d} {total_active:7d}")
print(f"\nAvailable files in {PDF_DIR}: {len(all_files)}")
print(f" PDFs: {len(pdf_files)}, TXT: {len(txt_files)}, HTML: {len(html_files)}")
if no_pdf:
print(f"\n=== Regulations WITHOUT obvious PDF match ({len(no_pdf)}) ===")
for source, active, has_text in no_pdf:
print(f" {source[:60]:60s} {active:4d} controls, {has_text:4d} with text")
# Also list all available files for manual matching
print(f"\n=== Available source files ({len(all_files)}) ===")
for stem in sorted(all_files.keys()):
print(f" {stem}{all_files[stem].suffix}")
conn.close()