breakpilot-compliance/scripts/qa/pdf_qa_inventory.py

"""Inventory: Which regulations have controls, how many, and do we have PDFs?"""
import os
import re
import json
import psycopg2
import urllib.parse
from pathlib import Path

PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
TEXT_DIR = Path(os.path.expanduser("~/rag-ingestion/texts"))

# DB connection
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
    host=parsed.hostname, port=parsed.port or 5432,
    user=parsed.username, password=parsed.password,
    dbname=parsed.path.lstrip('/'),
    options="-c search_path=compliance,public"
)
cur = conn.cursor()

# Get all regulations with controls (excluding duplicates/too_close)
cur.execute("""
    SELECT
        source_citation->>'source' as source_name,
        count(*) as total,
        count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close')) as active,
        count(*) FILTER (WHERE source_citation->>'article' IS NOT NULL AND source_citation->>'article' != '') as has_article,
        count(*) FILTER (WHERE source_original_text IS NOT NULL AND length(source_original_text) > 50) as has_text
    FROM compliance.canonical_controls
    WHERE source_citation IS NOT NULL
    GROUP BY 1
    ORDER BY active DESC
""")
regs = cur.fetchall()

# List available PDFs and text files
pdf_files = {f.stem: f for f in PDF_DIR.glob("*.pdf")} if PDF_DIR.exists() else {}
txt_files = {f.stem: f for f in TEXT_DIR.glob("*.txt")} if TEXT_DIR.exists() else {}
html_files = {f.stem: f for f in PDF_DIR.glob("*.html")} if PDF_DIR.exists() else {}

# Also check for XML/zip files
all_files = {}
if PDF_DIR.exists():
    for f in PDF_DIR.iterdir():
        all_files[f.stem] = f

print(f"{'Source':55s} {'Total':>6s} {'Active':>7s} {'w/Art':>6s} {'w/Text':>7s} {'PDF?':>5s}")
print("-" * 92)

total_controls = 0
total_active = 0
total_with_text = 0
total_with_pdf = 0
no_pdf = []

for row in regs:
    source, total, active, has_art, has_text = row
    if not source:
        source = "(null)"
    total_controls += total
    total_active += active
    total_with_text += has_text if active > 0 else 0

    # Try to find matching PDF
    has_pdf = "?"
    # Common name mappings
    name_lower = source.lower()
    for stem, path in all_files.items():
        if stem.lower() in name_lower or name_lower[:20] in stem.lower():
            has_pdf = path.suffix
            break

    if active > 0:
        if has_pdf == "?":
            no_pdf.append((source, active, has_text))
        print(f"{source[:55]:55s} {total:6d} {active:7d} {has_art:6d} {has_text:7d} {has_pdf:>5s}")

print("-" * 92)
print(f"{'TOTAL':55s} {total_controls:6d} {total_active:7d}")
print(f"\nAvailable files in {PDF_DIR}: {len(all_files)}")
print(f"  PDFs: {len(pdf_files)}, TXT: {len(txt_files)}, HTML: {len(html_files)}")

if no_pdf:
    print(f"\n=== Regulations WITHOUT obvious PDF match ({len(no_pdf)}) ===")
    for source, active, has_text in no_pdf:
        print(f"  {source[:60]:60s} {active:4d} controls, {has_text:4d} with text")

# Also list all available files for manual matching
print(f"\n=== Available source files ({len(all_files)}) ===")
for stem in sorted(all_files.keys()):
    print(f"  {stem}{all_files[stem].suffix}")

conn.close()