"""Inventory: Which regulations have controls, how many, and do we have PDFs?""" import os import re import json import psycopg2 import urllib.parse from pathlib import Path PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs")) TEXT_DIR = Path(os.path.expanduser("~/rag-ingestion/texts")) # DB connection db_url = os.environ['DATABASE_URL'] parsed = urllib.parse.urlparse(db_url) conn = psycopg2.connect( host=parsed.hostname, port=parsed.port or 5432, user=parsed.username, password=parsed.password, dbname=parsed.path.lstrip('/'), options="-c search_path=compliance,public" ) cur = conn.cursor() # Get all regulations with controls (excluding duplicates/too_close) cur.execute(""" SELECT source_citation->>'source' as source_name, count(*) as total, count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close')) as active, count(*) FILTER (WHERE source_citation->>'article' IS NOT NULL AND source_citation->>'article' != '') as has_article, count(*) FILTER (WHERE source_original_text IS NOT NULL AND length(source_original_text) > 50) as has_text FROM compliance.canonical_controls WHERE source_citation IS NOT NULL GROUP BY 1 ORDER BY active DESC """) regs = cur.fetchall() # List available PDFs and text files pdf_files = {f.stem: f for f in PDF_DIR.glob("*.pdf")} if PDF_DIR.exists() else {} txt_files = {f.stem: f for f in TEXT_DIR.glob("*.txt")} if TEXT_DIR.exists() else {} html_files = {f.stem: f for f in PDF_DIR.glob("*.html")} if PDF_DIR.exists() else {} # Also check for XML/zip files all_files = {} if PDF_DIR.exists(): for f in PDF_DIR.iterdir(): all_files[f.stem] = f print(f"{'Source':55s} {'Total':>6s} {'Active':>7s} {'w/Art':>6s} {'w/Text':>7s} {'PDF?':>5s}") print("-" * 92) total_controls = 0 total_active = 0 total_with_text = 0 total_with_pdf = 0 no_pdf = [] for row in regs: source, total, active, has_art, has_text = row if not source: source = "(null)" total_controls += total total_active += active total_with_text += has_text if active > 0 else 0 # Try to find matching PDF has_pdf = "?" # Common name mappings name_lower = source.lower() for stem, path in all_files.items(): if stem.lower() in name_lower or name_lower[:20] in stem.lower(): has_pdf = path.suffix break if active > 0: if has_pdf == "?": no_pdf.append((source, active, has_text)) print(f"{source[:55]:55s} {total:6d} {active:7d} {has_art:6d} {has_text:7d} {has_pdf:>5s}") print("-" * 92) print(f"{'TOTAL':55s} {total_controls:6d} {total_active:7d}") print(f"\nAvailable files in {PDF_DIR}: {len(all_files)}") print(f" PDFs: {len(pdf_files)}, TXT: {len(txt_files)}, HTML: {len(html_files)}") if no_pdf: print(f"\n=== Regulations WITHOUT obvious PDF match ({len(no_pdf)}) ===") for source, active, has_text in no_pdf: print(f" {source[:60]:60s} {active:4d} controls, {has_text:4d} with text") # Also list all available files for manual matching print(f"\n=== Available source files ({len(all_files)}) ===") for stem in sorted(all_files.keys()): print(f" {stem}{all_files[stem].suffix}") conn.close()