chore(qa): add PDF-based control QA scripts and results

QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:56:13 +01:00
parent 1cc34c23d9
commit 9b0f25c105
12 changed files with 30839 additions and 0 deletions
@@ -0,0 +1,95 @@
+"""Inventory: Which regulations have controls, how many, and do we have PDFs?"""
+import os
+import re
+import json
+import psycopg2
+import urllib.parse
+from pathlib import Path
+
+PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
+TEXT_DIR = Path(os.path.expanduser("~/rag-ingestion/texts"))
+
+# DB connection
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# Get all regulations with controls (excluding duplicates/too_close)
+cur.execute("""
+    SELECT
+        source_citation->>'source' as source_name,
+        count(*) as total,
+        count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close')) as active,
+        count(*) FILTER (WHERE source_citation->>'article' IS NOT NULL AND source_citation->>'article' != '') as has_article,
+        count(*) FILTER (WHERE source_original_text IS NOT NULL AND length(source_original_text) > 50) as has_text
+    FROM compliance.canonical_controls
+    WHERE source_citation IS NOT NULL
+    GROUP BY 1
+    ORDER BY active DESC
+""")
+regs = cur.fetchall()
+
+# List available PDFs and text files
+pdf_files = {f.stem: f for f in PDF_DIR.glob("*.pdf")} if PDF_DIR.exists() else {}
+txt_files = {f.stem: f for f in TEXT_DIR.glob("*.txt")} if TEXT_DIR.exists() else {}
+html_files = {f.stem: f for f in PDF_DIR.glob("*.html")} if PDF_DIR.exists() else {}
+
+# Also check for XML/zip files
+all_files = {}
+if PDF_DIR.exists():
+    for f in PDF_DIR.iterdir():
+        all_files[f.stem] = f
+
+print(f"{'Source':55s} {'Total':>6s} {'Active':>7s} {'w/Art':>6s} {'w/Text':>7s} {'PDF?':>5s}")
+print("-" * 92)
+
+total_controls = 0
+total_active = 0
+total_with_text = 0
+total_with_pdf = 0
+no_pdf = []
+
+for row in regs:
+    source, total, active, has_art, has_text = row
+    if not source:
+        source = "(null)"
+    total_controls += total
+    total_active += active
+    total_with_text += has_text if active > 0 else 0
+
+    # Try to find matching PDF
+    has_pdf = "?"
+    # Common name mappings
+    name_lower = source.lower()
+    for stem, path in all_files.items():
+        if stem.lower() in name_lower or name_lower[:20] in stem.lower():
+            has_pdf = path.suffix
+            break
+
+    if active > 0:
+        if has_pdf == "?":
+            no_pdf.append((source, active, has_text))
+        print(f"{source[:55]:55s} {total:6d} {active:7d} {has_art:6d} {has_text:7d} {has_pdf:>5s}")
+
+print("-" * 92)
+print(f"{'TOTAL':55s} {total_controls:6d} {total_active:7d}")
+print(f"\nAvailable files in {PDF_DIR}: {len(all_files)}")
+print(f"  PDFs: {len(pdf_files)}, TXT: {len(txt_files)}, HTML: {len(html_files)}")
+
+if no_pdf:
+    print(f"\n=== Regulations WITHOUT obvious PDF match ({len(no_pdf)}) ===")
+    for source, active, has_text in no_pdf:
+        print(f"  {source[:60]:60s} {active:4d} controls, {has_text:4d} with text")
+
+# Also list all available files for manual matching
+print(f"\n=== Available source files ({len(all_files)}) ===")
+for stem in sorted(all_files.keys()):
+    print(f"  {stem}{all_files[stem].suffix}")
+
+conn.close()