Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
96 lines
3.2 KiB
Python
96 lines
3.2 KiB
Python
"""Inventory: Which regulations have controls, how many, and do we have PDFs?"""
|
|
import os
|
|
import re
|
|
import json
|
|
import psycopg2
|
|
import urllib.parse
|
|
from pathlib import Path
|
|
|
|
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
|
TEXT_DIR = Path(os.path.expanduser("~/rag-ingestion/texts"))
|
|
|
|
# DB connection
|
|
db_url = os.environ['DATABASE_URL']
|
|
parsed = urllib.parse.urlparse(db_url)
|
|
conn = psycopg2.connect(
|
|
host=parsed.hostname, port=parsed.port or 5432,
|
|
user=parsed.username, password=parsed.password,
|
|
dbname=parsed.path.lstrip('/'),
|
|
options="-c search_path=compliance,public"
|
|
)
|
|
cur = conn.cursor()
|
|
|
|
# Get all regulations with controls (excluding duplicates/too_close)
|
|
cur.execute("""
|
|
SELECT
|
|
source_citation->>'source' as source_name,
|
|
count(*) as total,
|
|
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close')) as active,
|
|
count(*) FILTER (WHERE source_citation->>'article' IS NOT NULL AND source_citation->>'article' != '') as has_article,
|
|
count(*) FILTER (WHERE source_original_text IS NOT NULL AND length(source_original_text) > 50) as has_text
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation IS NOT NULL
|
|
GROUP BY 1
|
|
ORDER BY active DESC
|
|
""")
|
|
regs = cur.fetchall()
|
|
|
|
# List available PDFs and text files
|
|
pdf_files = {f.stem: f for f in PDF_DIR.glob("*.pdf")} if PDF_DIR.exists() else {}
|
|
txt_files = {f.stem: f for f in TEXT_DIR.glob("*.txt")} if TEXT_DIR.exists() else {}
|
|
html_files = {f.stem: f for f in PDF_DIR.glob("*.html")} if PDF_DIR.exists() else {}
|
|
|
|
# Also check for XML/zip files
|
|
all_files = {}
|
|
if PDF_DIR.exists():
|
|
for f in PDF_DIR.iterdir():
|
|
all_files[f.stem] = f
|
|
|
|
print(f"{'Source':55s} {'Total':>6s} {'Active':>7s} {'w/Art':>6s} {'w/Text':>7s} {'PDF?':>5s}")
|
|
print("-" * 92)
|
|
|
|
total_controls = 0
|
|
total_active = 0
|
|
total_with_text = 0
|
|
total_with_pdf = 0
|
|
no_pdf = []
|
|
|
|
for row in regs:
|
|
source, total, active, has_art, has_text = row
|
|
if not source:
|
|
source = "(null)"
|
|
total_controls += total
|
|
total_active += active
|
|
total_with_text += has_text if active > 0 else 0
|
|
|
|
# Try to find matching PDF
|
|
has_pdf = "?"
|
|
# Common name mappings
|
|
name_lower = source.lower()
|
|
for stem, path in all_files.items():
|
|
if stem.lower() in name_lower or name_lower[:20] in stem.lower():
|
|
has_pdf = path.suffix
|
|
break
|
|
|
|
if active > 0:
|
|
if has_pdf == "?":
|
|
no_pdf.append((source, active, has_text))
|
|
print(f"{source[:55]:55s} {total:6d} {active:7d} {has_art:6d} {has_text:7d} {has_pdf:>5s}")
|
|
|
|
print("-" * 92)
|
|
print(f"{'TOTAL':55s} {total_controls:6d} {total_active:7d}")
|
|
print(f"\nAvailable files in {PDF_DIR}: {len(all_files)}")
|
|
print(f" PDFs: {len(pdf_files)}, TXT: {len(txt_files)}, HTML: {len(html_files)}")
|
|
|
|
if no_pdf:
|
|
print(f"\n=== Regulations WITHOUT obvious PDF match ({len(no_pdf)}) ===")
|
|
for source, active, has_text in no_pdf:
|
|
print(f" {source[:60]:60s} {active:4d} controls, {has_text:4d} with text")
|
|
|
|
# Also list all available files for manual matching
|
|
print(f"\n=== Available source files ({len(all_files)}) ===")
|
|
for stem in sorted(all_files.keys()):
|
|
print(f" {stem}{all_files[stem].suffix}")
|
|
|
|
conn.close()
|