Files
breakpilot-compliance/scripts/qa/pdf_article_lookup_poc.py
Benjamin Admin 9b0f25c105
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
chore(qa): add PDF-based control QA scripts and results
QA pipeline that matches control source_original_text directly against
original PDF documents to verify article/paragraph assignments. Covers
backfill, dedup, source normalization, Qdrant cleanup, and prod sync.

Key results (2026-03-20):
- 4,110/7,943 controls matched to PDF (100% for major EU regs)
- 3,366 article corrections, 705 new assignments
- 1,290 controls from Erwägungsgründe (preamble) identified
- 779 controls from Anhänge (annexes) identified

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:56:13 +01:00

132 lines
4.5 KiB
Python

"""POC v2: Find control's source text in PDF — distinguish headings from cross-refs."""
import os
import re
import fitz # PyMuPDF
import psycopg2
import urllib.parse
import unicodedata
PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/battery_2023_1542.pdf")
# Step 1: Extract full text from PDF
print("=== Step 1: Reading PDF ===")
doc = fitz.open(PDF_PATH)
full_text = ""
for page in doc:
full_text += page.get_text() + "\n"
print(f" Pages: {len(doc)}, Total chars: {len(full_text)}")
def normalize(s):
"""Remove soft hyphens, normalize whitespace."""
s = s.replace('\u00ad', '').replace('\xad', '') # soft hyphen
s = s.replace('\u200b', '') # zero-width space
s = unicodedata.normalize('NFC', s)
s = re.sub(r'\s+', ' ', s)
return s.strip()
# Step 2: Build article heading index
# Article headings in EU regulations are on their own line: "Artikel 76"
# followed by a title line like: "Rücknahme"
# Cross-references look like: "gemäß Artikel 290 des Vertrags"
print("\n=== Step 2: Building article HEADING index ===")
# Pattern: "Artikel N" at start of line, NOT preceded by text on same line
heading_pattern = re.compile(r'(?:^|\n)\s*Artikel\s+(\d+[a-z]?)\s*\n', re.MULTILINE)
headings = []
for match in heading_pattern.finditer(full_text):
art_num = int(re.match(r'(\d+)', match.group(1)).group(1))
# Filter: Batterieverordnung has articles 1-96, not 114/192/290
if art_num <= 96:
headings.append((match.start(), match.group(1)))
# Sort by position
headings.sort(key=lambda x: x[0])
# Deduplicate (keep first occurrence of each article)
seen = set()
unique_headings = []
for pos, num in headings:
if num not in seen:
seen.add(num)
unique_headings.append((pos, num))
headings = unique_headings
print(f" Found {len(headings)} unique article headings")
for h in headings[:15]:
# Show context
ctx = full_text[h[0]:h[0]+60].replace('\n', '|')
print(f" Pos {h[0]:6d}: Artikel {h[1]:3s}'{ctx[:50]}'")
if len(headings) > 15:
print(f" ... and {len(headings)-15} more (up to Artikel {headings[-1][1]})")
# Normalize full text for searching
full_norm = normalize(full_text)
# Precompute normalized heading positions
heading_norm_positions = []
for pos, num in headings:
norm_pos = len(normalize(full_text[:pos]))
heading_norm_positions.append((norm_pos, num))
# Step 3: Get controls from DB
print("\n=== Step 3: Looking up controls ===")
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
cur.execute("""
SELECT id, control_id, title, source_original_text,
source_citation->>'article' as existing_article
FROM compliance.canonical_controls
WHERE source_citation->>'source' LIKE '%%1542%%'
AND source_original_text IS NOT NULL
ORDER BY control_id
""")
controls = cur.fetchall()
print(f" Got {len(controls)} controls")
# Step 4: Match
print("\n=== Step 4: Matching controls to PDF articles ===")
found = 0
not_found = 0
results = []
for ctrl in controls:
ctrl_id, control_id, title, orig_text, existing_art = ctrl
orig_norm = normalize(orig_text)
matched = False
for length in [80, 60, 40, 30]:
start = max(0, len(orig_norm) // 4)
snippet = orig_norm[start:start+length]
if not snippet or len(snippet) < 20:
continue
pos = full_norm.find(snippet)
if pos >= 0:
# Find which article heading precedes this position
article = "Preamble"
for h_pos, h_num in reversed(heading_norm_positions):
if h_pos <= pos:
article = h_num
break
status = "MATCH" if existing_art == article else ("NEW" if not existing_art else f"DIFF({existing_art}{article})")
print(f" {control_id:10s}: Artikel {article:3s} [{status}] {title[:55]}")
found += 1
matched = True
results.append((ctrl_id, control_id, article))
break
if not matched:
not_found += 1
print(f" {control_id:10s}: NOT FOUND {title[:55]}")
print(f" Text: '{orig_norm[20:70]}...'")
print(f"\n=== Result: {found}/{len(controls)} found ({not_found} not found) ===")
if headings:
print(f" Articles covered: {headings[0][1]} - {headings[-1][1]}")
conn.close()