breakpilot-compliance/scripts/qa/pdf_article_lookup_poc.py

"""POC v2: Find control's source text in PDF — distinguish headings from cross-refs."""
import os
import re
import fitz  # PyMuPDF
import psycopg2
import urllib.parse
import unicodedata

PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/battery_2023_1542.pdf")

# Step 1: Extract full text from PDF
print("=== Step 1: Reading PDF ===")
doc = fitz.open(PDF_PATH)
full_text = ""
for page in doc:
    full_text += page.get_text() + "\n"
print(f"  Pages: {len(doc)}, Total chars: {len(full_text)}")

def normalize(s):
    """Remove soft hyphens, normalize whitespace."""
    s = s.replace('\u00ad', '').replace('\xad', '')  # soft hyphen
    s = s.replace('\u200b', '')  # zero-width space
    s = unicodedata.normalize('NFC', s)
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

# Step 2: Build article heading index
# Article headings in EU regulations are on their own line: "Artikel 76"
# followed by a title line like: "Rücknahme"
# Cross-references look like: "gemäß Artikel 290 des Vertrags"
print("\n=== Step 2: Building article HEADING index ===")
# Pattern: "Artikel N" at start of line, NOT preceded by text on same line
heading_pattern = re.compile(r'(?:^|\n)\s*Artikel\s+(\d+[a-z]?)\s*\n', re.MULTILINE)
headings = []
for match in heading_pattern.finditer(full_text):
    art_num = int(re.match(r'(\d+)', match.group(1)).group(1))
    # Filter: Batterieverordnung has articles 1-96, not 114/192/290
    if art_num <= 96:
        headings.append((match.start(), match.group(1)))

# Sort by position
headings.sort(key=lambda x: x[0])
# Deduplicate (keep first occurrence of each article)
seen = set()
unique_headings = []
for pos, num in headings:
    if num not in seen:
        seen.add(num)
        unique_headings.append((pos, num))
headings = unique_headings

print(f"  Found {len(headings)} unique article headings")
for h in headings[:15]:
    # Show context
    ctx = full_text[h[0]:h[0]+60].replace('\n', '|')
    print(f"    Pos {h[0]:6d}: Artikel {h[1]:3s}  → '{ctx[:50]}'")
if len(headings) > 15:
    print(f"    ... and {len(headings)-15} more (up to Artikel {headings[-1][1]})")

# Normalize full text for searching
full_norm = normalize(full_text)

# Precompute normalized heading positions
heading_norm_positions = []
for pos, num in headings:
    norm_pos = len(normalize(full_text[:pos]))
    heading_norm_positions.append((norm_pos, num))

# Step 3: Get controls from DB
print("\n=== Step 3: Looking up controls ===")
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
    host=parsed.hostname, port=parsed.port or 5432,
    user=parsed.username, password=parsed.password,
    dbname=parsed.path.lstrip('/'),
    options="-c search_path=compliance,public"
)
cur = conn.cursor()
cur.execute("""
    SELECT id, control_id, title, source_original_text,
           source_citation->>'article' as existing_article
    FROM compliance.canonical_controls
    WHERE source_citation->>'source' LIKE '%%1542%%'
    AND source_original_text IS NOT NULL
    ORDER BY control_id
""")
controls = cur.fetchall()
print(f"  Got {len(controls)} controls")

# Step 4: Match
print("\n=== Step 4: Matching controls to PDF articles ===")
found = 0
not_found = 0
results = []

for ctrl in controls:
    ctrl_id, control_id, title, orig_text, existing_art = ctrl
    orig_norm = normalize(orig_text)

    matched = False
    for length in [80, 60, 40, 30]:
        start = max(0, len(orig_norm) // 4)
        snippet = orig_norm[start:start+length]
        if not snippet or len(snippet) < 20:
            continue
        pos = full_norm.find(snippet)
        if pos >= 0:
            # Find which article heading precedes this position
            article = "Preamble"
            for h_pos, h_num in reversed(heading_norm_positions):
                if h_pos <= pos:
                    article = h_num
                    break

            status = "MATCH" if existing_art == article else ("NEW" if not existing_art else f"DIFF({existing_art}→{article})")
            print(f"  {control_id:10s}: Artikel {article:3s}  [{status}]  {title[:55]}")
            found += 1
            matched = True
            results.append((ctrl_id, control_id, article))
            break

    if not matched:
        not_found += 1
        print(f"  {control_id:10s}: NOT FOUND           {title[:55]}")
        print(f"              Text: '{orig_norm[20:70]}...'")

print(f"\n=== Result: {found}/{len(controls)} found ({not_found} not found) ===")
if headings:
    print(f"  Articles covered: {headings[0][1]} - {headings[-1][1]}")
conn.close()