"""POC v2: Find control's source text in PDF — distinguish headings from cross-refs.""" import os import re import fitz # PyMuPDF import psycopg2 import urllib.parse import unicodedata PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/battery_2023_1542.pdf") # Step 1: Extract full text from PDF print("=== Step 1: Reading PDF ===") doc = fitz.open(PDF_PATH) full_text = "" for page in doc: full_text += page.get_text() + "\n" print(f" Pages: {len(doc)}, Total chars: {len(full_text)}") def normalize(s): """Remove soft hyphens, normalize whitespace.""" s = s.replace('\u00ad', '').replace('\xad', '') # soft hyphen s = s.replace('\u200b', '') # zero-width space s = unicodedata.normalize('NFC', s) s = re.sub(r'\s+', ' ', s) return s.strip() # Step 2: Build article heading index # Article headings in EU regulations are on their own line: "Artikel 76" # followed by a title line like: "Rücknahme" # Cross-references look like: "gemäß Artikel 290 des Vertrags" print("\n=== Step 2: Building article HEADING index ===") # Pattern: "Artikel N" at start of line, NOT preceded by text on same line heading_pattern = re.compile(r'(?:^|\n)\s*Artikel\s+(\d+[a-z]?)\s*\n', re.MULTILINE) headings = [] for match in heading_pattern.finditer(full_text): art_num = int(re.match(r'(\d+)', match.group(1)).group(1)) # Filter: Batterieverordnung has articles 1-96, not 114/192/290 if art_num <= 96: headings.append((match.start(), match.group(1))) # Sort by position headings.sort(key=lambda x: x[0]) # Deduplicate (keep first occurrence of each article) seen = set() unique_headings = [] for pos, num in headings: if num not in seen: seen.add(num) unique_headings.append((pos, num)) headings = unique_headings print(f" Found {len(headings)} unique article headings") for h in headings[:15]: # Show context ctx = full_text[h[0]:h[0]+60].replace('\n', '|') print(f" Pos {h[0]:6d}: Artikel {h[1]:3s} → '{ctx[:50]}'") if len(headings) > 15: print(f" ... and {len(headings)-15} more (up to Artikel {headings[-1][1]})") # Normalize full text for searching full_norm = normalize(full_text) # Precompute normalized heading positions heading_norm_positions = [] for pos, num in headings: norm_pos = len(normalize(full_text[:pos])) heading_norm_positions.append((norm_pos, num)) # Step 3: Get controls from DB print("\n=== Step 3: Looking up controls ===") db_url = os.environ['DATABASE_URL'] parsed = urllib.parse.urlparse(db_url) conn = psycopg2.connect( host=parsed.hostname, port=parsed.port or 5432, user=parsed.username, password=parsed.password, dbname=parsed.path.lstrip('/'), options="-c search_path=compliance,public" ) cur = conn.cursor() cur.execute(""" SELECT id, control_id, title, source_original_text, source_citation->>'article' as existing_article FROM compliance.canonical_controls WHERE source_citation->>'source' LIKE '%%1542%%' AND source_original_text IS NOT NULL ORDER BY control_id """) controls = cur.fetchall() print(f" Got {len(controls)} controls") # Step 4: Match print("\n=== Step 4: Matching controls to PDF articles ===") found = 0 not_found = 0 results = [] for ctrl in controls: ctrl_id, control_id, title, orig_text, existing_art = ctrl orig_norm = normalize(orig_text) matched = False for length in [80, 60, 40, 30]: start = max(0, len(orig_norm) // 4) snippet = orig_norm[start:start+length] if not snippet or len(snippet) < 20: continue pos = full_norm.find(snippet) if pos >= 0: # Find which article heading precedes this position article = "Preamble" for h_pos, h_num in reversed(heading_norm_positions): if h_pos <= pos: article = h_num break status = "MATCH" if existing_art == article else ("NEW" if not existing_art else f"DIFF({existing_art}→{article})") print(f" {control_id:10s}: Artikel {article:3s} [{status}] {title[:55]}") found += 1 matched = True results.append((ctrl_id, control_id, article)) break if not matched: not_found += 1 print(f" {control_id:10s}: NOT FOUND {title[:55]}") print(f" Text: '{orig_norm[20:70]}...'") print(f"\n=== Result: {found}/{len(controls)} found ({not_found} not found) ===") if headings: print(f" Articles covered: {headings[0][1]} - {headings[-1][1]}") conn.close()