"""Debug low match rates for Blue Guide, OWASP Top 10, CISA.""" import os import re import fitz import psycopg2 import urllib.parse import unicodedata def normalize(s): s = s.replace('\u00ad', '').replace('\xad', '') s = s.replace('\u200b', '').replace('\u00a0', ' ') s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl') s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl') s = s.replace('\u2019', "'").replace('\u2018', "'") s = s.replace('\u201c', '"').replace('\u201d', '"') s = s.replace('\u2013', '-').replace('\u2014', '-') s = unicodedata.normalize('NFC', s) s = re.sub(r'\s+', ' ', s) return s.strip() PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs") db_url = os.environ['DATABASE_URL'] parsed = urllib.parse.urlparse(db_url) conn = psycopg2.connect( host=parsed.hostname, port=parsed.port or 5432, user=parsed.username, password=parsed.password, dbname=parsed.path.lstrip('/'), options="-c search_path=compliance,public" ) cur = conn.cursor() for source, filename in [ ("EU Blue Guide 2022", "blue_guide_2022.pdf"), ("OWASP Top 10 (2021)", "owasp_top10_2021.pdf"), ]: print(f"\n{'='*60}") print(f"DEBUG: {source}") # Read PDF doc = fitz.open(os.path.join(PDF_DIR, filename)) pdf_text = "" for p in doc: pdf_text += p.get_text() pdf_norm = normalize(pdf_text) print(f" PDF: {len(doc)} pages, {len(pdf_text):,} chars, normalized {len(pdf_norm):,}") # Get sample NOT-FOUND controls cur.execute(""" SELECT control_id, source_original_text FROM compliance.canonical_controls WHERE source_citation->>'source' = %s AND source_original_text IS NOT NULL AND release_state NOT IN ('duplicate', 'too_close') LIMIT 8 """, (source,)) found = 0 not_found = 0 for row in cur.fetchall(): ctrl_id, orig = row orig_norm = normalize(orig) # Try standard matching matched = False for start_frac in [0.25, 0.1, 0.5, 0.0]: for length in [80, 60, 40, 30, 20]: start = max(0, int(len(orig_norm) * start_frac)) snippet = orig_norm[start:start+length] if len(snippet) < 15: continue if pdf_norm.find(snippet) >= 0: matched = True break if matched: break if matched: found += 1 else: not_found += 1 print(f"\n {ctrl_id}: NOT FOUND") # Show what the control text looks like print(f" Control (norm, 50-110): '{orig_norm[50:110]}'") # Try to find even a 10-char match for i in range(0, min(len(orig_norm)-10, 200), 10): snippet = orig_norm[i:i+10] pos = pdf_norm.find(snippet) if pos >= 0: print(f" Partial found at ctrl[{i}:{i+10}] = '{snippet}' → PDF pos {pos}") print(f" PDF context: '...{pdf_norm[max(0,pos-20):pos+30]}...'") break else: # No match at all — check char by char print(f" No 10-char match found. Control text may be from a different source.") print(f" First 100 chars: '{orig_norm[:100]}'") print(f"\n Result: {found} found, {not_found} not found") conn.close()