chore(qa): PDF QA v3 — 6,259/7,943 controls matched (79%)

- Added NIST 800-53, OWASP Top 10/ASVS/SAMM/API/MASVS, ENISA ICS PDFs - Improved normalize() for ligatures, smart quotes, dashes - Added OWASP-specific index builder (A01:2021, V1.1, MASVS-*) - 6,259 article assignments in DB (1,817 article, 1,355 preamble, 1,173 control, 790 annex, 666 section) - Remaining 1,651 unmatched: Blue Guide (EN text vs DE PDF), OWASP multilingual translations (PT/AR/ID/ES) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 07:57:52 +01:00
parent 24f02b52ed
commit 0e16640c28
4 changed files with 15335 additions and 37 deletions
@@ -0,0 +1,100 @@
+"""Debug low match rates for Blue Guide, OWASP Top 10, CISA."""
+import os
+import re
+import fitz
+import psycopg2
+import urllib.parse
+import unicodedata
+
+def normalize(s):
+    s = s.replace('\u00ad', '').replace('\xad', '')
+    s = s.replace('\u200b', '').replace('\u00a0', ' ')
+    s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
+    s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+    s = s.replace('\u2019', "'").replace('\u2018', "'")
+    s = s.replace('\u201c', '"').replace('\u201d', '"')
+    s = s.replace('\u2013', '-').replace('\u2014', '-')
+    s = unicodedata.normalize('NFC', s)
+    s = re.sub(r'\s+', ' ', s)
+    return s.strip()
+
+PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
+
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+for source, filename in [
+    ("EU Blue Guide 2022", "blue_guide_2022.pdf"),
+    ("OWASP Top 10 (2021)", "owasp_top10_2021.pdf"),
+]:
+    print(f"\n{'='*60}")
+    print(f"DEBUG: {source}")
+
+    # Read PDF
+    doc = fitz.open(os.path.join(PDF_DIR, filename))
+    pdf_text = ""
+    for p in doc:
+        pdf_text += p.get_text()
+    pdf_norm = normalize(pdf_text)
+    print(f"  PDF: {len(doc)} pages, {len(pdf_text):,} chars, normalized {len(pdf_norm):,}")
+
+    # Get sample NOT-FOUND controls
+    cur.execute("""
+        SELECT control_id, source_original_text
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' = %s
+        AND source_original_text IS NOT NULL
+        AND release_state NOT IN ('duplicate', 'too_close')
+        LIMIT 8
+    """, (source,))
+
+    found = 0
+    not_found = 0
+    for row in cur.fetchall():
+        ctrl_id, orig = row
+        orig_norm = normalize(orig)
+
+        # Try standard matching
+        matched = False
+        for start_frac in [0.25, 0.1, 0.5, 0.0]:
+            for length in [80, 60, 40, 30, 20]:
+                start = max(0, int(len(orig_norm) * start_frac))
+                snippet = orig_norm[start:start+length]
+                if len(snippet) < 15:
+                    continue
+                if pdf_norm.find(snippet) >= 0:
+                    matched = True
+                    break
+            if matched:
+                break
+
+        if matched:
+            found += 1
+        else:
+            not_found += 1
+            print(f"\n  {ctrl_id}: NOT FOUND")
+            # Show what the control text looks like
+            print(f"    Control (norm, 50-110): '{orig_norm[50:110]}'")
+            # Try to find even a 10-char match
+            for i in range(0, min(len(orig_norm)-10, 200), 10):
+                snippet = orig_norm[i:i+10]
+                pos = pdf_norm.find(snippet)
+                if pos >= 0:
+                    print(f"    Partial found at ctrl[{i}:{i+10}] = '{snippet}' → PDF pos {pos}")
+                    print(f"    PDF context: '...{pdf_norm[max(0,pos-20):pos+30]}...'")
+                    break
+            else:
+                # No match at all — check char by char
+                print(f"    No 10-char match found. Control text may be from a different source.")
+                print(f"    First 100 chars: '{orig_norm[:100]}'")
+
+    print(f"\n  Result: {found} found, {not_found} not found")
+
+conn.close()