Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 43s
CI/CD / test-python-backend-compliance (push) Successful in 33s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 22s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
- Added NIST 800-53, OWASP Top 10/ASVS/SAMM/API/MASVS, ENISA ICS PDFs - Improved normalize() for ligatures, smart quotes, dashes - Added OWASP-specific index builder (A01:2021, V1.1, MASVS-*) - 6,259 article assignments in DB (1,817 article, 1,355 preamble, 1,173 control, 790 annex, 666 section) - Remaining 1,651 unmatched: Blue Guide (EN text vs DE PDF), OWASP multilingual translations (PT/AR/ID/ES) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
101 lines
3.4 KiB
Python
101 lines
3.4 KiB
Python
"""Debug low match rates for Blue Guide, OWASP Top 10, CISA."""
|
|
import os
|
|
import re
|
|
import fitz
|
|
import psycopg2
|
|
import urllib.parse
|
|
import unicodedata
|
|
|
|
def normalize(s):
|
|
s = s.replace('\u00ad', '').replace('\xad', '')
|
|
s = s.replace('\u200b', '').replace('\u00a0', ' ')
|
|
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
|
|
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
|
|
s = s.replace('\u2019', "'").replace('\u2018', "'")
|
|
s = s.replace('\u201c', '"').replace('\u201d', '"')
|
|
s = s.replace('\u2013', '-').replace('\u2014', '-')
|
|
s = unicodedata.normalize('NFC', s)
|
|
s = re.sub(r'\s+', ' ', s)
|
|
return s.strip()
|
|
|
|
PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
|
|
|
|
db_url = os.environ['DATABASE_URL']
|
|
parsed = urllib.parse.urlparse(db_url)
|
|
conn = psycopg2.connect(
|
|
host=parsed.hostname, port=parsed.port or 5432,
|
|
user=parsed.username, password=parsed.password,
|
|
dbname=parsed.path.lstrip('/'),
|
|
options="-c search_path=compliance,public"
|
|
)
|
|
cur = conn.cursor()
|
|
|
|
for source, filename in [
|
|
("EU Blue Guide 2022", "blue_guide_2022.pdf"),
|
|
("OWASP Top 10 (2021)", "owasp_top10_2021.pdf"),
|
|
]:
|
|
print(f"\n{'='*60}")
|
|
print(f"DEBUG: {source}")
|
|
|
|
# Read PDF
|
|
doc = fitz.open(os.path.join(PDF_DIR, filename))
|
|
pdf_text = ""
|
|
for p in doc:
|
|
pdf_text += p.get_text()
|
|
pdf_norm = normalize(pdf_text)
|
|
print(f" PDF: {len(doc)} pages, {len(pdf_text):,} chars, normalized {len(pdf_norm):,}")
|
|
|
|
# Get sample NOT-FOUND controls
|
|
cur.execute("""
|
|
SELECT control_id, source_original_text
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'source' = %s
|
|
AND source_original_text IS NOT NULL
|
|
AND release_state NOT IN ('duplicate', 'too_close')
|
|
LIMIT 8
|
|
""", (source,))
|
|
|
|
found = 0
|
|
not_found = 0
|
|
for row in cur.fetchall():
|
|
ctrl_id, orig = row
|
|
orig_norm = normalize(orig)
|
|
|
|
# Try standard matching
|
|
matched = False
|
|
for start_frac in [0.25, 0.1, 0.5, 0.0]:
|
|
for length in [80, 60, 40, 30, 20]:
|
|
start = max(0, int(len(orig_norm) * start_frac))
|
|
snippet = orig_norm[start:start+length]
|
|
if len(snippet) < 15:
|
|
continue
|
|
if pdf_norm.find(snippet) >= 0:
|
|
matched = True
|
|
break
|
|
if matched:
|
|
break
|
|
|
|
if matched:
|
|
found += 1
|
|
else:
|
|
not_found += 1
|
|
print(f"\n {ctrl_id}: NOT FOUND")
|
|
# Show what the control text looks like
|
|
print(f" Control (norm, 50-110): '{orig_norm[50:110]}'")
|
|
# Try to find even a 10-char match
|
|
for i in range(0, min(len(orig_norm)-10, 200), 10):
|
|
snippet = orig_norm[i:i+10]
|
|
pos = pdf_norm.find(snippet)
|
|
if pos >= 0:
|
|
print(f" Partial found at ctrl[{i}:{i+10}] = '{snippet}' → PDF pos {pos}")
|
|
print(f" PDF context: '...{pdf_norm[max(0,pos-20):pos+30]}...'")
|
|
break
|
|
else:
|
|
# No match at all — check char by char
|
|
print(f" No 10-char match found. Control text may be from a different source.")
|
|
print(f" First 100 chars: '{orig_norm[:100]}'")
|
|
|
|
print(f"\n Result: {found} found, {not_found} not found")
|
|
|
|
conn.close()
|