"""OWASP Cleanup: 1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate' 2. Fix 47 wrong source attributions (found in different OWASP PDF) """ import os import re import json import unicodedata import psycopg2 import urllib.parse try: import fitz except ImportError: print("ERROR: PyMuPDF not installed") exit(1) PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs") def normalize(s): s = s.replace('\u00ad', '').replace('\xad', '') s = s.replace('\u200b', '').replace('\u00a0', ' ') s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl') s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl') s = s.replace('\u2019', "'").replace('\u2018', "'") s = s.replace('\u201c', '"').replace('\u201d', '"') s = s.replace('\u2013', '-').replace('\u2014', '-') s = s.replace('\u2022', '-').replace('\u00b7', '-') s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s) s = unicodedata.normalize('NFC', s) s = re.sub(r'\s+', ' ', s) return s.strip() # Load OWASP PDFs OWASP_PDFS = { "OWASP Top 10 (2021)": "owasp_top10_2021.pdf", "OWASP ASVS 4.0": "owasp_asvs_4_0.pdf", "OWASP SAMM 2.0": "owasp_samm_2_0.pdf", "OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf", "OWASP MASVS 2.0": "owasp_masvs_2_0.pdf", } pdf_norms = {} for name, filename in OWASP_PDFS.items(): path = os.path.join(PDF_DIR, filename) if not os.path.exists(path): continue doc = fitz.open(path) text = "" for page in doc: text += page.get_text() + "\n" doc.close() pdf_norms[name] = normalize(text) def build_owasp_index(text_norm, source_name): # We need the raw text for regex, but we already normalized. # Rebuild index from normalized text. items = [] if "Top 10" in source_name and "API" not in source_name: for m in re.finditer(r'(A\d{2}:\d{4})', text_norm): items.append((m.start(), m.group(1), "category")) elif "API" in source_name: for m in re.finditer(r'(API\d+:\d{4})', text_norm): items.append((m.start(), m.group(1), "category")) elif "ASVS" in source_name: for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm): items.append((m.start(), m.group(1), "requirement")) elif "MASVS" in source_name: for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm): items.append((m.start(), m.group(1), "requirement")) items.sort(key=lambda x: x[0]) seen = set() unique = [] for pos, label, typ in items: if label not in seen: seen.add(label) unique.append((pos, label, typ)) return unique pdf_indexes = {} for name, norm in pdf_norms.items(): pdf_indexes[name] = build_owasp_index(norm, name) def find_in_pdf(orig_text, source_name): """Find control text in a specific PDF. Returns (label, type) or None.""" pdf_norm = pdf_norms.get(source_name) if not pdf_norm: return None orig_norm = normalize(orig_text) if len(orig_norm) < 20: return None idx = pdf_indexes.get(source_name, []) for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]: for length in [80, 60, 40, 30, 20]: start = max(0, int(len(orig_norm) * start_frac)) snippet = orig_norm[start:start+length] if not snippet or len(snippet) < 15: continue pos = pdf_norm.find(snippet) if pos >= 0: label = "Unknown" typ = "unknown" for h_pos, h_label, h_type in reversed(idx): if h_pos <= pos: label = h_label typ = h_type break return (label, typ) return None # DB db_url = os.environ['DATABASE_URL'] parsed = urllib.parse.urlparse(db_url) conn = psycopg2.connect( host=parsed.hostname, port=parsed.port or 5432, user=parsed.username, password=parsed.password, dbname=parsed.path.lstrip('/'), options="-c search_path=compliance,public" ) cur = conn.cursor() # ═══════════════════════════════════════════════════════════════ # STEP 1: Mark OWASP Top 10 multilingual controls as duplicate # ═══════════════════════════════════════════════════════════════ print("=" * 60) print("STEP 1: OWASP Top 10 — multilingual controls → duplicate") print("=" * 60) cur.execute(""" SELECT id, control_id, title, source_original_text, release_state FROM compliance.canonical_controls WHERE source_citation->>'source' = 'OWASP Top 10 (2021)' AND source_citation->>'article_type' IS NULL AND source_original_text IS NOT NULL AND release_state NOT IN ('duplicate', 'too_close') ORDER BY control_id """) top10_unmatched = cur.fetchall() print(f" Unmatched active OWASP Top 10: {len(top10_unmatched)}") # Separate: found in other OWASP PDF vs not found anywhere to_mark_dup = [] to_fix_source = [] for ctrl in top10_unmatched: uid, cid, title, text, state = ctrl # Check if found in another OWASP PDF found_in = None found_result = None for other_src in OWASP_PDFS: if other_src == 'OWASP Top 10 (2021)': continue result = find_in_pdf(text, other_src) if result: found_in = other_src found_result = result break if found_in: to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1])) else: to_mark_dup.append((uid, cid)) print(f" → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate") print(f" → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution") # Mark as duplicate dup_marked = 0 for uid, cid in to_mark_dup: cur.execute(""" UPDATE compliance.canonical_controls SET release_state = 'duplicate' WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close') """, (uid,)) if cur.rowcount > 0: dup_marked += 1 print(f" Marked as duplicate: {dup_marked}") # ═══════════════════════════════════════════════════════════════ # STEP 2: Fix wrong source attributions across ALL OWASP sources # ═══════════════════════════════════════════════════════════════ print(f"\n{'='*60}") print("STEP 2: Fix wrong OWASP source attributions") print("=" * 60) all_fixes = list(to_fix_source) # Start with Top 10 fixes # Also check ASVS, SAMM, MASVS for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']: cur.execute(""" SELECT id, control_id, title, source_original_text FROM compliance.canonical_controls WHERE source_citation->>'source' = %s AND source_citation->>'article_type' IS NULL AND source_original_text IS NOT NULL AND release_state NOT IN ('duplicate', 'too_close') """, (source,)) controls = cur.fetchall() for ctrl in controls: uid, cid, title, text = ctrl # Try own PDF first result = find_in_pdf(text, source) if result: # Found in own PDF! Update article info cur.execute(""" UPDATE compliance.canonical_controls SET source_citation = source_citation || jsonb_build_object('article', %s, 'article_type', %s) WHERE id = %s AND (source_citation->>'article' IS DISTINCT FROM %s OR source_citation->>'article_type' IS DISTINCT FROM %s) """, (result[0], result[1], uid, result[0], result[1])) continue # Try other OWASP PDFs for other_src in OWASP_PDFS: if other_src == source: continue result = find_in_pdf(text, other_src) if result: all_fixes.append((uid, cid, other_src, result[0], result[1])) break print(f" Total wrong-source controls found: {len(all_fixes)}") # Apply source fixes fixed = 0 for uid, cid, correct_source, label, typ in all_fixes: cur.execute(""" UPDATE compliance.canonical_controls SET source_citation = source_citation || jsonb_build_object('source', %s, 'article', %s, 'article_type', %s) WHERE id = %s """, (correct_source, label, typ, uid,)) if cur.rowcount > 0: fixed += 1 print(f" {cid:10s} → {correct_source} / {label} [{typ}]") print(f" Fixed: {fixed} controls") conn.commit() # ═══════════════════════════════════════════════════════════════ # SUMMARY # ═══════════════════════════════════════════════════════════════ print(f"\n{'='*60}") print("ZUSAMMENFASSUNG") print("=" * 60) print(f" OWASP Top 10 multilingual → duplicate: {dup_marked}") print(f" Wrong source attribution → fixed: {fixed}") # Final counts cur.execute(""" SELECT release_state, count(*) FROM compliance.canonical_controls GROUP BY release_state ORDER BY count(*) DESC """) print(f"\n DB release_state nach Cleanup:") for row in cur.fetchall(): print(f" {row[0]:15s}: {row[1]:5d}") cur.execute(""" SELECT count(*) FROM compliance.canonical_controls WHERE release_state NOT IN ('duplicate', 'too_close') """) active = cur.fetchone()[0] print(f"\n Aktive Controls: {active}") conn.close()