breakpilot-compliance/scripts/qa/owasp_cleanup.py

"""OWASP Cleanup:
1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate'
2. Fix 47 wrong source attributions (found in different OWASP PDF)
"""
import os
import re
import json
import unicodedata
import psycopg2
import urllib.parse

try:
    import fitz
except ImportError:
    print("ERROR: PyMuPDF not installed")
    exit(1)

PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")

def normalize(s):
    s = s.replace('\u00ad', '').replace('\xad', '')
    s = s.replace('\u200b', '').replace('\u00a0', ' ')
    s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
    s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
    s = s.replace('\u2019', "'").replace('\u2018', "'")
    s = s.replace('\u201c', '"').replace('\u201d', '"')
    s = s.replace('\u2013', '-').replace('\u2014', '-')
    s = s.replace('\u2022', '-').replace('\u00b7', '-')
    s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
    s = unicodedata.normalize('NFC', s)
    s = re.sub(r'\s+', ' ', s)
    return s.strip()

# Load OWASP PDFs
OWASP_PDFS = {
    "OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
    "OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
    "OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
    "OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
    "OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",
}

pdf_norms = {}
for name, filename in OWASP_PDFS.items():
    path = os.path.join(PDF_DIR, filename)
    if not os.path.exists(path):
        continue
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text() + "\n"
    doc.close()
    pdf_norms[name] = normalize(text)

def build_owasp_index(text_norm, source_name):
    # We need the raw text for regex, but we already normalized.
    # Rebuild index from normalized text.
    items = []
    if "Top 10" in source_name and "API" not in source_name:
        for m in re.finditer(r'(A\d{2}:\d{4})', text_norm):
            items.append((m.start(), m.group(1), "category"))
    elif "API" in source_name:
        for m in re.finditer(r'(API\d+:\d{4})', text_norm):
            items.append((m.start(), m.group(1), "category"))
    elif "ASVS" in source_name:
        for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm):
            items.append((m.start(), m.group(1), "requirement"))
    elif "MASVS" in source_name:
        for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm):
            items.append((m.start(), m.group(1), "requirement"))
    items.sort(key=lambda x: x[0])
    seen = set()
    unique = []
    for pos, label, typ in items:
        if label not in seen:
            seen.add(label)
            unique.append((pos, label, typ))
    return unique

pdf_indexes = {}
for name, norm in pdf_norms.items():
    pdf_indexes[name] = build_owasp_index(norm, name)

def find_in_pdf(orig_text, source_name):
    """Find control text in a specific PDF. Returns (label, type) or None."""
    pdf_norm = pdf_norms.get(source_name)
    if not pdf_norm:
        return None
    orig_norm = normalize(orig_text)
    if len(orig_norm) < 20:
        return None
    idx = pdf_indexes.get(source_name, [])
    for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
        for length in [80, 60, 40, 30, 20]:
            start = max(0, int(len(orig_norm) * start_frac))
            snippet = orig_norm[start:start+length]
            if not snippet or len(snippet) < 15:
                continue
            pos = pdf_norm.find(snippet)
            if pos >= 0:
                label = "Unknown"
                typ = "unknown"
                for h_pos, h_label, h_type in reversed(idx):
                    if h_pos <= pos:
                        label = h_label
                        typ = h_type
                        break
                return (label, typ)
    return None

# DB
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
    host=parsed.hostname, port=parsed.port or 5432,
    user=parsed.username, password=parsed.password,
    dbname=parsed.path.lstrip('/'),
    options="-c search_path=compliance,public"
)
cur = conn.cursor()

# ═══════════════════════════════════════════════════════════════
# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate
# ═══════════════════════════════════════════════════════════════
print("=" * 60)
print("STEP 1: OWASP Top 10 — multilingual controls → duplicate")
print("=" * 60)

cur.execute("""
    SELECT id, control_id, title, source_original_text, release_state
    FROM compliance.canonical_controls
    WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
    AND source_citation->>'article_type' IS NULL
    AND source_original_text IS NOT NULL
    AND release_state NOT IN ('duplicate', 'too_close')
    ORDER BY control_id
""")
top10_unmatched = cur.fetchall()
print(f"  Unmatched active OWASP Top 10: {len(top10_unmatched)}")

# Separate: found in other OWASP PDF vs not found anywhere
to_mark_dup = []
to_fix_source = []

for ctrl in top10_unmatched:
    uid, cid, title, text, state = ctrl

    # Check if found in another OWASP PDF
    found_in = None
    found_result = None
    for other_src in OWASP_PDFS:
        if other_src == 'OWASP Top 10 (2021)':
            continue
        result = find_in_pdf(text, other_src)
        if result:
            found_in = other_src
            found_result = result
            break

    if found_in:
        to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1]))
    else:
        to_mark_dup.append((uid, cid))

print(f"  → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate")
print(f"  → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution")

# Mark as duplicate
dup_marked = 0
for uid, cid in to_mark_dup:
    cur.execute("""
        UPDATE compliance.canonical_controls
        SET release_state = 'duplicate'
        WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close')
    """, (uid,))
    if cur.rowcount > 0:
        dup_marked += 1

print(f"  Marked as duplicate: {dup_marked}")

# ═══════════════════════════════════════════════════════════════
# STEP 2: Fix wrong source attributions across ALL OWASP sources
# ═══════════════════════════════════════════════════════════════
print(f"\n{'='*60}")
print("STEP 2: Fix wrong OWASP source attributions")
print("=" * 60)

all_fixes = list(to_fix_source)  # Start with Top 10 fixes

# Also check ASVS, SAMM, MASVS
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']:
    cur.execute("""
        SELECT id, control_id, title, source_original_text
        FROM compliance.canonical_controls
        WHERE source_citation->>'source' = %s
        AND source_citation->>'article_type' IS NULL
        AND source_original_text IS NOT NULL
        AND release_state NOT IN ('duplicate', 'too_close')
    """, (source,))
    controls = cur.fetchall()

    for ctrl in controls:
        uid, cid, title, text = ctrl
        # Try own PDF first
        result = find_in_pdf(text, source)
        if result:
            # Found in own PDF! Update article info
            cur.execute("""
                UPDATE compliance.canonical_controls
                SET source_citation = source_citation ||
                    jsonb_build_object('article', %s, 'article_type', %s)
                WHERE id = %s
                AND (source_citation->>'article' IS DISTINCT FROM %s
                     OR source_citation->>'article_type' IS DISTINCT FROM %s)
            """, (result[0], result[1], uid, result[0], result[1]))
            continue

        # Try other OWASP PDFs
        for other_src in OWASP_PDFS:
            if other_src == source:
                continue
            result = find_in_pdf(text, other_src)
            if result:
                all_fixes.append((uid, cid, other_src, result[0], result[1]))
                break

print(f"  Total wrong-source controls found: {len(all_fixes)}")

# Apply source fixes
fixed = 0
for uid, cid, correct_source, label, typ in all_fixes:
    cur.execute("""
        UPDATE compliance.canonical_controls
        SET source_citation = source_citation ||
            jsonb_build_object('source', %s, 'article', %s, 'article_type', %s)
        WHERE id = %s
    """, (correct_source, label, typ, uid,))
    if cur.rowcount > 0:
        fixed += 1
        print(f"  {cid:10s} → {correct_source} / {label} [{typ}]")

print(f"  Fixed: {fixed} controls")

conn.commit()

# ═══════════════════════════════════════════════════════════════
# SUMMARY
# ═══════════════════════════════════════════════════════════════
print(f"\n{'='*60}")
print("ZUSAMMENFASSUNG")
print("=" * 60)
print(f"  OWASP Top 10 multilingual → duplicate:  {dup_marked}")
print(f"  Wrong source attribution → fixed:        {fixed}")

# Final counts
cur.execute("""
    SELECT release_state, count(*)
    FROM compliance.canonical_controls
    GROUP BY release_state
    ORDER BY count(*) DESC
""")
print(f"\n  DB release_state nach Cleanup:")
for row in cur.fetchall():
    print(f"    {row[0]:15s}: {row[1]:5d}")

cur.execute("""
    SELECT count(*)
    FROM compliance.canonical_controls
    WHERE release_state NOT IN ('duplicate', 'too_close')
""")
active = cur.fetchone()[0]
print(f"\n  Aktive Controls: {active}")

conn.close()