Files
breakpilot-compliance/scripts/qa/owasp_cleanup.py
Benjamin Admin 643b26618f
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
feat: Control Library UI, dedup migration, QA tooling, docs
- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00

275 lines
9.8 KiB
Python

"""OWASP Cleanup:
1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate'
2. Fix 47 wrong source attributions (found in different OWASP PDF)
"""
import os
import re
import json
import unicodedata
import psycopg2
import urllib.parse
try:
import fitz
except ImportError:
print("ERROR: PyMuPDF not installed")
exit(1)
PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
def normalize(s):
s = s.replace('\u00ad', '').replace('\xad', '')
s = s.replace('\u200b', '').replace('\u00a0', ' ')
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
s = s.replace('\u2019', "'").replace('\u2018', "'")
s = s.replace('\u201c', '"').replace('\u201d', '"')
s = s.replace('\u2013', '-').replace('\u2014', '-')
s = s.replace('\u2022', '-').replace('\u00b7', '-')
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
s = unicodedata.normalize('NFC', s)
s = re.sub(r'\s+', ' ', s)
return s.strip()
# Load OWASP PDFs
OWASP_PDFS = {
"OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
"OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
"OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
"OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
"OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",
}
pdf_norms = {}
for name, filename in OWASP_PDFS.items():
path = os.path.join(PDF_DIR, filename)
if not os.path.exists(path):
continue
doc = fitz.open(path)
text = ""
for page in doc:
text += page.get_text() + "\n"
doc.close()
pdf_norms[name] = normalize(text)
def build_owasp_index(text_norm, source_name):
# We need the raw text for regex, but we already normalized.
# Rebuild index from normalized text.
items = []
if "Top 10" in source_name and "API" not in source_name:
for m in re.finditer(r'(A\d{2}:\d{4})', text_norm):
items.append((m.start(), m.group(1), "category"))
elif "API" in source_name:
for m in re.finditer(r'(API\d+:\d{4})', text_norm):
items.append((m.start(), m.group(1), "category"))
elif "ASVS" in source_name:
for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm):
items.append((m.start(), m.group(1), "requirement"))
elif "MASVS" in source_name:
for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm):
items.append((m.start(), m.group(1), "requirement"))
items.sort(key=lambda x: x[0])
seen = set()
unique = []
for pos, label, typ in items:
if label not in seen:
seen.add(label)
unique.append((pos, label, typ))
return unique
pdf_indexes = {}
for name, norm in pdf_norms.items():
pdf_indexes[name] = build_owasp_index(norm, name)
def find_in_pdf(orig_text, source_name):
"""Find control text in a specific PDF. Returns (label, type) or None."""
pdf_norm = pdf_norms.get(source_name)
if not pdf_norm:
return None
orig_norm = normalize(orig_text)
if len(orig_norm) < 20:
return None
idx = pdf_indexes.get(source_name, [])
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
for length in [80, 60, 40, 30, 20]:
start = max(0, int(len(orig_norm) * start_frac))
snippet = orig_norm[start:start+length]
if not snippet or len(snippet) < 15:
continue
pos = pdf_norm.find(snippet)
if pos >= 0:
label = "Unknown"
typ = "unknown"
for h_pos, h_label, h_type in reversed(idx):
if h_pos <= pos:
label = h_label
typ = h_type
break
return (label, typ)
return None
# DB
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# ═══════════════════════════════════════════════════════════════
# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate
# ═══════════════════════════════════════════════════════════════
print("=" * 60)
print("STEP 1: OWASP Top 10 — multilingual controls → duplicate")
print("=" * 60)
cur.execute("""
SELECT id, control_id, title, source_original_text, release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
AND source_citation->>'article_type' IS NULL
AND source_original_text IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
ORDER BY control_id
""")
top10_unmatched = cur.fetchall()
print(f" Unmatched active OWASP Top 10: {len(top10_unmatched)}")
# Separate: found in other OWASP PDF vs not found anywhere
to_mark_dup = []
to_fix_source = []
for ctrl in top10_unmatched:
uid, cid, title, text, state = ctrl
# Check if found in another OWASP PDF
found_in = None
found_result = None
for other_src in OWASP_PDFS:
if other_src == 'OWASP Top 10 (2021)':
continue
result = find_in_pdf(text, other_src)
if result:
found_in = other_src
found_result = result
break
if found_in:
to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1]))
else:
to_mark_dup.append((uid, cid))
print(f" → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate")
print(f" → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution")
# Mark as duplicate
dup_marked = 0
for uid, cid in to_mark_dup:
cur.execute("""
UPDATE compliance.canonical_controls
SET release_state = 'duplicate'
WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close')
""", (uid,))
if cur.rowcount > 0:
dup_marked += 1
print(f" Marked as duplicate: {dup_marked}")
# ═══════════════════════════════════════════════════════════════
# STEP 2: Fix wrong source attributions across ALL OWASP sources
# ═══════════════════════════════════════════════════════════════
print(f"\n{'='*60}")
print("STEP 2: Fix wrong OWASP source attributions")
print("=" * 60)
all_fixes = list(to_fix_source) # Start with Top 10 fixes
# Also check ASVS, SAMM, MASVS
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']:
cur.execute("""
SELECT id, control_id, title, source_original_text
FROM compliance.canonical_controls
WHERE source_citation->>'source' = %s
AND source_citation->>'article_type' IS NULL
AND source_original_text IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
""", (source,))
controls = cur.fetchall()
for ctrl in controls:
uid, cid, title, text = ctrl
# Try own PDF first
result = find_in_pdf(text, source)
if result:
# Found in own PDF! Update article info
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation ||
jsonb_build_object('article', %s, 'article_type', %s)
WHERE id = %s
AND (source_citation->>'article' IS DISTINCT FROM %s
OR source_citation->>'article_type' IS DISTINCT FROM %s)
""", (result[0], result[1], uid, result[0], result[1]))
continue
# Try other OWASP PDFs
for other_src in OWASP_PDFS:
if other_src == source:
continue
result = find_in_pdf(text, other_src)
if result:
all_fixes.append((uid, cid, other_src, result[0], result[1]))
break
print(f" Total wrong-source controls found: {len(all_fixes)}")
# Apply source fixes
fixed = 0
for uid, cid, correct_source, label, typ in all_fixes:
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation ||
jsonb_build_object('source', %s, 'article', %s, 'article_type', %s)
WHERE id = %s
""", (correct_source, label, typ, uid,))
if cur.rowcount > 0:
fixed += 1
print(f" {cid:10s}{correct_source} / {label} [{typ}]")
print(f" Fixed: {fixed} controls")
conn.commit()
# ═══════════════════════════════════════════════════════════════
# SUMMARY
# ═══════════════════════════════════════════════════════════════
print(f"\n{'='*60}")
print("ZUSAMMENFASSUNG")
print("=" * 60)
print(f" OWASP Top 10 multilingual → duplicate: {dup_marked}")
print(f" Wrong source attribution → fixed: {fixed}")
# Final counts
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY release_state
ORDER BY count(*) DESC
""")
print(f"\n DB release_state nach Cleanup:")
for row in cur.fetchall():
print(f" {row[0]:15s}: {row[1]:5d}")
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls
WHERE release_state NOT IN ('duplicate', 'too_close')
""")
active = cur.fetchone()[0]
print(f"\n Aktive Controls: {active}")
conn.close()