Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
275 lines
9.8 KiB
Python
275 lines
9.8 KiB
Python
"""OWASP Cleanup:
|
|
1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate'
|
|
2. Fix 47 wrong source attributions (found in different OWASP PDF)
|
|
"""
|
|
import os
|
|
import re
|
|
import json
|
|
import unicodedata
|
|
import psycopg2
|
|
import urllib.parse
|
|
|
|
try:
|
|
import fitz
|
|
except ImportError:
|
|
print("ERROR: PyMuPDF not installed")
|
|
exit(1)
|
|
|
|
PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
|
|
|
|
def normalize(s):
|
|
s = s.replace('\u00ad', '').replace('\xad', '')
|
|
s = s.replace('\u200b', '').replace('\u00a0', ' ')
|
|
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
|
|
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
|
|
s = s.replace('\u2019', "'").replace('\u2018', "'")
|
|
s = s.replace('\u201c', '"').replace('\u201d', '"')
|
|
s = s.replace('\u2013', '-').replace('\u2014', '-')
|
|
s = s.replace('\u2022', '-').replace('\u00b7', '-')
|
|
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
|
s = unicodedata.normalize('NFC', s)
|
|
s = re.sub(r'\s+', ' ', s)
|
|
return s.strip()
|
|
|
|
# Load OWASP PDFs
|
|
OWASP_PDFS = {
|
|
"OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
|
|
"OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
|
|
"OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
|
|
"OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
|
|
"OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",
|
|
}
|
|
|
|
pdf_norms = {}
|
|
for name, filename in OWASP_PDFS.items():
|
|
path = os.path.join(PDF_DIR, filename)
|
|
if not os.path.exists(path):
|
|
continue
|
|
doc = fitz.open(path)
|
|
text = ""
|
|
for page in doc:
|
|
text += page.get_text() + "\n"
|
|
doc.close()
|
|
pdf_norms[name] = normalize(text)
|
|
|
|
def build_owasp_index(text_norm, source_name):
|
|
# We need the raw text for regex, but we already normalized.
|
|
# Rebuild index from normalized text.
|
|
items = []
|
|
if "Top 10" in source_name and "API" not in source_name:
|
|
for m in re.finditer(r'(A\d{2}:\d{4})', text_norm):
|
|
items.append((m.start(), m.group(1), "category"))
|
|
elif "API" in source_name:
|
|
for m in re.finditer(r'(API\d+:\d{4})', text_norm):
|
|
items.append((m.start(), m.group(1), "category"))
|
|
elif "ASVS" in source_name:
|
|
for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm):
|
|
items.append((m.start(), m.group(1), "requirement"))
|
|
elif "MASVS" in source_name:
|
|
for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm):
|
|
items.append((m.start(), m.group(1), "requirement"))
|
|
items.sort(key=lambda x: x[0])
|
|
seen = set()
|
|
unique = []
|
|
for pos, label, typ in items:
|
|
if label not in seen:
|
|
seen.add(label)
|
|
unique.append((pos, label, typ))
|
|
return unique
|
|
|
|
pdf_indexes = {}
|
|
for name, norm in pdf_norms.items():
|
|
pdf_indexes[name] = build_owasp_index(norm, name)
|
|
|
|
def find_in_pdf(orig_text, source_name):
|
|
"""Find control text in a specific PDF. Returns (label, type) or None."""
|
|
pdf_norm = pdf_norms.get(source_name)
|
|
if not pdf_norm:
|
|
return None
|
|
orig_norm = normalize(orig_text)
|
|
if len(orig_norm) < 20:
|
|
return None
|
|
idx = pdf_indexes.get(source_name, [])
|
|
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
|
|
for length in [80, 60, 40, 30, 20]:
|
|
start = max(0, int(len(orig_norm) * start_frac))
|
|
snippet = orig_norm[start:start+length]
|
|
if not snippet or len(snippet) < 15:
|
|
continue
|
|
pos = pdf_norm.find(snippet)
|
|
if pos >= 0:
|
|
label = "Unknown"
|
|
typ = "unknown"
|
|
for h_pos, h_label, h_type in reversed(idx):
|
|
if h_pos <= pos:
|
|
label = h_label
|
|
typ = h_type
|
|
break
|
|
return (label, typ)
|
|
return None
|
|
|
|
# DB
|
|
db_url = os.environ['DATABASE_URL']
|
|
parsed = urllib.parse.urlparse(db_url)
|
|
conn = psycopg2.connect(
|
|
host=parsed.hostname, port=parsed.port or 5432,
|
|
user=parsed.username, password=parsed.password,
|
|
dbname=parsed.path.lstrip('/'),
|
|
options="-c search_path=compliance,public"
|
|
)
|
|
cur = conn.cursor()
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate
|
|
# ═══════════════════════════════════════════════════════════════
|
|
print("=" * 60)
|
|
print("STEP 1: OWASP Top 10 — multilingual controls → duplicate")
|
|
print("=" * 60)
|
|
|
|
cur.execute("""
|
|
SELECT id, control_id, title, source_original_text, release_state
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
|
|
AND source_citation->>'article_type' IS NULL
|
|
AND source_original_text IS NOT NULL
|
|
AND release_state NOT IN ('duplicate', 'too_close')
|
|
ORDER BY control_id
|
|
""")
|
|
top10_unmatched = cur.fetchall()
|
|
print(f" Unmatched active OWASP Top 10: {len(top10_unmatched)}")
|
|
|
|
# Separate: found in other OWASP PDF vs not found anywhere
|
|
to_mark_dup = []
|
|
to_fix_source = []
|
|
|
|
for ctrl in top10_unmatched:
|
|
uid, cid, title, text, state = ctrl
|
|
|
|
# Check if found in another OWASP PDF
|
|
found_in = None
|
|
found_result = None
|
|
for other_src in OWASP_PDFS:
|
|
if other_src == 'OWASP Top 10 (2021)':
|
|
continue
|
|
result = find_in_pdf(text, other_src)
|
|
if result:
|
|
found_in = other_src
|
|
found_result = result
|
|
break
|
|
|
|
if found_in:
|
|
to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1]))
|
|
else:
|
|
to_mark_dup.append((uid, cid))
|
|
|
|
print(f" → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate")
|
|
print(f" → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution")
|
|
|
|
# Mark as duplicate
|
|
dup_marked = 0
|
|
for uid, cid in to_mark_dup:
|
|
cur.execute("""
|
|
UPDATE compliance.canonical_controls
|
|
SET release_state = 'duplicate'
|
|
WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close')
|
|
""", (uid,))
|
|
if cur.rowcount > 0:
|
|
dup_marked += 1
|
|
|
|
print(f" Marked as duplicate: {dup_marked}")
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# STEP 2: Fix wrong source attributions across ALL OWASP sources
|
|
# ═══════════════════════════════════════════════════════════════
|
|
print(f"\n{'='*60}")
|
|
print("STEP 2: Fix wrong OWASP source attributions")
|
|
print("=" * 60)
|
|
|
|
all_fixes = list(to_fix_source) # Start with Top 10 fixes
|
|
|
|
# Also check ASVS, SAMM, MASVS
|
|
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']:
|
|
cur.execute("""
|
|
SELECT id, control_id, title, source_original_text
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'source' = %s
|
|
AND source_citation->>'article_type' IS NULL
|
|
AND source_original_text IS NOT NULL
|
|
AND release_state NOT IN ('duplicate', 'too_close')
|
|
""", (source,))
|
|
controls = cur.fetchall()
|
|
|
|
for ctrl in controls:
|
|
uid, cid, title, text = ctrl
|
|
# Try own PDF first
|
|
result = find_in_pdf(text, source)
|
|
if result:
|
|
# Found in own PDF! Update article info
|
|
cur.execute("""
|
|
UPDATE compliance.canonical_controls
|
|
SET source_citation = source_citation ||
|
|
jsonb_build_object('article', %s, 'article_type', %s)
|
|
WHERE id = %s
|
|
AND (source_citation->>'article' IS DISTINCT FROM %s
|
|
OR source_citation->>'article_type' IS DISTINCT FROM %s)
|
|
""", (result[0], result[1], uid, result[0], result[1]))
|
|
continue
|
|
|
|
# Try other OWASP PDFs
|
|
for other_src in OWASP_PDFS:
|
|
if other_src == source:
|
|
continue
|
|
result = find_in_pdf(text, other_src)
|
|
if result:
|
|
all_fixes.append((uid, cid, other_src, result[0], result[1]))
|
|
break
|
|
|
|
print(f" Total wrong-source controls found: {len(all_fixes)}")
|
|
|
|
# Apply source fixes
|
|
fixed = 0
|
|
for uid, cid, correct_source, label, typ in all_fixes:
|
|
cur.execute("""
|
|
UPDATE compliance.canonical_controls
|
|
SET source_citation = source_citation ||
|
|
jsonb_build_object('source', %s, 'article', %s, 'article_type', %s)
|
|
WHERE id = %s
|
|
""", (correct_source, label, typ, uid,))
|
|
if cur.rowcount > 0:
|
|
fixed += 1
|
|
print(f" {cid:10s} → {correct_source} / {label} [{typ}]")
|
|
|
|
print(f" Fixed: {fixed} controls")
|
|
|
|
conn.commit()
|
|
|
|
# ═══════════════════════════════════════════════════════════════
|
|
# SUMMARY
|
|
# ═══════════════════════════════════════════════════════════════
|
|
print(f"\n{'='*60}")
|
|
print("ZUSAMMENFASSUNG")
|
|
print("=" * 60)
|
|
print(f" OWASP Top 10 multilingual → duplicate: {dup_marked}")
|
|
print(f" Wrong source attribution → fixed: {fixed}")
|
|
|
|
# Final counts
|
|
cur.execute("""
|
|
SELECT release_state, count(*)
|
|
FROM compliance.canonical_controls
|
|
GROUP BY release_state
|
|
ORDER BY count(*) DESC
|
|
""")
|
|
print(f"\n DB release_state nach Cleanup:")
|
|
for row in cur.fetchall():
|
|
print(f" {row[0]:15s}: {row[1]:5d}")
|
|
|
|
cur.execute("""
|
|
SELECT count(*)
|
|
FROM compliance.canonical_controls
|
|
WHERE release_state NOT IN ('duplicate', 'too_close')
|
|
""")
|
|
active = cur.fetchone()[0]
|
|
print(f"\n Aktive Controls: {active}")
|
|
|
|
conn.close()
|