Files
breakpilot-core/embedding-service/test_nist_normalization.py
T
Benjamin Admin a9671a572b fix(embedding): single-number ALL-CAPS section detection for ENISA/BSI
Add case-sensitive _SINGLE_NUM_ALLCAPS_RE for "1. INTRODUCTION" style
headers (ENISA, BSI docs). Cannot use _LEGAL_SECTION_RE for this because
it uses re.IGNORECASE which would false-positive on "1. Erstens" etc.

Also re-downloaded 2 corrupt PDFs from nist.gov (nistir_8259a, nist_ai_rmf)
— originals in MinIO were 263-byte XML error responses, not PDFs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 08:56:02 +02:00

249 lines
10 KiB
Python

"""
Tests for NIST/BSI/ENISA PDF text normalization and section detection.
Covers:
- _normalize_pdf_text() fixing broken multi-column PDF artifacts
- Section detection after normalization
- NIST CSF 2.0 compound IDs (GV.OC-01)
- NIST SP 800-53 control IDs (AC-1, AC-1(1))
- OWASP Top 10 IDs (A01:2021)
- Unicode normalization (ligatures, soft hyphens)
"""
from main import (
_normalize_pdf_text,
_extract_section_header,
_parse_section_metadata,
chunk_text_legal,
chunk_text_legal_structured,
)
# =========================================================================
# _normalize_pdf_text — broken spacing fixes
# =========================================================================
class TestNormalizePdfText:
def test_broken_section_number(self):
assert _normalize_pdf_text("1 . 1 Risk Framing") == "1.1 Risk Framing"
def test_nested_section_number(self):
assert _normalize_pdf_text("2 . 3 . 1 Subtitle") == "2.3.1 Subtitle"
def test_broken_nist_control_id(self):
assert _normalize_pdf_text("AC - 1 Account Management") == "AC-1 Account Management"
def test_broken_nist_control_au(self):
assert _normalize_pdf_text("AU - 2 Audit Events") == "AU-2 Audit Events"
def test_broken_csf_compound_id(self):
assert _normalize_pdf_text("GV . OC - 01 Context") == "GV.OC-01 Context"
def test_broken_enhancement_parens(self):
assert _normalize_pdf_text("AC-1( 1 ) Enhancement") == "AC-1(1) Enhancement"
def test_soft_hyphen_removed(self):
assert _normalize_pdf_text("infor\u00admation") == "information"
def test_zero_width_space_removed(self):
assert _normalize_pdf_text("data\u200bprotection") == "dataprotection"
def test_ligature_fi_normalized(self):
# U+FB01 = fi ligature
assert _normalize_pdf_text("con\ufb01dential") == "confidential"
def test_ligature_fl_normalized(self):
# U+FB02 = fl ligature
assert _normalize_pdf_text("over\ufb02ow") == "overflow"
def test_multiple_spaces_collapsed(self):
assert _normalize_pdf_text("too many spaces") == "too many spaces"
def test_newlines_preserved(self):
result = _normalize_pdf_text("line one\nline two\n\nline three")
assert "\n" in result
assert "line one" in result
assert "line three" in result
def test_normal_text_unchanged(self):
text = "AC-1 Account Management requires proper controls."
assert _normalize_pdf_text(text) == text
def test_combined_artifacts(self):
"""Multiple broken artifacts in one text block."""
broken = "1 . 1 Overview\nAC - 1 Account Management\nGV . OC - 01 Context"
fixed = _normalize_pdf_text(broken)
assert "1.1 Overview" in fixed
assert "AC-1 Account Management" in fixed
assert "GV.OC-01 Context" in fixed
# =========================================================================
# Section detection after normalization
# =========================================================================
class TestNistSectionDetection:
def test_nist_control_ac1(self):
assert _extract_section_header("AC-1 Account Management") is not None
def test_nist_control_au2(self):
assert _extract_section_header("AU-2 Audit Events") is not None
def test_nist_csf_compound(self):
assert _extract_section_header("GV.OC-01 Organizational Context") is not None
def test_nist_enhancement(self):
assert _extract_section_header("AC-1(1) Policy and Procedures") is not None
def test_owasp_top10(self):
assert _extract_section_header("A01:2021 Broken Access Control") is not None
def test_owasp_without_year(self):
assert _extract_section_header("A03 Injection") is not None
def test_numbered_section(self):
assert _extract_section_header("2.1 Risk Framing") is not None
def test_deep_numbered_section(self):
assert _extract_section_header("3.2.1 Assessment Methodology") is not None
def test_broken_then_normalized_detects(self):
"""After normalization, broken NIST IDs should be detected as sections."""
broken = "AC - 1 Account Management"
normalized = _normalize_pdf_text(broken)
assert _extract_section_header(normalized) is not None
def test_broken_csf_then_normalized_detects(self):
broken = "GV . OC - 01 Organizational Context"
normalized = _normalize_pdf_text(broken)
assert _extract_section_header(normalized) is not None
def test_broken_section_num_then_normalized(self):
broken = "2 . 1 Risk Framing"
normalized = _normalize_pdf_text(broken)
assert _extract_section_header(normalized) is not None
# =========================================================================
# Section metadata extraction (_parse_section_metadata)
# =========================================================================
class TestNistSectionMetadata:
def test_nist_control_ac1_section(self):
meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES")
assert meta["section"] == "AC-1"
def test_nist_control_au2_section(self):
meta = _parse_section_metadata("AU-2 Audit Events")
assert meta["section"] == "AU-2"
def test_nist_enhancement_section(self):
meta = _parse_section_metadata("AC-1(1) Policy and Procedures")
assert meta["section"] == "AC-1(1)"
def test_nist_csf_compound_section(self):
meta = _parse_section_metadata("GV.OC-01 Organizational Context")
assert meta["section"] == "GV.OC-01"
def test_numbered_section(self):
meta = _parse_section_metadata("3.1 ACCESS CONTROL")
assert meta["section"] == "3.1"
def test_deep_numbered_section(self):
meta = _parse_section_metadata("2.3.1 Subtitle")
assert meta["section"] == "2.3.1"
def test_owasp_section(self):
meta = _parse_section_metadata("A01:2021 Broken Access Control")
assert meta["section"] == "A01:2021"
def test_section_title_extracted(self):
meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES")
assert meta["section_title"] == "POLICY AND PROCEDURES"
def test_numbered_section_title(self):
meta = _parse_section_metadata("3.1 ACCESS CONTROL")
assert meta["section_title"] == "ACCESS CONTROL"
def test_single_number_allcaps_section(self):
"""ENISA-style: '1. INTRODUCTION'"""
assert _extract_section_header("1. INTRODUCTION") is not None
def test_single_number_section_metadata(self):
meta = _parse_section_metadata("1. INTRODUCTION")
assert meta["section"] == "1"
assert meta["section_title"] == "INTRODUCTION"
def test_single_number_lowercase_not_matched(self):
"""'1. First item' should NOT be a section (lowercase title)."""
assert _extract_section_header("1. First item in a list") is None
def test_structured_chunks_have_section(self):
text = (
"3.1 ACCESS CONTROL\n"
"Overview of access control family.\n\n"
"AC-1 POLICY AND PROCEDURES\n"
"The organization develops, documents, and disseminates an access "
"control policy that addresses purpose, scope, roles, responsibilities, "
"management commitment, coordination among entities.\n\n"
"AC-2 ACCOUNT MANAGEMENT\n"
"The information system enforces approved authorizations for logical "
"access to information and system resources.\n"
)
result = chunk_text_legal_structured(text, chunk_size=300, overlap=50)
sections = [r.get("section", "") for r in result]
assert any(s == "AC-1" for s in sections)
assert any(s == "AC-2" for s in sections)
# =========================================================================
# Chunking with NIST-style text
# =========================================================================
class TestNistChunking:
NIST_SAMPLE = (
"AC-1 Account Management\n"
"The organization develops, documents, and disseminates an access "
"control policy that addresses purpose, scope, roles, responsibilities, "
"management commitment, coordination among organizational entities, "
"and compliance.\n\n"
"AC-2 Access Enforcement\n"
"The information system enforces approved authorizations for logical "
"access to information and system resources in accordance with "
"applicable access control policies.\n\n"
"AC-3 Information Flow Enforcement\n"
"The system enforces approved authorizations for controlling the flow "
"of information within the system and between interconnected systems.\n"
)
def test_chunks_have_section_prefix(self):
chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=300, overlap=50)
assert any("[AC-1" in c for c in chunks)
assert any("[AC-2" in c for c in chunks)
def test_sections_detected(self):
chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=500, overlap=50)
assert len(chunks) >= 2
def test_normalized_broken_text_chunks_correctly(self):
"""Broken PDF text should chunk correctly after normalization."""
broken = (
"AC - 1 Account Management\n"
"The organization develops, documents, and disseminates an access "
"control policy that addresses purpose, scope, roles, responsibilities, "
"management commitment, coordination among organizational entities, "
"and compliance with applicable regulations and standards.\n\n"
"AC - 2 Access Enforcement\n"
"The information system enforces approved authorizations for logical "
"access to information and system resources in accordance with "
"applicable access control policies and procedures.\n"
)
normalized = _normalize_pdf_text(broken)
chunks = chunk_text_legal(normalized, chunk_size=300, overlap=50)
assert any("[AC-1" in c for c in chunks)
assert any("[AC-2" in c for c in chunks)