""" Tests for NIST/BSI/ENISA PDF text normalization and section detection. Covers: - _normalize_pdf_text() fixing broken multi-column PDF artifacts - Section detection after normalization - NIST CSF 2.0 compound IDs (GV.OC-01) - NIST SP 800-53 control IDs (AC-1, AC-1(1)) - OWASP Top 10 IDs (A01:2021) - Unicode normalization (ligatures, soft hyphens) """ from main import ( _normalize_pdf_text, _extract_section_header, _parse_section_metadata, chunk_text_legal, chunk_text_legal_structured, ) # ========================================================================= # _normalize_pdf_text — broken spacing fixes # ========================================================================= class TestNormalizePdfText: def test_broken_section_number(self): assert _normalize_pdf_text("1 . 1 Risk Framing") == "1.1 Risk Framing" def test_nested_section_number(self): assert _normalize_pdf_text("2 . 3 . 1 Subtitle") == "2.3.1 Subtitle" def test_broken_nist_control_id(self): assert _normalize_pdf_text("AC - 1 Account Management") == "AC-1 Account Management" def test_broken_nist_control_au(self): assert _normalize_pdf_text("AU - 2 Audit Events") == "AU-2 Audit Events" def test_broken_csf_compound_id(self): assert _normalize_pdf_text("GV . OC - 01 Context") == "GV.OC-01 Context" def test_broken_enhancement_parens(self): assert _normalize_pdf_text("AC-1( 1 ) Enhancement") == "AC-1(1) Enhancement" def test_soft_hyphen_removed(self): assert _normalize_pdf_text("infor\u00admation") == "information" def test_zero_width_space_removed(self): assert _normalize_pdf_text("data\u200bprotection") == "dataprotection" def test_ligature_fi_normalized(self): # U+FB01 = fi ligature assert _normalize_pdf_text("con\ufb01dential") == "confidential" def test_ligature_fl_normalized(self): # U+FB02 = fl ligature assert _normalize_pdf_text("over\ufb02ow") == "overflow" def test_multiple_spaces_collapsed(self): assert _normalize_pdf_text("too many spaces") == "too many spaces" def test_newlines_preserved(self): result = _normalize_pdf_text("line one\nline two\n\nline three") assert "\n" in result assert "line one" in result assert "line three" in result def test_normal_text_unchanged(self): text = "AC-1 Account Management requires proper controls." assert _normalize_pdf_text(text) == text def test_combined_artifacts(self): """Multiple broken artifacts in one text block.""" broken = "1 . 1 Overview\nAC - 1 Account Management\nGV . OC - 01 Context" fixed = _normalize_pdf_text(broken) assert "1.1 Overview" in fixed assert "AC-1 Account Management" in fixed assert "GV.OC-01 Context" in fixed # ========================================================================= # Section detection after normalization # ========================================================================= class TestNistSectionDetection: def test_nist_control_ac1(self): assert _extract_section_header("AC-1 Account Management") is not None def test_nist_control_au2(self): assert _extract_section_header("AU-2 Audit Events") is not None def test_nist_csf_compound(self): assert _extract_section_header("GV.OC-01 Organizational Context") is not None def test_nist_enhancement(self): assert _extract_section_header("AC-1(1) Policy and Procedures") is not None def test_owasp_top10(self): assert _extract_section_header("A01:2021 Broken Access Control") is not None def test_owasp_without_year(self): assert _extract_section_header("A03 Injection") is not None def test_numbered_section(self): assert _extract_section_header("2.1 Risk Framing") is not None def test_deep_numbered_section(self): assert _extract_section_header("3.2.1 Assessment Methodology") is not None def test_broken_then_normalized_detects(self): """After normalization, broken NIST IDs should be detected as sections.""" broken = "AC - 1 Account Management" normalized = _normalize_pdf_text(broken) assert _extract_section_header(normalized) is not None def test_broken_csf_then_normalized_detects(self): broken = "GV . OC - 01 Organizational Context" normalized = _normalize_pdf_text(broken) assert _extract_section_header(normalized) is not None def test_broken_section_num_then_normalized(self): broken = "2 . 1 Risk Framing" normalized = _normalize_pdf_text(broken) assert _extract_section_header(normalized) is not None # ========================================================================= # Section metadata extraction (_parse_section_metadata) # ========================================================================= class TestNistSectionMetadata: def test_nist_control_ac1_section(self): meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES") assert meta["section"] == "AC-1" def test_nist_control_au2_section(self): meta = _parse_section_metadata("AU-2 Audit Events") assert meta["section"] == "AU-2" def test_nist_enhancement_section(self): meta = _parse_section_metadata("AC-1(1) Policy and Procedures") assert meta["section"] == "AC-1(1)" def test_nist_csf_compound_section(self): meta = _parse_section_metadata("GV.OC-01 Organizational Context") assert meta["section"] == "GV.OC-01" def test_numbered_section(self): meta = _parse_section_metadata("3.1 ACCESS CONTROL") assert meta["section"] == "3.1" def test_deep_numbered_section(self): meta = _parse_section_metadata("2.3.1 Subtitle") assert meta["section"] == "2.3.1" def test_owasp_section(self): meta = _parse_section_metadata("A01:2021 Broken Access Control") assert meta["section"] == "A01:2021" def test_section_title_extracted(self): meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES") assert meta["section_title"] == "POLICY AND PROCEDURES" def test_numbered_section_title(self): meta = _parse_section_metadata("3.1 ACCESS CONTROL") assert meta["section_title"] == "ACCESS CONTROL" def test_structured_chunks_have_section(self): text = ( "3.1 ACCESS CONTROL\n" "Overview of access control family.\n\n" "AC-1 POLICY AND PROCEDURES\n" "The organization develops, documents, and disseminates an access " "control policy that addresses purpose, scope, roles, responsibilities, " "management commitment, coordination among entities.\n\n" "AC-2 ACCOUNT MANAGEMENT\n" "The information system enforces approved authorizations for logical " "access to information and system resources.\n" ) result = chunk_text_legal_structured(text, chunk_size=300, overlap=50) sections = [r.get("section", "") for r in result] assert any(s == "AC-1" for s in sections) assert any(s == "AC-2" for s in sections) # ========================================================================= # Chunking with NIST-style text # ========================================================================= class TestNistChunking: NIST_SAMPLE = ( "AC-1 Account Management\n" "The organization develops, documents, and disseminates an access " "control policy that addresses purpose, scope, roles, responsibilities, " "management commitment, coordination among organizational entities, " "and compliance.\n\n" "AC-2 Access Enforcement\n" "The information system enforces approved authorizations for logical " "access to information and system resources in accordance with " "applicable access control policies.\n\n" "AC-3 Information Flow Enforcement\n" "The system enforces approved authorizations for controlling the flow " "of information within the system and between interconnected systems.\n" ) def test_chunks_have_section_prefix(self): chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=300, overlap=50) assert any("[AC-1" in c for c in chunks) assert any("[AC-2" in c for c in chunks) def test_sections_detected(self): chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=500, overlap=50) assert len(chunks) >= 2 def test_normalized_broken_text_chunks_correctly(self): """Broken PDF text should chunk correctly after normalization.""" broken = ( "AC - 1 Account Management\n" "The organization develops, documents, and disseminates an access " "control policy that addresses purpose, scope, roles, responsibilities, " "management commitment, coordination among organizational entities, " "and compliance with applicable regulations and standards.\n\n" "AC - 2 Access Enforcement\n" "The information system enforces approved authorizations for logical " "access to information and system resources in accordance with " "applicable access control policies and procedures.\n" ) normalized = _normalize_pdf_text(broken) chunks = chunk_text_legal(normalized, chunk_size=300, overlap=50) assert any("[AC-1" in c for c in chunks) assert any("[AC-2" in c for c in chunks)