feat(embedding): NIST PDF text normalization + safe re-ingest script

Fix broken multi-column PDF extraction for NIST/BSI/ENISA documents: - _normalize_pdf_text(): fixes broken section numbers (1 . 1 → 1.1), control IDs (AC - 1 → AC-1), ligatures, soft hyphens - pdfplumber tolerances increased (x=3,y=4) for better column handling - 3 new regex patterns: NIST CSF 2.0, NIST enhancements, OWASP Top 10 - reingest_nist.py: safe upload-before-delete for 4 lost NIST PDFs - reingest_d5.py: safety fix — upload first, verify, then delete old Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 06:42:46 +02:00
parent 97a7f6f264
commit 0b0eed27b0
4 changed files with 738 additions and 9 deletions
@@ -12,6 +12,7 @@ This service handles all ML-heavy operations, keeping the main klausur-service l

 import logging
 import re
+import unicodedata
 from typing import List, Optional
 from contextlib import asynccontextmanager

@@ -299,6 +300,9 @@ _LEGAL_SECTION_RE = re.compile(
    # NIST/ENISA/standard numbering
    r'|\d+\.\d+(?:\.\d+)*\s+[A-ZÄÖÜ]'  # 1.1 Title, 2.3.1 Subtitle
    r'|[A-Z]{2,4}[-\.]\d+(?:\.\d+)*\b'  # AC-1, AU-2, PO.1, PW.1.1
+    r'|[A-Z]{2}\.[A-Z]{2}-\d{2}\b'      # GV.OC-01 (NIST CSF 2.0)
+    r'|[A-Z]{2,4}-\d+\(\d+\)'           # AC-1(1) (NIST enhancements)
+    r'|A\d{2}(?::\d{4})?\b'             # A01:2021 (OWASP Top 10)
    r'|Table\s+\d+'                      # Table 1, Table A-1
    r'|Figure\s+\d+'                     # Figure 1
    r'|Appendix\s+[A-Z\d]'              # Appendix A, Appendix 1
@@ -827,6 +831,34 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
            pass


+def _normalize_pdf_text(text: str) -> str:
+    """Fix broken spacing from multi-column PDF extraction.
+
+    pdfplumber/pypdf often break section numbers in multi-column NIST/BSI/ENISA
+    PDFs: "1 . 1" instead of "1.1", "AC - 1" instead of "AC-1".
+    """
+    # Unicode NFKC: decompose ligatures (fi → fi) before other fixes
+    text = unicodedata.normalize('NFKC', text)
+    # Remove soft hyphens and zero-width spaces
+    text = text.replace('\u00ad', '').replace('\u200b', '')
+    # "1 . 1" → "1.1" (broken section numbers, apply repeatedly for nested)
+    prev = None
+    while prev != text:
+        prev = text
+        text = re.sub(r'(\d+)\s+\.\s+(\d+)', r'\1.\2', text)
+    # "AC - 1" → "AC-1" (broken NIST control IDs, 2-4 uppercase letters)
+    text = re.sub(r'\b([A-Z]{2,4})\s+-\s+(\d+)\b', r'\1-\2', text)
+    # "GV . OC - 01" → "GV.OC-01" (NIST CSF 2.0 compound IDs)
+    text = re.sub(
+        r'\b([A-Z]{2})\s*\.\s*([A-Z]{2})\s*-\s*(\d{2})\b', r'\1.\2-\3', text
+    )
+    # "AC - 1 ( 1 )" → "AC-1(1)" (NIST enhancements with spaced parens)
+    text = re.sub(r'\(\s+(\d+)\s+\)', r'(\1)', text)
+    # Collapse multiple horizontal spaces (keep newlines)
+    text = re.sub(r'[^\S\n]{2,}', ' ', text)
+    return text
+
+
 def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
    """Extract PDF using pdfplumber (best for multi-column EU regulation PDFs)."""
    import io
@@ -839,12 +871,12 @@ def extract_pdf_pdfplumber(pdf_content: bytes) -> ExtractPDFResponse:
    with pdfplumber.open(pdf_file) as pdf:
        page_count = len(pdf.pages)
        for page in pdf.pages:
-            text = page.extract_text(x_tolerance=2, y_tolerance=3)
+            text = page.extract_text(x_tolerance=3, y_tolerance=4)
            if text:
                text_parts.append(text)

    return ExtractPDFResponse(
-        text="\n\n".join(text_parts),
+        text=_normalize_pdf_text("\n\n".join(text_parts)),
        backend_used="pdfplumber",
        pages=page_count,
        table_count=0,
@@ -866,7 +898,7 @@ def extract_pdf_pypdf(pdf_content: bytes) -> ExtractPDFResponse:
            text_parts.append(text)

    return ExtractPDFResponse(
-        text="\n\n".join(text_parts),
+        text=_normalize_pdf_text("\n\n".join(text_parts)),
        backend_used="pypdf",
        pages=len(reader.pages),
        table_count=0
@@ -0,0 +1,173 @@
+"""
+Tests for NIST/BSI/ENISA PDF text normalization and section detection.
+
+Covers:
+- _normalize_pdf_text() fixing broken multi-column PDF artifacts
+- Section detection after normalization
+- NIST CSF 2.0 compound IDs (GV.OC-01)
+- NIST SP 800-53 control IDs (AC-1, AC-1(1))
+- OWASP Top 10 IDs (A01:2021)
+- Unicode normalization (ligatures, soft hyphens)
+"""
+
+from main import (
+    _normalize_pdf_text,
+    _extract_section_header,
+    chunk_text_legal,
+)
+
+
+# =========================================================================
+# _normalize_pdf_text — broken spacing fixes
+# =========================================================================
+
+class TestNormalizePdfText:
+
+    def test_broken_section_number(self):
+        assert _normalize_pdf_text("1 . 1 Risk Framing") == "1.1 Risk Framing"
+
+    def test_nested_section_number(self):
+        assert _normalize_pdf_text("2 . 3 . 1 Subtitle") == "2.3.1 Subtitle"
+
+    def test_broken_nist_control_id(self):
+        assert _normalize_pdf_text("AC - 1 Account Management") == "AC-1 Account Management"
+
+    def test_broken_nist_control_au(self):
+        assert _normalize_pdf_text("AU - 2 Audit Events") == "AU-2 Audit Events"
+
+    def test_broken_csf_compound_id(self):
+        assert _normalize_pdf_text("GV . OC - 01 Context") == "GV.OC-01 Context"
+
+    def test_broken_enhancement_parens(self):
+        assert _normalize_pdf_text("AC-1( 1 ) Enhancement") == "AC-1(1) Enhancement"
+
+    def test_soft_hyphen_removed(self):
+        assert _normalize_pdf_text("infor\u00admation") == "information"
+
+    def test_zero_width_space_removed(self):
+        assert _normalize_pdf_text("data\u200bprotection") == "dataprotection"
+
+    def test_ligature_fi_normalized(self):
+        # U+FB01 = fi ligature
+        assert _normalize_pdf_text("con\ufb01dential") == "confidential"
+
+    def test_ligature_fl_normalized(self):
+        # U+FB02 = fl ligature
+        assert _normalize_pdf_text("over\ufb02ow") == "overflow"
+
+    def test_multiple_spaces_collapsed(self):
+        assert _normalize_pdf_text("too   many    spaces") == "too many spaces"
+
+    def test_newlines_preserved(self):
+        result = _normalize_pdf_text("line one\nline two\n\nline three")
+        assert "\n" in result
+        assert "line one" in result
+        assert "line three" in result
+
+    def test_normal_text_unchanged(self):
+        text = "AC-1 Account Management requires proper controls."
+        assert _normalize_pdf_text(text) == text
+
+    def test_combined_artifacts(self):
+        """Multiple broken artifacts in one text block."""
+        broken = "1 . 1 Overview\nAC - 1 Account Management\nGV . OC - 01 Context"
+        fixed = _normalize_pdf_text(broken)
+        assert "1.1 Overview" in fixed
+        assert "AC-1 Account Management" in fixed
+        assert "GV.OC-01 Context" in fixed
+
+
+# =========================================================================
+# Section detection after normalization
+# =========================================================================
+
+class TestNistSectionDetection:
+
+    def test_nist_control_ac1(self):
+        assert _extract_section_header("AC-1 Account Management") is not None
+
+    def test_nist_control_au2(self):
+        assert _extract_section_header("AU-2 Audit Events") is not None
+
+    def test_nist_csf_compound(self):
+        assert _extract_section_header("GV.OC-01 Organizational Context") is not None
+
+    def test_nist_enhancement(self):
+        assert _extract_section_header("AC-1(1) Policy and Procedures") is not None
+
+    def test_owasp_top10(self):
+        assert _extract_section_header("A01:2021 Broken Access Control") is not None
+
+    def test_owasp_without_year(self):
+        assert _extract_section_header("A03 Injection") is not None
+
+    def test_numbered_section(self):
+        assert _extract_section_header("2.1 Risk Framing") is not None
+
+    def test_deep_numbered_section(self):
+        assert _extract_section_header("3.2.1 Assessment Methodology") is not None
+
+    def test_broken_then_normalized_detects(self):
+        """After normalization, broken NIST IDs should be detected as sections."""
+        broken = "AC - 1 Account Management"
+        normalized = _normalize_pdf_text(broken)
+        assert _extract_section_header(normalized) is not None
+
+    def test_broken_csf_then_normalized_detects(self):
+        broken = "GV . OC - 01 Organizational Context"
+        normalized = _normalize_pdf_text(broken)
+        assert _extract_section_header(normalized) is not None
+
+    def test_broken_section_num_then_normalized(self):
+        broken = "2 . 1 Risk Framing"
+        normalized = _normalize_pdf_text(broken)
+        assert _extract_section_header(normalized) is not None
+
+
+# =========================================================================
+# Chunking with NIST-style text
+# =========================================================================
+
+class TestNistChunking:
+
+    NIST_SAMPLE = (
+        "AC-1 Account Management\n"
+        "The organization develops, documents, and disseminates an access "
+        "control policy that addresses purpose, scope, roles, responsibilities, "
+        "management commitment, coordination among organizational entities, "
+        "and compliance.\n\n"
+        "AC-2 Access Enforcement\n"
+        "The information system enforces approved authorizations for logical "
+        "access to information and system resources in accordance with "
+        "applicable access control policies.\n\n"
+        "AC-3 Information Flow Enforcement\n"
+        "The system enforces approved authorizations for controlling the flow "
+        "of information within the system and between interconnected systems.\n"
+    )
+
+    def test_chunks_have_section_prefix(self):
+        chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=300, overlap=50)
+        assert any("[AC-1" in c for c in chunks)
+        assert any("[AC-2" in c for c in chunks)
+
+    def test_sections_detected(self):
+        chunks = chunk_text_legal(self.NIST_SAMPLE, chunk_size=500, overlap=50)
+        assert len(chunks) >= 2
+
+    def test_normalized_broken_text_chunks_correctly(self):
+        """Broken PDF text should chunk correctly after normalization."""
+        broken = (
+            "AC - 1 Account Management\n"
+            "The organization develops, documents, and disseminates an access "
+            "control policy that addresses purpose, scope, roles, responsibilities, "
+            "management commitment, coordination among organizational entities, "
+            "and compliance with applicable regulations and standards.\n\n"
+            "AC - 2 Access Enforcement\n"
+            "The information system enforces approved authorizations for logical "
+            "access to information and system resources in accordance with "
+            "applicable access control policies and procedures.\n"
+        )
+        normalized = _normalize_pdf_text(broken)
+        chunks = chunk_text_legal(normalized, chunk_size=300, overlap=50)
+        assert any("[AC-1" in c for c in chunks)
+        assert any("[AC-2" in c for c in chunks)