From a9671a572bc15564635813621448be6aa22af374 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Sun, 3 May 2026 08:56:02 +0200
Subject: [PATCH] fix(embedding): single-number ALL-CAPS section detection for
 ENISA/BSI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add case-sensitive _SINGLE_NUM_ALLCAPS_RE for "1. INTRODUCTION" style
headers (ENISA, BSI docs). Cannot use _LEGAL_SECTION_RE for this because
it uses re.IGNORECASE which would false-positive on "1. Erstens" etc.

Also re-downloaded 2 corrupt PDFs from nist.gov (nistir_8259a, nist_ai_rmf)
— originals in MinIO were 263-byte XML error responses, not PDFs.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 embedding-service/main.py                    | 17 +++++++++++++----
 embedding-service/test_nist_normalization.py | 13 +++++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/embedding-service/main.py b/embedding-service/main.py
index fa6d9bf..4220f8b 100644
--- a/embedding-service/main.py
+++ b/embedding-service/main.py
@@ -319,6 +319,10 @@ _HEADING_RE = re.compile(
     re.MULTILINE
 )
 
+# Case-sensitive: single-number + ALL-CAPS title (e.g., "1. INTRODUCTION")
+# Separate regex because _LEGAL_SECTION_RE uses re.IGNORECASE
+_SINGLE_NUM_ALLCAPS_RE = re.compile(r'^\d+\.\s+[A-Z][A-Z\s]{4,}')
+
 
 def _detect_language(text: str) -> str:
     """Simple heuristic: count German vs English marker words."""
@@ -393,6 +397,7 @@ _SECTION_NUMBER_RE = re.compile(
     r'|([A-Z]{2}\.[A-Z]{2}-\d{2})'  # GV.OC-01 (NIST CSF 2.0)
     r'|([A-Z]{2,4}-\d+(?:\(\d+\))?)'  # AC-1, AC-1(1) (NIST controls)
     r'|(\d+\.\d+(?:\.\d+)*)'        # 3.1, 2.3.1 (numbered sections)
+    r'|(\d+)(?=\.\s+[A-Z]{5,})'    # 1 (from "1. INTRODUCTION", case-sensitive below)
     r'|(A\d{2}(?::\d{4})?)'         # A01:2021 (OWASP)
     r')',
     re.IGNORECASE
@@ -401,12 +406,16 @@ _SECTION_NUMBER_RE = re.compile(
 
 def _extract_section_header(line: str) -> Optional[str]:
     """Extract a legal section header from a line, or None."""
-    m = _LEGAL_SECTION_RE.match(line.strip())
+    stripped = line.strip()
+    m = _LEGAL_SECTION_RE.match(stripped)
     if m:
-        return line.strip()
-    m = _HEADING_RE.match(line.strip())
+        return stripped
+    # Case-sensitive check for "1. INTRODUCTION" style (ENISA/BSI docs)
+    if _SINGLE_NUM_ALLCAPS_RE.match(stripped):
+        return stripped
+    m = _HEADING_RE.match(stripped)
     if m:
-        return line.strip()
+        return stripped
     return None
 
 
diff --git a/embedding-service/test_nist_normalization.py b/embedding-service/test_nist_normalization.py
index 53bd0c4..8778105 100644
--- a/embedding-service/test_nist_normalization.py
+++ b/embedding-service/test_nist_normalization.py
@@ -168,6 +168,19 @@ class TestNistSectionMetadata:
         meta = _parse_section_metadata("3.1 ACCESS CONTROL")
         assert meta["section_title"] == "ACCESS CONTROL"
 
+    def test_single_number_allcaps_section(self):
+        """ENISA-style: '1. INTRODUCTION'"""
+        assert _extract_section_header("1. INTRODUCTION") is not None
+
+    def test_single_number_section_metadata(self):
+        meta = _parse_section_metadata("1. INTRODUCTION")
+        assert meta["section"] == "1"
+        assert meta["section_title"] == "INTRODUCTION"
+
+    def test_single_number_lowercase_not_matched(self):
+        """'1. First item' should NOT be a section (lowercase title)."""
+        assert _extract_section_header("1. First item in a list") is None
+
     def test_structured_chunks_have_section(self):
         text = (
             "3.1 ACCESS CONTROL\n"