From a9671a572bc15564635813621448be6aa22af374 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 3 May 2026 08:56:02 +0200 Subject: [PATCH] fix(embedding): single-number ALL-CAPS section detection for ENISA/BSI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add case-sensitive _SINGLE_NUM_ALLCAPS_RE for "1. INTRODUCTION" style headers (ENISA, BSI docs). Cannot use _LEGAL_SECTION_RE for this because it uses re.IGNORECASE which would false-positive on "1. Erstens" etc. Also re-downloaded 2 corrupt PDFs from nist.gov (nistir_8259a, nist_ai_rmf) — originals in MinIO were 263-byte XML error responses, not PDFs. Co-Authored-By: Claude Opus 4.6 (1M context) --- embedding-service/main.py | 17 +++++++++++++---- embedding-service/test_nist_normalization.py | 13 +++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/embedding-service/main.py b/embedding-service/main.py index fa6d9bf..4220f8b 100644 --- a/embedding-service/main.py +++ b/embedding-service/main.py @@ -319,6 +319,10 @@ _HEADING_RE = re.compile( re.MULTILINE ) +# Case-sensitive: single-number + ALL-CAPS title (e.g., "1. INTRODUCTION") +# Separate regex because _LEGAL_SECTION_RE uses re.IGNORECASE +_SINGLE_NUM_ALLCAPS_RE = re.compile(r'^\d+\.\s+[A-Z][A-Z\s]{4,}') + def _detect_language(text: str) -> str: """Simple heuristic: count German vs English marker words.""" @@ -393,6 +397,7 @@ _SECTION_NUMBER_RE = re.compile( r'|([A-Z]{2}\.[A-Z]{2}-\d{2})' # GV.OC-01 (NIST CSF 2.0) r'|([A-Z]{2,4}-\d+(?:\(\d+\))?)' # AC-1, AC-1(1) (NIST controls) r'|(\d+\.\d+(?:\.\d+)*)' # 3.1, 2.3.1 (numbered sections) + r'|(\d+)(?=\.\s+[A-Z]{5,})' # 1 (from "1. INTRODUCTION", case-sensitive below) r'|(A\d{2}(?::\d{4})?)' # A01:2021 (OWASP) r')', re.IGNORECASE @@ -401,12 +406,16 @@ _SECTION_NUMBER_RE = re.compile( def _extract_section_header(line: str) -> Optional[str]: """Extract a legal section header from a line, or None.""" - m = _LEGAL_SECTION_RE.match(line.strip()) + stripped = line.strip() + m = _LEGAL_SECTION_RE.match(stripped) if m: - return line.strip() - m = _HEADING_RE.match(line.strip()) + return stripped + # Case-sensitive check for "1. INTRODUCTION" style (ENISA/BSI docs) + if _SINGLE_NUM_ALLCAPS_RE.match(stripped): + return stripped + m = _HEADING_RE.match(stripped) if m: - return line.strip() + return stripped return None diff --git a/embedding-service/test_nist_normalization.py b/embedding-service/test_nist_normalization.py index 53bd0c4..8778105 100644 --- a/embedding-service/test_nist_normalization.py +++ b/embedding-service/test_nist_normalization.py @@ -168,6 +168,19 @@ class TestNistSectionMetadata: meta = _parse_section_metadata("3.1 ACCESS CONTROL") assert meta["section_title"] == "ACCESS CONTROL" + def test_single_number_allcaps_section(self): + """ENISA-style: '1. INTRODUCTION'""" + assert _extract_section_header("1. INTRODUCTION") is not None + + def test_single_number_section_metadata(self): + meta = _parse_section_metadata("1. INTRODUCTION") + assert meta["section"] == "1" + assert meta["section_title"] == "INTRODUCTION" + + def test_single_number_lowercase_not_matched(self): + """'1. First item' should NOT be a section (lowercase title).""" + assert _extract_section_header("1. First item in a list") is None + def test_structured_chunks_have_section(self): text = ( "3.1 ACCESS CONTROL\n"