fix(embedding): single-number ALL-CAPS section detection for ENISA/BSI
Add case-sensitive _SINGLE_NUM_ALLCAPS_RE for "1. INTRODUCTION" style headers (ENISA, BSI docs). Cannot use _LEGAL_SECTION_RE for this because it uses re.IGNORECASE which would false-positive on "1. Erstens" etc. Also re-downloaded 2 corrupt PDFs from nist.gov (nistir_8259a, nist_ai_rmf) — originals in MinIO were 263-byte XML error responses, not PDFs. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -168,6 +168,19 @@ class TestNistSectionMetadata:
|
||||
meta = _parse_section_metadata("3.1 ACCESS CONTROL")
|
||||
assert meta["section_title"] == "ACCESS CONTROL"
|
||||
|
||||
def test_single_number_allcaps_section(self):
|
||||
"""ENISA-style: '1. INTRODUCTION'"""
|
||||
assert _extract_section_header("1. INTRODUCTION") is not None
|
||||
|
||||
def test_single_number_section_metadata(self):
|
||||
meta = _parse_section_metadata("1. INTRODUCTION")
|
||||
assert meta["section"] == "1"
|
||||
assert meta["section_title"] == "INTRODUCTION"
|
||||
|
||||
def test_single_number_lowercase_not_matched(self):
|
||||
"""'1. First item' should NOT be a section (lowercase title)."""
|
||||
assert _extract_section_header("1. First item in a list") is None
|
||||
|
||||
def test_structured_chunks_have_section(self):
|
||||
text = (
|
||||
"3.1 ACCESS CONTROL\n"
|
||||
|
||||
Reference in New Issue
Block a user