fix(embedding): single-number ALL-CAPS section detection for ENISA/BSI
Add case-sensitive _SINGLE_NUM_ALLCAPS_RE for "1. INTRODUCTION" style headers (ENISA, BSI docs). Cannot use _LEGAL_SECTION_RE for this because it uses re.IGNORECASE which would false-positive on "1. Erstens" etc. Also re-downloaded 2 corrupt PDFs from nist.gov (nistir_8259a, nist_ai_rmf) — originals in MinIO were 263-byte XML error responses, not PDFs. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -319,6 +319,10 @@ _HEADING_RE = re.compile(
|
|||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Case-sensitive: single-number + ALL-CAPS title (e.g., "1. INTRODUCTION")
|
||||||
|
# Separate regex because _LEGAL_SECTION_RE uses re.IGNORECASE
|
||||||
|
_SINGLE_NUM_ALLCAPS_RE = re.compile(r'^\d+\.\s+[A-Z][A-Z\s]{4,}')
|
||||||
|
|
||||||
|
|
||||||
def _detect_language(text: str) -> str:
|
def _detect_language(text: str) -> str:
|
||||||
"""Simple heuristic: count German vs English marker words."""
|
"""Simple heuristic: count German vs English marker words."""
|
||||||
@@ -393,6 +397,7 @@ _SECTION_NUMBER_RE = re.compile(
|
|||||||
r'|([A-Z]{2}\.[A-Z]{2}-\d{2})' # GV.OC-01 (NIST CSF 2.0)
|
r'|([A-Z]{2}\.[A-Z]{2}-\d{2})' # GV.OC-01 (NIST CSF 2.0)
|
||||||
r'|([A-Z]{2,4}-\d+(?:\(\d+\))?)' # AC-1, AC-1(1) (NIST controls)
|
r'|([A-Z]{2,4}-\d+(?:\(\d+\))?)' # AC-1, AC-1(1) (NIST controls)
|
||||||
r'|(\d+\.\d+(?:\.\d+)*)' # 3.1, 2.3.1 (numbered sections)
|
r'|(\d+\.\d+(?:\.\d+)*)' # 3.1, 2.3.1 (numbered sections)
|
||||||
|
r'|(\d+)(?=\.\s+[A-Z]{5,})' # 1 (from "1. INTRODUCTION", case-sensitive below)
|
||||||
r'|(A\d{2}(?::\d{4})?)' # A01:2021 (OWASP)
|
r'|(A\d{2}(?::\d{4})?)' # A01:2021 (OWASP)
|
||||||
r')',
|
r')',
|
||||||
re.IGNORECASE
|
re.IGNORECASE
|
||||||
@@ -401,12 +406,16 @@ _SECTION_NUMBER_RE = re.compile(
|
|||||||
|
|
||||||
def _extract_section_header(line: str) -> Optional[str]:
|
def _extract_section_header(line: str) -> Optional[str]:
|
||||||
"""Extract a legal section header from a line, or None."""
|
"""Extract a legal section header from a line, or None."""
|
||||||
m = _LEGAL_SECTION_RE.match(line.strip())
|
stripped = line.strip()
|
||||||
|
m = _LEGAL_SECTION_RE.match(stripped)
|
||||||
if m:
|
if m:
|
||||||
return line.strip()
|
return stripped
|
||||||
m = _HEADING_RE.match(line.strip())
|
# Case-sensitive check for "1. INTRODUCTION" style (ENISA/BSI docs)
|
||||||
|
if _SINGLE_NUM_ALLCAPS_RE.match(stripped):
|
||||||
|
return stripped
|
||||||
|
m = _HEADING_RE.match(stripped)
|
||||||
if m:
|
if m:
|
||||||
return line.strip()
|
return stripped
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -168,6 +168,19 @@ class TestNistSectionMetadata:
|
|||||||
meta = _parse_section_metadata("3.1 ACCESS CONTROL")
|
meta = _parse_section_metadata("3.1 ACCESS CONTROL")
|
||||||
assert meta["section_title"] == "ACCESS CONTROL"
|
assert meta["section_title"] == "ACCESS CONTROL"
|
||||||
|
|
||||||
|
def test_single_number_allcaps_section(self):
|
||||||
|
"""ENISA-style: '1. INTRODUCTION'"""
|
||||||
|
assert _extract_section_header("1. INTRODUCTION") is not None
|
||||||
|
|
||||||
|
def test_single_number_section_metadata(self):
|
||||||
|
meta = _parse_section_metadata("1. INTRODUCTION")
|
||||||
|
assert meta["section"] == "1"
|
||||||
|
assert meta["section_title"] == "INTRODUCTION"
|
||||||
|
|
||||||
|
def test_single_number_lowercase_not_matched(self):
|
||||||
|
"""'1. First item' should NOT be a section (lowercase title)."""
|
||||||
|
assert _extract_section_header("1. First item in a list") is None
|
||||||
|
|
||||||
def test_structured_chunks_have_section(self):
|
def test_structured_chunks_have_section(self):
|
||||||
text = (
|
text = (
|
||||||
"3.1 ACCESS CONTROL\n"
|
"3.1 ACCESS CONTROL\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user