fix(embedding): add NIST control IDs to _SECTION_NUMBER_RE
_SECTION_NUMBER_RE only had patterns for §/Art/Section/Kapitel/Annex but missed NIST-style identifiers (AC-1, GV.OC-01, 3.1, A01:2021). This caused 0% section rate for all NIST/BSI/ENISA documents even though sections were correctly detected — the section NUMBER wasn't extracted from the header. Also adds: - reupload_legal_strategy.py: re-upload with legal chunking - extract_and_upload_nist.py: local PDF extraction workaround - qdrant-snapshot.sh: backup mechanism for Qdrant collections Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,7 +13,9 @@ Covers:
|
||||
from main import (
|
||||
_normalize_pdf_text,
|
||||
_extract_section_header,
|
||||
_parse_section_metadata,
|
||||
chunk_text_legal,
|
||||
chunk_text_legal_structured,
|
||||
)
|
||||
|
||||
|
||||
@@ -124,6 +126,66 @@ class TestNistSectionDetection:
|
||||
assert _extract_section_header(normalized) is not None
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Section metadata extraction (_parse_section_metadata)
|
||||
# =========================================================================
|
||||
|
||||
class TestNistSectionMetadata:
|
||||
|
||||
def test_nist_control_ac1_section(self):
|
||||
meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES")
|
||||
assert meta["section"] == "AC-1"
|
||||
|
||||
def test_nist_control_au2_section(self):
|
||||
meta = _parse_section_metadata("AU-2 Audit Events")
|
||||
assert meta["section"] == "AU-2"
|
||||
|
||||
def test_nist_enhancement_section(self):
|
||||
meta = _parse_section_metadata("AC-1(1) Policy and Procedures")
|
||||
assert meta["section"] == "AC-1(1)"
|
||||
|
||||
def test_nist_csf_compound_section(self):
|
||||
meta = _parse_section_metadata("GV.OC-01 Organizational Context")
|
||||
assert meta["section"] == "GV.OC-01"
|
||||
|
||||
def test_numbered_section(self):
|
||||
meta = _parse_section_metadata("3.1 ACCESS CONTROL")
|
||||
assert meta["section"] == "3.1"
|
||||
|
||||
def test_deep_numbered_section(self):
|
||||
meta = _parse_section_metadata("2.3.1 Subtitle")
|
||||
assert meta["section"] == "2.3.1"
|
||||
|
||||
def test_owasp_section(self):
|
||||
meta = _parse_section_metadata("A01:2021 Broken Access Control")
|
||||
assert meta["section"] == "A01:2021"
|
||||
|
||||
def test_section_title_extracted(self):
|
||||
meta = _parse_section_metadata("AC-1 POLICY AND PROCEDURES")
|
||||
assert meta["section_title"] == "POLICY AND PROCEDURES"
|
||||
|
||||
def test_numbered_section_title(self):
|
||||
meta = _parse_section_metadata("3.1 ACCESS CONTROL")
|
||||
assert meta["section_title"] == "ACCESS CONTROL"
|
||||
|
||||
def test_structured_chunks_have_section(self):
|
||||
text = (
|
||||
"3.1 ACCESS CONTROL\n"
|
||||
"Overview of access control family.\n\n"
|
||||
"AC-1 POLICY AND PROCEDURES\n"
|
||||
"The organization develops, documents, and disseminates an access "
|
||||
"control policy that addresses purpose, scope, roles, responsibilities, "
|
||||
"management commitment, coordination among entities.\n\n"
|
||||
"AC-2 ACCOUNT MANAGEMENT\n"
|
||||
"The information system enforces approved authorizations for logical "
|
||||
"access to information and system resources.\n"
|
||||
)
|
||||
result = chunk_text_legal_structured(text, chunk_size=300, overlap=50)
|
||||
sections = [r.get("section", "") for r in result]
|
||||
assert any(s == "AC-1" for s in sections)
|
||||
assert any(s == "AC-2" for s in sections)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Chunking with NIST-style text
|
||||
# =========================================================================
|
||||
|
||||
Reference in New Issue
Block a user