93099b2770
D2: RAG service stores section/section_title/paragraph/paragraph_num/page from embedding service chunks_with_metadata into Qdrant payloads. D3: Control generator prefers section > article > section_title from Qdrant, adds page to source_citation and generation_metadata. D4: Validated with real BGB §§ 312-312k text. Found and fixed critical bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing only the first chunk per document to have metadata. All subsequent chunks lost section info. Also fixes pre-existing lint issues (unused imports, ambiguous variable names, duplicate dict key, bare except). 456 tests passing (58 embedding + 387 pipeline + 11 rag-service). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
218 lines
8.6 KiB
Python
218 lines
8.6 KiB
Python
"""
|
|
D4 Validation: BGB § 312k structural chunking test.
|
|
|
|
Tests that real German legal text is correctly chunked with structural
|
|
metadata (section, section_title, paragraph, paragraph_num).
|
|
This is the gate test before re-ingesting all 297 legal sources.
|
|
"""
|
|
|
|
import os
|
|
import pytest
|
|
|
|
from main import chunk_text_legal, chunk_text_legal_structured
|
|
|
|
FIXTURE_PATH = os.path.join(
|
|
os.path.dirname(__file__), "tests", "fixtures", "bgb_312_excerpt.txt"
|
|
)
|
|
|
|
# Reasonable defaults for legal text
|
|
CHUNK_SIZE = 1500
|
|
OVERLAP = 100
|
|
|
|
|
|
@pytest.fixture
|
|
def bgb_text():
|
|
with open(FIXTURE_PATH, encoding="utf-8") as f:
|
|
return f.read()
|
|
|
|
|
|
@pytest.fixture
|
|
def plain_chunks(bgb_text):
|
|
return chunk_text_legal(bgb_text, CHUNK_SIZE, OVERLAP)
|
|
|
|
|
|
@pytest.fixture
|
|
def structured_chunks(bgb_text):
|
|
return chunk_text_legal_structured(bgb_text, CHUNK_SIZE, OVERLAP)
|
|
|
|
|
|
# =========================================================================
|
|
# Basic sanity
|
|
# =========================================================================
|
|
|
|
class TestChunkingSanity:
|
|
|
|
def test_fixture_loads(self, bgb_text):
|
|
assert len(bgb_text) > 2000, "BGB excerpt should be substantial"
|
|
assert "§ 312k" in bgb_text
|
|
assert "§ 312 " in bgb_text
|
|
|
|
def test_chunk_count_reasonable(self, plain_chunks):
|
|
assert 4 <= len(plain_chunks) <= 30, (
|
|
f"Expected 4-30 chunks, got {len(plain_chunks)}"
|
|
)
|
|
|
|
def test_structured_same_count(self, plain_chunks, structured_chunks):
|
|
assert len(plain_chunks) == len(structured_chunks)
|
|
|
|
def test_no_empty_chunks(self, plain_chunks):
|
|
for i, chunk in enumerate(plain_chunks):
|
|
assert chunk.strip(), f"Chunk {i} is empty"
|
|
|
|
def test_chunk_sizes_reasonable(self, plain_chunks):
|
|
for i, chunk in enumerate(plain_chunks):
|
|
assert len(chunk) < 3000, f"Chunk {i} too large: {len(chunk)} chars"
|
|
assert len(chunk) > 30, f"Chunk {i} too small: {len(chunk)} chars"
|
|
|
|
|
|
# =========================================================================
|
|
# Section detection
|
|
# =========================================================================
|
|
|
|
class TestSectionDetection:
|
|
|
|
def test_all_four_sections_detected(self, structured_chunks):
|
|
"""All 4 BGB sections should appear as section metadata."""
|
|
found_sections = set()
|
|
for meta in structured_chunks:
|
|
if meta["section"]:
|
|
found_sections.add(meta["section"])
|
|
|
|
assert "§ 312" in found_sections or any(
|
|
s.startswith("§ 312") and s != "§ 312a" and s != "§ 312g" and s != "§ 312k"
|
|
for s in found_sections
|
|
), f"§ 312 not found. Sections: {found_sections}"
|
|
assert "§ 312a" in found_sections, f"§ 312a not found. Sections: {found_sections}"
|
|
assert "§ 312g" in found_sections, f"§ 312g not found. Sections: {found_sections}"
|
|
assert "§ 312k" in found_sections, f"§ 312k not found. Sections: {found_sections}"
|
|
|
|
def test_section_prefix_in_chunks(self, plain_chunks):
|
|
"""Most chunks should have [§ ...] prefix."""
|
|
prefixed = sum(1 for c in plain_chunks if c.startswith("[§"))
|
|
ratio = prefixed / len(plain_chunks)
|
|
assert ratio >= 0.8, (
|
|
f"Only {ratio:.0%} chunks have section prefix (expected >= 80%)"
|
|
)
|
|
|
|
def test_312k_has_own_chunk(self, plain_chunks):
|
|
"""§ 312k must appear as a chunk section header, not merged into another §."""
|
|
chunks_with_312k = [c for c in plain_chunks if "[§ 312k" in c]
|
|
assert len(chunks_with_312k) >= 1, (
|
|
"§ 312k should have at least 1 dedicated chunk"
|
|
)
|
|
|
|
|
|
# =========================================================================
|
|
# § 312k specific metadata
|
|
# =========================================================================
|
|
|
|
class TestSection312k:
|
|
|
|
def _312k_chunks(self, structured_chunks):
|
|
return [m for m in structured_chunks if m["section"] == "§ 312k"]
|
|
|
|
def test_312k_section_metadata(self, structured_chunks):
|
|
"""§ 312k chunks should have section='§ 312k' with a title."""
|
|
chunks = self._312k_chunks(structured_chunks)
|
|
assert len(chunks) >= 1, "No chunks with section='§ 312k'"
|
|
for meta in chunks:
|
|
assert meta["section"] == "§ 312k"
|
|
# Title should contain key words
|
|
title = meta["section_title"].lower()
|
|
assert "kuendigung" in title or "verbrauchervertrae" in title, (
|
|
f"Unexpected section_title: {meta['section_title']}"
|
|
)
|
|
|
|
def test_312k_paragraph_extraction(self, structured_chunks):
|
|
"""At least some § 312k chunks should have paragraph references."""
|
|
chunks = self._312k_chunks(structured_chunks)
|
|
paragraphs_found = [m["paragraph"] for m in chunks if m["paragraph"]]
|
|
# § 312k has (1) through (6), at least some should be detected
|
|
assert len(paragraphs_found) >= 1, (
|
|
"No paragraph references found in § 312k chunks"
|
|
)
|
|
|
|
def test_312k_content_present(self, structured_chunks):
|
|
"""§ 312k chunk text should contain key legal terms."""
|
|
chunks = self._312k_chunks(structured_chunks)
|
|
all_text = " ".join(m["text"] for m in chunks)
|
|
assert "Kuendigungsschaltflaeche" in all_text or "kuendigen" in all_text.lower()
|
|
assert "Webseite" in all_text or "elektronischen" in all_text
|
|
|
|
def test_312k_not_merged_with_312g(self, structured_chunks):
|
|
"""§ 312k and § 312g should be separate sections, not merged."""
|
|
sections_312g = [m for m in structured_chunks if m["section"] == "§ 312g"]
|
|
sections_312k = self._312k_chunks(structured_chunks)
|
|
assert len(sections_312g) >= 1, "§ 312g missing"
|
|
assert len(sections_312k) >= 1, "§ 312k missing"
|
|
# Verify they are different chunks (no overlap in indices)
|
|
g_indices = {m["index"] for m in sections_312g}
|
|
k_indices = {m["index"] for m in sections_312k}
|
|
assert g_indices.isdisjoint(k_indices), (
|
|
f"§ 312g and § 312k share chunk indices: {g_indices & k_indices}"
|
|
)
|
|
|
|
|
|
# =========================================================================
|
|
# Metadata quality across all sections
|
|
# =========================================================================
|
|
|
|
class TestMetadataQuality:
|
|
|
|
def test_most_chunks_have_section(self, structured_chunks):
|
|
"""At least 90% of chunks should have a section reference."""
|
|
with_section = sum(1 for m in structured_chunks if m["section"])
|
|
ratio = with_section / len(structured_chunks)
|
|
assert ratio >= 0.9, (
|
|
f"Only {ratio:.0%} chunks have section metadata (expected >= 90%)"
|
|
)
|
|
|
|
def test_section_titles_not_empty(self, structured_chunks):
|
|
"""Chunks with a section should also have a section_title."""
|
|
for meta in structured_chunks:
|
|
if meta["section"]:
|
|
assert meta["section_title"], (
|
|
f"Chunk {meta['index']} has section={meta['section']} but no title"
|
|
)
|
|
|
|
def test_paragraph_nums_are_integers(self, structured_chunks):
|
|
"""paragraph_num should be int or None, never str."""
|
|
for meta in structured_chunks:
|
|
pn = meta["paragraph_num"]
|
|
assert pn is None or isinstance(pn, int), (
|
|
f"Chunk {meta['index']}: paragraph_num={pn!r} (type={type(pn).__name__})"
|
|
)
|
|
|
|
def test_indices_sequential(self, structured_chunks):
|
|
"""Chunk indices should be 0, 1, 2, ... in order."""
|
|
for i, meta in enumerate(structured_chunks):
|
|
assert meta["index"] == i, (
|
|
f"Expected index {i}, got {meta['index']}"
|
|
)
|
|
|
|
|
|
# =========================================================================
|
|
# Edge cases
|
|
# =========================================================================
|
|
|
|
class TestEdgeCases:
|
|
|
|
def test_numbered_list_not_false_section(self, structured_chunks):
|
|
"""Numbered items (1., 2., 3.) inside a § should NOT create new sections."""
|
|
for meta in structured_chunks:
|
|
section = meta["section"]
|
|
# Section should always start with § or be empty
|
|
if section:
|
|
assert section.startswith("§"), (
|
|
f"Unexpected section format: {section!r}"
|
|
)
|
|
|
|
def test_subsection_letters_preserved(self, plain_chunks):
|
|
"""Lettered subsections (a, b, c, d, e) in § 312k(2) should be in the text."""
|
|
all_text = " ".join(plain_chunks)
|
|
# § 312k Abs 2 Nr 1 has a) through e)
|
|
for letter in ["a)", "b)", "c)", "d)", "e)"]:
|
|
assert letter in all_text, (
|
|
f"Subsection letter {letter} from § 312k(2) missing"
|
|
)
|