Files
breakpilot-core/embedding-service/test_d4_bgb.py
T
Benjamin Admin 93099b2770 feat(pipeline): structural metadata end-to-end (Blocks D2-D4)
D2: RAG service stores section/section_title/paragraph/paragraph_num/page
from embedding service chunks_with_metadata into Qdrant payloads.

D3: Control generator prefers section > article > section_title from
Qdrant, adds page to source_citation and generation_metadata.

D4: Validated with real BGB §§ 312-312k text. Found and fixed critical
bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing
only the first chunk per document to have metadata. All subsequent
chunks lost section info.

Also fixes pre-existing lint issues (unused imports, ambiguous variable
names, duplicate dict key, bare except).

456 tests passing (58 embedding + 387 pipeline + 11 rag-service).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-01 20:34:00 +02:00

218 lines
8.6 KiB
Python

"""
D4 Validation: BGB § 312k structural chunking test.
Tests that real German legal text is correctly chunked with structural
metadata (section, section_title, paragraph, paragraph_num).
This is the gate test before re-ingesting all 297 legal sources.
"""
import os
import pytest
from main import chunk_text_legal, chunk_text_legal_structured
FIXTURE_PATH = os.path.join(
os.path.dirname(__file__), "tests", "fixtures", "bgb_312_excerpt.txt"
)
# Reasonable defaults for legal text
CHUNK_SIZE = 1500
OVERLAP = 100
@pytest.fixture
def bgb_text():
with open(FIXTURE_PATH, encoding="utf-8") as f:
return f.read()
@pytest.fixture
def plain_chunks(bgb_text):
return chunk_text_legal(bgb_text, CHUNK_SIZE, OVERLAP)
@pytest.fixture
def structured_chunks(bgb_text):
return chunk_text_legal_structured(bgb_text, CHUNK_SIZE, OVERLAP)
# =========================================================================
# Basic sanity
# =========================================================================
class TestChunkingSanity:
def test_fixture_loads(self, bgb_text):
assert len(bgb_text) > 2000, "BGB excerpt should be substantial"
assert "§ 312k" in bgb_text
assert "§ 312 " in bgb_text
def test_chunk_count_reasonable(self, plain_chunks):
assert 4 <= len(plain_chunks) <= 30, (
f"Expected 4-30 chunks, got {len(plain_chunks)}"
)
def test_structured_same_count(self, plain_chunks, structured_chunks):
assert len(plain_chunks) == len(structured_chunks)
def test_no_empty_chunks(self, plain_chunks):
for i, chunk in enumerate(plain_chunks):
assert chunk.strip(), f"Chunk {i} is empty"
def test_chunk_sizes_reasonable(self, plain_chunks):
for i, chunk in enumerate(plain_chunks):
assert len(chunk) < 3000, f"Chunk {i} too large: {len(chunk)} chars"
assert len(chunk) > 30, f"Chunk {i} too small: {len(chunk)} chars"
# =========================================================================
# Section detection
# =========================================================================
class TestSectionDetection:
def test_all_four_sections_detected(self, structured_chunks):
"""All 4 BGB sections should appear as section metadata."""
found_sections = set()
for meta in structured_chunks:
if meta["section"]:
found_sections.add(meta["section"])
assert "§ 312" in found_sections or any(
s.startswith("§ 312") and s != "§ 312a" and s != "§ 312g" and s != "§ 312k"
for s in found_sections
), f"§ 312 not found. Sections: {found_sections}"
assert "§ 312a" in found_sections, f"§ 312a not found. Sections: {found_sections}"
assert "§ 312g" in found_sections, f"§ 312g not found. Sections: {found_sections}"
assert "§ 312k" in found_sections, f"§ 312k not found. Sections: {found_sections}"
def test_section_prefix_in_chunks(self, plain_chunks):
"""Most chunks should have [§ ...] prefix."""
prefixed = sum(1 for c in plain_chunks if c.startswith(""))
ratio = prefixed / len(plain_chunks)
assert ratio >= 0.8, (
f"Only {ratio:.0%} chunks have section prefix (expected >= 80%)"
)
def test_312k_has_own_chunk(self, plain_chunks):
"""§ 312k must appear as a chunk section header, not merged into another §."""
chunks_with_312k = [c for c in plain_chunks if "[§ 312k" in c]
assert len(chunks_with_312k) >= 1, (
"§ 312k should have at least 1 dedicated chunk"
)
# =========================================================================
# § 312k specific metadata
# =========================================================================
class TestSection312k:
def _312k_chunks(self, structured_chunks):
return [m for m in structured_chunks if m["section"] == "§ 312k"]
def test_312k_section_metadata(self, structured_chunks):
"""§ 312k chunks should have section='§ 312k' with a title."""
chunks = self._312k_chunks(structured_chunks)
assert len(chunks) >= 1, "No chunks with section='§ 312k'"
for meta in chunks:
assert meta["section"] == "§ 312k"
# Title should contain key words
title = meta["section_title"].lower()
assert "kuendigung" in title or "verbrauchervertrae" in title, (
f"Unexpected section_title: {meta['section_title']}"
)
def test_312k_paragraph_extraction(self, structured_chunks):
"""At least some § 312k chunks should have paragraph references."""
chunks = self._312k_chunks(structured_chunks)
paragraphs_found = [m["paragraph"] for m in chunks if m["paragraph"]]
# § 312k has (1) through (6), at least some should be detected
assert len(paragraphs_found) >= 1, (
"No paragraph references found in § 312k chunks"
)
def test_312k_content_present(self, structured_chunks):
"""§ 312k chunk text should contain key legal terms."""
chunks = self._312k_chunks(structured_chunks)
all_text = " ".join(m["text"] for m in chunks)
assert "Kuendigungsschaltflaeche" in all_text or "kuendigen" in all_text.lower()
assert "Webseite" in all_text or "elektronischen" in all_text
def test_312k_not_merged_with_312g(self, structured_chunks):
"""§ 312k and § 312g should be separate sections, not merged."""
sections_312g = [m for m in structured_chunks if m["section"] == "§ 312g"]
sections_312k = self._312k_chunks(structured_chunks)
assert len(sections_312g) >= 1, "§ 312g missing"
assert len(sections_312k) >= 1, "§ 312k missing"
# Verify they are different chunks (no overlap in indices)
g_indices = {m["index"] for m in sections_312g}
k_indices = {m["index"] for m in sections_312k}
assert g_indices.isdisjoint(k_indices), (
f"§ 312g and § 312k share chunk indices: {g_indices & k_indices}"
)
# =========================================================================
# Metadata quality across all sections
# =========================================================================
class TestMetadataQuality:
def test_most_chunks_have_section(self, structured_chunks):
"""At least 90% of chunks should have a section reference."""
with_section = sum(1 for m in structured_chunks if m["section"])
ratio = with_section / len(structured_chunks)
assert ratio >= 0.9, (
f"Only {ratio:.0%} chunks have section metadata (expected >= 90%)"
)
def test_section_titles_not_empty(self, structured_chunks):
"""Chunks with a section should also have a section_title."""
for meta in structured_chunks:
if meta["section"]:
assert meta["section_title"], (
f"Chunk {meta['index']} has section={meta['section']} but no title"
)
def test_paragraph_nums_are_integers(self, structured_chunks):
"""paragraph_num should be int or None, never str."""
for meta in structured_chunks:
pn = meta["paragraph_num"]
assert pn is None or isinstance(pn, int), (
f"Chunk {meta['index']}: paragraph_num={pn!r} (type={type(pn).__name__})"
)
def test_indices_sequential(self, structured_chunks):
"""Chunk indices should be 0, 1, 2, ... in order."""
for i, meta in enumerate(structured_chunks):
assert meta["index"] == i, (
f"Expected index {i}, got {meta['index']}"
)
# =========================================================================
# Edge cases
# =========================================================================
class TestEdgeCases:
def test_numbered_list_not_false_section(self, structured_chunks):
"""Numbered items (1., 2., 3.) inside a § should NOT create new sections."""
for meta in structured_chunks:
section = meta["section"]
# Section should always start with § or be empty
if section:
assert section.startswith("§"), (
f"Unexpected section format: {section!r}"
)
def test_subsection_letters_preserved(self, plain_chunks):
"""Lettered subsections (a, b, c, d, e) in § 312k(2) should be in the text."""
all_text = " ".join(plain_chunks)
# § 312k Abs 2 Nr 1 has a) through e)
for letter in ["a)", "b)", "c)", "d)", "e)"]:
assert letter in all_text, (
f"Subsection letter {letter} from § 312k(2) missing"
)