breakpilot-core/embedding-service/test_d4_bgb.py

"""
D4 Validation: BGB § 312k structural chunking test.

Tests that real German legal text is correctly chunked with structural
metadata (section, section_title, paragraph, paragraph_num).
This is the gate test before re-ingesting all 297 legal sources.
"""

import os
import pytest

from main import chunk_text_legal, chunk_text_legal_structured

FIXTURE_PATH = os.path.join(
    os.path.dirname(__file__), "tests", "fixtures", "bgb_312_excerpt.txt"
)

# Reasonable defaults for legal text
CHUNK_SIZE = 1500
OVERLAP = 100


@pytest.fixture
def bgb_text():
    with open(FIXTURE_PATH, encoding="utf-8") as f:
        return f.read()


@pytest.fixture
def plain_chunks(bgb_text):
    return chunk_text_legal(bgb_text, CHUNK_SIZE, OVERLAP)


@pytest.fixture
def structured_chunks(bgb_text):
    return chunk_text_legal_structured(bgb_text, CHUNK_SIZE, OVERLAP)


# =========================================================================
# Basic sanity
# =========================================================================

class TestChunkingSanity:

    def test_fixture_loads(self, bgb_text):
        assert len(bgb_text) > 2000, "BGB excerpt should be substantial"
        assert "§ 312k" in bgb_text
        assert "§ 312 " in bgb_text

    def test_chunk_count_reasonable(self, plain_chunks):
        assert 4 <= len(plain_chunks) <= 30, (
            f"Expected 4-30 chunks, got {len(plain_chunks)}"
        )

    def test_structured_same_count(self, plain_chunks, structured_chunks):
        assert len(plain_chunks) == len(structured_chunks)

    def test_no_empty_chunks(self, plain_chunks):
        for i, chunk in enumerate(plain_chunks):
            assert chunk.strip(), f"Chunk {i} is empty"

    def test_chunk_sizes_reasonable(self, plain_chunks):
        for i, chunk in enumerate(plain_chunks):
            assert len(chunk) < 3000, f"Chunk {i} too large: {len(chunk)} chars"
            assert len(chunk) > 30, f"Chunk {i} too small: {len(chunk)} chars"


# =========================================================================
# Section detection
# =========================================================================

class TestSectionDetection:

    def test_all_four_sections_detected(self, structured_chunks):
        """All 4 BGB sections should appear as section metadata."""
        found_sections = set()
        for meta in structured_chunks:
            if meta["section"]:
                found_sections.add(meta["section"])

        assert "§ 312" in found_sections or any(
            s.startswith("§ 312") and s != "§ 312a" and s != "§ 312g" and s != "§ 312k"
            for s in found_sections
        ), f"§ 312 not found. Sections: {found_sections}"
        assert "§ 312a" in found_sections, f"§ 312a not found. Sections: {found_sections}"
        assert "§ 312g" in found_sections, f"§ 312g not found. Sections: {found_sections}"
        assert "§ 312k" in found_sections, f"§ 312k not found. Sections: {found_sections}"

    def test_section_prefix_in_chunks(self, plain_chunks):
        """Most chunks should have [§ ...] prefix."""
        prefixed = sum(1 for c in plain_chunks if c.startswith("[§"))
        ratio = prefixed / len(plain_chunks)
        assert ratio >= 0.8, (
            f"Only {ratio:.0%} chunks have section prefix (expected >= 80%)"
        )

    def test_312k_has_own_chunk(self, plain_chunks):
        """§ 312k must appear as a chunk section header, not merged into another §."""
        chunks_with_312k = [c for c in plain_chunks if "[§ 312k" in c]
        assert len(chunks_with_312k) >= 1, (
            "§ 312k should have at least 1 dedicated chunk"
        )


# =========================================================================
# § 312k specific metadata
# =========================================================================

class TestSection312k:

    def _312k_chunks(self, structured_chunks):
        return [m for m in structured_chunks if m["section"] == "§ 312k"]

    def test_312k_section_metadata(self, structured_chunks):
        """§ 312k chunks should have section='§ 312k' with a title."""
        chunks = self._312k_chunks(structured_chunks)
        assert len(chunks) >= 1, "No chunks with section='§ 312k'"
        for meta in chunks:
            assert meta["section"] == "§ 312k"
            # Title should contain key words
            title = meta["section_title"].lower()
            assert "kuendigung" in title or "verbrauchervertrae" in title, (
                f"Unexpected section_title: {meta['section_title']}"
            )

    def test_312k_paragraph_extraction(self, structured_chunks):
        """At least some § 312k chunks should have paragraph references."""
        chunks = self._312k_chunks(structured_chunks)
        paragraphs_found = [m["paragraph"] for m in chunks if m["paragraph"]]
        # § 312k has (1) through (6), at least some should be detected
        assert len(paragraphs_found) >= 1, (
            "No paragraph references found in § 312k chunks"
        )

    def test_312k_content_present(self, structured_chunks):
        """§ 312k chunk text should contain key legal terms."""
        chunks = self._312k_chunks(structured_chunks)
        all_text = " ".join(m["text"] for m in chunks)
        assert "Kuendigungsschaltflaeche" in all_text or "kuendigen" in all_text.lower()
        assert "Webseite" in all_text or "elektronischen" in all_text

    def test_312k_not_merged_with_312g(self, structured_chunks):
        """§ 312k and § 312g should be separate sections, not merged."""
        sections_312g = [m for m in structured_chunks if m["section"] == "§ 312g"]
        sections_312k = self._312k_chunks(structured_chunks)
        assert len(sections_312g) >= 1, "§ 312g missing"
        assert len(sections_312k) >= 1, "§ 312k missing"
        # Verify they are different chunks (no overlap in indices)
        g_indices = {m["index"] for m in sections_312g}
        k_indices = {m["index"] for m in sections_312k}
        assert g_indices.isdisjoint(k_indices), (
            f"§ 312g and § 312k share chunk indices: {g_indices & k_indices}"
        )


# =========================================================================
# Metadata quality across all sections
# =========================================================================

class TestMetadataQuality:

    def test_most_chunks_have_section(self, structured_chunks):
        """At least 90% of chunks should have a section reference."""
        with_section = sum(1 for m in structured_chunks if m["section"])
        ratio = with_section / len(structured_chunks)
        assert ratio >= 0.9, (
            f"Only {ratio:.0%} chunks have section metadata (expected >= 90%)"
        )

    def test_section_titles_not_empty(self, structured_chunks):
        """Chunks with a section should also have a section_title."""
        for meta in structured_chunks:
            if meta["section"]:
                assert meta["section_title"], (
                    f"Chunk {meta['index']} has section={meta['section']} but no title"
                )

    def test_paragraph_nums_are_integers(self, structured_chunks):
        """paragraph_num should be int or None, never str."""
        for meta in structured_chunks:
            pn = meta["paragraph_num"]
            assert pn is None or isinstance(pn, int), (
                f"Chunk {meta['index']}: paragraph_num={pn!r} (type={type(pn).__name__})"
            )

    def test_indices_sequential(self, structured_chunks):
        """Chunk indices should be 0, 1, 2, ... in order."""
        for i, meta in enumerate(structured_chunks):
            assert meta["index"] == i, (
                f"Expected index {i}, got {meta['index']}"
            )


# =========================================================================
# Edge cases
# =========================================================================

class TestEdgeCases:

    def test_numbered_list_not_false_section(self, structured_chunks):
        """Numbered items (1., 2., 3.) inside a § should NOT create new sections."""
        for meta in structured_chunks:
            section = meta["section"]
            # Section should always start with § or be empty
            if section:
                assert section.startswith("§"), (
                    f"Unexpected section format: {section!r}"
                )

    def test_subsection_letters_preserved(self, plain_chunks):
        """Lettered subsections (a, b, c, d, e) in § 312k(2) should be in the text."""
        all_text = " ".join(plain_chunks)
        # § 312k Abs 2 Nr 1 has a) through e)
        for letter in ["a)", "b)", "c)", "d)", "e)"]:
            assert letter in all_text, (
                f"Subsection letter {letter} from § 312k(2) missing"
            )