""" D4 Validation: BGB § 312k structural chunking test. Tests that real German legal text is correctly chunked with structural metadata (section, section_title, paragraph, paragraph_num). This is the gate test before re-ingesting all 297 legal sources. """ import os import pytest from main import chunk_text_legal, chunk_text_legal_structured FIXTURE_PATH = os.path.join( os.path.dirname(__file__), "tests", "fixtures", "bgb_312_excerpt.txt" ) # Reasonable defaults for legal text CHUNK_SIZE = 1500 OVERLAP = 100 @pytest.fixture def bgb_text(): with open(FIXTURE_PATH, encoding="utf-8") as f: return f.read() @pytest.fixture def plain_chunks(bgb_text): return chunk_text_legal(bgb_text, CHUNK_SIZE, OVERLAP) @pytest.fixture def structured_chunks(bgb_text): return chunk_text_legal_structured(bgb_text, CHUNK_SIZE, OVERLAP) # ========================================================================= # Basic sanity # ========================================================================= class TestChunkingSanity: def test_fixture_loads(self, bgb_text): assert len(bgb_text) > 2000, "BGB excerpt should be substantial" assert "§ 312k" in bgb_text assert "§ 312 " in bgb_text def test_chunk_count_reasonable(self, plain_chunks): assert 4 <= len(plain_chunks) <= 30, ( f"Expected 4-30 chunks, got {len(plain_chunks)}" ) def test_structured_same_count(self, plain_chunks, structured_chunks): assert len(plain_chunks) == len(structured_chunks) def test_no_empty_chunks(self, plain_chunks): for i, chunk in enumerate(plain_chunks): assert chunk.strip(), f"Chunk {i} is empty" def test_chunk_sizes_reasonable(self, plain_chunks): for i, chunk in enumerate(plain_chunks): assert len(chunk) < 3000, f"Chunk {i} too large: {len(chunk)} chars" assert len(chunk) > 30, f"Chunk {i} too small: {len(chunk)} chars" # ========================================================================= # Section detection # ========================================================================= class TestSectionDetection: def test_all_four_sections_detected(self, structured_chunks): """All 4 BGB sections should appear as section metadata.""" found_sections = set() for meta in structured_chunks: if meta["section"]: found_sections.add(meta["section"]) assert "§ 312" in found_sections or any( s.startswith("§ 312") and s != "§ 312a" and s != "§ 312g" and s != "§ 312k" for s in found_sections ), f"§ 312 not found. Sections: {found_sections}" assert "§ 312a" in found_sections, f"§ 312a not found. Sections: {found_sections}" assert "§ 312g" in found_sections, f"§ 312g not found. Sections: {found_sections}" assert "§ 312k" in found_sections, f"§ 312k not found. Sections: {found_sections}" def test_section_prefix_in_chunks(self, plain_chunks): """Most chunks should have [§ ...] prefix.""" prefixed = sum(1 for c in plain_chunks if c.startswith("[§")) ratio = prefixed / len(plain_chunks) assert ratio >= 0.8, ( f"Only {ratio:.0%} chunks have section prefix (expected >= 80%)" ) def test_312k_has_own_chunk(self, plain_chunks): """§ 312k must appear as a chunk section header, not merged into another §.""" chunks_with_312k = [c for c in plain_chunks if "[§ 312k" in c] assert len(chunks_with_312k) >= 1, ( "§ 312k should have at least 1 dedicated chunk" ) # ========================================================================= # § 312k specific metadata # ========================================================================= class TestSection312k: def _312k_chunks(self, structured_chunks): return [m for m in structured_chunks if m["section"] == "§ 312k"] def test_312k_section_metadata(self, structured_chunks): """§ 312k chunks should have section='§ 312k' with a title.""" chunks = self._312k_chunks(structured_chunks) assert len(chunks) >= 1, "No chunks with section='§ 312k'" for meta in chunks: assert meta["section"] == "§ 312k" # Title should contain key words title = meta["section_title"].lower() assert "kuendigung" in title or "verbrauchervertrae" in title, ( f"Unexpected section_title: {meta['section_title']}" ) def test_312k_paragraph_extraction(self, structured_chunks): """At least some § 312k chunks should have paragraph references.""" chunks = self._312k_chunks(structured_chunks) paragraphs_found = [m["paragraph"] for m in chunks if m["paragraph"]] # § 312k has (1) through (6), at least some should be detected assert len(paragraphs_found) >= 1, ( "No paragraph references found in § 312k chunks" ) def test_312k_content_present(self, structured_chunks): """§ 312k chunk text should contain key legal terms.""" chunks = self._312k_chunks(structured_chunks) all_text = " ".join(m["text"] for m in chunks) assert "Kuendigungsschaltflaeche" in all_text or "kuendigen" in all_text.lower() assert "Webseite" in all_text or "elektronischen" in all_text def test_312k_not_merged_with_312g(self, structured_chunks): """§ 312k and § 312g should be separate sections, not merged.""" sections_312g = [m for m in structured_chunks if m["section"] == "§ 312g"] sections_312k = self._312k_chunks(structured_chunks) assert len(sections_312g) >= 1, "§ 312g missing" assert len(sections_312k) >= 1, "§ 312k missing" # Verify they are different chunks (no overlap in indices) g_indices = {m["index"] for m in sections_312g} k_indices = {m["index"] for m in sections_312k} assert g_indices.isdisjoint(k_indices), ( f"§ 312g and § 312k share chunk indices: {g_indices & k_indices}" ) # ========================================================================= # Metadata quality across all sections # ========================================================================= class TestMetadataQuality: def test_most_chunks_have_section(self, structured_chunks): """At least 90% of chunks should have a section reference.""" with_section = sum(1 for m in structured_chunks if m["section"]) ratio = with_section / len(structured_chunks) assert ratio >= 0.9, ( f"Only {ratio:.0%} chunks have section metadata (expected >= 90%)" ) def test_section_titles_not_empty(self, structured_chunks): """Chunks with a section should also have a section_title.""" for meta in structured_chunks: if meta["section"]: assert meta["section_title"], ( f"Chunk {meta['index']} has section={meta['section']} but no title" ) def test_paragraph_nums_are_integers(self, structured_chunks): """paragraph_num should be int or None, never str.""" for meta in structured_chunks: pn = meta["paragraph_num"] assert pn is None or isinstance(pn, int), ( f"Chunk {meta['index']}: paragraph_num={pn!r} (type={type(pn).__name__})" ) def test_indices_sequential(self, structured_chunks): """Chunk indices should be 0, 1, 2, ... in order.""" for i, meta in enumerate(structured_chunks): assert meta["index"] == i, ( f"Expected index {i}, got {meta['index']}" ) # ========================================================================= # Edge cases # ========================================================================= class TestEdgeCases: def test_numbered_list_not_false_section(self, structured_chunks): """Numbered items (1., 2., 3.) inside a § should NOT create new sections.""" for meta in structured_chunks: section = meta["section"] # Section should always start with § or be empty if section: assert section.startswith("§"), ( f"Unexpected section format: {section!r}" ) def test_subsection_letters_preserved(self, plain_chunks): """Lettered subsections (a, b, c, d, e) in § 312k(2) should be in the text.""" all_text = " ".join(plain_chunks) # § 312k Abs 2 Nr 1 has a) through e) for letter in ["a)", "b)", "c)", "d)", "e)"]: assert letter in all_text, ( f"Subsection letter {letter} from § 312k(2) missing" )