"""Tests for document upload payload building — structural metadata (D2).""" # Mirror the constant from api/documents.py to avoid heavy import chain # (api → jose, qdrant_client, minio, etc.) _STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page") def _build_payload( chunk: str, index: int, chunks_meta: list[dict], extra_metadata: "dict | None" = None, ) -> dict: """Replicate the payload-building logic from documents.py for unit testing.""" payload = { "document_id": "test-doc-id", "object_name": "test/path.pdf", "filename": "path.pdf", "chunk_index": index, "chunk_text": chunk, "data_type": "law", "bundesland": "bund", "use_case": "compliance", "year": "2026", **(extra_metadata or {}), } if index < len(chunks_meta): meta = chunks_meta[index] for field in _STRUCT_FIELDS: value = meta.get(field) if value is not None and value != "": payload[field] = value return payload class TestPayloadStructuralMetadata: """Tests for structural metadata merging into Qdrant payloads.""" def test_payload_contains_structural_metadata(self): """Metadata fields from chunks_with_metadata land in the payload.""" meta = [ { "text": "chunk text", "section": "§ 312k", "section_title": "Kuendigungsbutton", "paragraph": "Abs. 1", "paragraph_num": 1, "page": 847, "index": 0, } ] payload = _build_payload("chunk text", 0, meta) assert payload["section"] == "§ 312k" assert payload["section_title"] == "Kuendigungsbutton" assert payload["paragraph"] == "Abs. 1" assert payload["paragraph_num"] == 1 assert payload["page"] == 847 def test_payload_without_metadata_backwards_compat(self): """Empty metadata list → payload has no structural fields.""" payload = _build_payload("chunk text", 0, []) for field in _STRUCT_FIELDS: assert field not in payload def test_payload_skips_empty_values(self): """Empty string and None values are NOT added to payload.""" meta = [ { "text": "chunk text", "section": "", "section_title": "", "paragraph": "", "paragraph_num": None, "page": None, "index": 0, } ] payload = _build_payload("chunk text", 0, meta) for field in _STRUCT_FIELDS: assert field not in payload def test_metadata_overrides_extra_metadata(self): """Auto-extracted metadata takes precedence over manual extra_metadata.""" meta = [ { "text": "chunk text", "section": "§ 25", "section_title": "", "paragraph": "", "paragraph_num": None, "page": None, "index": 0, } ] extra = {"section": "manual-value"} payload = _build_payload("chunk text", 0, meta, extra_metadata=extra) assert payload["section"] == "§ 25" def test_partial_metadata_alignment(self): """3 chunks but only 2 metadata entries → third payload has no structural fields.""" meta = [ { "text": "c1", "section": "§ 1", "section_title": "", "paragraph": "", "paragraph_num": None, "page": None, "index": 0, }, { "text": "c2", "section": "§ 2", "section_title": "", "paragraph": "", "paragraph_num": None, "page": None, "index": 1, }, ] p0 = _build_payload("c1", 0, meta) p1 = _build_payload("c2", 1, meta) p2 = _build_payload("c3", 2, meta) assert p0["section"] == "§ 1" assert p1["section"] == "§ 2" assert "section" not in p2 def test_zero_paragraph_num_is_kept(self): """paragraph_num=0 is a valid value and should be stored.""" meta = [ { "text": "chunk", "section": "", "section_title": "", "paragraph": "", "paragraph_num": 0, "page": None, "index": 0, } ] payload = _build_payload("chunk", 0, meta) # 0 is not None and not "" → should be stored assert payload["paragraph_num"] == 0 def test_page_zero_is_kept(self): """page=0 is a valid value (first page) and should be stored.""" meta = [ { "text": "chunk", "section": "", "section_title": "", "paragraph": "", "paragraph_num": None, "page": 0, "index": 0, } ] payload = _build_payload("chunk", 0, meta) assert payload["page"] == 0