feat(pipeline): structural metadata end-to-end (Blocks D2-D4)
D2: RAG service stores section/section_title/paragraph/paragraph_num/page from embedding service chunks_with_metadata into Qdrant payloads. D3: Control generator prefers section > article > section_title from Qdrant, adds page to source_citation and generation_metadata. D4: Validated with real BGB §§ 312-312k text. Found and fixed critical bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing only the first chunk per document to have metadata. All subsequent chunks lost section info. Also fixes pre-existing lint issues (unused imports, ambiguous variable names, duplicate dict key, bare except). 456 tests passing (58 embedding + 387 pipeline + 11 rag-service). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,135 @@
|
||||
"""Tests for EmbeddingClient.chunk_text() — ChunkResult with metadata (D2)."""
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from embedding_client import ChunkResult, EmbeddingClient
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
with patch("embedding_client.settings") as mock_settings:
|
||||
mock_settings.EMBEDDING_SERVICE_URL = "http://localhost:8087"
|
||||
return EmbeddingClient()
|
||||
|
||||
|
||||
def _mock_response(json_data: dict, status_code: int = 200):
|
||||
"""Create a mock httpx response (sync methods like .json() and .raise_for_status())."""
|
||||
resp = MagicMock()
|
||||
resp.status_code = status_code
|
||||
resp.json.return_value = json_data
|
||||
return resp
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chunk_text_returns_chunk_result(client):
|
||||
"""chunk_text returns ChunkResult with both chunks and metadata."""
|
||||
mock_json = {
|
||||
"chunks": ["chunk1 text", "chunk2 text"],
|
||||
"chunks_with_metadata": [
|
||||
{
|
||||
"text": "chunk1 text",
|
||||
"section": "§ 25",
|
||||
"section_title": "Informationspflichten",
|
||||
"paragraph": "Abs. 1",
|
||||
"paragraph_num": 1,
|
||||
"page": None,
|
||||
"index": 0,
|
||||
},
|
||||
{
|
||||
"text": "chunk2 text",
|
||||
"section": "§ 25",
|
||||
"section_title": "Informationspflichten",
|
||||
"paragraph": "Abs. 2",
|
||||
"paragraph_num": 2,
|
||||
"page": None,
|
||||
"index": 1,
|
||||
},
|
||||
],
|
||||
"count": 2,
|
||||
"strategy": "recursive",
|
||||
}
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = _mock_response(mock_json)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
result = await client.chunk_text("some legal text")
|
||||
|
||||
assert isinstance(result, ChunkResult)
|
||||
assert result.chunks == ["chunk1 text", "chunk2 text"]
|
||||
assert len(result.chunks_with_metadata) == 2
|
||||
assert result.chunks_with_metadata[0]["section"] == "§ 25"
|
||||
assert result.chunks_with_metadata[1]["paragraph"] == "Abs. 2"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chunk_text_without_metadata_field(client):
|
||||
"""Embedding service response without chunks_with_metadata → empty list."""
|
||||
mock_json = {
|
||||
"chunks": ["chunk1"],
|
||||
"count": 1,
|
||||
"strategy": "semantic",
|
||||
}
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = _mock_response(mock_json)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
result = await client.chunk_text("text", strategy="semantic")
|
||||
|
||||
assert isinstance(result, ChunkResult)
|
||||
assert result.chunks == ["chunk1"]
|
||||
assert result.chunks_with_metadata == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chunk_text_with_null_metadata(client):
|
||||
"""chunks_with_metadata: null in response → empty list."""
|
||||
mock_json = {
|
||||
"chunks": ["chunk1"],
|
||||
"chunks_with_metadata": None,
|
||||
"count": 1,
|
||||
"strategy": "recursive",
|
||||
}
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = _mock_response(mock_json)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
result = await client.chunk_text("text")
|
||||
|
||||
assert result.chunks_with_metadata == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chunk_text_empty(client):
|
||||
"""Empty text → empty chunks and metadata."""
|
||||
mock_json = {
|
||||
"chunks": [],
|
||||
"chunks_with_metadata": [],
|
||||
"count": 0,
|
||||
"strategy": "recursive",
|
||||
}
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = _mock_response(mock_json)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
result = await client.chunk_text("")
|
||||
|
||||
assert result.chunks == []
|
||||
assert result.chunks_with_metadata == []
|
||||
Reference in New Issue
Block a user