feat(pipeline): structural metadata end-to-end (Blocks D2-D4)

D2: RAG service stores section/section_title/paragraph/paragraph_num/page
from embedding service chunks_with_metadata into Qdrant payloads.

D3: Control generator prefers section > article > section_title from
Qdrant, adds page to source_citation and generation_metadata.

D4: Validated with real BGB §§ 312-312k text. Found and fixed critical
bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing
only the first chunk per document to have metadata. All subsequent
chunks lost section info.

Also fixes pre-existing lint issues (unused imports, ambiguous variable
names, duplicate dict key, bare except).

456 tests passing (58 embedding + 387 pipeline + 11 rag-service).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-01 20:34:00 +02:00
parent da21339e76
commit 93099b2770
15 changed files with 1086 additions and 25 deletions
@@ -0,0 +1,268 @@
#!/usr/bin/env python3
"""
D4 Integration Test: Upload BGB excerpt → verify Qdrant payloads.
Usage:
# Dry-run (local chunking only, no services needed)
python3 scripts/test_d4_integration.py --dry-run
# Against Mac Mini
python3 scripts/test_d4_integration.py \
--rag-url https://macmini:8097 \
--qdrant-url http://macmini:6333
# Against production
python3 scripts/test_d4_integration.py \
--rag-url https://rag-prod:8097 \
--qdrant-url http://qdrant-prod:6333
"""
import argparse
import json
import os
import sys
import time
import httpx
FIXTURE_PATH = os.path.join(
os.path.dirname(__file__), "..", "..", "embedding-service",
"tests", "fixtures", "bgb_312_excerpt.txt",
)
COLLECTION = "bp_compliance_gesetze"
REG_CODE = "BGB_D4_TEST"
# Expected sections in the BGB excerpt
EXPECTED_SECTIONS = {"§ 312", "§ 312a", "§ 312g", "§ 312k"}
def load_fixture() -> str:
with open(FIXTURE_PATH, encoding="utf-8") as f:
return f.read()
def upload_document(rag_url: str, text: str) -> dict:
"""Upload BGB excerpt to RAG service."""
metadata = json.dumps({
"regulation_code": REG_CODE,
"regulation_name_de": "BGB (D4 Test)",
"source_type": "law",
})
with httpx.Client(timeout=60.0, verify=False) as client:
resp = client.post(
f"{rag_url}/api/v1/documents/upload",
files={"file": ("bgb_312_test.txt", text.encode(), "text/plain")},
data={
"collection": COLLECTION,
"data_type": "law",
"bundesland": "bund",
"use_case": "compliance",
"year": "2026",
"chunk_strategy": "recursive",
"chunk_size": "1500",
"chunk_overlap": "100",
"metadata_json": metadata,
},
)
resp.raise_for_status()
return resp.json()
def scroll_chunks(qdrant_url: str, document_id: str) -> list[dict]:
"""Scroll Qdrant for chunks matching this document_id."""
all_points = []
offset = None
with httpx.Client(timeout=30.0) as client:
while True:
body: dict = {
"limit": 100,
"with_payload": True,
"with_vector": False,
"filter": {
"must": [{
"key": "document_id",
"match": {"value": document_id},
}]
},
}
if offset:
body["offset"] = offset
resp = client.post(
f"{qdrant_url}/collections/{COLLECTION}/points/scroll",
json=body,
)
resp.raise_for_status()
data = resp.json()["result"]
all_points.extend(data["points"])
offset = data.get("next_page_offset")
if not offset:
break
return all_points
def delete_test_data(qdrant_url: str, document_id: str):
"""Clean up test chunks from Qdrant."""
with httpx.Client(timeout=30.0) as client:
resp = client.post(
f"{qdrant_url}/collections/{COLLECTION}/points/delete",
json={
"filter": {
"must": [{
"key": "document_id",
"match": {"value": document_id},
}]
}
},
)
resp.raise_for_status()
def verify_chunks(points: list[dict]) -> dict:
"""Analyze chunks and return a verification report."""
report = {
"total_chunks": len(points),
"sections_found": set(),
"chunks_with_section": 0,
"chunks_with_paragraph": 0,
"chunks_with_page": 0,
"section_details": [],
"issues": [],
}
for pt in points:
payload = pt.get("payload", {})
section = payload.get("section", "")
section_title = payload.get("section_title", "")
paragraph = payload.get("paragraph", "")
paragraph_num = payload.get("paragraph_num")
page = payload.get("page")
chunk_idx = payload.get("chunk_index", "?")
if section:
report["sections_found"].add(section)
report["chunks_with_section"] += 1
if paragraph:
report["chunks_with_paragraph"] += 1
if page is not None:
report["chunks_with_page"] += 1
report["section_details"].append({
"chunk_index": chunk_idx,
"section": section,
"section_title": section_title[:40],
"paragraph": paragraph,
"paragraph_num": paragraph_num,
"page": page,
"text_preview": payload.get("chunk_text", "")[:60],
})
# Checks
missing = EXPECTED_SECTIONS - report["sections_found"]
if missing:
report["issues"].append(f"Missing sections: {missing}")
if "§ 312k" not in report["sections_found"]:
report["issues"].append("CRITICAL: § 312k not found!")
section_ratio = report["chunks_with_section"] / max(report["total_chunks"], 1)
if section_ratio < 0.9:
report["issues"].append(
f"Only {section_ratio:.0%} chunks have section metadata (expected >= 90%)"
)
return report
def print_report(report: dict):
"""Print verification report."""
print("\n" + "=" * 60)
print("D4 VALIDATION REPORT")
print("=" * 60)
print(f"Total chunks: {report['total_chunks']}")
print(f"With section: {report['chunks_with_section']}")
print(f"With paragraph: {report['chunks_with_paragraph']}")
print(f"With page: {report['chunks_with_page']}")
print(f"Sections found: {sorted(report['sections_found'])}")
print("\nChunk details:")
for d in sorted(report["section_details"], key=lambda x: x["chunk_index"]):
print(
f" [{d['chunk_index']:2}] "
f"section={d['section']!r:12s} "
f"title={d['section_title']!r:30s} "
f"para={d['paragraph']!r:8s}"
)
if report["issues"]:
print(f"\nISSUES ({len(report['issues'])}):")
for issue in report["issues"]:
print(f" - {issue}")
print("\nRESULT: FAIL")
else:
print("\nRESULT: PASS — all sections detected, metadata quality OK")
def main():
parser = argparse.ArgumentParser(description="D4 Integration Test")
parser.add_argument("--rag-url", default="https://macmini:8097")
parser.add_argument("--qdrant-url", default="http://macmini:6333")
parser.add_argument("--dry-run", action="store_true",
help="Only test local chunking, no upload")
parser.add_argument("--keep", action="store_true",
help="Don't delete test data after verification")
args = parser.parse_args()
text = load_fixture()
print(f"Loaded BGB excerpt: {len(text)} chars")
if args.dry_run:
# Import chunking directly
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "embedding-service"))
from main import chunk_text_legal_structured
chunks = chunk_text_legal_structured(text, 1500, 100)
# Build fake points for verification
points = [{"payload": {
"chunk_index": c["index"],
"chunk_text": c["text"],
"section": c["section"],
"section_title": c["section_title"],
"paragraph": c["paragraph"],
"paragraph_num": c["paragraph_num"],
"page": c["page"],
}} for c in chunks]
report = verify_chunks(points)
print_report(report)
sys.exit(1 if report["issues"] else 0)
# Full integration test
print(f"Uploading to {args.rag_url} → collection={COLLECTION}...")
result = upload_document(args.rag_url, text)
doc_id = result["document_id"]
print(f" document_id: {doc_id}")
print(f" chunks_count: {result['chunks_count']}")
print(f" vectors_indexed: {result['vectors_indexed']}")
print("Waiting 2s for indexing...")
time.sleep(2)
print(f"Scrolling Qdrant at {args.qdrant_url}...")
points = scroll_chunks(args.qdrant_url, doc_id)
print(f" Found {len(points)} points")
report = verify_chunks(points)
print_report(report)
if not args.keep:
print(f"\nCleaning up test data (document_id={doc_id})...")
delete_test_data(args.qdrant_url, doc_id)
print(" Deleted.")
sys.exit(1 if report["issues"] else 0)
if __name__ == "__main__":
main()
+13 -7
View File
@@ -25,8 +25,7 @@ import re
import uuid
from collections import defaultdict
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set
from typing import Dict, List, Optional
import httpx
from pydantic import BaseModel
@@ -34,7 +33,7 @@ from sqlalchemy import text
from sqlalchemy.orm import Session
from .rag_client import ComplianceRAGClient, RAGSearchResult, get_rag_client
from .similarity_detector import check_similarity, SimilarityReport
from .similarity_detector import check_similarity
logger = logging.getLogger(__name__)
@@ -1019,11 +1018,12 @@ class ControlGeneratorPipeline:
regulation_name=reg_name,
regulation_short=reg_short,
category=payload.get("category", "") or payload.get("data_type", ""),
article=payload.get("article", "") or payload.get("section_title", "") or payload.get("section", ""),
article=payload.get("section", "") or payload.get("article", "") or payload.get("section_title", ""),
paragraph=payload.get("paragraph", ""),
source_url=payload.get("source_url", "") or payload.get("source", "") or payload.get("url", ""),
score=0.0,
collection=collection,
page=payload.get("page"),
)
all_results.append(chunk)
collection_new += 1
@@ -1127,6 +1127,7 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"page": chunk.page,
"license": license_info.get("license", ""),
"source_type": license_info.get("source_type", "law"),
"url": chunk.source_url or "",
@@ -1141,6 +1142,7 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
"source_regulation": chunk.regulation_code,
"source_article": effective_article,
"source_paragraph": effective_paragraph,
"source_page": chunk.page,
}
return control
@@ -1194,6 +1196,7 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"page": chunk.page,
"license": license_info.get("license", ""),
"license_notice": attribution,
"source_type": license_info.get("source_type", "standard"),
@@ -1209,6 +1212,7 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
"source_regulation": chunk.regulation_code,
"source_article": effective_article,
"source_paragraph": effective_paragraph,
"source_page": chunk.page,
}
return control
@@ -1368,6 +1372,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne A
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"page": chunk.page,
"license": lic.get("license", ""),
"license_notice": lic.get("attribution", ""),
"source_type": lic.get("source_type", "law"),
@@ -1384,6 +1389,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne A
"source_regulation": chunk.regulation_code,
"source_article": effective_article,
"source_paragraph": effective_paragraph,
"source_page": chunk.page,
"batch_size": len(chunks),
"document_grouped": same_doc,
}
@@ -1479,14 +1485,14 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
) -> list[Optional[GeneratedControl]]:
"""Process a batch of (chunk, license_info) through stages 3-5."""
# Split by license rule: Rule 1+2 → structure, Rule 3 → reform
structure_items = [(c, l) for c, l in batch_items if l["rule"] in (1, 2)]
reform_items = [(c, l) for c, l in batch_items if l["rule"] == 3]
structure_items = [(c, lic) for c, lic in batch_items if lic["rule"] in (1, 2)]
reform_items = [(c, lic) for c, lic in batch_items if lic["rule"] == 3]
all_controls: dict[int, Optional[GeneratedControl]] = {}
if structure_items:
s_chunks = [c for c, _ in structure_items]
s_lics = [l for _, l in structure_items]
s_lics = [lic for _, lic in structure_items]
try:
s_controls = await self._structure_batch(s_chunks, s_lics)
except Exception as e:
@@ -24,7 +24,6 @@ import json
import logging
import os
import re
import uuid
from dataclasses import dataclass, field
from typing import Optional
@@ -56,7 +55,7 @@ ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
# Patterns are defined in normative_patterns.py and imported here
# with local aliases for backward compatibility.
from .normative_patterns import (
from .normative_patterns import ( # noqa: E402
PFLICHT_RE as _PFLICHT_RE,
EMPFEHLUNG_RE as _EMPFEHLUNG_RE,
KANN_RE as _KANN_RE,
@@ -3472,7 +3471,7 @@ class DecompositionPass:
"category": atomic.category,
"parent_uuid": parent_uuid,
"gen_meta": json.dumps({
"decomposition_source": candidate_id,
"decomposition_source_id": candidate_id,
"decomposition_method": "pass0b",
"engine_version": "v2",
"action_object_class": getattr(atomic, "domain", ""),
@@ -4104,6 +4103,8 @@ def _format_citation(citation) -> str:
parts.append(c["article"])
if c.get("paragraph"):
parts.append(c["paragraph"])
if c.get("page") is not None:
parts.append(f"S. {c['page']}")
return " ".join(parts) if parts else citation
except (json.JSONDecodeError, TypeError):
return citation
+1
View File
@@ -34,6 +34,7 @@ class RAGSearchResult:
source_url: str
score: float
collection: str = ""
page: Optional[int] = None
class ComplianceRAGClient:
+166
View File
@@ -0,0 +1,166 @@
"""Tests for D3: Structural metadata flow (section priority, page in citation)."""
import json
from typing import Optional
from services.rag_client import RAGSearchResult
def _make_chunk(
article: str = "",
paragraph: str = "",
page: Optional[int] = None,
) -> RAGSearchResult:
return RAGSearchResult(
text="Test chunk text",
regulation_code="DSGVO",
regulation_name="Datenschutz-Grundverordnung",
regulation_short="DSGVO",
category="data_protection",
article=article,
paragraph=paragraph,
source_url="https://example.com",
score=0.95,
collection="bp_compliance_de",
page=page,
)
class TestRAGSearchResultPage:
"""RAGSearchResult now carries a page field."""
def test_page_default_none(self):
chunk = _make_chunk()
assert chunk.page is None
def test_page_set(self):
chunk = _make_chunk(page=42)
assert chunk.page == 42
def test_page_zero(self):
chunk = _make_chunk(page=0)
assert chunk.page == 0
class TestQdrantPayloadPriority:
"""section (D2) should take priority over article (legacy)."""
def test_section_preferred_over_article(self):
payload = {"section": "§ 312k", "article": "Art. 312", "section_title": "Kuendigungsbutton"}
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
assert article == "§ 312k"
def test_article_fallback_when_no_section(self):
payload = {"section": "", "article": "Art. 35", "section_title": ""}
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
assert article == "Art. 35"
def test_section_title_last_resort(self):
payload = {"section": "", "article": "", "section_title": "Informationspflichten"}
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
assert article == "Informationspflichten"
def test_all_empty(self):
payload = {"section": "", "article": "", "section_title": ""}
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
assert article == ""
def test_page_from_payload(self):
payload = {"page": 847}
assert payload.get("page") == 847
def test_page_none_from_payload(self):
payload = {}
assert payload.get("page") is None
class TestSourceCitationPage:
"""source_citation dict should include page when available."""
def _build_citation(self, chunk: RAGSearchResult) -> dict:
"""Mirrors the citation-building logic from control_generator.py."""
return {
"source": chunk.regulation_name,
"article": chunk.article,
"paragraph": chunk.paragraph,
"page": chunk.page,
"license": "free_use",
"source_type": "law",
"url": chunk.source_url or "",
}
def test_citation_with_page(self):
chunk = _make_chunk(article="§ 312k", paragraph="Abs. 1", page=847)
citation = self._build_citation(chunk)
assert citation["page"] == 847
def test_citation_without_page(self):
chunk = _make_chunk(article="§ 312k", paragraph="Abs. 1")
citation = self._build_citation(chunk)
assert citation["page"] is None
def test_citation_serializable(self):
chunk = _make_chunk(article="Art. 35", page=12)
citation = self._build_citation(chunk)
serialized = json.dumps(citation)
restored = json.loads(serialized)
assert restored["page"] == 12
class TestFormatCitation:
"""_format_citation should include page number."""
def _format_citation(self, citation) -> str:
"""Mirrors _format_citation from decomposition_pass.py."""
if not citation:
return ""
if isinstance(citation, str):
try:
c = json.loads(citation)
if isinstance(c, dict):
parts = []
if c.get("source"):
parts.append(c["source"])
if c.get("article"):
parts.append(c["article"])
if c.get("paragraph"):
parts.append(c["paragraph"])
if c.get("page") is not None:
parts.append(f"S. {c['page']}")
return " ".join(parts) if parts else citation
except (json.JSONDecodeError, TypeError):
return citation
return str(citation)
def test_format_with_page(self):
citation = json.dumps({
"source": "DSGVO",
"article": "Art. 35",
"paragraph": "Abs. 1",
"page": 42,
})
result = self._format_citation(citation)
assert result == "DSGVO Art. 35 Abs. 1 S. 42"
def test_format_without_page(self):
citation = json.dumps({
"source": "BGB",
"article": "§ 312k",
"paragraph": "",
})
result = self._format_citation(citation)
assert result == "BGB § 312k"
def test_format_page_zero(self):
citation = json.dumps({
"source": "BGB",
"article": "§ 1",
"paragraph": "",
"page": 0,
})
result = self._format_citation(citation)
assert result == "BGB § 1 S. 0"
def test_format_empty_citation(self):
assert self._format_citation("") == ""
assert self._format_citation(None) == ""
+12 -9
View File
@@ -10,8 +10,8 @@ Provides REST endpoints for:
This service handles all ML-heavy operations, keeping the main klausur-service lightweight.
"""
import os
import logging
import re
from typing import List, Optional
from contextlib import asynccontextmanager
@@ -282,8 +282,6 @@ ENGLISH_ABBREVIATIONS = {
ALL_ABBREVIATIONS = GERMAN_ABBREVIATIONS | ENGLISH_ABBREVIATIONS
# Regex pattern for legal section headers (§, Art., Article, Section, etc.)
import re
_LEGAL_SECTION_RE = re.compile(
r'^(?:'
r'§\s*\d+' # § 25, § 5a
@@ -411,8 +409,6 @@ def _parse_section_metadata(header: str) -> dict:
# Find which group matched
for i, g in enumerate(m.groups(), 1):
if g:
# Reconstruct the section reference
prefix = header[:m.start()].strip()
section = header[m.start():m.end()].strip()
break
@@ -577,7 +573,14 @@ def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]:
if space_idx > 0:
overlap_text = overlap_text[space_idx + 1:]
if overlap_text:
chunk = overlap_text + ' ' + chunk
# Insert overlap AFTER the [§ ...] prefix to preserve it
# for structured metadata extraction
prefix_match = re.match(r'\[.+?\]\s*', chunk)
if prefix_match:
pos = prefix_match.end()
chunk = chunk[:pos] + overlap_text + ' ' + chunk[pos:]
else:
chunk = overlap_text + ' ' + chunk
final_chunks.append(chunk.strip())
return [c for c in final_chunks if c]
@@ -742,13 +745,13 @@ def detect_pdf_backends() -> List[str]:
available = []
try:
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.pdf import partition_pdf # noqa: F401
available.append("unstructured")
except ImportError:
pass
try:
from pypdf import PdfReader
from pypdf import PdfReader # noqa: F401
available.append("pypdf")
except ImportError:
pass
@@ -808,7 +811,7 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
import os as os_module
try:
os_module.unlink(tmp_path)
except:
except OSError:
pass
-1
View File
@@ -11,7 +11,6 @@ Covers:
- Long sentence force-splitting
"""
import pytest
from main import (
chunk_text_legal,
chunk_text_recursive,
+217
View File
@@ -0,0 +1,217 @@
"""
D4 Validation: BGB § 312k structural chunking test.
Tests that real German legal text is correctly chunked with structural
metadata (section, section_title, paragraph, paragraph_num).
This is the gate test before re-ingesting all 297 legal sources.
"""
import os
import pytest
from main import chunk_text_legal, chunk_text_legal_structured
FIXTURE_PATH = os.path.join(
os.path.dirname(__file__), "tests", "fixtures", "bgb_312_excerpt.txt"
)
# Reasonable defaults for legal text
CHUNK_SIZE = 1500
OVERLAP = 100
@pytest.fixture
def bgb_text():
with open(FIXTURE_PATH, encoding="utf-8") as f:
return f.read()
@pytest.fixture
def plain_chunks(bgb_text):
return chunk_text_legal(bgb_text, CHUNK_SIZE, OVERLAP)
@pytest.fixture
def structured_chunks(bgb_text):
return chunk_text_legal_structured(bgb_text, CHUNK_SIZE, OVERLAP)
# =========================================================================
# Basic sanity
# =========================================================================
class TestChunkingSanity:
def test_fixture_loads(self, bgb_text):
assert len(bgb_text) > 2000, "BGB excerpt should be substantial"
assert "§ 312k" in bgb_text
assert "§ 312 " in bgb_text
def test_chunk_count_reasonable(self, plain_chunks):
assert 4 <= len(plain_chunks) <= 30, (
f"Expected 4-30 chunks, got {len(plain_chunks)}"
)
def test_structured_same_count(self, plain_chunks, structured_chunks):
assert len(plain_chunks) == len(structured_chunks)
def test_no_empty_chunks(self, plain_chunks):
for i, chunk in enumerate(plain_chunks):
assert chunk.strip(), f"Chunk {i} is empty"
def test_chunk_sizes_reasonable(self, plain_chunks):
for i, chunk in enumerate(plain_chunks):
assert len(chunk) < 3000, f"Chunk {i} too large: {len(chunk)} chars"
assert len(chunk) > 30, f"Chunk {i} too small: {len(chunk)} chars"
# =========================================================================
# Section detection
# =========================================================================
class TestSectionDetection:
def test_all_four_sections_detected(self, structured_chunks):
"""All 4 BGB sections should appear as section metadata."""
found_sections = set()
for meta in structured_chunks:
if meta["section"]:
found_sections.add(meta["section"])
assert "§ 312" in found_sections or any(
s.startswith("§ 312") and s != "§ 312a" and s != "§ 312g" and s != "§ 312k"
for s in found_sections
), f"§ 312 not found. Sections: {found_sections}"
assert "§ 312a" in found_sections, f"§ 312a not found. Sections: {found_sections}"
assert "§ 312g" in found_sections, f"§ 312g not found. Sections: {found_sections}"
assert "§ 312k" in found_sections, f"§ 312k not found. Sections: {found_sections}"
def test_section_prefix_in_chunks(self, plain_chunks):
"""Most chunks should have [§ ...] prefix."""
prefixed = sum(1 for c in plain_chunks if c.startswith(""))
ratio = prefixed / len(plain_chunks)
assert ratio >= 0.8, (
f"Only {ratio:.0%} chunks have section prefix (expected >= 80%)"
)
def test_312k_has_own_chunk(self, plain_chunks):
"""§ 312k must appear as a chunk section header, not merged into another §."""
chunks_with_312k = [c for c in plain_chunks if "[§ 312k" in c]
assert len(chunks_with_312k) >= 1, (
"§ 312k should have at least 1 dedicated chunk"
)
# =========================================================================
# § 312k specific metadata
# =========================================================================
class TestSection312k:
def _312k_chunks(self, structured_chunks):
return [m for m in structured_chunks if m["section"] == "§ 312k"]
def test_312k_section_metadata(self, structured_chunks):
"""§ 312k chunks should have section='§ 312k' with a title."""
chunks = self._312k_chunks(structured_chunks)
assert len(chunks) >= 1, "No chunks with section='§ 312k'"
for meta in chunks:
assert meta["section"] == "§ 312k"
# Title should contain key words
title = meta["section_title"].lower()
assert "kuendigung" in title or "verbrauchervertrae" in title, (
f"Unexpected section_title: {meta['section_title']}"
)
def test_312k_paragraph_extraction(self, structured_chunks):
"""At least some § 312k chunks should have paragraph references."""
chunks = self._312k_chunks(structured_chunks)
paragraphs_found = [m["paragraph"] for m in chunks if m["paragraph"]]
# § 312k has (1) through (6), at least some should be detected
assert len(paragraphs_found) >= 1, (
"No paragraph references found in § 312k chunks"
)
def test_312k_content_present(self, structured_chunks):
"""§ 312k chunk text should contain key legal terms."""
chunks = self._312k_chunks(structured_chunks)
all_text = " ".join(m["text"] for m in chunks)
assert "Kuendigungsschaltflaeche" in all_text or "kuendigen" in all_text.lower()
assert "Webseite" in all_text or "elektronischen" in all_text
def test_312k_not_merged_with_312g(self, structured_chunks):
"""§ 312k and § 312g should be separate sections, not merged."""
sections_312g = [m for m in structured_chunks if m["section"] == "§ 312g"]
sections_312k = self._312k_chunks(structured_chunks)
assert len(sections_312g) >= 1, "§ 312g missing"
assert len(sections_312k) >= 1, "§ 312k missing"
# Verify they are different chunks (no overlap in indices)
g_indices = {m["index"] for m in sections_312g}
k_indices = {m["index"] for m in sections_312k}
assert g_indices.isdisjoint(k_indices), (
f"§ 312g and § 312k share chunk indices: {g_indices & k_indices}"
)
# =========================================================================
# Metadata quality across all sections
# =========================================================================
class TestMetadataQuality:
def test_most_chunks_have_section(self, structured_chunks):
"""At least 90% of chunks should have a section reference."""
with_section = sum(1 for m in structured_chunks if m["section"])
ratio = with_section / len(structured_chunks)
assert ratio >= 0.9, (
f"Only {ratio:.0%} chunks have section metadata (expected >= 90%)"
)
def test_section_titles_not_empty(self, structured_chunks):
"""Chunks with a section should also have a section_title."""
for meta in structured_chunks:
if meta["section"]:
assert meta["section_title"], (
f"Chunk {meta['index']} has section={meta['section']} but no title"
)
def test_paragraph_nums_are_integers(self, structured_chunks):
"""paragraph_num should be int or None, never str."""
for meta in structured_chunks:
pn = meta["paragraph_num"]
assert pn is None or isinstance(pn, int), (
f"Chunk {meta['index']}: paragraph_num={pn!r} (type={type(pn).__name__})"
)
def test_indices_sequential(self, structured_chunks):
"""Chunk indices should be 0, 1, 2, ... in order."""
for i, meta in enumerate(structured_chunks):
assert meta["index"] == i, (
f"Expected index {i}, got {meta['index']}"
)
# =========================================================================
# Edge cases
# =========================================================================
class TestEdgeCases:
def test_numbered_list_not_false_section(self, structured_chunks):
"""Numbered items (1., 2., 3.) inside a § should NOT create new sections."""
for meta in structured_chunks:
section = meta["section"]
# Section should always start with § or be empty
if section:
assert section.startswith("§"), (
f"Unexpected section format: {section!r}"
)
def test_subsection_letters_preserved(self, plain_chunks):
"""Lettered subsections (a, b, c, d, e) in § 312k(2) should be in the text."""
all_text = " ".join(plain_chunks)
# § 312k Abs 2 Nr 1 has a) through e)
for letter in ["a)", "b)", "c)", "d)", "e)"]:
assert letter in all_text, (
f"Subsection letter {letter} from § 312k(2) missing"
)
+62
View File
@@ -0,0 +1,62 @@
§ 312 Anwendungsbereich
(1) Die Vorschriften der Kapitel 1 und 2 dieses Untertitels sind auf Verbrauchervertraege anzuwenden, bei denen sich der Verbraucher zu der Zahlung eines Preises verpflichtet.
(1a) Die Vorschriften der Kapitel 1 und 2 dieses Untertitels sind auch auf Verbrauchervertraege anzuwenden, bei denen der Verbraucher dem Unternehmer personenbezogene Daten bereitstellt oder sich hierzu verpflichtet. Dies gilt nicht, wenn der Unternehmer die vom Verbraucher bereitgestellten personenbezogenen Daten ausschliesslich verarbeitet, um seine Leistungspflicht oder an ihn gestellte rechtliche Anforderungen zu erfuellen, und sie zu keinem anderen Zweck verarbeitet.
(2) Von den Vorschriften der Kapitel 1 und 2 dieses Untertitels ist nur § 312a Absatz 1, 3, 4 und 6 auf folgende Vertraege anzuwenden:
1. notariell beurkundete Vertraege
2. Vertraege ueber die Begruendung, den Erwerb oder die Uebertragung von Eigentum oder anderen Rechten an Grundstuecken
3. Vertraege ueber den Bau von neuen Gebaeuden oder erhebliche Umbaumassnahmen an bestehenden Gebaeuden
4. Vertraege ueber Reiseleistungen nach § 651a
5. Vertraege ueber die Befoerderung von Personen
6. Vertraege, die unter Einsatz von Warenautomaten oder automatisierten Geschaeftsraeumen geschlossen werden
§ 312a Allgemeine Pflichten und Grundsaetze bei Verbrauchervertraegen
(1) Ruft der Unternehmer oder eine Person, die in seinem Namen oder Auftrag handelt, den Verbraucher an, um mit diesem einen Vertrag zu schliessen, hat der Anrufer zu Beginn des Gespraechs seine Identitaet und gegebenenfalls die Identitaet der Person, fuer die er anruft, sowie den geschaeftlichen Zweck des Anrufs offenzulegen.
(2) Der Unternehmer ist verpflichtet, den Verbraucher nach Massgabe des Artikels 246 des Einfuehrungsgesetzes zum Buergerlichen Gesetzbuche zu informieren. Der Unternehmer kann von dem Verbraucher Fracht-, Liefer- oder Versandkosten und sonstige Kosten nur verlangen, soweit er den Verbraucher ueber diese Kosten entsprechend den Anforderungen aus Artikel 246 Absatz 1 Nummer 3 des Einfuehrungsgesetzes zum Buergerlichen Gesetzbuche informiert hat. Die Saetze 1 und 2 sind weder auf ausserhalb von Geschaeftsraeumen geschlossene Vertraege noch auf Fernabsatzvertraege noch auf Vertraege ueber Finanzdienstleistungen anzuwenden.
(3) Eine Vereinbarung, die auf eine ueber das vereinbarte Entgelt fuer die Hauptleistung hinausgehende Zahlung des Verbrauchers gerichtet ist, kann ein Unternehmer mit einem Verbraucher nur ausdruecklich treffen. Schliesst der Unternehmer und der Verbraucher einen Vertrag im elektronischen Geschaeftsverkehr, wird eine solche Vereinbarung nur Vertragsbestandteil, wenn der Unternehmer die Vereinbarung nicht durch eine Voreinstellung herbeifuehrt.
(4) Eine Vereinbarung, durch die ein Verbraucher verpflichtet wird, ein Entgelt dafuer zu zahlen, dass der Verbraucher fuer die Erfuellung seiner vertraglichen Pflichten ein bestimmtes Zahlungsmittel nutzt, ist unwirksam, wenn fuer den Verbraucher keine zumutbare und gaengige unentgeltliche Zahlungsmoeglichkeit besteht oder das vereinbarte Entgelt ueber die Kosten hinausgeht, die dem Unternehmer durch die Nutzung des Zahlungsmittels entstehen.
(5) Eine Vereinbarung, durch die ein Verbraucher verpflichtet wird, ein Entgelt dafuer zu zahlen, dass der Verbraucher den Unternehmer wegen Fragen oder Erklaerungen zu einem zwischen ihnen geschlossenen Vertrag ueber eine Rufnummer anruft, die der Unternehmer fuer solche Zwecke bereithaelt, ist unwirksam, wenn das vereinbarte Entgelt das Entgelt fuer die blosse Nutzung des Telekommunikationsdienstes uebersteigt.
(6) Ist eine Vereinbarung nach den Absaetzen 3 bis 5 nicht Vertragsbestandteil geworden oder ist sie unwirksam, bleibt der Vertrag im Uebrigen wirksam.
§ 312g Widerrufsrecht
(1) Dem Verbraucher steht bei ausserhalb von Geschaeftsraeumen geschlossenen Vertraegen und bei Fernabsatzvertraegen ein Widerrufsrecht gemaess § 355 zu.
(2) Das Widerrufsrecht besteht, soweit die Parteien nichts anderes vereinbart haben, nicht bei folgenden Vertraegen:
1. Vertraege zur Lieferung von Waren, die nicht vorgefertigt sind und fuer deren Herstellung eine individuelle Auswahl oder Bestimmung durch den Verbraucher massgeblich ist oder die eindeutig auf die persoenlichen Beduerfnisse des Verbrauchers zugeschnitten sind,
2. Vertraege zur Lieferung von Waren, die schnell verderben koennen oder deren Verfallsdatum schnell ueberschritten wuerde,
3. Vertraege zur Lieferung versiegelter Waren, die aus Gruenden des Gesundheitsschutzes oder der Hygiene nicht zur Rueckgabe geeignet sind, wenn ihre Versiegelung nach der Lieferung entfernt wurde.
(3) Das Widerrufsrecht besteht ferner nicht bei Vertraegen, bei denen dem Verbraucher bereits auf Grund der §§ 495, 506 bis 513 ein Widerrufsrecht zusteht.
§ 312k Kuendigung von Verbrauchervertraegen im elektronischen Geschaeftsverkehr
(1) Wird Verbrauchern ueber eine Webseite ermoeglicht, einen Vertrag im elektronischen Geschaeftsverkehr zu schliessen, der auf die Begruendung eines Dauerschuldverhaeltnisses gerichtet ist, das einen Unternehmer zu einer entgeltlichen Leistung verpflichtet, so treffen den Unternehmer die Pflichten nach dieser Vorschrift. Dies gilt nicht
1. fuer Vertraege, fuer deren Kuendigung gesetzlich ausschliesslich eine strengere Form als die Textform vorgesehen ist, und
2. in Bezug auf Webseiten, die Finanzdienstleistungen betreffen, oder fuer Vertraege ueber Finanzdienstleistungen.
(2) Der Unternehmer hat sicherzustellen, dass der Verbraucher auf der Webseite eine Erklaerung zur ordentlichen oder ausserordentlichen Kuendigung eines auf der Webseite abschliessbaren Vertrags nach Absatz 1 Satz 1 ueber eine Kuendigungsschaltflaeche abgeben kann. Die Kuendigungsschaltflaeche muss gut lesbar mit nichts anderem als den Woertern "Vertraege hier kuendigen" oder mit einer entsprechenden eindeutigen Formulierung beschriftet sein. Sie muss den Verbraucher unmittelbar zu einer Bestaetigungsseite fuehren, die
1. den Verbraucher auffordert und ihm ermoeglicht Angaben zu machen
a) zur Art der Kuendigung sowie im Falle der ausserordentlichen Kuendigung zum Kuendigungsgrund,
b) zu seiner eindeutigen Identifizierbarkeit,
c) zur eindeutigen Bezeichnung des Vertrags,
d) zum Zeitpunkt, zu dem die Kuendigung das Vertragsverhaeltnis beenden soll,
e) zur schnellen elektronischen Uebermittlung der Kuendigungsbestaetigung an ihn und
2. eine Bestaetigungsschaltflaeche enthaelt, ueber deren Betaetigung der Verbraucher die Kuendigungserklaerung abgeben kann und die gut lesbar mit nichts anderem als den Woertern "jetzt kuendigen" oder mit einer entsprechenden eindeutigen Formulierung beschriftet ist.
Die Schaltflaechen und die Bestaetigungsseite muessen staendig verfuegbar sowie unmittelbar und leicht zugaenglich sein.
(3) Der Verbraucher muss seine durch das Betaetigen der Bestaetigungsschaltflaeche abgegebene Kuendigungserklaerung mit dem Datum und der Uhrzeit der Abgabe auf einem dauerhaften Datentraeger so speichern koennen, dass erkennbar ist, dass die Kuendigungserklaerung durch das Betaetigen der Bestaetigungsschaltflaeche abgegeben wurde.
(4) Der Unternehmer hat dem Verbraucher den Inhalt sowie Datum und Uhrzeit des Zugangs der Kuendigungserklaerung sowie den Zeitpunkt, zu dem das Vertragsverhaeltnis durch die Kuendigung beendet werden soll, sofort auf elektronischem Wege in Textform zu bestaetigen. Es wird vermutet, dass eine durch das Betaetigen der Bestaetigungsschaltflaeche abgegebene Kuendigungserklaerung dem Unternehmer unmittelbar nach ihrer Abgabe zugegangen ist.
(5) Wenn der Verbraucher bei der Abgabe der Kuendigungserklaerung keinen Zeitpunkt angibt, zu dem die Kuendigung das Vertragsverhaeltnis beenden soll, wirkt die Kuendigung im Zweifel zum fruehestmoeglichen Zeitpunkt.
(6) Werden die Schaltflaechen und die Bestaetigungsseite nicht entsprechend den Absaetzen 1 und 2 zur Verfuegung gestellt, kann ein Verbraucher einen Vertrag, fuer dessen Kuendigung die Schaltflaechen und die Bestaetigungsseite zur Verfuegung zu stellen sind, jederzeit und ohne Einhaltung einer Kuendigungsfrist kuendigen. Die Moeglichkeit des Verbrauchers zur ausserordentlichen Kuendigung bleibt hiervon unberuehrt.
+14 -1
View File
@@ -14,6 +14,9 @@ logger = logging.getLogger("rag-service.api.documents")
router = APIRouter(prefix="/api/v1/documents")
# Structural metadata fields from embedding-service chunks_with_metadata (D2)
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
# ---- Request / Response models --------------------------------------------
@@ -110,7 +113,7 @@ async def upload_document(
# --- Chunk ---
try:
chunks = await embedding_client.chunk_text(
chunk_result = await embedding_client.chunk_text(
text=text,
strategy=chunk_strategy,
chunk_size=chunk_size,
@@ -120,6 +123,9 @@ async def upload_document(
logger.error("Chunking failed: %s", exc)
raise HTTPException(status_code=500, detail=f"Chunking failed: {exc}")
chunks = chunk_result.chunks
chunks_meta = chunk_result.chunks_with_metadata
if not chunks:
raise HTTPException(status_code=400, detail="Chunking produced zero chunks")
@@ -154,6 +160,13 @@ async def upload_document(
"year": year,
**extra_metadata,
}
# Merge structural metadata from embedding service (D2)
if i < len(chunks_meta):
meta = chunks_meta[i]
for field in _STRUCT_FIELDS:
value = meta.get(field)
if value is not None and value != "":
payload[field] = value
payloads.append(payload)
# --- Index in Qdrant ---
+15 -4
View File
@@ -1,6 +1,6 @@
import logging
import os
from typing import Optional
from dataclasses import dataclass
import httpx
@@ -19,6 +19,14 @@ _OLLAMA_EMBED_MODEL = os.getenv("OLLAMA_EMBED_MODEL", "bge-m3")
_EMBED_BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", "32"))
@dataclass
class ChunkResult:
"""Result from the embedding service /chunk endpoint."""
chunks: list[str]
chunks_with_metadata: list[dict]
class EmbeddingClient:
"""
Hybrid client:
@@ -120,10 +128,10 @@ class EmbeddingClient:
strategy: str = "recursive",
chunk_size: int = 512,
overlap: int = 50,
) -> list[str]:
) -> ChunkResult:
"""
Ask the embedding service to chunk a long text.
Returns a list of chunk strings.
Returns ChunkResult with plain chunks and structural metadata.
"""
async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
response = await client.post(
@@ -137,7 +145,10 @@ class EmbeddingClient:
)
response.raise_for_status()
data = response.json()
return data.get("chunks", [])
return ChunkResult(
chunks=data.get("chunks", []),
chunks_with_metadata=data.get("chunks_with_metadata") or [],
)
# ------------------------------------------------------------------
# PDF extraction (via embedding-service)
View File
+7
View File
@@ -0,0 +1,7 @@
"""Shared test fixtures for rag-service tests."""
import os
import sys
# Ensure rag-service root is on sys.path so imports resolve
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+172
View File
@@ -0,0 +1,172 @@
"""Tests for document upload payload building — structural metadata (D2)."""
# Mirror the constant from api/documents.py to avoid heavy import chain
# (api → jose, qdrant_client, minio, etc.)
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
def _build_payload(
chunk: str,
index: int,
chunks_meta: list[dict],
extra_metadata: "dict | None" = None,
) -> dict:
"""Replicate the payload-building logic from documents.py for unit testing."""
payload = {
"document_id": "test-doc-id",
"object_name": "test/path.pdf",
"filename": "path.pdf",
"chunk_index": index,
"chunk_text": chunk,
"data_type": "law",
"bundesland": "bund",
"use_case": "compliance",
"year": "2026",
**(extra_metadata or {}),
}
if index < len(chunks_meta):
meta = chunks_meta[index]
for field in _STRUCT_FIELDS:
value = meta.get(field)
if value is not None and value != "":
payload[field] = value
return payload
class TestPayloadStructuralMetadata:
"""Tests for structural metadata merging into Qdrant payloads."""
def test_payload_contains_structural_metadata(self):
"""Metadata fields from chunks_with_metadata land in the payload."""
meta = [
{
"text": "chunk text",
"section": "§ 312k",
"section_title": "Kuendigungsbutton",
"paragraph": "Abs. 1",
"paragraph_num": 1,
"page": 847,
"index": 0,
}
]
payload = _build_payload("chunk text", 0, meta)
assert payload["section"] == "§ 312k"
assert payload["section_title"] == "Kuendigungsbutton"
assert payload["paragraph"] == "Abs. 1"
assert payload["paragraph_num"] == 1
assert payload["page"] == 847
def test_payload_without_metadata_backwards_compat(self):
"""Empty metadata list → payload has no structural fields."""
payload = _build_payload("chunk text", 0, [])
for field in _STRUCT_FIELDS:
assert field not in payload
def test_payload_skips_empty_values(self):
"""Empty string and None values are NOT added to payload."""
meta = [
{
"text": "chunk text",
"section": "",
"section_title": "",
"paragraph": "",
"paragraph_num": None,
"page": None,
"index": 0,
}
]
payload = _build_payload("chunk text", 0, meta)
for field in _STRUCT_FIELDS:
assert field not in payload
def test_metadata_overrides_extra_metadata(self):
"""Auto-extracted metadata takes precedence over manual extra_metadata."""
meta = [
{
"text": "chunk text",
"section": "§ 25",
"section_title": "",
"paragraph": "",
"paragraph_num": None,
"page": None,
"index": 0,
}
]
extra = {"section": "manual-value"}
payload = _build_payload("chunk text", 0, meta, extra_metadata=extra)
assert payload["section"] == "§ 25"
def test_partial_metadata_alignment(self):
"""3 chunks but only 2 metadata entries → third payload has no structural fields."""
meta = [
{
"text": "c1",
"section": "§ 1",
"section_title": "",
"paragraph": "",
"paragraph_num": None,
"page": None,
"index": 0,
},
{
"text": "c2",
"section": "§ 2",
"section_title": "",
"paragraph": "",
"paragraph_num": None,
"page": None,
"index": 1,
},
]
p0 = _build_payload("c1", 0, meta)
p1 = _build_payload("c2", 1, meta)
p2 = _build_payload("c3", 2, meta)
assert p0["section"] == "§ 1"
assert p1["section"] == "§ 2"
assert "section" not in p2
def test_zero_paragraph_num_is_kept(self):
"""paragraph_num=0 is a valid value and should be stored."""
meta = [
{
"text": "chunk",
"section": "",
"section_title": "",
"paragraph": "",
"paragraph_num": 0,
"page": None,
"index": 0,
}
]
payload = _build_payload("chunk", 0, meta)
# 0 is not None and not "" → should be stored
assert payload["paragraph_num"] == 0
def test_page_zero_is_kept(self):
"""page=0 is a valid value (first page) and should be stored."""
meta = [
{
"text": "chunk",
"section": "",
"section_title": "",
"paragraph": "",
"paragraph_num": None,
"page": 0,
"index": 0,
}
]
payload = _build_payload("chunk", 0, meta)
assert payload["page"] == 0
+135
View File
@@ -0,0 +1,135 @@
"""Tests for EmbeddingClient.chunk_text() — ChunkResult with metadata (D2)."""
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from embedding_client import ChunkResult, EmbeddingClient
@pytest.fixture
def client():
with patch("embedding_client.settings") as mock_settings:
mock_settings.EMBEDDING_SERVICE_URL = "http://localhost:8087"
return EmbeddingClient()
def _mock_response(json_data: dict, status_code: int = 200):
"""Create a mock httpx response (sync methods like .json() and .raise_for_status())."""
resp = MagicMock()
resp.status_code = status_code
resp.json.return_value = json_data
return resp
@pytest.mark.asyncio
async def test_chunk_text_returns_chunk_result(client):
"""chunk_text returns ChunkResult with both chunks and metadata."""
mock_json = {
"chunks": ["chunk1 text", "chunk2 text"],
"chunks_with_metadata": [
{
"text": "chunk1 text",
"section": "§ 25",
"section_title": "Informationspflichten",
"paragraph": "Abs. 1",
"paragraph_num": 1,
"page": None,
"index": 0,
},
{
"text": "chunk2 text",
"section": "§ 25",
"section_title": "Informationspflichten",
"paragraph": "Abs. 2",
"paragraph_num": 2,
"page": None,
"index": 1,
},
],
"count": 2,
"strategy": "recursive",
}
with patch("httpx.AsyncClient") as mock_client_cls:
mock_client = AsyncMock()
mock_client.post.return_value = _mock_response(mock_json)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_cls.return_value = mock_client
result = await client.chunk_text("some legal text")
assert isinstance(result, ChunkResult)
assert result.chunks == ["chunk1 text", "chunk2 text"]
assert len(result.chunks_with_metadata) == 2
assert result.chunks_with_metadata[0]["section"] == "§ 25"
assert result.chunks_with_metadata[1]["paragraph"] == "Abs. 2"
@pytest.mark.asyncio
async def test_chunk_text_without_metadata_field(client):
"""Embedding service response without chunks_with_metadata → empty list."""
mock_json = {
"chunks": ["chunk1"],
"count": 1,
"strategy": "semantic",
}
with patch("httpx.AsyncClient") as mock_client_cls:
mock_client = AsyncMock()
mock_client.post.return_value = _mock_response(mock_json)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_cls.return_value = mock_client
result = await client.chunk_text("text", strategy="semantic")
assert isinstance(result, ChunkResult)
assert result.chunks == ["chunk1"]
assert result.chunks_with_metadata == []
@pytest.mark.asyncio
async def test_chunk_text_with_null_metadata(client):
"""chunks_with_metadata: null in response → empty list."""
mock_json = {
"chunks": ["chunk1"],
"chunks_with_metadata": None,
"count": 1,
"strategy": "recursive",
}
with patch("httpx.AsyncClient") as mock_client_cls:
mock_client = AsyncMock()
mock_client.post.return_value = _mock_response(mock_json)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_cls.return_value = mock_client
result = await client.chunk_text("text")
assert result.chunks_with_metadata == []
@pytest.mark.asyncio
async def test_chunk_text_empty(client):
"""Empty text → empty chunks and metadata."""
mock_json = {
"chunks": [],
"chunks_with_metadata": [],
"count": 0,
"strategy": "recursive",
}
with patch("httpx.AsyncClient") as mock_client_cls:
mock_client = AsyncMock()
mock_client.post.return_value = _mock_response(mock_json)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_cls.return_value = mock_client
result = await client.chunk_text("")
assert result.chunks == []
assert result.chunks_with_metadata == []