feat(pipeline): structural metadata end-to-end (Blocks D2-D4)
D2: RAG service stores section/section_title/paragraph/paragraph_num/page from embedding service chunks_with_metadata into Qdrant payloads. D3: Control generator prefers section > article > section_title from Qdrant, adds page to source_citation and generation_metadata. D4: Validated with real BGB §§ 312-312k text. Found and fixed critical bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing only the first chunk per document to have metadata. All subsequent chunks lost section info. Also fixes pre-existing lint issues (unused imports, ambiguous variable names, duplicate dict key, bare except). 456 tests passing (58 embedding + 387 pipeline + 11 rag-service). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,268 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
D4 Integration Test: Upload BGB excerpt → verify Qdrant payloads.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Dry-run (local chunking only, no services needed)
|
||||||
|
python3 scripts/test_d4_integration.py --dry-run
|
||||||
|
|
||||||
|
# Against Mac Mini
|
||||||
|
python3 scripts/test_d4_integration.py \
|
||||||
|
--rag-url https://macmini:8097 \
|
||||||
|
--qdrant-url http://macmini:6333
|
||||||
|
|
||||||
|
# Against production
|
||||||
|
python3 scripts/test_d4_integration.py \
|
||||||
|
--rag-url https://rag-prod:8097 \
|
||||||
|
--qdrant-url http://qdrant-prod:6333
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
FIXTURE_PATH = os.path.join(
|
||||||
|
os.path.dirname(__file__), "..", "..", "embedding-service",
|
||||||
|
"tests", "fixtures", "bgb_312_excerpt.txt",
|
||||||
|
)
|
||||||
|
COLLECTION = "bp_compliance_gesetze"
|
||||||
|
REG_CODE = "BGB_D4_TEST"
|
||||||
|
|
||||||
|
# Expected sections in the BGB excerpt
|
||||||
|
EXPECTED_SECTIONS = {"§ 312", "§ 312a", "§ 312g", "§ 312k"}
|
||||||
|
|
||||||
|
|
||||||
|
def load_fixture() -> str:
|
||||||
|
with open(FIXTURE_PATH, encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
|
def upload_document(rag_url: str, text: str) -> dict:
|
||||||
|
"""Upload BGB excerpt to RAG service."""
|
||||||
|
metadata = json.dumps({
|
||||||
|
"regulation_code": REG_CODE,
|
||||||
|
"regulation_name_de": "BGB (D4 Test)",
|
||||||
|
"source_type": "law",
|
||||||
|
})
|
||||||
|
|
||||||
|
with httpx.Client(timeout=60.0, verify=False) as client:
|
||||||
|
resp = client.post(
|
||||||
|
f"{rag_url}/api/v1/documents/upload",
|
||||||
|
files={"file": ("bgb_312_test.txt", text.encode(), "text/plain")},
|
||||||
|
data={
|
||||||
|
"collection": COLLECTION,
|
||||||
|
"data_type": "law",
|
||||||
|
"bundesland": "bund",
|
||||||
|
"use_case": "compliance",
|
||||||
|
"year": "2026",
|
||||||
|
"chunk_strategy": "recursive",
|
||||||
|
"chunk_size": "1500",
|
||||||
|
"chunk_overlap": "100",
|
||||||
|
"metadata_json": metadata,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def scroll_chunks(qdrant_url: str, document_id: str) -> list[dict]:
|
||||||
|
"""Scroll Qdrant for chunks matching this document_id."""
|
||||||
|
all_points = []
|
||||||
|
offset = None
|
||||||
|
|
||||||
|
with httpx.Client(timeout=30.0) as client:
|
||||||
|
while True:
|
||||||
|
body: dict = {
|
||||||
|
"limit": 100,
|
||||||
|
"with_payload": True,
|
||||||
|
"with_vector": False,
|
||||||
|
"filter": {
|
||||||
|
"must": [{
|
||||||
|
"key": "document_id",
|
||||||
|
"match": {"value": document_id},
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if offset:
|
||||||
|
body["offset"] = offset
|
||||||
|
|
||||||
|
resp = client.post(
|
||||||
|
f"{qdrant_url}/collections/{COLLECTION}/points/scroll",
|
||||||
|
json=body,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()["result"]
|
||||||
|
all_points.extend(data["points"])
|
||||||
|
offset = data.get("next_page_offset")
|
||||||
|
if not offset:
|
||||||
|
break
|
||||||
|
|
||||||
|
return all_points
|
||||||
|
|
||||||
|
|
||||||
|
def delete_test_data(qdrant_url: str, document_id: str):
|
||||||
|
"""Clean up test chunks from Qdrant."""
|
||||||
|
with httpx.Client(timeout=30.0) as client:
|
||||||
|
resp = client.post(
|
||||||
|
f"{qdrant_url}/collections/{COLLECTION}/points/delete",
|
||||||
|
json={
|
||||||
|
"filter": {
|
||||||
|
"must": [{
|
||||||
|
"key": "document_id",
|
||||||
|
"match": {"value": document_id},
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
|
||||||
|
def verify_chunks(points: list[dict]) -> dict:
|
||||||
|
"""Analyze chunks and return a verification report."""
|
||||||
|
report = {
|
||||||
|
"total_chunks": len(points),
|
||||||
|
"sections_found": set(),
|
||||||
|
"chunks_with_section": 0,
|
||||||
|
"chunks_with_paragraph": 0,
|
||||||
|
"chunks_with_page": 0,
|
||||||
|
"section_details": [],
|
||||||
|
"issues": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for pt in points:
|
||||||
|
payload = pt.get("payload", {})
|
||||||
|
section = payload.get("section", "")
|
||||||
|
section_title = payload.get("section_title", "")
|
||||||
|
paragraph = payload.get("paragraph", "")
|
||||||
|
paragraph_num = payload.get("paragraph_num")
|
||||||
|
page = payload.get("page")
|
||||||
|
chunk_idx = payload.get("chunk_index", "?")
|
||||||
|
|
||||||
|
if section:
|
||||||
|
report["sections_found"].add(section)
|
||||||
|
report["chunks_with_section"] += 1
|
||||||
|
if paragraph:
|
||||||
|
report["chunks_with_paragraph"] += 1
|
||||||
|
if page is not None:
|
||||||
|
report["chunks_with_page"] += 1
|
||||||
|
|
||||||
|
report["section_details"].append({
|
||||||
|
"chunk_index": chunk_idx,
|
||||||
|
"section": section,
|
||||||
|
"section_title": section_title[:40],
|
||||||
|
"paragraph": paragraph,
|
||||||
|
"paragraph_num": paragraph_num,
|
||||||
|
"page": page,
|
||||||
|
"text_preview": payload.get("chunk_text", "")[:60],
|
||||||
|
})
|
||||||
|
|
||||||
|
# Checks
|
||||||
|
missing = EXPECTED_SECTIONS - report["sections_found"]
|
||||||
|
if missing:
|
||||||
|
report["issues"].append(f"Missing sections: {missing}")
|
||||||
|
|
||||||
|
if "§ 312k" not in report["sections_found"]:
|
||||||
|
report["issues"].append("CRITICAL: § 312k not found!")
|
||||||
|
|
||||||
|
section_ratio = report["chunks_with_section"] / max(report["total_chunks"], 1)
|
||||||
|
if section_ratio < 0.9:
|
||||||
|
report["issues"].append(
|
||||||
|
f"Only {section_ratio:.0%} chunks have section metadata (expected >= 90%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
def print_report(report: dict):
|
||||||
|
"""Print verification report."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("D4 VALIDATION REPORT")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Total chunks: {report['total_chunks']}")
|
||||||
|
print(f"With section: {report['chunks_with_section']}")
|
||||||
|
print(f"With paragraph: {report['chunks_with_paragraph']}")
|
||||||
|
print(f"With page: {report['chunks_with_page']}")
|
||||||
|
print(f"Sections found: {sorted(report['sections_found'])}")
|
||||||
|
|
||||||
|
print("\nChunk details:")
|
||||||
|
for d in sorted(report["section_details"], key=lambda x: x["chunk_index"]):
|
||||||
|
print(
|
||||||
|
f" [{d['chunk_index']:2}] "
|
||||||
|
f"section={d['section']!r:12s} "
|
||||||
|
f"title={d['section_title']!r:30s} "
|
||||||
|
f"para={d['paragraph']!r:8s}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if report["issues"]:
|
||||||
|
print(f"\nISSUES ({len(report['issues'])}):")
|
||||||
|
for issue in report["issues"]:
|
||||||
|
print(f" - {issue}")
|
||||||
|
print("\nRESULT: FAIL")
|
||||||
|
else:
|
||||||
|
print("\nRESULT: PASS — all sections detected, metadata quality OK")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="D4 Integration Test")
|
||||||
|
parser.add_argument("--rag-url", default="https://macmini:8097")
|
||||||
|
parser.add_argument("--qdrant-url", default="http://macmini:6333")
|
||||||
|
parser.add_argument("--dry-run", action="store_true",
|
||||||
|
help="Only test local chunking, no upload")
|
||||||
|
parser.add_argument("--keep", action="store_true",
|
||||||
|
help="Don't delete test data after verification")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
text = load_fixture()
|
||||||
|
print(f"Loaded BGB excerpt: {len(text)} chars")
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
# Import chunking directly
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "embedding-service"))
|
||||||
|
from main import chunk_text_legal_structured
|
||||||
|
chunks = chunk_text_legal_structured(text, 1500, 100)
|
||||||
|
# Build fake points for verification
|
||||||
|
points = [{"payload": {
|
||||||
|
"chunk_index": c["index"],
|
||||||
|
"chunk_text": c["text"],
|
||||||
|
"section": c["section"],
|
||||||
|
"section_title": c["section_title"],
|
||||||
|
"paragraph": c["paragraph"],
|
||||||
|
"paragraph_num": c["paragraph_num"],
|
||||||
|
"page": c["page"],
|
||||||
|
}} for c in chunks]
|
||||||
|
report = verify_chunks(points)
|
||||||
|
print_report(report)
|
||||||
|
sys.exit(1 if report["issues"] else 0)
|
||||||
|
|
||||||
|
# Full integration test
|
||||||
|
print(f"Uploading to {args.rag_url} → collection={COLLECTION}...")
|
||||||
|
result = upload_document(args.rag_url, text)
|
||||||
|
doc_id = result["document_id"]
|
||||||
|
print(f" document_id: {doc_id}")
|
||||||
|
print(f" chunks_count: {result['chunks_count']}")
|
||||||
|
print(f" vectors_indexed: {result['vectors_indexed']}")
|
||||||
|
|
||||||
|
print("Waiting 2s for indexing...")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
print(f"Scrolling Qdrant at {args.qdrant_url}...")
|
||||||
|
points = scroll_chunks(args.qdrant_url, doc_id)
|
||||||
|
print(f" Found {len(points)} points")
|
||||||
|
|
||||||
|
report = verify_chunks(points)
|
||||||
|
print_report(report)
|
||||||
|
|
||||||
|
if not args.keep:
|
||||||
|
print(f"\nCleaning up test data (document_id={doc_id})...")
|
||||||
|
delete_test_data(args.qdrant_url, doc_id)
|
||||||
|
print(" Deleted.")
|
||||||
|
|
||||||
|
sys.exit(1 if report["issues"] else 0)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -25,8 +25,7 @@ import re
|
|||||||
import uuid
|
import uuid
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from dataclasses import dataclass, field, asdict
|
from dataclasses import dataclass, field, asdict
|
||||||
from datetime import datetime, timezone
|
from typing import Dict, List, Optional
|
||||||
from typing import Dict, List, Optional, Set
|
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
@@ -34,7 +33,7 @@ from sqlalchemy import text
|
|||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
from .rag_client import ComplianceRAGClient, RAGSearchResult, get_rag_client
|
from .rag_client import ComplianceRAGClient, RAGSearchResult, get_rag_client
|
||||||
from .similarity_detector import check_similarity, SimilarityReport
|
from .similarity_detector import check_similarity
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -1019,11 +1018,12 @@ class ControlGeneratorPipeline:
|
|||||||
regulation_name=reg_name,
|
regulation_name=reg_name,
|
||||||
regulation_short=reg_short,
|
regulation_short=reg_short,
|
||||||
category=payload.get("category", "") or payload.get("data_type", ""),
|
category=payload.get("category", "") or payload.get("data_type", ""),
|
||||||
article=payload.get("article", "") or payload.get("section_title", "") or payload.get("section", ""),
|
article=payload.get("section", "") or payload.get("article", "") or payload.get("section_title", ""),
|
||||||
paragraph=payload.get("paragraph", ""),
|
paragraph=payload.get("paragraph", ""),
|
||||||
source_url=payload.get("source_url", "") or payload.get("source", "") or payload.get("url", ""),
|
source_url=payload.get("source_url", "") or payload.get("source", "") or payload.get("url", ""),
|
||||||
score=0.0,
|
score=0.0,
|
||||||
collection=collection,
|
collection=collection,
|
||||||
|
page=payload.get("page"),
|
||||||
)
|
)
|
||||||
all_results.append(chunk)
|
all_results.append(chunk)
|
||||||
collection_new += 1
|
collection_new += 1
|
||||||
@@ -1127,6 +1127,7 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
|
|||||||
"source": canonical_source,
|
"source": canonical_source,
|
||||||
"article": effective_article,
|
"article": effective_article,
|
||||||
"paragraph": effective_paragraph,
|
"paragraph": effective_paragraph,
|
||||||
|
"page": chunk.page,
|
||||||
"license": license_info.get("license", ""),
|
"license": license_info.get("license", ""),
|
||||||
"source_type": license_info.get("source_type", "law"),
|
"source_type": license_info.get("source_type", "law"),
|
||||||
"url": chunk.source_url or "",
|
"url": chunk.source_url or "",
|
||||||
@@ -1141,6 +1142,7 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
|
|||||||
"source_regulation": chunk.regulation_code,
|
"source_regulation": chunk.regulation_code,
|
||||||
"source_article": effective_article,
|
"source_article": effective_article,
|
||||||
"source_paragraph": effective_paragraph,
|
"source_paragraph": effective_paragraph,
|
||||||
|
"source_page": chunk.page,
|
||||||
}
|
}
|
||||||
return control
|
return control
|
||||||
|
|
||||||
@@ -1194,6 +1196,7 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
|
|||||||
"source": canonical_source,
|
"source": canonical_source,
|
||||||
"article": effective_article,
|
"article": effective_article,
|
||||||
"paragraph": effective_paragraph,
|
"paragraph": effective_paragraph,
|
||||||
|
"page": chunk.page,
|
||||||
"license": license_info.get("license", ""),
|
"license": license_info.get("license", ""),
|
||||||
"license_notice": attribution,
|
"license_notice": attribution,
|
||||||
"source_type": license_info.get("source_type", "standard"),
|
"source_type": license_info.get("source_type", "standard"),
|
||||||
@@ -1209,6 +1212,7 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
|
|||||||
"source_regulation": chunk.regulation_code,
|
"source_regulation": chunk.regulation_code,
|
||||||
"source_article": effective_article,
|
"source_article": effective_article,
|
||||||
"source_paragraph": effective_paragraph,
|
"source_paragraph": effective_paragraph,
|
||||||
|
"source_page": chunk.page,
|
||||||
}
|
}
|
||||||
return control
|
return control
|
||||||
|
|
||||||
@@ -1368,6 +1372,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne A
|
|||||||
"source": canonical_source,
|
"source": canonical_source,
|
||||||
"article": effective_article,
|
"article": effective_article,
|
||||||
"paragraph": effective_paragraph,
|
"paragraph": effective_paragraph,
|
||||||
|
"page": chunk.page,
|
||||||
"license": lic.get("license", ""),
|
"license": lic.get("license", ""),
|
||||||
"license_notice": lic.get("attribution", ""),
|
"license_notice": lic.get("attribution", ""),
|
||||||
"source_type": lic.get("source_type", "law"),
|
"source_type": lic.get("source_type", "law"),
|
||||||
@@ -1384,6 +1389,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne A
|
|||||||
"source_regulation": chunk.regulation_code,
|
"source_regulation": chunk.regulation_code,
|
||||||
"source_article": effective_article,
|
"source_article": effective_article,
|
||||||
"source_paragraph": effective_paragraph,
|
"source_paragraph": effective_paragraph,
|
||||||
|
"source_page": chunk.page,
|
||||||
"batch_size": len(chunks),
|
"batch_size": len(chunks),
|
||||||
"document_grouped": same_doc,
|
"document_grouped": same_doc,
|
||||||
}
|
}
|
||||||
@@ -1479,14 +1485,14 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
|
|||||||
) -> list[Optional[GeneratedControl]]:
|
) -> list[Optional[GeneratedControl]]:
|
||||||
"""Process a batch of (chunk, license_info) through stages 3-5."""
|
"""Process a batch of (chunk, license_info) through stages 3-5."""
|
||||||
# Split by license rule: Rule 1+2 → structure, Rule 3 → reform
|
# Split by license rule: Rule 1+2 → structure, Rule 3 → reform
|
||||||
structure_items = [(c, l) for c, l in batch_items if l["rule"] in (1, 2)]
|
structure_items = [(c, lic) for c, lic in batch_items if lic["rule"] in (1, 2)]
|
||||||
reform_items = [(c, l) for c, l in batch_items if l["rule"] == 3]
|
reform_items = [(c, lic) for c, lic in batch_items if lic["rule"] == 3]
|
||||||
|
|
||||||
all_controls: dict[int, Optional[GeneratedControl]] = {}
|
all_controls: dict[int, Optional[GeneratedControl]] = {}
|
||||||
|
|
||||||
if structure_items:
|
if structure_items:
|
||||||
s_chunks = [c for c, _ in structure_items]
|
s_chunks = [c for c, _ in structure_items]
|
||||||
s_lics = [l for _, l in structure_items]
|
s_lics = [lic for _, lic in structure_items]
|
||||||
try:
|
try:
|
||||||
s_controls = await self._structure_batch(s_chunks, s_lics)
|
s_controls = await self._structure_batch(s_chunks, s_lics)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -24,7 +24,6 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import uuid
|
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@@ -56,7 +55,7 @@ ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
|
|||||||
# Patterns are defined in normative_patterns.py and imported here
|
# Patterns are defined in normative_patterns.py and imported here
|
||||||
# with local aliases for backward compatibility.
|
# with local aliases for backward compatibility.
|
||||||
|
|
||||||
from .normative_patterns import (
|
from .normative_patterns import ( # noqa: E402
|
||||||
PFLICHT_RE as _PFLICHT_RE,
|
PFLICHT_RE as _PFLICHT_RE,
|
||||||
EMPFEHLUNG_RE as _EMPFEHLUNG_RE,
|
EMPFEHLUNG_RE as _EMPFEHLUNG_RE,
|
||||||
KANN_RE as _KANN_RE,
|
KANN_RE as _KANN_RE,
|
||||||
@@ -3472,7 +3471,7 @@ class DecompositionPass:
|
|||||||
"category": atomic.category,
|
"category": atomic.category,
|
||||||
"parent_uuid": parent_uuid,
|
"parent_uuid": parent_uuid,
|
||||||
"gen_meta": json.dumps({
|
"gen_meta": json.dumps({
|
||||||
"decomposition_source": candidate_id,
|
"decomposition_source_id": candidate_id,
|
||||||
"decomposition_method": "pass0b",
|
"decomposition_method": "pass0b",
|
||||||
"engine_version": "v2",
|
"engine_version": "v2",
|
||||||
"action_object_class": getattr(atomic, "domain", ""),
|
"action_object_class": getattr(atomic, "domain", ""),
|
||||||
@@ -4104,6 +4103,8 @@ def _format_citation(citation) -> str:
|
|||||||
parts.append(c["article"])
|
parts.append(c["article"])
|
||||||
if c.get("paragraph"):
|
if c.get("paragraph"):
|
||||||
parts.append(c["paragraph"])
|
parts.append(c["paragraph"])
|
||||||
|
if c.get("page") is not None:
|
||||||
|
parts.append(f"S. {c['page']}")
|
||||||
return " ".join(parts) if parts else citation
|
return " ".join(parts) if parts else citation
|
||||||
except (json.JSONDecodeError, TypeError):
|
except (json.JSONDecodeError, TypeError):
|
||||||
return citation
|
return citation
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ class RAGSearchResult:
|
|||||||
source_url: str
|
source_url: str
|
||||||
score: float
|
score: float
|
||||||
collection: str = ""
|
collection: str = ""
|
||||||
|
page: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
class ComplianceRAGClient:
|
class ComplianceRAGClient:
|
||||||
|
|||||||
@@ -0,0 +1,166 @@
|
|||||||
|
"""Tests for D3: Structural metadata flow (section priority, page in citation)."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from services.rag_client import RAGSearchResult
|
||||||
|
|
||||||
|
|
||||||
|
def _make_chunk(
|
||||||
|
article: str = "",
|
||||||
|
paragraph: str = "",
|
||||||
|
page: Optional[int] = None,
|
||||||
|
) -> RAGSearchResult:
|
||||||
|
return RAGSearchResult(
|
||||||
|
text="Test chunk text",
|
||||||
|
regulation_code="DSGVO",
|
||||||
|
regulation_name="Datenschutz-Grundverordnung",
|
||||||
|
regulation_short="DSGVO",
|
||||||
|
category="data_protection",
|
||||||
|
article=article,
|
||||||
|
paragraph=paragraph,
|
||||||
|
source_url="https://example.com",
|
||||||
|
score=0.95,
|
||||||
|
collection="bp_compliance_de",
|
||||||
|
page=page,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRAGSearchResultPage:
|
||||||
|
"""RAGSearchResult now carries a page field."""
|
||||||
|
|
||||||
|
def test_page_default_none(self):
|
||||||
|
chunk = _make_chunk()
|
||||||
|
assert chunk.page is None
|
||||||
|
|
||||||
|
def test_page_set(self):
|
||||||
|
chunk = _make_chunk(page=42)
|
||||||
|
assert chunk.page == 42
|
||||||
|
|
||||||
|
def test_page_zero(self):
|
||||||
|
chunk = _make_chunk(page=0)
|
||||||
|
assert chunk.page == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestQdrantPayloadPriority:
|
||||||
|
"""section (D2) should take priority over article (legacy)."""
|
||||||
|
|
||||||
|
def test_section_preferred_over_article(self):
|
||||||
|
payload = {"section": "§ 312k", "article": "Art. 312", "section_title": "Kuendigungsbutton"}
|
||||||
|
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
||||||
|
assert article == "§ 312k"
|
||||||
|
|
||||||
|
def test_article_fallback_when_no_section(self):
|
||||||
|
payload = {"section": "", "article": "Art. 35", "section_title": ""}
|
||||||
|
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
||||||
|
assert article == "Art. 35"
|
||||||
|
|
||||||
|
def test_section_title_last_resort(self):
|
||||||
|
payload = {"section": "", "article": "", "section_title": "Informationspflichten"}
|
||||||
|
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
||||||
|
assert article == "Informationspflichten"
|
||||||
|
|
||||||
|
def test_all_empty(self):
|
||||||
|
payload = {"section": "", "article": "", "section_title": ""}
|
||||||
|
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
||||||
|
assert article == ""
|
||||||
|
|
||||||
|
def test_page_from_payload(self):
|
||||||
|
payload = {"page": 847}
|
||||||
|
assert payload.get("page") == 847
|
||||||
|
|
||||||
|
def test_page_none_from_payload(self):
|
||||||
|
payload = {}
|
||||||
|
assert payload.get("page") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestSourceCitationPage:
|
||||||
|
"""source_citation dict should include page when available."""
|
||||||
|
|
||||||
|
def _build_citation(self, chunk: RAGSearchResult) -> dict:
|
||||||
|
"""Mirrors the citation-building logic from control_generator.py."""
|
||||||
|
return {
|
||||||
|
"source": chunk.regulation_name,
|
||||||
|
"article": chunk.article,
|
||||||
|
"paragraph": chunk.paragraph,
|
||||||
|
"page": chunk.page,
|
||||||
|
"license": "free_use",
|
||||||
|
"source_type": "law",
|
||||||
|
"url": chunk.source_url or "",
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_citation_with_page(self):
|
||||||
|
chunk = _make_chunk(article="§ 312k", paragraph="Abs. 1", page=847)
|
||||||
|
citation = self._build_citation(chunk)
|
||||||
|
assert citation["page"] == 847
|
||||||
|
|
||||||
|
def test_citation_without_page(self):
|
||||||
|
chunk = _make_chunk(article="§ 312k", paragraph="Abs. 1")
|
||||||
|
citation = self._build_citation(chunk)
|
||||||
|
assert citation["page"] is None
|
||||||
|
|
||||||
|
def test_citation_serializable(self):
|
||||||
|
chunk = _make_chunk(article="Art. 35", page=12)
|
||||||
|
citation = self._build_citation(chunk)
|
||||||
|
serialized = json.dumps(citation)
|
||||||
|
restored = json.loads(serialized)
|
||||||
|
assert restored["page"] == 12
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatCitation:
|
||||||
|
"""_format_citation should include page number."""
|
||||||
|
|
||||||
|
def _format_citation(self, citation) -> str:
|
||||||
|
"""Mirrors _format_citation from decomposition_pass.py."""
|
||||||
|
if not citation:
|
||||||
|
return ""
|
||||||
|
if isinstance(citation, str):
|
||||||
|
try:
|
||||||
|
c = json.loads(citation)
|
||||||
|
if isinstance(c, dict):
|
||||||
|
parts = []
|
||||||
|
if c.get("source"):
|
||||||
|
parts.append(c["source"])
|
||||||
|
if c.get("article"):
|
||||||
|
parts.append(c["article"])
|
||||||
|
if c.get("paragraph"):
|
||||||
|
parts.append(c["paragraph"])
|
||||||
|
if c.get("page") is not None:
|
||||||
|
parts.append(f"S. {c['page']}")
|
||||||
|
return " ".join(parts) if parts else citation
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
return citation
|
||||||
|
return str(citation)
|
||||||
|
|
||||||
|
def test_format_with_page(self):
|
||||||
|
citation = json.dumps({
|
||||||
|
"source": "DSGVO",
|
||||||
|
"article": "Art. 35",
|
||||||
|
"paragraph": "Abs. 1",
|
||||||
|
"page": 42,
|
||||||
|
})
|
||||||
|
result = self._format_citation(citation)
|
||||||
|
assert result == "DSGVO Art. 35 Abs. 1 S. 42"
|
||||||
|
|
||||||
|
def test_format_without_page(self):
|
||||||
|
citation = json.dumps({
|
||||||
|
"source": "BGB",
|
||||||
|
"article": "§ 312k",
|
||||||
|
"paragraph": "",
|
||||||
|
})
|
||||||
|
result = self._format_citation(citation)
|
||||||
|
assert result == "BGB § 312k"
|
||||||
|
|
||||||
|
def test_format_page_zero(self):
|
||||||
|
citation = json.dumps({
|
||||||
|
"source": "BGB",
|
||||||
|
"article": "§ 1",
|
||||||
|
"paragraph": "",
|
||||||
|
"page": 0,
|
||||||
|
})
|
||||||
|
result = self._format_citation(citation)
|
||||||
|
assert result == "BGB § 1 S. 0"
|
||||||
|
|
||||||
|
def test_format_empty_citation(self):
|
||||||
|
assert self._format_citation("") == ""
|
||||||
|
assert self._format_citation(None) == ""
|
||||||
@@ -10,8 +10,8 @@ Provides REST endpoints for:
|
|||||||
This service handles all ML-heavy operations, keeping the main klausur-service lightweight.
|
This service handles all ML-heavy operations, keeping the main klausur-service lightweight.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
@@ -282,8 +282,6 @@ ENGLISH_ABBREVIATIONS = {
|
|||||||
ALL_ABBREVIATIONS = GERMAN_ABBREVIATIONS | ENGLISH_ABBREVIATIONS
|
ALL_ABBREVIATIONS = GERMAN_ABBREVIATIONS | ENGLISH_ABBREVIATIONS
|
||||||
|
|
||||||
# Regex pattern for legal section headers (§, Art., Article, Section, etc.)
|
# Regex pattern for legal section headers (§, Art., Article, Section, etc.)
|
||||||
import re
|
|
||||||
|
|
||||||
_LEGAL_SECTION_RE = re.compile(
|
_LEGAL_SECTION_RE = re.compile(
|
||||||
r'^(?:'
|
r'^(?:'
|
||||||
r'§\s*\d+' # § 25, § 5a
|
r'§\s*\d+' # § 25, § 5a
|
||||||
@@ -411,8 +409,6 @@ def _parse_section_metadata(header: str) -> dict:
|
|||||||
# Find which group matched
|
# Find which group matched
|
||||||
for i, g in enumerate(m.groups(), 1):
|
for i, g in enumerate(m.groups(), 1):
|
||||||
if g:
|
if g:
|
||||||
# Reconstruct the section reference
|
|
||||||
prefix = header[:m.start()].strip()
|
|
||||||
section = header[m.start():m.end()].strip()
|
section = header[m.start():m.end()].strip()
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -577,7 +573,14 @@ def chunk_text_legal(text: str, chunk_size: int, overlap: int) -> List[str]:
|
|||||||
if space_idx > 0:
|
if space_idx > 0:
|
||||||
overlap_text = overlap_text[space_idx + 1:]
|
overlap_text = overlap_text[space_idx + 1:]
|
||||||
if overlap_text:
|
if overlap_text:
|
||||||
chunk = overlap_text + ' ' + chunk
|
# Insert overlap AFTER the [§ ...] prefix to preserve it
|
||||||
|
# for structured metadata extraction
|
||||||
|
prefix_match = re.match(r'\[.+?\]\s*', chunk)
|
||||||
|
if prefix_match:
|
||||||
|
pos = prefix_match.end()
|
||||||
|
chunk = chunk[:pos] + overlap_text + ' ' + chunk[pos:]
|
||||||
|
else:
|
||||||
|
chunk = overlap_text + ' ' + chunk
|
||||||
final_chunks.append(chunk.strip())
|
final_chunks.append(chunk.strip())
|
||||||
|
|
||||||
return [c for c in final_chunks if c]
|
return [c for c in final_chunks if c]
|
||||||
@@ -742,13 +745,13 @@ def detect_pdf_backends() -> List[str]:
|
|||||||
available = []
|
available = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from unstructured.partition.pdf import partition_pdf
|
from unstructured.partition.pdf import partition_pdf # noqa: F401
|
||||||
available.append("unstructured")
|
available.append("unstructured")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader # noqa: F401
|
||||||
available.append("pypdf")
|
available.append("pypdf")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
@@ -808,7 +811,7 @@ def extract_pdf_unstructured(pdf_content: bytes) -> ExtractPDFResponse:
|
|||||||
import os as os_module
|
import os as os_module
|
||||||
try:
|
try:
|
||||||
os_module.unlink(tmp_path)
|
os_module.unlink(tmp_path)
|
||||||
except:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ Covers:
|
|||||||
- Long sentence force-splitting
|
- Long sentence force-splitting
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pytest
|
|
||||||
from main import (
|
from main import (
|
||||||
chunk_text_legal,
|
chunk_text_legal,
|
||||||
chunk_text_recursive,
|
chunk_text_recursive,
|
||||||
|
|||||||
@@ -0,0 +1,217 @@
|
|||||||
|
"""
|
||||||
|
D4 Validation: BGB § 312k structural chunking test.
|
||||||
|
|
||||||
|
Tests that real German legal text is correctly chunked with structural
|
||||||
|
metadata (section, section_title, paragraph, paragraph_num).
|
||||||
|
This is the gate test before re-ingesting all 297 legal sources.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from main import chunk_text_legal, chunk_text_legal_structured
|
||||||
|
|
||||||
|
FIXTURE_PATH = os.path.join(
|
||||||
|
os.path.dirname(__file__), "tests", "fixtures", "bgb_312_excerpt.txt"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Reasonable defaults for legal text
|
||||||
|
CHUNK_SIZE = 1500
|
||||||
|
OVERLAP = 100
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def bgb_text():
|
||||||
|
with open(FIXTURE_PATH, encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def plain_chunks(bgb_text):
|
||||||
|
return chunk_text_legal(bgb_text, CHUNK_SIZE, OVERLAP)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def structured_chunks(bgb_text):
|
||||||
|
return chunk_text_legal_structured(bgb_text, CHUNK_SIZE, OVERLAP)
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Basic sanity
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class TestChunkingSanity:
|
||||||
|
|
||||||
|
def test_fixture_loads(self, bgb_text):
|
||||||
|
assert len(bgb_text) > 2000, "BGB excerpt should be substantial"
|
||||||
|
assert "§ 312k" in bgb_text
|
||||||
|
assert "§ 312 " in bgb_text
|
||||||
|
|
||||||
|
def test_chunk_count_reasonable(self, plain_chunks):
|
||||||
|
assert 4 <= len(plain_chunks) <= 30, (
|
||||||
|
f"Expected 4-30 chunks, got {len(plain_chunks)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_structured_same_count(self, plain_chunks, structured_chunks):
|
||||||
|
assert len(plain_chunks) == len(structured_chunks)
|
||||||
|
|
||||||
|
def test_no_empty_chunks(self, plain_chunks):
|
||||||
|
for i, chunk in enumerate(plain_chunks):
|
||||||
|
assert chunk.strip(), f"Chunk {i} is empty"
|
||||||
|
|
||||||
|
def test_chunk_sizes_reasonable(self, plain_chunks):
|
||||||
|
for i, chunk in enumerate(plain_chunks):
|
||||||
|
assert len(chunk) < 3000, f"Chunk {i} too large: {len(chunk)} chars"
|
||||||
|
assert len(chunk) > 30, f"Chunk {i} too small: {len(chunk)} chars"
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Section detection
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class TestSectionDetection:
|
||||||
|
|
||||||
|
def test_all_four_sections_detected(self, structured_chunks):
|
||||||
|
"""All 4 BGB sections should appear as section metadata."""
|
||||||
|
found_sections = set()
|
||||||
|
for meta in structured_chunks:
|
||||||
|
if meta["section"]:
|
||||||
|
found_sections.add(meta["section"])
|
||||||
|
|
||||||
|
assert "§ 312" in found_sections or any(
|
||||||
|
s.startswith("§ 312") and s != "§ 312a" and s != "§ 312g" and s != "§ 312k"
|
||||||
|
for s in found_sections
|
||||||
|
), f"§ 312 not found. Sections: {found_sections}"
|
||||||
|
assert "§ 312a" in found_sections, f"§ 312a not found. Sections: {found_sections}"
|
||||||
|
assert "§ 312g" in found_sections, f"§ 312g not found. Sections: {found_sections}"
|
||||||
|
assert "§ 312k" in found_sections, f"§ 312k not found. Sections: {found_sections}"
|
||||||
|
|
||||||
|
def test_section_prefix_in_chunks(self, plain_chunks):
|
||||||
|
"""Most chunks should have [§ ...] prefix."""
|
||||||
|
prefixed = sum(1 for c in plain_chunks if c.startswith("[§"))
|
||||||
|
ratio = prefixed / len(plain_chunks)
|
||||||
|
assert ratio >= 0.8, (
|
||||||
|
f"Only {ratio:.0%} chunks have section prefix (expected >= 80%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_312k_has_own_chunk(self, plain_chunks):
|
||||||
|
"""§ 312k must appear as a chunk section header, not merged into another §."""
|
||||||
|
chunks_with_312k = [c for c in plain_chunks if "[§ 312k" in c]
|
||||||
|
assert len(chunks_with_312k) >= 1, (
|
||||||
|
"§ 312k should have at least 1 dedicated chunk"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# § 312k specific metadata
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class TestSection312k:
|
||||||
|
|
||||||
|
def _312k_chunks(self, structured_chunks):
|
||||||
|
return [m for m in structured_chunks if m["section"] == "§ 312k"]
|
||||||
|
|
||||||
|
def test_312k_section_metadata(self, structured_chunks):
|
||||||
|
"""§ 312k chunks should have section='§ 312k' with a title."""
|
||||||
|
chunks = self._312k_chunks(structured_chunks)
|
||||||
|
assert len(chunks) >= 1, "No chunks with section='§ 312k'"
|
||||||
|
for meta in chunks:
|
||||||
|
assert meta["section"] == "§ 312k"
|
||||||
|
# Title should contain key words
|
||||||
|
title = meta["section_title"].lower()
|
||||||
|
assert "kuendigung" in title or "verbrauchervertrae" in title, (
|
||||||
|
f"Unexpected section_title: {meta['section_title']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_312k_paragraph_extraction(self, structured_chunks):
|
||||||
|
"""At least some § 312k chunks should have paragraph references."""
|
||||||
|
chunks = self._312k_chunks(structured_chunks)
|
||||||
|
paragraphs_found = [m["paragraph"] for m in chunks if m["paragraph"]]
|
||||||
|
# § 312k has (1) through (6), at least some should be detected
|
||||||
|
assert len(paragraphs_found) >= 1, (
|
||||||
|
"No paragraph references found in § 312k chunks"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_312k_content_present(self, structured_chunks):
|
||||||
|
"""§ 312k chunk text should contain key legal terms."""
|
||||||
|
chunks = self._312k_chunks(structured_chunks)
|
||||||
|
all_text = " ".join(m["text"] for m in chunks)
|
||||||
|
assert "Kuendigungsschaltflaeche" in all_text or "kuendigen" in all_text.lower()
|
||||||
|
assert "Webseite" in all_text or "elektronischen" in all_text
|
||||||
|
|
||||||
|
def test_312k_not_merged_with_312g(self, structured_chunks):
|
||||||
|
"""§ 312k and § 312g should be separate sections, not merged."""
|
||||||
|
sections_312g = [m for m in structured_chunks if m["section"] == "§ 312g"]
|
||||||
|
sections_312k = self._312k_chunks(structured_chunks)
|
||||||
|
assert len(sections_312g) >= 1, "§ 312g missing"
|
||||||
|
assert len(sections_312k) >= 1, "§ 312k missing"
|
||||||
|
# Verify they are different chunks (no overlap in indices)
|
||||||
|
g_indices = {m["index"] for m in sections_312g}
|
||||||
|
k_indices = {m["index"] for m in sections_312k}
|
||||||
|
assert g_indices.isdisjoint(k_indices), (
|
||||||
|
f"§ 312g and § 312k share chunk indices: {g_indices & k_indices}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Metadata quality across all sections
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class TestMetadataQuality:
|
||||||
|
|
||||||
|
def test_most_chunks_have_section(self, structured_chunks):
|
||||||
|
"""At least 90% of chunks should have a section reference."""
|
||||||
|
with_section = sum(1 for m in structured_chunks if m["section"])
|
||||||
|
ratio = with_section / len(structured_chunks)
|
||||||
|
assert ratio >= 0.9, (
|
||||||
|
f"Only {ratio:.0%} chunks have section metadata (expected >= 90%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_section_titles_not_empty(self, structured_chunks):
|
||||||
|
"""Chunks with a section should also have a section_title."""
|
||||||
|
for meta in structured_chunks:
|
||||||
|
if meta["section"]:
|
||||||
|
assert meta["section_title"], (
|
||||||
|
f"Chunk {meta['index']} has section={meta['section']} but no title"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_paragraph_nums_are_integers(self, structured_chunks):
|
||||||
|
"""paragraph_num should be int or None, never str."""
|
||||||
|
for meta in structured_chunks:
|
||||||
|
pn = meta["paragraph_num"]
|
||||||
|
assert pn is None or isinstance(pn, int), (
|
||||||
|
f"Chunk {meta['index']}: paragraph_num={pn!r} (type={type(pn).__name__})"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_indices_sequential(self, structured_chunks):
|
||||||
|
"""Chunk indices should be 0, 1, 2, ... in order."""
|
||||||
|
for i, meta in enumerate(structured_chunks):
|
||||||
|
assert meta["index"] == i, (
|
||||||
|
f"Expected index {i}, got {meta['index']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Edge cases
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
class TestEdgeCases:
|
||||||
|
|
||||||
|
def test_numbered_list_not_false_section(self, structured_chunks):
|
||||||
|
"""Numbered items (1., 2., 3.) inside a § should NOT create new sections."""
|
||||||
|
for meta in structured_chunks:
|
||||||
|
section = meta["section"]
|
||||||
|
# Section should always start with § or be empty
|
||||||
|
if section:
|
||||||
|
assert section.startswith("§"), (
|
||||||
|
f"Unexpected section format: {section!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_subsection_letters_preserved(self, plain_chunks):
|
||||||
|
"""Lettered subsections (a, b, c, d, e) in § 312k(2) should be in the text."""
|
||||||
|
all_text = " ".join(plain_chunks)
|
||||||
|
# § 312k Abs 2 Nr 1 has a) through e)
|
||||||
|
for letter in ["a)", "b)", "c)", "d)", "e)"]:
|
||||||
|
assert letter in all_text, (
|
||||||
|
f"Subsection letter {letter} from § 312k(2) missing"
|
||||||
|
)
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
§ 312 Anwendungsbereich
|
||||||
|
|
||||||
|
(1) Die Vorschriften der Kapitel 1 und 2 dieses Untertitels sind auf Verbrauchervertraege anzuwenden, bei denen sich der Verbraucher zu der Zahlung eines Preises verpflichtet.
|
||||||
|
|
||||||
|
(1a) Die Vorschriften der Kapitel 1 und 2 dieses Untertitels sind auch auf Verbrauchervertraege anzuwenden, bei denen der Verbraucher dem Unternehmer personenbezogene Daten bereitstellt oder sich hierzu verpflichtet. Dies gilt nicht, wenn der Unternehmer die vom Verbraucher bereitgestellten personenbezogenen Daten ausschliesslich verarbeitet, um seine Leistungspflicht oder an ihn gestellte rechtliche Anforderungen zu erfuellen, und sie zu keinem anderen Zweck verarbeitet.
|
||||||
|
|
||||||
|
(2) Von den Vorschriften der Kapitel 1 und 2 dieses Untertitels ist nur § 312a Absatz 1, 3, 4 und 6 auf folgende Vertraege anzuwenden:
|
||||||
|
1. notariell beurkundete Vertraege
|
||||||
|
2. Vertraege ueber die Begruendung, den Erwerb oder die Uebertragung von Eigentum oder anderen Rechten an Grundstuecken
|
||||||
|
3. Vertraege ueber den Bau von neuen Gebaeuden oder erhebliche Umbaumassnahmen an bestehenden Gebaeuden
|
||||||
|
4. Vertraege ueber Reiseleistungen nach § 651a
|
||||||
|
5. Vertraege ueber die Befoerderung von Personen
|
||||||
|
6. Vertraege, die unter Einsatz von Warenautomaten oder automatisierten Geschaeftsraeumen geschlossen werden
|
||||||
|
|
||||||
|
§ 312a Allgemeine Pflichten und Grundsaetze bei Verbrauchervertraegen
|
||||||
|
|
||||||
|
(1) Ruft der Unternehmer oder eine Person, die in seinem Namen oder Auftrag handelt, den Verbraucher an, um mit diesem einen Vertrag zu schliessen, hat der Anrufer zu Beginn des Gespraechs seine Identitaet und gegebenenfalls die Identitaet der Person, fuer die er anruft, sowie den geschaeftlichen Zweck des Anrufs offenzulegen.
|
||||||
|
|
||||||
|
(2) Der Unternehmer ist verpflichtet, den Verbraucher nach Massgabe des Artikels 246 des Einfuehrungsgesetzes zum Buergerlichen Gesetzbuche zu informieren. Der Unternehmer kann von dem Verbraucher Fracht-, Liefer- oder Versandkosten und sonstige Kosten nur verlangen, soweit er den Verbraucher ueber diese Kosten entsprechend den Anforderungen aus Artikel 246 Absatz 1 Nummer 3 des Einfuehrungsgesetzes zum Buergerlichen Gesetzbuche informiert hat. Die Saetze 1 und 2 sind weder auf ausserhalb von Geschaeftsraeumen geschlossene Vertraege noch auf Fernabsatzvertraege noch auf Vertraege ueber Finanzdienstleistungen anzuwenden.
|
||||||
|
|
||||||
|
(3) Eine Vereinbarung, die auf eine ueber das vereinbarte Entgelt fuer die Hauptleistung hinausgehende Zahlung des Verbrauchers gerichtet ist, kann ein Unternehmer mit einem Verbraucher nur ausdruecklich treffen. Schliesst der Unternehmer und der Verbraucher einen Vertrag im elektronischen Geschaeftsverkehr, wird eine solche Vereinbarung nur Vertragsbestandteil, wenn der Unternehmer die Vereinbarung nicht durch eine Voreinstellung herbeifuehrt.
|
||||||
|
|
||||||
|
(4) Eine Vereinbarung, durch die ein Verbraucher verpflichtet wird, ein Entgelt dafuer zu zahlen, dass der Verbraucher fuer die Erfuellung seiner vertraglichen Pflichten ein bestimmtes Zahlungsmittel nutzt, ist unwirksam, wenn fuer den Verbraucher keine zumutbare und gaengige unentgeltliche Zahlungsmoeglichkeit besteht oder das vereinbarte Entgelt ueber die Kosten hinausgeht, die dem Unternehmer durch die Nutzung des Zahlungsmittels entstehen.
|
||||||
|
|
||||||
|
(5) Eine Vereinbarung, durch die ein Verbraucher verpflichtet wird, ein Entgelt dafuer zu zahlen, dass der Verbraucher den Unternehmer wegen Fragen oder Erklaerungen zu einem zwischen ihnen geschlossenen Vertrag ueber eine Rufnummer anruft, die der Unternehmer fuer solche Zwecke bereithaelt, ist unwirksam, wenn das vereinbarte Entgelt das Entgelt fuer die blosse Nutzung des Telekommunikationsdienstes uebersteigt.
|
||||||
|
|
||||||
|
(6) Ist eine Vereinbarung nach den Absaetzen 3 bis 5 nicht Vertragsbestandteil geworden oder ist sie unwirksam, bleibt der Vertrag im Uebrigen wirksam.
|
||||||
|
|
||||||
|
§ 312g Widerrufsrecht
|
||||||
|
|
||||||
|
(1) Dem Verbraucher steht bei ausserhalb von Geschaeftsraeumen geschlossenen Vertraegen und bei Fernabsatzvertraegen ein Widerrufsrecht gemaess § 355 zu.
|
||||||
|
|
||||||
|
(2) Das Widerrufsrecht besteht, soweit die Parteien nichts anderes vereinbart haben, nicht bei folgenden Vertraegen:
|
||||||
|
1. Vertraege zur Lieferung von Waren, die nicht vorgefertigt sind und fuer deren Herstellung eine individuelle Auswahl oder Bestimmung durch den Verbraucher massgeblich ist oder die eindeutig auf die persoenlichen Beduerfnisse des Verbrauchers zugeschnitten sind,
|
||||||
|
2. Vertraege zur Lieferung von Waren, die schnell verderben koennen oder deren Verfallsdatum schnell ueberschritten wuerde,
|
||||||
|
3. Vertraege zur Lieferung versiegelter Waren, die aus Gruenden des Gesundheitsschutzes oder der Hygiene nicht zur Rueckgabe geeignet sind, wenn ihre Versiegelung nach der Lieferung entfernt wurde.
|
||||||
|
|
||||||
|
(3) Das Widerrufsrecht besteht ferner nicht bei Vertraegen, bei denen dem Verbraucher bereits auf Grund der §§ 495, 506 bis 513 ein Widerrufsrecht zusteht.
|
||||||
|
|
||||||
|
§ 312k Kuendigung von Verbrauchervertraegen im elektronischen Geschaeftsverkehr
|
||||||
|
|
||||||
|
(1) Wird Verbrauchern ueber eine Webseite ermoeglicht, einen Vertrag im elektronischen Geschaeftsverkehr zu schliessen, der auf die Begruendung eines Dauerschuldverhaeltnisses gerichtet ist, das einen Unternehmer zu einer entgeltlichen Leistung verpflichtet, so treffen den Unternehmer die Pflichten nach dieser Vorschrift. Dies gilt nicht
|
||||||
|
1. fuer Vertraege, fuer deren Kuendigung gesetzlich ausschliesslich eine strengere Form als die Textform vorgesehen ist, und
|
||||||
|
2. in Bezug auf Webseiten, die Finanzdienstleistungen betreffen, oder fuer Vertraege ueber Finanzdienstleistungen.
|
||||||
|
|
||||||
|
(2) Der Unternehmer hat sicherzustellen, dass der Verbraucher auf der Webseite eine Erklaerung zur ordentlichen oder ausserordentlichen Kuendigung eines auf der Webseite abschliessbaren Vertrags nach Absatz 1 Satz 1 ueber eine Kuendigungsschaltflaeche abgeben kann. Die Kuendigungsschaltflaeche muss gut lesbar mit nichts anderem als den Woertern "Vertraege hier kuendigen" oder mit einer entsprechenden eindeutigen Formulierung beschriftet sein. Sie muss den Verbraucher unmittelbar zu einer Bestaetigungsseite fuehren, die
|
||||||
|
1. den Verbraucher auffordert und ihm ermoeglicht Angaben zu machen
|
||||||
|
a) zur Art der Kuendigung sowie im Falle der ausserordentlichen Kuendigung zum Kuendigungsgrund,
|
||||||
|
b) zu seiner eindeutigen Identifizierbarkeit,
|
||||||
|
c) zur eindeutigen Bezeichnung des Vertrags,
|
||||||
|
d) zum Zeitpunkt, zu dem die Kuendigung das Vertragsverhaeltnis beenden soll,
|
||||||
|
e) zur schnellen elektronischen Uebermittlung der Kuendigungsbestaetigung an ihn und
|
||||||
|
2. eine Bestaetigungsschaltflaeche enthaelt, ueber deren Betaetigung der Verbraucher die Kuendigungserklaerung abgeben kann und die gut lesbar mit nichts anderem als den Woertern "jetzt kuendigen" oder mit einer entsprechenden eindeutigen Formulierung beschriftet ist.
|
||||||
|
Die Schaltflaechen und die Bestaetigungsseite muessen staendig verfuegbar sowie unmittelbar und leicht zugaenglich sein.
|
||||||
|
|
||||||
|
(3) Der Verbraucher muss seine durch das Betaetigen der Bestaetigungsschaltflaeche abgegebene Kuendigungserklaerung mit dem Datum und der Uhrzeit der Abgabe auf einem dauerhaften Datentraeger so speichern koennen, dass erkennbar ist, dass die Kuendigungserklaerung durch das Betaetigen der Bestaetigungsschaltflaeche abgegeben wurde.
|
||||||
|
|
||||||
|
(4) Der Unternehmer hat dem Verbraucher den Inhalt sowie Datum und Uhrzeit des Zugangs der Kuendigungserklaerung sowie den Zeitpunkt, zu dem das Vertragsverhaeltnis durch die Kuendigung beendet werden soll, sofort auf elektronischem Wege in Textform zu bestaetigen. Es wird vermutet, dass eine durch das Betaetigen der Bestaetigungsschaltflaeche abgegebene Kuendigungserklaerung dem Unternehmer unmittelbar nach ihrer Abgabe zugegangen ist.
|
||||||
|
|
||||||
|
(5) Wenn der Verbraucher bei der Abgabe der Kuendigungserklaerung keinen Zeitpunkt angibt, zu dem die Kuendigung das Vertragsverhaeltnis beenden soll, wirkt die Kuendigung im Zweifel zum fruehestmoeglichen Zeitpunkt.
|
||||||
|
|
||||||
|
(6) Werden die Schaltflaechen und die Bestaetigungsseite nicht entsprechend den Absaetzen 1 und 2 zur Verfuegung gestellt, kann ein Verbraucher einen Vertrag, fuer dessen Kuendigung die Schaltflaechen und die Bestaetigungsseite zur Verfuegung zu stellen sind, jederzeit und ohne Einhaltung einer Kuendigungsfrist kuendigen. Die Moeglichkeit des Verbrauchers zur ausserordentlichen Kuendigung bleibt hiervon unberuehrt.
|
||||||
@@ -14,6 +14,9 @@ logger = logging.getLogger("rag-service.api.documents")
|
|||||||
|
|
||||||
router = APIRouter(prefix="/api/v1/documents")
|
router = APIRouter(prefix="/api/v1/documents")
|
||||||
|
|
||||||
|
# Structural metadata fields from embedding-service chunks_with_metadata (D2)
|
||||||
|
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
|
||||||
|
|
||||||
|
|
||||||
# ---- Request / Response models --------------------------------------------
|
# ---- Request / Response models --------------------------------------------
|
||||||
|
|
||||||
@@ -110,7 +113,7 @@ async def upload_document(
|
|||||||
|
|
||||||
# --- Chunk ---
|
# --- Chunk ---
|
||||||
try:
|
try:
|
||||||
chunks = await embedding_client.chunk_text(
|
chunk_result = await embedding_client.chunk_text(
|
||||||
text=text,
|
text=text,
|
||||||
strategy=chunk_strategy,
|
strategy=chunk_strategy,
|
||||||
chunk_size=chunk_size,
|
chunk_size=chunk_size,
|
||||||
@@ -120,6 +123,9 @@ async def upload_document(
|
|||||||
logger.error("Chunking failed: %s", exc)
|
logger.error("Chunking failed: %s", exc)
|
||||||
raise HTTPException(status_code=500, detail=f"Chunking failed: {exc}")
|
raise HTTPException(status_code=500, detail=f"Chunking failed: {exc}")
|
||||||
|
|
||||||
|
chunks = chunk_result.chunks
|
||||||
|
chunks_meta = chunk_result.chunks_with_metadata
|
||||||
|
|
||||||
if not chunks:
|
if not chunks:
|
||||||
raise HTTPException(status_code=400, detail="Chunking produced zero chunks")
|
raise HTTPException(status_code=400, detail="Chunking produced zero chunks")
|
||||||
|
|
||||||
@@ -154,6 +160,13 @@ async def upload_document(
|
|||||||
"year": year,
|
"year": year,
|
||||||
**extra_metadata,
|
**extra_metadata,
|
||||||
}
|
}
|
||||||
|
# Merge structural metadata from embedding service (D2)
|
||||||
|
if i < len(chunks_meta):
|
||||||
|
meta = chunks_meta[i]
|
||||||
|
for field in _STRUCT_FIELDS:
|
||||||
|
value = meta.get(field)
|
||||||
|
if value is not None and value != "":
|
||||||
|
payload[field] = value
|
||||||
payloads.append(payload)
|
payloads.append(payload)
|
||||||
|
|
||||||
# --- Index in Qdrant ---
|
# --- Index in Qdrant ---
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import Optional
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
@@ -19,6 +19,14 @@ _OLLAMA_EMBED_MODEL = os.getenv("OLLAMA_EMBED_MODEL", "bge-m3")
|
|||||||
_EMBED_BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", "32"))
|
_EMBED_BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", "32"))
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChunkResult:
|
||||||
|
"""Result from the embedding service /chunk endpoint."""
|
||||||
|
|
||||||
|
chunks: list[str]
|
||||||
|
chunks_with_metadata: list[dict]
|
||||||
|
|
||||||
|
|
||||||
class EmbeddingClient:
|
class EmbeddingClient:
|
||||||
"""
|
"""
|
||||||
Hybrid client:
|
Hybrid client:
|
||||||
@@ -120,10 +128,10 @@ class EmbeddingClient:
|
|||||||
strategy: str = "recursive",
|
strategy: str = "recursive",
|
||||||
chunk_size: int = 512,
|
chunk_size: int = 512,
|
||||||
overlap: int = 50,
|
overlap: int = 50,
|
||||||
) -> list[str]:
|
) -> ChunkResult:
|
||||||
"""
|
"""
|
||||||
Ask the embedding service to chunk a long text.
|
Ask the embedding service to chunk a long text.
|
||||||
Returns a list of chunk strings.
|
Returns ChunkResult with plain chunks and structural metadata.
|
||||||
"""
|
"""
|
||||||
async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
|
async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
|
||||||
response = await client.post(
|
response = await client.post(
|
||||||
@@ -137,7 +145,10 @@ class EmbeddingClient:
|
|||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
data = response.json()
|
data = response.json()
|
||||||
return data.get("chunks", [])
|
return ChunkResult(
|
||||||
|
chunks=data.get("chunks", []),
|
||||||
|
chunks_with_metadata=data.get("chunks_with_metadata") or [],
|
||||||
|
)
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# PDF extraction (via embedding-service)
|
# PDF extraction (via embedding-service)
|
||||||
|
|||||||
@@ -0,0 +1,7 @@
|
|||||||
|
"""Shared test fixtures for rag-service tests."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Ensure rag-service root is on sys.path so imports resolve
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||||
@@ -0,0 +1,172 @@
|
|||||||
|
"""Tests for document upload payload building — structural metadata (D2)."""
|
||||||
|
|
||||||
|
# Mirror the constant from api/documents.py to avoid heavy import chain
|
||||||
|
# (api → jose, qdrant_client, minio, etc.)
|
||||||
|
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
|
||||||
|
|
||||||
|
|
||||||
|
def _build_payload(
|
||||||
|
chunk: str,
|
||||||
|
index: int,
|
||||||
|
chunks_meta: list[dict],
|
||||||
|
extra_metadata: "dict | None" = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Replicate the payload-building logic from documents.py for unit testing."""
|
||||||
|
payload = {
|
||||||
|
"document_id": "test-doc-id",
|
||||||
|
"object_name": "test/path.pdf",
|
||||||
|
"filename": "path.pdf",
|
||||||
|
"chunk_index": index,
|
||||||
|
"chunk_text": chunk,
|
||||||
|
"data_type": "law",
|
||||||
|
"bundesland": "bund",
|
||||||
|
"use_case": "compliance",
|
||||||
|
"year": "2026",
|
||||||
|
**(extra_metadata or {}),
|
||||||
|
}
|
||||||
|
if index < len(chunks_meta):
|
||||||
|
meta = chunks_meta[index]
|
||||||
|
for field in _STRUCT_FIELDS:
|
||||||
|
value = meta.get(field)
|
||||||
|
if value is not None and value != "":
|
||||||
|
payload[field] = value
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
class TestPayloadStructuralMetadata:
|
||||||
|
"""Tests for structural metadata merging into Qdrant payloads."""
|
||||||
|
|
||||||
|
def test_payload_contains_structural_metadata(self):
|
||||||
|
"""Metadata fields from chunks_with_metadata land in the payload."""
|
||||||
|
meta = [
|
||||||
|
{
|
||||||
|
"text": "chunk text",
|
||||||
|
"section": "§ 312k",
|
||||||
|
"section_title": "Kuendigungsbutton",
|
||||||
|
"paragraph": "Abs. 1",
|
||||||
|
"paragraph_num": 1,
|
||||||
|
"page": 847,
|
||||||
|
"index": 0,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
payload = _build_payload("chunk text", 0, meta)
|
||||||
|
|
||||||
|
assert payload["section"] == "§ 312k"
|
||||||
|
assert payload["section_title"] == "Kuendigungsbutton"
|
||||||
|
assert payload["paragraph"] == "Abs. 1"
|
||||||
|
assert payload["paragraph_num"] == 1
|
||||||
|
assert payload["page"] == 847
|
||||||
|
|
||||||
|
def test_payload_without_metadata_backwards_compat(self):
|
||||||
|
"""Empty metadata list → payload has no structural fields."""
|
||||||
|
payload = _build_payload("chunk text", 0, [])
|
||||||
|
|
||||||
|
for field in _STRUCT_FIELDS:
|
||||||
|
assert field not in payload
|
||||||
|
|
||||||
|
def test_payload_skips_empty_values(self):
|
||||||
|
"""Empty string and None values are NOT added to payload."""
|
||||||
|
meta = [
|
||||||
|
{
|
||||||
|
"text": "chunk text",
|
||||||
|
"section": "",
|
||||||
|
"section_title": "",
|
||||||
|
"paragraph": "",
|
||||||
|
"paragraph_num": None,
|
||||||
|
"page": None,
|
||||||
|
"index": 0,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
payload = _build_payload("chunk text", 0, meta)
|
||||||
|
|
||||||
|
for field in _STRUCT_FIELDS:
|
||||||
|
assert field not in payload
|
||||||
|
|
||||||
|
def test_metadata_overrides_extra_metadata(self):
|
||||||
|
"""Auto-extracted metadata takes precedence over manual extra_metadata."""
|
||||||
|
meta = [
|
||||||
|
{
|
||||||
|
"text": "chunk text",
|
||||||
|
"section": "§ 25",
|
||||||
|
"section_title": "",
|
||||||
|
"paragraph": "",
|
||||||
|
"paragraph_num": None,
|
||||||
|
"page": None,
|
||||||
|
"index": 0,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
extra = {"section": "manual-value"}
|
||||||
|
|
||||||
|
payload = _build_payload("chunk text", 0, meta, extra_metadata=extra)
|
||||||
|
|
||||||
|
assert payload["section"] == "§ 25"
|
||||||
|
|
||||||
|
def test_partial_metadata_alignment(self):
|
||||||
|
"""3 chunks but only 2 metadata entries → third payload has no structural fields."""
|
||||||
|
meta = [
|
||||||
|
{
|
||||||
|
"text": "c1",
|
||||||
|
"section": "§ 1",
|
||||||
|
"section_title": "",
|
||||||
|
"paragraph": "",
|
||||||
|
"paragraph_num": None,
|
||||||
|
"page": None,
|
||||||
|
"index": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "c2",
|
||||||
|
"section": "§ 2",
|
||||||
|
"section_title": "",
|
||||||
|
"paragraph": "",
|
||||||
|
"paragraph_num": None,
|
||||||
|
"page": None,
|
||||||
|
"index": 1,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
p0 = _build_payload("c1", 0, meta)
|
||||||
|
p1 = _build_payload("c2", 1, meta)
|
||||||
|
p2 = _build_payload("c3", 2, meta)
|
||||||
|
|
||||||
|
assert p0["section"] == "§ 1"
|
||||||
|
assert p1["section"] == "§ 2"
|
||||||
|
assert "section" not in p2
|
||||||
|
|
||||||
|
def test_zero_paragraph_num_is_kept(self):
|
||||||
|
"""paragraph_num=0 is a valid value and should be stored."""
|
||||||
|
meta = [
|
||||||
|
{
|
||||||
|
"text": "chunk",
|
||||||
|
"section": "",
|
||||||
|
"section_title": "",
|
||||||
|
"paragraph": "",
|
||||||
|
"paragraph_num": 0,
|
||||||
|
"page": None,
|
||||||
|
"index": 0,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
payload = _build_payload("chunk", 0, meta)
|
||||||
|
|
||||||
|
# 0 is not None and not "" → should be stored
|
||||||
|
assert payload["paragraph_num"] == 0
|
||||||
|
|
||||||
|
def test_page_zero_is_kept(self):
|
||||||
|
"""page=0 is a valid value (first page) and should be stored."""
|
||||||
|
meta = [
|
||||||
|
{
|
||||||
|
"text": "chunk",
|
||||||
|
"section": "",
|
||||||
|
"section_title": "",
|
||||||
|
"paragraph": "",
|
||||||
|
"paragraph_num": None,
|
||||||
|
"page": 0,
|
||||||
|
"index": 0,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
payload = _build_payload("chunk", 0, meta)
|
||||||
|
|
||||||
|
assert payload["page"] == 0
|
||||||
@@ -0,0 +1,135 @@
|
|||||||
|
"""Tests for EmbeddingClient.chunk_text() — ChunkResult with metadata (D2)."""
|
||||||
|
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from embedding_client import ChunkResult, EmbeddingClient
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
with patch("embedding_client.settings") as mock_settings:
|
||||||
|
mock_settings.EMBEDDING_SERVICE_URL = "http://localhost:8087"
|
||||||
|
return EmbeddingClient()
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_response(json_data: dict, status_code: int = 200):
|
||||||
|
"""Create a mock httpx response (sync methods like .json() and .raise_for_status())."""
|
||||||
|
resp = MagicMock()
|
||||||
|
resp.status_code = status_code
|
||||||
|
resp.json.return_value = json_data
|
||||||
|
return resp
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chunk_text_returns_chunk_result(client):
|
||||||
|
"""chunk_text returns ChunkResult with both chunks and metadata."""
|
||||||
|
mock_json = {
|
||||||
|
"chunks": ["chunk1 text", "chunk2 text"],
|
||||||
|
"chunks_with_metadata": [
|
||||||
|
{
|
||||||
|
"text": "chunk1 text",
|
||||||
|
"section": "§ 25",
|
||||||
|
"section_title": "Informationspflichten",
|
||||||
|
"paragraph": "Abs. 1",
|
||||||
|
"paragraph_num": 1,
|
||||||
|
"page": None,
|
||||||
|
"index": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "chunk2 text",
|
||||||
|
"section": "§ 25",
|
||||||
|
"section_title": "Informationspflichten",
|
||||||
|
"paragraph": "Abs. 2",
|
||||||
|
"paragraph_num": 2,
|
||||||
|
"page": None,
|
||||||
|
"index": 1,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"count": 2,
|
||||||
|
"strategy": "recursive",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.post.return_value = _mock_response(mock_json)
|
||||||
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||||
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||||
|
mock_client_cls.return_value = mock_client
|
||||||
|
|
||||||
|
result = await client.chunk_text("some legal text")
|
||||||
|
|
||||||
|
assert isinstance(result, ChunkResult)
|
||||||
|
assert result.chunks == ["chunk1 text", "chunk2 text"]
|
||||||
|
assert len(result.chunks_with_metadata) == 2
|
||||||
|
assert result.chunks_with_metadata[0]["section"] == "§ 25"
|
||||||
|
assert result.chunks_with_metadata[1]["paragraph"] == "Abs. 2"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chunk_text_without_metadata_field(client):
|
||||||
|
"""Embedding service response without chunks_with_metadata → empty list."""
|
||||||
|
mock_json = {
|
||||||
|
"chunks": ["chunk1"],
|
||||||
|
"count": 1,
|
||||||
|
"strategy": "semantic",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.post.return_value = _mock_response(mock_json)
|
||||||
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||||
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||||
|
mock_client_cls.return_value = mock_client
|
||||||
|
|
||||||
|
result = await client.chunk_text("text", strategy="semantic")
|
||||||
|
|
||||||
|
assert isinstance(result, ChunkResult)
|
||||||
|
assert result.chunks == ["chunk1"]
|
||||||
|
assert result.chunks_with_metadata == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chunk_text_with_null_metadata(client):
|
||||||
|
"""chunks_with_metadata: null in response → empty list."""
|
||||||
|
mock_json = {
|
||||||
|
"chunks": ["chunk1"],
|
||||||
|
"chunks_with_metadata": None,
|
||||||
|
"count": 1,
|
||||||
|
"strategy": "recursive",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.post.return_value = _mock_response(mock_json)
|
||||||
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||||
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||||
|
mock_client_cls.return_value = mock_client
|
||||||
|
|
||||||
|
result = await client.chunk_text("text")
|
||||||
|
|
||||||
|
assert result.chunks_with_metadata == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chunk_text_empty(client):
|
||||||
|
"""Empty text → empty chunks and metadata."""
|
||||||
|
mock_json = {
|
||||||
|
"chunks": [],
|
||||||
|
"chunks_with_metadata": [],
|
||||||
|
"count": 0,
|
||||||
|
"strategy": "recursive",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||||
|
mock_client = AsyncMock()
|
||||||
|
mock_client.post.return_value = _mock_response(mock_json)
|
||||||
|
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||||
|
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||||
|
mock_client_cls.return_value = mock_client
|
||||||
|
|
||||||
|
result = await client.chunk_text("")
|
||||||
|
|
||||||
|
assert result.chunks == []
|
||||||
|
assert result.chunks_with_metadata == []
|
||||||
Reference in New Issue
Block a user