From 510f513811b2bf103041b7dfffc4a2e561420c10 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 6 May 2026 14:28:32 +0200 Subject: [PATCH] fix: Qdrant search uses chunk_text + section/category filter Payload structure: chunk_text (not text), section (Article 13), category, regulation_id. Scrolls 100 points per collection, filters client-side against regulation keywords. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../services/rag_document_checker.py | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/backend-compliance/compliance/services/rag_document_checker.py b/backend-compliance/compliance/services/rag_document_checker.py index 6187328..b01a45a 100644 --- a/backend-compliance/compliance/services/rag_document_checker.py +++ b/backend-compliance/compliance/services/rag_document_checker.py @@ -116,41 +116,51 @@ async def _search_via_sdk(regulations: list[str], top_k: int) -> list[dict]: async def _search_via_qdrant(regulations: list[str], top_k: int) -> list[dict]: - """Search directly in local Qdrant — keyword scroll with filter.""" + """Search directly in local Qdrant — scroll with payload filter.""" try: - # Search in multiple collections all_results = [] - for collection in ["bp_compliance_datenschutz", "bp_compliance_gesetze", "atomic_controls_dedup"]: + collections = ["bp_compliance_datenschutz", "bp_compliance_gesetze"] + + for collection in collections: + # Scroll through points, filter by section/regulation matching async with httpx.AsyncClient(timeout=10.0) as client: - # Scroll with text filter (Qdrant scroll endpoint) resp = await client.post(f"{QDRANT_URL}/collections/{collection}/points/scroll", json={ - "limit": top_k, + "limit": 100, # Fetch more, filter client-side "with_payload": True, "with_vector": False, }) if resp.status_code != 200: continue + data = resp.json() for point in data.get("result", {}).get("points", []): payload = point.get("payload", {}) - text = payload.get("text", "") or payload.get("content", "") or payload.get("chunk_text", "") - if not text: + chunk = payload.get("chunk_text", "") + section = payload.get("section", "") + category = payload.get("category", "") + reg_id = payload.get("regulation_id", "") + section_title = payload.get("section_title", "") + + if not chunk or len(chunk) < 50: continue - # Filter: only keep results that mention our regulations - text_lower = text.lower() - reg_match = any( - r.lower().replace("§", "").replace("art.", "art").strip() in text_lower + + # Match against regulation keywords + searchable = f"{section} {category} {reg_id} {section_title} {chunk[:200]}".lower() + matched = any( + kw.lower() in searchable for r in regulations + for kw in [r, r.replace("Art. ", "Article "), r.replace("§", "")] ) - if reg_match and len(text) > 50: + if matched: all_results.append({ - "text": text[:500], - "regulation": payload.get("regulation_code", "") or payload.get("regulation_short", ""), - "article": payload.get("article", ""), + "text": chunk[:500], + "regulation": reg_id or section or category, + "article": section, "score": 0.5, }) - logger.info("Qdrant direct search: found %d controls", len(all_results)) + logger.info("Qdrant direct search: found %d controls from %d collections", + len(all_results), len(collections)) return all_results[:top_k] except Exception as e: