From 510f513811b2bf103041b7dfffc4a2e561420c10 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Wed, 6 May 2026 14:28:32 +0200
Subject: [PATCH] fix: Qdrant search uses chunk_text + section/category filter

Payload structure: chunk_text (not text), section (Article 13),
category, regulation_id. Scrolls 100 points per collection,
filters client-side against regulation keywords.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../services/rag_document_checker.py          | 42 ++++++++++++-------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/backend-compliance/compliance/services/rag_document_checker.py b/backend-compliance/compliance/services/rag_document_checker.py
index 6187328..b01a45a 100644
--- a/backend-compliance/compliance/services/rag_document_checker.py
+++ b/backend-compliance/compliance/services/rag_document_checker.py
@@ -116,41 +116,51 @@ async def _search_via_sdk(regulations: list[str], top_k: int) -> list[dict]:
 
 
 async def _search_via_qdrant(regulations: list[str], top_k: int) -> list[dict]:
-    """Search directly in local Qdrant — keyword scroll with filter."""
+    """Search directly in local Qdrant — scroll with payload filter."""
     try:
-        # Search in multiple collections
         all_results = []
-        for collection in ["bp_compliance_datenschutz", "bp_compliance_gesetze", "atomic_controls_dedup"]:
+        collections = ["bp_compliance_datenschutz", "bp_compliance_gesetze"]
+
+        for collection in collections:
+            # Scroll through points, filter by section/regulation matching
             async with httpx.AsyncClient(timeout=10.0) as client:
-                # Scroll with text filter (Qdrant scroll endpoint)
                 resp = await client.post(f"{QDRANT_URL}/collections/{collection}/points/scroll", json={
-                    "limit": top_k,
+                    "limit": 100,  # Fetch more, filter client-side
                     "with_payload": True,
                     "with_vector": False,
                 })
             if resp.status_code != 200:
                 continue
+
             data = resp.json()
             for point in data.get("result", {}).get("points", []):
                 payload = point.get("payload", {})
-                text = payload.get("text", "") or payload.get("content", "") or payload.get("chunk_text", "")
-                if not text:
+                chunk = payload.get("chunk_text", "")
+                section = payload.get("section", "")
+                category = payload.get("category", "")
+                reg_id = payload.get("regulation_id", "")
+                section_title = payload.get("section_title", "")
+
+                if not chunk or len(chunk) < 50:
                     continue
-                # Filter: only keep results that mention our regulations
-                text_lower = text.lower()
-                reg_match = any(
-                    r.lower().replace("§", "").replace("art.", "art").strip() in text_lower
+
+                # Match against regulation keywords
+                searchable = f"{section} {category} {reg_id} {section_title} {chunk[:200]}".lower()
+                matched = any(
+                    kw.lower() in searchable
                     for r in regulations
+                    for kw in [r, r.replace("Art. ", "Article "), r.replace("§", "")]
                 )
-                if reg_match and len(text) > 50:
+                if matched:
                     all_results.append({
-                        "text": text[:500],
-                        "regulation": payload.get("regulation_code", "") or payload.get("regulation_short", ""),
-                        "article": payload.get("article", ""),
+                        "text": chunk[:500],
+                        "regulation": reg_id or section or category,
+                        "article": section,
                         "score": 0.5,
                     })
 
-        logger.info("Qdrant direct search: found %d controls", len(all_results))
+        logger.info("Qdrant direct search: found %d controls from %d collections",
+                     len(all_results), len(collections))
         return all_results[:top_k]
 
     except Exception as e: