fix: Qdrant search uses chunk_text + section/category filter
Payload structure: chunk_text (not text), section (Article 13), category, regulation_id. Scrolls 100 points per collection, filters client-side against regulation keywords. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -116,41 +116,51 @@ async def _search_via_sdk(regulations: list[str], top_k: int) -> list[dict]:
|
|||||||
|
|
||||||
|
|
||||||
async def _search_via_qdrant(regulations: list[str], top_k: int) -> list[dict]:
|
async def _search_via_qdrant(regulations: list[str], top_k: int) -> list[dict]:
|
||||||
"""Search directly in local Qdrant — keyword scroll with filter."""
|
"""Search directly in local Qdrant — scroll with payload filter."""
|
||||||
try:
|
try:
|
||||||
# Search in multiple collections
|
|
||||||
all_results = []
|
all_results = []
|
||||||
for collection in ["bp_compliance_datenschutz", "bp_compliance_gesetze", "atomic_controls_dedup"]:
|
collections = ["bp_compliance_datenschutz", "bp_compliance_gesetze"]
|
||||||
|
|
||||||
|
for collection in collections:
|
||||||
|
# Scroll through points, filter by section/regulation matching
|
||||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||||
# Scroll with text filter (Qdrant scroll endpoint)
|
|
||||||
resp = await client.post(f"{QDRANT_URL}/collections/{collection}/points/scroll", json={
|
resp = await client.post(f"{QDRANT_URL}/collections/{collection}/points/scroll", json={
|
||||||
"limit": top_k,
|
"limit": 100, # Fetch more, filter client-side
|
||||||
"with_payload": True,
|
"with_payload": True,
|
||||||
"with_vector": False,
|
"with_vector": False,
|
||||||
})
|
})
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
for point in data.get("result", {}).get("points", []):
|
for point in data.get("result", {}).get("points", []):
|
||||||
payload = point.get("payload", {})
|
payload = point.get("payload", {})
|
||||||
text = payload.get("text", "") or payload.get("content", "") or payload.get("chunk_text", "")
|
chunk = payload.get("chunk_text", "")
|
||||||
if not text:
|
section = payload.get("section", "")
|
||||||
|
category = payload.get("category", "")
|
||||||
|
reg_id = payload.get("regulation_id", "")
|
||||||
|
section_title = payload.get("section_title", "")
|
||||||
|
|
||||||
|
if not chunk or len(chunk) < 50:
|
||||||
continue
|
continue
|
||||||
# Filter: only keep results that mention our regulations
|
|
||||||
text_lower = text.lower()
|
# Match against regulation keywords
|
||||||
reg_match = any(
|
searchable = f"{section} {category} {reg_id} {section_title} {chunk[:200]}".lower()
|
||||||
r.lower().replace("§", "").replace("art.", "art").strip() in text_lower
|
matched = any(
|
||||||
|
kw.lower() in searchable
|
||||||
for r in regulations
|
for r in regulations
|
||||||
|
for kw in [r, r.replace("Art. ", "Article "), r.replace("§", "")]
|
||||||
)
|
)
|
||||||
if reg_match and len(text) > 50:
|
if matched:
|
||||||
all_results.append({
|
all_results.append({
|
||||||
"text": text[:500],
|
"text": chunk[:500],
|
||||||
"regulation": payload.get("regulation_code", "") or payload.get("regulation_short", ""),
|
"regulation": reg_id or section or category,
|
||||||
"article": payload.get("article", ""),
|
"article": section,
|
||||||
"score": 0.5,
|
"score": 0.5,
|
||||||
})
|
})
|
||||||
|
|
||||||
logger.info("Qdrant direct search: found %d controls", len(all_results))
|
logger.info("Qdrant direct search: found %d controls from %d collections",
|
||||||
|
len(all_results), len(collections))
|
||||||
return all_results[:top_k]
|
return all_results[:top_k]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user