fix(anchor-finder): use correct Qdrant payload fields (regulation_id, regulation_name_de)

Qdrant collections use regulation_id (not regulation_code), regulation_name_de,
guideline_name, download_url etc. Also search bp_compliance_datenschutz
collection where OWASP/ENISA docs live.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-21 18:17:36 +02:00
parent b29dc33708
commit fb53c8be90

View File

@@ -32,7 +32,7 @@ EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
_OPEN_SOURCE_RULES = {1, 2}
# Collections to search for anchors (open-source frameworks)
_ANCHOR_COLLECTIONS = ["bp_compliance_ce"]
_ANCHOR_COLLECTIONS = ["bp_compliance_ce", "bp_compliance_datenschutz"]
@dataclass
@@ -123,9 +123,11 @@ class AnchorFinder:
for hit in results:
payload = hit.get("payload", {})
# Qdrant payloads use regulation_id (not regulation_code)
regulation_code = (
payload.get("regulation_code", "")
or payload.get("metadata", {}).get("regulation_code", "")
payload.get("regulation_id", "")
or payload.get("regulation_code", "")
or payload.get("metadata", {}).get("regulation_id", "")
)
if not regulation_code:
continue
@@ -136,25 +138,23 @@ class AnchorFinder:
continue
# Build reference key for dedup
article = payload.get("article", "") or payload.get("metadata", {}).get("article", "")
category = payload.get("category", "") or payload.get("metadata", {}).get("category", "")
ref = article or category or ""
article = payload.get("article", "") or payload.get("category", "") or ""
ref = article
key = f"{regulation_code}:{ref}"
if key in seen:
continue
seen.add(key)
reg_name = (
payload.get("regulation_name", "")
or payload.get("metadata", {}).get("regulation_name", "")
)
reg_short = (
payload.get("regulation_short", "")
or payload.get("metadata", {}).get("regulation_short", "")
payload.get("regulation_name_de", "")
or payload.get("regulation_name_en", "")
or payload.get("guideline_name", "")
)
reg_short = payload.get("regulation_short", "")
source_url = (
payload.get("source_url", "")
or payload.get("metadata", {}).get("source_url", "")
payload.get("download_url", "")
or payload.get("source_url", "")
or payload.get("source", "")
)
framework_name = license_info.get("name", reg_name or reg_short or regulation_code)