fix(anchor-finder): use correct Qdrant payload fields (regulation_id, regulation_name_de)

Qdrant collections use regulation_id (not regulation_code), regulation_name_de,
guideline_name, download_url etc. Also search bp_compliance_datenschutz
collection where OWASP/ENISA docs live.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-21 18:17:36 +02:00
parent b29dc33708
commit fb53c8be90

View File

@@ -32,7 +32,7 @@ EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
_OPEN_SOURCE_RULES = {1, 2} _OPEN_SOURCE_RULES = {1, 2}
# Collections to search for anchors (open-source frameworks) # Collections to search for anchors (open-source frameworks)
_ANCHOR_COLLECTIONS = ["bp_compliance_ce"] _ANCHOR_COLLECTIONS = ["bp_compliance_ce", "bp_compliance_datenschutz"]
@dataclass @dataclass
@@ -123,9 +123,11 @@ class AnchorFinder:
for hit in results: for hit in results:
payload = hit.get("payload", {}) payload = hit.get("payload", {})
# Qdrant payloads use regulation_id (not regulation_code)
regulation_code = ( regulation_code = (
payload.get("regulation_code", "") payload.get("regulation_id", "")
or payload.get("metadata", {}).get("regulation_code", "") or payload.get("regulation_code", "")
or payload.get("metadata", {}).get("regulation_id", "")
) )
if not regulation_code: if not regulation_code:
continue continue
@@ -136,25 +138,23 @@ class AnchorFinder:
continue continue
# Build reference key for dedup # Build reference key for dedup
article = payload.get("article", "") or payload.get("metadata", {}).get("article", "") article = payload.get("article", "") or payload.get("category", "") or ""
category = payload.get("category", "") or payload.get("metadata", {}).get("category", "") ref = article
ref = article or category or ""
key = f"{regulation_code}:{ref}" key = f"{regulation_code}:{ref}"
if key in seen: if key in seen:
continue continue
seen.add(key) seen.add(key)
reg_name = ( reg_name = (
payload.get("regulation_name", "") payload.get("regulation_name_de", "")
or payload.get("metadata", {}).get("regulation_name", "") or payload.get("regulation_name_en", "")
) or payload.get("guideline_name", "")
reg_short = (
payload.get("regulation_short", "")
or payload.get("metadata", {}).get("regulation_short", "")
) )
reg_short = payload.get("regulation_short", "")
source_url = ( source_url = (
payload.get("source_url", "") payload.get("download_url", "")
or payload.get("metadata", {}).get("source_url", "") or payload.get("source_url", "")
or payload.get("source", "")
) )
framework_name = license_info.get("name", reg_name or reg_short or regulation_code) framework_name = license_info.get("name", reg_name or reg_short or regulation_code)