diff --git a/control-pipeline/services/anchor_finder.py b/control-pipeline/services/anchor_finder.py index 596f315..8807567 100644 --- a/control-pipeline/services/anchor_finder.py +++ b/control-pipeline/services/anchor_finder.py @@ -32,7 +32,7 @@ EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087") _OPEN_SOURCE_RULES = {1, 2} # Collections to search for anchors (open-source frameworks) -_ANCHOR_COLLECTIONS = ["bp_compliance_ce"] +_ANCHOR_COLLECTIONS = ["bp_compliance_ce", "bp_compliance_datenschutz"] @dataclass @@ -123,9 +123,11 @@ class AnchorFinder: for hit in results: payload = hit.get("payload", {}) + # Qdrant payloads use regulation_id (not regulation_code) regulation_code = ( - payload.get("regulation_code", "") - or payload.get("metadata", {}).get("regulation_code", "") + payload.get("regulation_id", "") + or payload.get("regulation_code", "") + or payload.get("metadata", {}).get("regulation_id", "") ) if not regulation_code: continue @@ -136,25 +138,23 @@ class AnchorFinder: continue # Build reference key for dedup - article = payload.get("article", "") or payload.get("metadata", {}).get("article", "") - category = payload.get("category", "") or payload.get("metadata", {}).get("category", "") - ref = article or category or "" + article = payload.get("article", "") or payload.get("category", "") or "" + ref = article key = f"{regulation_code}:{ref}" if key in seen: continue seen.add(key) reg_name = ( - payload.get("regulation_name", "") - or payload.get("metadata", {}).get("regulation_name", "") - ) - reg_short = ( - payload.get("regulation_short", "") - or payload.get("metadata", {}).get("regulation_short", "") + payload.get("regulation_name_de", "") + or payload.get("regulation_name_en", "") + or payload.get("guideline_name", "") ) + reg_short = payload.get("regulation_short", "") source_url = ( - payload.get("source_url", "") - or payload.get("metadata", {}).get("source_url", "") + payload.get("download_url", "") + or payload.get("source_url", "") + or payload.get("source", "") ) framework_name = license_info.get("name", reg_name or reg_short or regulation_code)