diff --git a/backend-compliance/compliance/services/v1_enrichment.py b/backend-compliance/compliance/services/v1_enrichment.py index 39f0f34..9223a0b 100644 --- a/backend-compliance/compliance/services/v1_enrichment.py +++ b/backend-compliance/compliance/services/v1_enrichment.py @@ -124,13 +124,19 @@ async def enrich_v1_matches( continue # Search Qdrant (cross-regulation, no pattern filter) + # Collection is atomic_controls_dedup (contains ~51k atomare Controls) results = await qdrant_search_cross_regulation( - embedding, top_k=10, + embedding, top_k=20, + collection="atomic_controls_dedup", ) - # Filter: only regulatory controls (with source_citation) - # and above threshold + # For each hit: resolve to a regulatory parent with source_citation. + # Atomic controls in Qdrant usually have parent_control_uuid → parent + # has the source_citation. We deduplicate by parent to avoid + # listing the same regulation multiple times. rank = 0 + seen_parents: set[str] = set() + for hit in results: score = hit.get("score", 0) if score < V1_MATCH_THRESHOLD: @@ -141,27 +147,50 @@ async def enrich_v1_matches( if not matched_uuid or matched_uuid == str(v1.id): continue - # Check if matched control has source_citation + # Try the matched control itself first, then its parent matched_row = db.execute(text(""" - SELECT id, control_id, title, source_citation, severity, category - FROM canonical_controls - WHERE id = CAST(:uuid AS uuid) - AND source_citation IS NOT NULL + SELECT c.id, c.control_id, c.title, c.source_citation, + c.severity, c.category, c.parent_control_uuid + FROM canonical_controls c + WHERE c.id = CAST(:uuid AS uuid) """), {"uuid": matched_uuid}).fetchone() if not matched_row: continue + # Resolve to regulatory control (one with source_citation) + reg_row = matched_row + if not reg_row.source_citation and reg_row.parent_control_uuid: + # Look up parent — the parent has the source_citation + parent_row = db.execute(text(""" + SELECT id, control_id, title, source_citation, + severity, category, parent_control_uuid + FROM canonical_controls + WHERE id = CAST(:uuid AS uuid) + AND source_citation IS NOT NULL + """), {"uuid": str(reg_row.parent_control_uuid)}).fetchone() + if parent_row: + reg_row = parent_row + + if not reg_row.source_citation: + continue + + # Deduplicate by parent UUID + parent_key = str(reg_row.id) + if parent_key in seen_parents: + continue + seen_parents.add(parent_key) + rank += 1 if rank > V1_MAX_MATCHES: break # Extract source info - source_citation = matched_row.source_citation or {} + source_citation = reg_row.source_citation or {} matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None - # Insert match (ON CONFLICT skip) + # Insert match — link to the regulatory parent (not the atomic child) db.execute(text(""" INSERT INTO v1_control_matches (v1_control_uuid, matched_control_uuid, similarity_score, @@ -174,7 +203,7 @@ async def enrich_v1_matches( match_rank = EXCLUDED.match_rank """), { "v1_uuid": str(v1.id), - "matched_uuid": str(matched_row.id), + "matched_uuid": str(reg_row.id), "score": round(score, 3), "rank": rank, "source": matched_source, @@ -187,8 +216,8 @@ async def enrich_v1_matches( sample_matches.append({ "v1_control_id": v1.control_id, "v1_title": v1.title, - "matched_control_id": matched_row.control_id, - "matched_title": matched_row.title, + "matched_control_id": reg_row.control_id, + "matched_title": reg_row.title, "matched_source": matched_source, "matched_article": matched_article, "similarity_score": round(score, 3), diff --git a/backend-compliance/tests/test_v1_enrichment.py b/backend-compliance/tests/test_v1_enrichment.py index cc95fe8..3c4bcf2 100644 --- a/backend-compliance/tests/test_v1_enrichment.py +++ b/backend-compliance/tests/test_v1_enrichment.py @@ -68,11 +68,24 @@ class TestV1EnrichmentExecution: ] mock_count = MagicMock(cnt=1) - mock_matched_row = MagicMock( + + # Atomic control found in Qdrant (has parent, no source_citation) + mock_atomic_row = MagicMock( + id="uuid-atomic-1", + control_id="SEC-042-A01", + title="Verschluesselung (atomar)", + source_citation=None, # Atomic controls don't have source_citation + parent_control_uuid="uuid-reg-1", + severity="high", + category="encryption", + ) + # Parent control (has source_citation) + mock_parent_row = MagicMock( id="uuid-reg-1", control_id="SEC-042", title="Verschluesselung personenbezogener Daten", source_citation={"source": "DSGVO (EU) 2016/679", "article": "Art. 32"}, + parent_control_uuid=None, severity="high", category="encryption", ) @@ -81,9 +94,9 @@ class TestV1EnrichmentExecution: { "score": 0.89, "payload": { - "control_uuid": "uuid-reg-1", - "control_id": "SEC-042", - "title": "Verschluesselung", + "control_uuid": "uuid-atomic-1", + "control_id": "SEC-042-A01", + "title": "Verschluesselung (atomar)", }, }, { @@ -100,18 +113,19 @@ class TestV1EnrichmentExecution: mock_session.return_value.__enter__ = MagicMock(return_value=db) mock_session.return_value.__exit__ = MagicMock(return_value=False) - # Multiple execute calls: v1 list, count, matched_row lookup, insert - call_count = [0] + # Route queries to correct mock data def side_effect_execute(query, params=None): - call_count[0] += 1 result = MagicMock() - # fetchall for v1 controls list + query_str = str(query) result.fetchall.return_value = mock_v1 - # fetchone for count and matched row - if "COUNT" in str(query): + if "COUNT" in query_str: result.fetchone.return_value = mock_count - elif "source_citation IS NOT NULL" in str(query): - result.fetchone.return_value = mock_matched_row + elif "source_citation IS NOT NULL" in query_str: + # Parent lookup + result.fetchone.return_value = mock_parent_row + elif "c.id = CAST" in query_str or "canonical_controls c" in query_str: + # Direct atomic control lookup + result.fetchone.return_value = mock_atomic_row else: result.fetchone.return_value = mock_count return result