fix: V1 Enrichment — Qdrant Collection + Parent-Resolution fuer regulatorische Matches
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 33s
CI/CD / test-python-backend-compliance (push) Successful in 30s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 1s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 33s
CI/CD / test-python-backend-compliance (push) Successful in 30s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 1s
Die atomic_controls_dedup Collection (51k Punkte) enthaelt nur atomare Controls ohne source_citation. Jetzt wird der Parent-Control aufgeloest, der die Rechtsgrundlage traegt. Deduplizierung nach Parent-UUID verhindert mehrfache Eintraege fuer die gleiche Regulation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -124,13 +124,19 @@ async def enrich_v1_matches(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Search Qdrant (cross-regulation, no pattern filter)
|
# Search Qdrant (cross-regulation, no pattern filter)
|
||||||
|
# Collection is atomic_controls_dedup (contains ~51k atomare Controls)
|
||||||
results = await qdrant_search_cross_regulation(
|
results = await qdrant_search_cross_regulation(
|
||||||
embedding, top_k=10,
|
embedding, top_k=20,
|
||||||
|
collection="atomic_controls_dedup",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Filter: only regulatory controls (with source_citation)
|
# For each hit: resolve to a regulatory parent with source_citation.
|
||||||
# and above threshold
|
# Atomic controls in Qdrant usually have parent_control_uuid → parent
|
||||||
|
# has the source_citation. We deduplicate by parent to avoid
|
||||||
|
# listing the same regulation multiple times.
|
||||||
rank = 0
|
rank = 0
|
||||||
|
seen_parents: set[str] = set()
|
||||||
|
|
||||||
for hit in results:
|
for hit in results:
|
||||||
score = hit.get("score", 0)
|
score = hit.get("score", 0)
|
||||||
if score < V1_MATCH_THRESHOLD:
|
if score < V1_MATCH_THRESHOLD:
|
||||||
@@ -141,27 +147,50 @@ async def enrich_v1_matches(
|
|||||||
if not matched_uuid or matched_uuid == str(v1.id):
|
if not matched_uuid or matched_uuid == str(v1.id):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if matched control has source_citation
|
# Try the matched control itself first, then its parent
|
||||||
matched_row = db.execute(text("""
|
matched_row = db.execute(text("""
|
||||||
SELECT id, control_id, title, source_citation, severity, category
|
SELECT c.id, c.control_id, c.title, c.source_citation,
|
||||||
FROM canonical_controls
|
c.severity, c.category, c.parent_control_uuid
|
||||||
WHERE id = CAST(:uuid AS uuid)
|
FROM canonical_controls c
|
||||||
AND source_citation IS NOT NULL
|
WHERE c.id = CAST(:uuid AS uuid)
|
||||||
"""), {"uuid": matched_uuid}).fetchone()
|
"""), {"uuid": matched_uuid}).fetchone()
|
||||||
|
|
||||||
if not matched_row:
|
if not matched_row:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Resolve to regulatory control (one with source_citation)
|
||||||
|
reg_row = matched_row
|
||||||
|
if not reg_row.source_citation and reg_row.parent_control_uuid:
|
||||||
|
# Look up parent — the parent has the source_citation
|
||||||
|
parent_row = db.execute(text("""
|
||||||
|
SELECT id, control_id, title, source_citation,
|
||||||
|
severity, category, parent_control_uuid
|
||||||
|
FROM canonical_controls
|
||||||
|
WHERE id = CAST(:uuid AS uuid)
|
||||||
|
AND source_citation IS NOT NULL
|
||||||
|
"""), {"uuid": str(reg_row.parent_control_uuid)}).fetchone()
|
||||||
|
if parent_row:
|
||||||
|
reg_row = parent_row
|
||||||
|
|
||||||
|
if not reg_row.source_citation:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Deduplicate by parent UUID
|
||||||
|
parent_key = str(reg_row.id)
|
||||||
|
if parent_key in seen_parents:
|
||||||
|
continue
|
||||||
|
seen_parents.add(parent_key)
|
||||||
|
|
||||||
rank += 1
|
rank += 1
|
||||||
if rank > V1_MAX_MATCHES:
|
if rank > V1_MAX_MATCHES:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Extract source info
|
# Extract source info
|
||||||
source_citation = matched_row.source_citation or {}
|
source_citation = reg_row.source_citation or {}
|
||||||
matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
|
matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
|
||||||
matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None
|
matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None
|
||||||
|
|
||||||
# Insert match (ON CONFLICT skip)
|
# Insert match — link to the regulatory parent (not the atomic child)
|
||||||
db.execute(text("""
|
db.execute(text("""
|
||||||
INSERT INTO v1_control_matches
|
INSERT INTO v1_control_matches
|
||||||
(v1_control_uuid, matched_control_uuid, similarity_score,
|
(v1_control_uuid, matched_control_uuid, similarity_score,
|
||||||
@@ -174,7 +203,7 @@ async def enrich_v1_matches(
|
|||||||
match_rank = EXCLUDED.match_rank
|
match_rank = EXCLUDED.match_rank
|
||||||
"""), {
|
"""), {
|
||||||
"v1_uuid": str(v1.id),
|
"v1_uuid": str(v1.id),
|
||||||
"matched_uuid": str(matched_row.id),
|
"matched_uuid": str(reg_row.id),
|
||||||
"score": round(score, 3),
|
"score": round(score, 3),
|
||||||
"rank": rank,
|
"rank": rank,
|
||||||
"source": matched_source,
|
"source": matched_source,
|
||||||
@@ -187,8 +216,8 @@ async def enrich_v1_matches(
|
|||||||
sample_matches.append({
|
sample_matches.append({
|
||||||
"v1_control_id": v1.control_id,
|
"v1_control_id": v1.control_id,
|
||||||
"v1_title": v1.title,
|
"v1_title": v1.title,
|
||||||
"matched_control_id": matched_row.control_id,
|
"matched_control_id": reg_row.control_id,
|
||||||
"matched_title": matched_row.title,
|
"matched_title": reg_row.title,
|
||||||
"matched_source": matched_source,
|
"matched_source": matched_source,
|
||||||
"matched_article": matched_article,
|
"matched_article": matched_article,
|
||||||
"similarity_score": round(score, 3),
|
"similarity_score": round(score, 3),
|
||||||
|
|||||||
@@ -68,11 +68,24 @@ class TestV1EnrichmentExecution:
|
|||||||
]
|
]
|
||||||
|
|
||||||
mock_count = MagicMock(cnt=1)
|
mock_count = MagicMock(cnt=1)
|
||||||
mock_matched_row = MagicMock(
|
|
||||||
|
# Atomic control found in Qdrant (has parent, no source_citation)
|
||||||
|
mock_atomic_row = MagicMock(
|
||||||
|
id="uuid-atomic-1",
|
||||||
|
control_id="SEC-042-A01",
|
||||||
|
title="Verschluesselung (atomar)",
|
||||||
|
source_citation=None, # Atomic controls don't have source_citation
|
||||||
|
parent_control_uuid="uuid-reg-1",
|
||||||
|
severity="high",
|
||||||
|
category="encryption",
|
||||||
|
)
|
||||||
|
# Parent control (has source_citation)
|
||||||
|
mock_parent_row = MagicMock(
|
||||||
id="uuid-reg-1",
|
id="uuid-reg-1",
|
||||||
control_id="SEC-042",
|
control_id="SEC-042",
|
||||||
title="Verschluesselung personenbezogener Daten",
|
title="Verschluesselung personenbezogener Daten",
|
||||||
source_citation={"source": "DSGVO (EU) 2016/679", "article": "Art. 32"},
|
source_citation={"source": "DSGVO (EU) 2016/679", "article": "Art. 32"},
|
||||||
|
parent_control_uuid=None,
|
||||||
severity="high",
|
severity="high",
|
||||||
category="encryption",
|
category="encryption",
|
||||||
)
|
)
|
||||||
@@ -81,9 +94,9 @@ class TestV1EnrichmentExecution:
|
|||||||
{
|
{
|
||||||
"score": 0.89,
|
"score": 0.89,
|
||||||
"payload": {
|
"payload": {
|
||||||
"control_uuid": "uuid-reg-1",
|
"control_uuid": "uuid-atomic-1",
|
||||||
"control_id": "SEC-042",
|
"control_id": "SEC-042-A01",
|
||||||
"title": "Verschluesselung",
|
"title": "Verschluesselung (atomar)",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -100,18 +113,19 @@ class TestV1EnrichmentExecution:
|
|||||||
mock_session.return_value.__enter__ = MagicMock(return_value=db)
|
mock_session.return_value.__enter__ = MagicMock(return_value=db)
|
||||||
mock_session.return_value.__exit__ = MagicMock(return_value=False)
|
mock_session.return_value.__exit__ = MagicMock(return_value=False)
|
||||||
|
|
||||||
# Multiple execute calls: v1 list, count, matched_row lookup, insert
|
# Route queries to correct mock data
|
||||||
call_count = [0]
|
|
||||||
def side_effect_execute(query, params=None):
|
def side_effect_execute(query, params=None):
|
||||||
call_count[0] += 1
|
|
||||||
result = MagicMock()
|
result = MagicMock()
|
||||||
# fetchall for v1 controls list
|
query_str = str(query)
|
||||||
result.fetchall.return_value = mock_v1
|
result.fetchall.return_value = mock_v1
|
||||||
# fetchone for count and matched row
|
if "COUNT" in query_str:
|
||||||
if "COUNT" in str(query):
|
|
||||||
result.fetchone.return_value = mock_count
|
result.fetchone.return_value = mock_count
|
||||||
elif "source_citation IS NOT NULL" in str(query):
|
elif "source_citation IS NOT NULL" in query_str:
|
||||||
result.fetchone.return_value = mock_matched_row
|
# Parent lookup
|
||||||
|
result.fetchone.return_value = mock_parent_row
|
||||||
|
elif "c.id = CAST" in query_str or "canonical_controls c" in query_str:
|
||||||
|
# Direct atomic control lookup
|
||||||
|
result.fetchone.return_value = mock_atomic_row
|
||||||
else:
|
else:
|
||||||
result.fetchone.return_value = mock_count
|
result.fetchone.return_value = mock_count
|
||||||
return result
|
return result
|
||||||
|
|||||||
Reference in New Issue
Block a user