fix: V1 Enrichment — Qdrant Collection + Parent-Resolution fuer regulatorische Matches
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 33s
CI/CD / test-python-backend-compliance (push) Successful in 30s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 1s

Die atomic_controls_dedup Collection (51k Punkte) enthaelt nur atomare
Controls ohne source_citation. Jetzt wird der Parent-Control aufgeloest,
der die Rechtsgrundlage traegt. Deduplizierung nach Parent-UUID verhindert
mehrfache Eintraege fuer die gleiche Regulation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-26 10:52:41 +01:00
parent db7c207464
commit 81c9ce5de3
2 changed files with 68 additions and 25 deletions

View File

@@ -124,13 +124,19 @@ async def enrich_v1_matches(
continue
# Search Qdrant (cross-regulation, no pattern filter)
# Collection is atomic_controls_dedup (contains ~51k atomare Controls)
results = await qdrant_search_cross_regulation(
embedding, top_k=10,
embedding, top_k=20,
collection="atomic_controls_dedup",
)
# Filter: only regulatory controls (with source_citation)
# and above threshold
# For each hit: resolve to a regulatory parent with source_citation.
# Atomic controls in Qdrant usually have parent_control_uuid → parent
# has the source_citation. We deduplicate by parent to avoid
# listing the same regulation multiple times.
rank = 0
seen_parents: set[str] = set()
for hit in results:
score = hit.get("score", 0)
if score < V1_MATCH_THRESHOLD:
@@ -141,27 +147,50 @@ async def enrich_v1_matches(
if not matched_uuid or matched_uuid == str(v1.id):
continue
# Check if matched control has source_citation
# Try the matched control itself first, then its parent
matched_row = db.execute(text("""
SELECT id, control_id, title, source_citation, severity, category
FROM canonical_controls
WHERE id = CAST(:uuid AS uuid)
AND source_citation IS NOT NULL
SELECT c.id, c.control_id, c.title, c.source_citation,
c.severity, c.category, c.parent_control_uuid
FROM canonical_controls c
WHERE c.id = CAST(:uuid AS uuid)
"""), {"uuid": matched_uuid}).fetchone()
if not matched_row:
continue
# Resolve to regulatory control (one with source_citation)
reg_row = matched_row
if not reg_row.source_citation and reg_row.parent_control_uuid:
# Look up parent — the parent has the source_citation
parent_row = db.execute(text("""
SELECT id, control_id, title, source_citation,
severity, category, parent_control_uuid
FROM canonical_controls
WHERE id = CAST(:uuid AS uuid)
AND source_citation IS NOT NULL
"""), {"uuid": str(reg_row.parent_control_uuid)}).fetchone()
if parent_row:
reg_row = parent_row
if not reg_row.source_citation:
continue
# Deduplicate by parent UUID
parent_key = str(reg_row.id)
if parent_key in seen_parents:
continue
seen_parents.add(parent_key)
rank += 1
if rank > V1_MAX_MATCHES:
break
# Extract source info
source_citation = matched_row.source_citation or {}
source_citation = reg_row.source_citation or {}
matched_source = source_citation.get("source") if isinstance(source_citation, dict) else None
matched_article = source_citation.get("article") if isinstance(source_citation, dict) else None
# Insert match (ON CONFLICT skip)
# Insert match — link to the regulatory parent (not the atomic child)
db.execute(text("""
INSERT INTO v1_control_matches
(v1_control_uuid, matched_control_uuid, similarity_score,
@@ -174,7 +203,7 @@ async def enrich_v1_matches(
match_rank = EXCLUDED.match_rank
"""), {
"v1_uuid": str(v1.id),
"matched_uuid": str(matched_row.id),
"matched_uuid": str(reg_row.id),
"score": round(score, 3),
"rank": rank,
"source": matched_source,
@@ -187,8 +216,8 @@ async def enrich_v1_matches(
sample_matches.append({
"v1_control_id": v1.control_id,
"v1_title": v1.title,
"matched_control_id": matched_row.control_id,
"matched_title": matched_row.title,
"matched_control_id": reg_row.control_id,
"matched_title": reg_row.title,
"matched_source": matched_source,
"matched_article": matched_article,
"similarity_score": round(score, 3),