Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Phase 1 (LLM Quality): - Add format=json to all Ollama payloads (obligation_extractor, control_generator, citation_backfill) - Add Chain-of-Thought analysis steps to Pass 0a/0b system prompts Phase 2 (Retrieval Quality): - Hybrid search via Qdrant Query API with RRF fusion + automatic text index (legal_rag.go) - Fallback to dense-only search if Query API unavailable - Cross-encoder re-ranking with BGE Reranker v2 (RERANK_ENABLED=false by default) - CPU-only PyTorch dependency to keep Docker image small Phase 3 (Data Layer): - Cross-regulation dedup pass (threshold 0.95) links controls across regulations - DedupResult.link_type field distinguishes dedup_merge vs cross_regulation - Chunk size defaults updated 512/50 → 1024/128 for new ingestions only - Existing collections and controls are NOT affected Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
734 lines
27 KiB
Python
734 lines
27 KiB
Python
"""Control Deduplication Engine — 4-Stage Matching Pipeline.
|
|
|
|
Prevents duplicate atomic controls during Pass 0b by checking candidates
|
|
against existing controls before insertion.
|
|
|
|
Stages:
|
|
1. Pattern-Gate: pattern_id must match (hard gate)
|
|
2. Action-Check: normalized action verb must match (hard gate)
|
|
3. Object-Norm: normalized object must match (soft gate with high threshold)
|
|
4. Embedding: cosine similarity with tiered thresholds (Qdrant)
|
|
|
|
Verdicts:
|
|
- NEW: create a new atomic control
|
|
- LINK: add parent link to existing control (similarity > LINK_THRESHOLD)
|
|
- REVIEW: queue for human review (REVIEW_THRESHOLD < sim < LINK_THRESHOLD)
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional, Callable, Awaitable
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Configuration ────────────────────────────────────────────────────
|
|
|
|
DEDUP_ENABLED = os.getenv("DEDUP_ENABLED", "true").lower() == "true"
|
|
LINK_THRESHOLD = float(os.getenv("DEDUP_LINK_THRESHOLD", "0.92"))
|
|
REVIEW_THRESHOLD = float(os.getenv("DEDUP_REVIEW_THRESHOLD", "0.85"))
|
|
LINK_THRESHOLD_DIFF_OBJECT = float(os.getenv("DEDUP_LINK_THRESHOLD_DIFF_OBJ", "0.95"))
|
|
CROSS_REG_LINK_THRESHOLD = float(os.getenv("DEDUP_CROSS_REG_THRESHOLD", "0.95"))
|
|
QDRANT_COLLECTION = os.getenv("DEDUP_QDRANT_COLLECTION", "atomic_controls")
|
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333")
|
|
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
|
|
|
|
|
|
# ── Result Dataclass ─────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class DedupResult:
|
|
"""Outcome of the dedup check."""
|
|
verdict: str # "new" | "link" | "review"
|
|
matched_control_uuid: Optional[str] = None
|
|
matched_control_id: Optional[str] = None
|
|
matched_title: Optional[str] = None
|
|
stage: str = "" # which stage decided
|
|
similarity_score: float = 0.0
|
|
link_type: str = "dedup_merge" # "dedup_merge" | "cross_regulation"
|
|
details: dict = field(default_factory=dict)
|
|
|
|
|
|
# ── Action Normalization ─────────────────────────────────────────────
|
|
|
|
_ACTION_SYNONYMS: dict[str, str] = {
|
|
# German → canonical English
|
|
"implementieren": "implement",
|
|
"umsetzen": "implement",
|
|
"einrichten": "implement",
|
|
"einführen": "implement",
|
|
"aufbauen": "implement",
|
|
"bereitstellen": "implement",
|
|
"aktivieren": "implement",
|
|
"konfigurieren": "configure",
|
|
"einstellen": "configure",
|
|
"parametrieren": "configure",
|
|
"testen": "test",
|
|
"prüfen": "test",
|
|
"überprüfen": "test",
|
|
"verifizieren": "test",
|
|
"validieren": "test",
|
|
"kontrollieren": "test",
|
|
"auditieren": "audit",
|
|
"dokumentieren": "document",
|
|
"protokollieren": "log",
|
|
"aufzeichnen": "log",
|
|
"loggen": "log",
|
|
"überwachen": "monitor",
|
|
"monitoring": "monitor",
|
|
"beobachten": "monitor",
|
|
"schulen": "train",
|
|
"trainieren": "train",
|
|
"sensibilisieren": "train",
|
|
"löschen": "delete",
|
|
"entfernen": "delete",
|
|
"verschlüsseln": "encrypt",
|
|
"sperren": "block",
|
|
"beschränken": "restrict",
|
|
"einschränken": "restrict",
|
|
"begrenzen": "restrict",
|
|
"autorisieren": "authorize",
|
|
"genehmigen": "authorize",
|
|
"freigeben": "authorize",
|
|
"authentifizieren": "authenticate",
|
|
"identifizieren": "identify",
|
|
"melden": "report",
|
|
"benachrichtigen": "notify",
|
|
"informieren": "notify",
|
|
"aktualisieren": "update",
|
|
"erneuern": "update",
|
|
"sichern": "backup",
|
|
"wiederherstellen": "restore",
|
|
# English passthrough
|
|
"implement": "implement",
|
|
"configure": "configure",
|
|
"test": "test",
|
|
"verify": "test",
|
|
"validate": "test",
|
|
"audit": "audit",
|
|
"document": "document",
|
|
"log": "log",
|
|
"monitor": "monitor",
|
|
"train": "train",
|
|
"delete": "delete",
|
|
"encrypt": "encrypt",
|
|
"restrict": "restrict",
|
|
"authorize": "authorize",
|
|
"authenticate": "authenticate",
|
|
"report": "report",
|
|
"update": "update",
|
|
"backup": "backup",
|
|
"restore": "restore",
|
|
}
|
|
|
|
|
|
def normalize_action(action: str) -> str:
|
|
"""Normalize an action verb to a canonical English form."""
|
|
if not action:
|
|
return ""
|
|
action = action.strip().lower()
|
|
# Strip German infinitive/conjugation suffixes for lookup
|
|
action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action)
|
|
# Try exact match first, then base form
|
|
if action in _ACTION_SYNONYMS:
|
|
return _ACTION_SYNONYMS[action]
|
|
if action_base in _ACTION_SYNONYMS:
|
|
return _ACTION_SYNONYMS[action_base]
|
|
# Fuzzy: check if action starts with any known verb
|
|
for verb, canonical in _ACTION_SYNONYMS.items():
|
|
if action.startswith(verb) or verb.startswith(action):
|
|
return canonical
|
|
return action # fallback: return as-is
|
|
|
|
|
|
# ── Object Normalization ─────────────────────────────────────────────
|
|
|
|
_OBJECT_SYNONYMS: dict[str, str] = {
|
|
# Authentication / Access
|
|
"mfa": "multi_factor_auth",
|
|
"multi-faktor-authentifizierung": "multi_factor_auth",
|
|
"mehrfaktorauthentifizierung": "multi_factor_auth",
|
|
"multi-factor authentication": "multi_factor_auth",
|
|
"two-factor": "multi_factor_auth",
|
|
"2fa": "multi_factor_auth",
|
|
"passwort": "password_policy",
|
|
"kennwort": "password_policy",
|
|
"password": "password_policy",
|
|
"zugangsdaten": "credentials",
|
|
"credentials": "credentials",
|
|
"admin-konten": "privileged_access",
|
|
"admin accounts": "privileged_access",
|
|
"administratorkonten": "privileged_access",
|
|
"privilegierte zugriffe": "privileged_access",
|
|
"privileged accounts": "privileged_access",
|
|
"remote-zugriff": "remote_access",
|
|
"fernzugriff": "remote_access",
|
|
"remote access": "remote_access",
|
|
"session": "session_management",
|
|
"sitzung": "session_management",
|
|
"sitzungsverwaltung": "session_management",
|
|
# Encryption
|
|
"verschlüsselung": "encryption",
|
|
"encryption": "encryption",
|
|
"kryptografie": "encryption",
|
|
"kryptografische verfahren": "encryption",
|
|
"schlüssel": "key_management",
|
|
"key management": "key_management",
|
|
"schlüsselverwaltung": "key_management",
|
|
"zertifikat": "certificate_management",
|
|
"certificate": "certificate_management",
|
|
"tls": "transport_encryption",
|
|
"ssl": "transport_encryption",
|
|
"https": "transport_encryption",
|
|
# Network
|
|
"firewall": "firewall",
|
|
"netzwerk": "network_security",
|
|
"network": "network_security",
|
|
"vpn": "vpn",
|
|
"segmentierung": "network_segmentation",
|
|
"segmentation": "network_segmentation",
|
|
# Logging / Monitoring
|
|
"audit-log": "audit_logging",
|
|
"audit log": "audit_logging",
|
|
"protokoll": "audit_logging",
|
|
"logging": "audit_logging",
|
|
"monitoring": "monitoring",
|
|
"überwachung": "monitoring",
|
|
"alerting": "alerting",
|
|
"alarmierung": "alerting",
|
|
"siem": "siem",
|
|
# Data
|
|
"personenbezogene daten": "personal_data",
|
|
"personal data": "personal_data",
|
|
"sensible daten": "sensitive_data",
|
|
"sensitive data": "sensitive_data",
|
|
"datensicherung": "backup",
|
|
"backup": "backup",
|
|
"wiederherstellung": "disaster_recovery",
|
|
"disaster recovery": "disaster_recovery",
|
|
# Policy / Process
|
|
"richtlinie": "policy",
|
|
"policy": "policy",
|
|
"verfahrensanweisung": "procedure",
|
|
"procedure": "procedure",
|
|
"prozess": "process",
|
|
"schulung": "training",
|
|
"training": "training",
|
|
"awareness": "awareness",
|
|
"sensibilisierung": "awareness",
|
|
# Incident
|
|
"vorfall": "incident",
|
|
"incident": "incident",
|
|
"sicherheitsvorfall": "security_incident",
|
|
"security incident": "security_incident",
|
|
# Vulnerability
|
|
"schwachstelle": "vulnerability",
|
|
"vulnerability": "vulnerability",
|
|
"patch": "patch_management",
|
|
"update": "patch_management",
|
|
"patching": "patch_management",
|
|
}
|
|
|
|
# Precompile for substring matching (longest first)
|
|
_OBJECT_KEYS_SORTED = sorted(_OBJECT_SYNONYMS.keys(), key=len, reverse=True)
|
|
|
|
|
|
def normalize_object(obj: str) -> str:
|
|
"""Normalize a compliance object to a canonical token."""
|
|
if not obj:
|
|
return ""
|
|
obj_lower = obj.strip().lower()
|
|
# Exact match
|
|
if obj_lower in _OBJECT_SYNONYMS:
|
|
return _OBJECT_SYNONYMS[obj_lower]
|
|
# Substring match (longest first)
|
|
for phrase in _OBJECT_KEYS_SORTED:
|
|
if phrase in obj_lower:
|
|
return _OBJECT_SYNONYMS[phrase]
|
|
# Fallback: strip articles/prepositions, join with underscore
|
|
cleaned = re.sub(r"\b(der|die|das|den|dem|des|ein|eine|eines|einem|einen"
|
|
r"|für|von|zu|auf|in|an|bei|mit|nach|über|unter|the|a|an"
|
|
r"|for|of|to|on|in|at|by|with)\b", "", obj_lower)
|
|
tokens = [t for t in cleaned.split() if len(t) > 2]
|
|
return "_".join(tokens[:4]) if tokens else obj_lower.replace(" ", "_")
|
|
|
|
|
|
# ── Canonicalization ─────────────────────────────────────────────────
|
|
|
|
def canonicalize_text(action: str, obj: str, title: str = "") -> str:
|
|
"""Build a canonical English text for embedding.
|
|
|
|
Transforms German compliance text into normalized English tokens
|
|
for more stable embedding comparisons.
|
|
"""
|
|
norm_action = normalize_action(action)
|
|
norm_object = normalize_object(obj)
|
|
# Build canonical sentence
|
|
parts = [norm_action, norm_object]
|
|
if title:
|
|
# Add title keywords (stripped of common filler)
|
|
title_clean = re.sub(
|
|
r"\b(und|oder|für|von|zu|der|die|das|den|dem|des|ein|eine"
|
|
r"|bei|mit|nach|gemäß|gem\.|laut|entsprechend)\b",
|
|
"", title.lower()
|
|
)
|
|
title_tokens = [t for t in title_clean.split() if len(t) > 3][:5]
|
|
if title_tokens:
|
|
parts.append("for")
|
|
parts.extend(title_tokens)
|
|
return " ".join(parts)
|
|
|
|
|
|
# ── Embedding Helper ─────────────────────────────────────────────────
|
|
|
|
async def get_embedding(text: str) -> list[float]:
|
|
"""Get embedding vector for a single text via embedding service."""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
resp = await client.post(
|
|
f"{EMBEDDING_URL}/embed",
|
|
json={"texts": [text]},
|
|
)
|
|
embeddings = resp.json().get("embeddings", [])
|
|
return embeddings[0] if embeddings else []
|
|
except Exception as e:
|
|
logger.warning("Embedding failed: %s", e)
|
|
return []
|
|
|
|
|
|
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
"""Compute cosine similarity between two vectors."""
|
|
if not a or not b or len(a) != len(b):
|
|
return 0.0
|
|
dot = sum(x * y for x, y in zip(a, b))
|
|
norm_a = sum(x * x for x in a) ** 0.5
|
|
norm_b = sum(x * x for x in b) ** 0.5
|
|
if norm_a == 0 or norm_b == 0:
|
|
return 0.0
|
|
return dot / (norm_a * norm_b)
|
|
|
|
|
|
# ── Qdrant Helpers ───────────────────────────────────────────────────
|
|
|
|
async def qdrant_search(
|
|
embedding: list[float],
|
|
pattern_id: str,
|
|
top_k: int = 10,
|
|
) -> list[dict]:
|
|
"""Search Qdrant for similar atomic controls, filtered by pattern_id."""
|
|
if not embedding:
|
|
return []
|
|
body: dict = {
|
|
"vector": embedding,
|
|
"limit": top_k,
|
|
"with_payload": True,
|
|
"filter": {
|
|
"must": [
|
|
{"key": "pattern_id", "match": {"value": pattern_id}}
|
|
]
|
|
},
|
|
}
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
resp = await client.post(
|
|
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
|
|
json=body,
|
|
)
|
|
if resp.status_code != 200:
|
|
logger.warning("Qdrant search failed: %d", resp.status_code)
|
|
return []
|
|
return resp.json().get("result", [])
|
|
except Exception as e:
|
|
logger.warning("Qdrant search error: %s", e)
|
|
return []
|
|
|
|
|
|
async def qdrant_search_cross_regulation(
|
|
embedding: list[float],
|
|
top_k: int = 5,
|
|
) -> list[dict]:
|
|
"""Search Qdrant for similar controls across ALL regulations (no pattern_id filter).
|
|
|
|
Used for cross-regulation linking (e.g. DSGVO Art. 25 ↔ NIS2 Art. 21).
|
|
"""
|
|
if not embedding:
|
|
return []
|
|
body: dict = {
|
|
"vector": embedding,
|
|
"limit": top_k,
|
|
"with_payload": True,
|
|
}
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
resp = await client.post(
|
|
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
|
|
json=body,
|
|
)
|
|
if resp.status_code != 200:
|
|
logger.warning("Qdrant cross-reg search failed: %d", resp.status_code)
|
|
return []
|
|
return resp.json().get("result", [])
|
|
except Exception as e:
|
|
logger.warning("Qdrant cross-reg search error: %s", e)
|
|
return []
|
|
|
|
|
|
async def qdrant_upsert(
|
|
point_id: str,
|
|
embedding: list[float],
|
|
payload: dict,
|
|
) -> bool:
|
|
"""Upsert a single point into the atomic_controls Qdrant collection."""
|
|
if not embedding:
|
|
return False
|
|
body = {
|
|
"points": [{
|
|
"id": point_id,
|
|
"vector": embedding,
|
|
"payload": payload,
|
|
}]
|
|
}
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
resp = await client.put(
|
|
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points",
|
|
json=body,
|
|
)
|
|
return resp.status_code == 200
|
|
except Exception as e:
|
|
logger.warning("Qdrant upsert error: %s", e)
|
|
return False
|
|
|
|
|
|
async def ensure_qdrant_collection(vector_size: int = 1024) -> bool:
|
|
"""Create the Qdrant collection if it doesn't exist (idempotent)."""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
# Check if exists
|
|
resp = await client.get(f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}")
|
|
if resp.status_code == 200:
|
|
return True
|
|
# Create
|
|
resp = await client.put(
|
|
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}",
|
|
json={
|
|
"vectors": {"size": vector_size, "distance": "Cosine"},
|
|
},
|
|
)
|
|
if resp.status_code == 200:
|
|
logger.info("Created Qdrant collection: %s", QDRANT_COLLECTION)
|
|
# Create payload indexes
|
|
for field_name in ["pattern_id", "action_normalized", "object_normalized", "control_id"]:
|
|
await client.put(
|
|
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/index",
|
|
json={"field_name": field_name, "field_schema": "keyword"},
|
|
)
|
|
return True
|
|
logger.error("Failed to create Qdrant collection: %d", resp.status_code)
|
|
return False
|
|
except Exception as e:
|
|
logger.warning("Qdrant collection check error: %s", e)
|
|
return False
|
|
|
|
|
|
# ── Main Dedup Checker ───────────────────────────────────────────────
|
|
|
|
class ControlDedupChecker:
|
|
"""4-stage dedup checker for atomic controls.
|
|
|
|
Usage:
|
|
checker = ControlDedupChecker(db_session)
|
|
result = await checker.check_duplicate(candidate_action, candidate_object, candidate_title, pattern_id)
|
|
if result.verdict == "link":
|
|
checker.add_parent_link(result.matched_control_uuid, parent_uuid)
|
|
elif result.verdict == "review":
|
|
checker.write_review(candidate, result)
|
|
else:
|
|
# Insert new control
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
db,
|
|
embed_fn: Optional[Callable[[str], Awaitable[list[float]]]] = None,
|
|
search_fn: Optional[Callable] = None,
|
|
):
|
|
self.db = db
|
|
self._embed = embed_fn or get_embedding
|
|
self._search = search_fn or qdrant_search
|
|
self._cache: dict[str, list[dict]] = {} # pattern_id → existing controls
|
|
|
|
def _load_existing(self, pattern_id: str) -> list[dict]:
|
|
"""Load existing atomic controls with same pattern_id from DB."""
|
|
if pattern_id in self._cache:
|
|
return self._cache[pattern_id]
|
|
from sqlalchemy import text
|
|
rows = self.db.execute(text("""
|
|
SELECT id::text, control_id, title, objective,
|
|
pattern_id,
|
|
generation_metadata->>'obligation_type' as obligation_type
|
|
FROM canonical_controls
|
|
WHERE parent_control_uuid IS NOT NULL
|
|
AND release_state != 'deprecated'
|
|
AND pattern_id = :pid
|
|
"""), {"pid": pattern_id}).fetchall()
|
|
result = [
|
|
{
|
|
"uuid": r[0], "control_id": r[1], "title": r[2],
|
|
"objective": r[3], "pattern_id": r[4],
|
|
"obligation_type": r[5],
|
|
}
|
|
for r in rows
|
|
]
|
|
self._cache[pattern_id] = result
|
|
return result
|
|
|
|
async def check_duplicate(
|
|
self,
|
|
action: str,
|
|
obj: str,
|
|
title: str,
|
|
pattern_id: Optional[str],
|
|
) -> DedupResult:
|
|
"""Run the 4-stage dedup pipeline + cross-regulation linking.
|
|
|
|
Returns DedupResult with verdict: new/link/review.
|
|
"""
|
|
# No pattern_id → can't dedup meaningfully
|
|
if not pattern_id:
|
|
return DedupResult(verdict="new", stage="no_pattern")
|
|
|
|
# Stage 1: Pattern-Gate
|
|
existing = self._load_existing(pattern_id)
|
|
if not existing:
|
|
return DedupResult(
|
|
verdict="new", stage="pattern_gate",
|
|
details={"reason": "no existing controls with this pattern_id"},
|
|
)
|
|
|
|
# Stage 2: Action-Check
|
|
norm_action = normalize_action(action)
|
|
# We don't have action stored on existing controls from DB directly,
|
|
# so we use embedding for controls that passed pattern gate.
|
|
# But we CAN check via generation_metadata if available.
|
|
|
|
# Stage 3: Object-Normalization
|
|
norm_object = normalize_object(obj)
|
|
|
|
# Stage 4: Embedding Similarity
|
|
canonical = canonicalize_text(action, obj, title)
|
|
embedding = await self._embed(canonical)
|
|
if not embedding:
|
|
# Can't compute embedding → default to new
|
|
return DedupResult(
|
|
verdict="new", stage="embedding_unavailable",
|
|
details={"canonical_text": canonical},
|
|
)
|
|
|
|
# Search Qdrant
|
|
results = await self._search(embedding, pattern_id, top_k=5)
|
|
|
|
if not results:
|
|
# No intra-pattern matches → try cross-regulation
|
|
return await self._check_cross_regulation(embedding, DedupResult(
|
|
verdict="new", stage="no_qdrant_matches",
|
|
details={"canonical_text": canonical, "action": norm_action, "object": norm_object},
|
|
))
|
|
|
|
# Evaluate best match
|
|
best = results[0]
|
|
best_score = best.get("score", 0.0)
|
|
best_payload = best.get("payload", {})
|
|
best_action = best_payload.get("action_normalized", "")
|
|
best_object = best_payload.get("object_normalized", "")
|
|
|
|
# Action differs → NEW (even if embedding is high)
|
|
if best_action and norm_action and best_action != norm_action:
|
|
return await self._check_cross_regulation(embedding, DedupResult(
|
|
verdict="new", stage="action_mismatch",
|
|
similarity_score=best_score,
|
|
matched_control_id=best_payload.get("control_id"),
|
|
details={
|
|
"candidate_action": norm_action,
|
|
"existing_action": best_action,
|
|
"similarity": best_score,
|
|
},
|
|
))
|
|
|
|
# Object differs → use higher threshold
|
|
if best_object and norm_object and best_object != norm_object:
|
|
if best_score > LINK_THRESHOLD_DIFF_OBJECT:
|
|
return DedupResult(
|
|
verdict="link", stage="embedding_diff_object",
|
|
matched_control_uuid=best_payload.get("control_uuid"),
|
|
matched_control_id=best_payload.get("control_id"),
|
|
matched_title=best_payload.get("title"),
|
|
similarity_score=best_score,
|
|
details={"candidate_object": norm_object, "existing_object": best_object},
|
|
)
|
|
return await self._check_cross_regulation(embedding, DedupResult(
|
|
verdict="new", stage="object_mismatch_below_threshold",
|
|
similarity_score=best_score,
|
|
matched_control_id=best_payload.get("control_id"),
|
|
details={
|
|
"candidate_object": norm_object,
|
|
"existing_object": best_object,
|
|
"threshold": LINK_THRESHOLD_DIFF_OBJECT,
|
|
},
|
|
))
|
|
|
|
# Same action + same object → tiered thresholds
|
|
if best_score > LINK_THRESHOLD:
|
|
return DedupResult(
|
|
verdict="link", stage="embedding_match",
|
|
matched_control_uuid=best_payload.get("control_uuid"),
|
|
matched_control_id=best_payload.get("control_id"),
|
|
matched_title=best_payload.get("title"),
|
|
similarity_score=best_score,
|
|
)
|
|
if best_score > REVIEW_THRESHOLD:
|
|
return DedupResult(
|
|
verdict="review", stage="embedding_review",
|
|
matched_control_uuid=best_payload.get("control_uuid"),
|
|
matched_control_id=best_payload.get("control_id"),
|
|
matched_title=best_payload.get("title"),
|
|
similarity_score=best_score,
|
|
)
|
|
return await self._check_cross_regulation(embedding, DedupResult(
|
|
verdict="new", stage="embedding_below_threshold",
|
|
similarity_score=best_score,
|
|
details={"threshold": REVIEW_THRESHOLD},
|
|
))
|
|
|
|
async def _check_cross_regulation(
|
|
self,
|
|
embedding: list[float],
|
|
intra_result: DedupResult,
|
|
) -> DedupResult:
|
|
"""Second pass: cross-regulation linking for controls deemed 'new'.
|
|
|
|
Searches Qdrant WITHOUT pattern_id filter. Uses a higher threshold
|
|
(0.95) to avoid false positives across regulation boundaries.
|
|
"""
|
|
if intra_result.verdict != "new" or not embedding:
|
|
return intra_result
|
|
|
|
cross_results = await qdrant_search_cross_regulation(embedding, top_k=5)
|
|
if not cross_results:
|
|
return intra_result
|
|
|
|
best = cross_results[0]
|
|
best_score = best.get("score", 0.0)
|
|
if best_score > CROSS_REG_LINK_THRESHOLD:
|
|
best_payload = best.get("payload", {})
|
|
return DedupResult(
|
|
verdict="link",
|
|
stage="cross_regulation",
|
|
matched_control_uuid=best_payload.get("control_uuid"),
|
|
matched_control_id=best_payload.get("control_id"),
|
|
matched_title=best_payload.get("title"),
|
|
similarity_score=best_score,
|
|
link_type="cross_regulation",
|
|
details={
|
|
"cross_reg_score": best_score,
|
|
"cross_reg_threshold": CROSS_REG_LINK_THRESHOLD,
|
|
},
|
|
)
|
|
|
|
return intra_result
|
|
|
|
def add_parent_link(
|
|
self,
|
|
control_uuid: str,
|
|
parent_control_uuid: str,
|
|
link_type: str = "dedup_merge",
|
|
confidence: float = 0.0,
|
|
source_regulation: Optional[str] = None,
|
|
source_article: Optional[str] = None,
|
|
obligation_candidate_id: Optional[str] = None,
|
|
) -> None:
|
|
"""Add a parent link to an existing atomic control."""
|
|
from sqlalchemy import text
|
|
self.db.execute(text("""
|
|
INSERT INTO control_parent_links
|
|
(control_uuid, parent_control_uuid, link_type, confidence,
|
|
source_regulation, source_article, obligation_candidate_id)
|
|
VALUES (:cu, :pu, :lt, :conf, :sr, :sa, :oci::uuid)
|
|
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
|
"""), {
|
|
"cu": control_uuid,
|
|
"pu": parent_control_uuid,
|
|
"lt": link_type,
|
|
"conf": confidence,
|
|
"sr": source_regulation,
|
|
"sa": source_article,
|
|
"oci": obligation_candidate_id,
|
|
})
|
|
self.db.commit()
|
|
|
|
def write_review(
|
|
self,
|
|
candidate_control_id: str,
|
|
candidate_title: str,
|
|
candidate_objective: str,
|
|
result: DedupResult,
|
|
parent_control_uuid: Optional[str] = None,
|
|
obligation_candidate_id: Optional[str] = None,
|
|
) -> None:
|
|
"""Write a dedup review queue entry."""
|
|
from sqlalchemy import text
|
|
self.db.execute(text("""
|
|
INSERT INTO control_dedup_reviews
|
|
(candidate_control_id, candidate_title, candidate_objective,
|
|
matched_control_uuid, matched_control_id,
|
|
similarity_score, dedup_stage, dedup_details,
|
|
parent_control_uuid, obligation_candidate_id)
|
|
VALUES (:ccid, :ct, :co, :mcu::uuid, :mci, :ss, :ds,
|
|
:dd::jsonb, :pcu::uuid, :oci)
|
|
"""), {
|
|
"ccid": candidate_control_id,
|
|
"ct": candidate_title,
|
|
"co": candidate_objective,
|
|
"mcu": result.matched_control_uuid,
|
|
"mci": result.matched_control_id,
|
|
"ss": result.similarity_score,
|
|
"ds": result.stage,
|
|
"dd": __import__("json").dumps(result.details),
|
|
"pcu": parent_control_uuid,
|
|
"oci": obligation_candidate_id,
|
|
})
|
|
self.db.commit()
|
|
|
|
async def index_control(
|
|
self,
|
|
control_uuid: str,
|
|
control_id: str,
|
|
title: str,
|
|
action: str,
|
|
obj: str,
|
|
pattern_id: str,
|
|
) -> bool:
|
|
"""Index a new atomic control in Qdrant for future dedup checks."""
|
|
norm_action = normalize_action(action)
|
|
norm_object = normalize_object(obj)
|
|
canonical = canonicalize_text(action, obj, title)
|
|
embedding = await self._embed(canonical)
|
|
if not embedding:
|
|
return False
|
|
return await qdrant_upsert(
|
|
point_id=control_uuid,
|
|
embedding=embedding,
|
|
payload={
|
|
"control_uuid": control_uuid,
|
|
"control_id": control_id,
|
|
"title": title,
|
|
"pattern_id": pattern_id,
|
|
"action_normalized": norm_action,
|
|
"object_normalized": norm_object,
|
|
"canonical_text": canonical,
|
|
},
|
|
)
|