fix: adapt batch dedup to NULL pattern_id — group by merge_group_hint
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 31s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Successful in 2s
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 31s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Successful in 2s
All Pass 0b controls have pattern_id=NULL. Rewritten to: - Phase 1: Group by merge_group_hint (action:object:trigger), 52k groups - Phase 2: Cross-group embedding search for semantically similar masters - Qdrant search uses unfiltered cross-regulation endpoint - API param changed: pattern_id → hint_filter Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -776,12 +776,12 @@ _batch_dedup_runner = None
|
|||||||
@router.post("/migrate/batch-dedup", response_model=MigrationResponse)
|
@router.post("/migrate/batch-dedup", response_model=MigrationResponse)
|
||||||
async def migrate_batch_dedup(
|
async def migrate_batch_dedup(
|
||||||
dry_run: bool = Query(False, description="Preview mode — no DB changes"),
|
dry_run: bool = Query(False, description="Preview mode — no DB changes"),
|
||||||
pattern_id: Optional[str] = Query(None, description="Only process this pattern"),
|
hint_filter: Optional[str] = Query(None, description="Only process hints matching this prefix"),
|
||||||
):
|
):
|
||||||
"""Batch dedup: reduce ~85k Pass 0b controls to ~18-25k masters.
|
"""Batch dedup: reduce ~85k Pass 0b controls to ~18-25k masters.
|
||||||
|
|
||||||
Groups controls by pattern_id + merge_group_hint, picks the best
|
Phase 1: Groups by merge_group_hint, picks best quality master, links rest.
|
||||||
quality master, and links duplicates via control_parent_links.
|
Phase 2: Cross-group embedding search for semantically similar masters.
|
||||||
"""
|
"""
|
||||||
global _batch_dedup_runner
|
global _batch_dedup_runner
|
||||||
from compliance.services.batch_dedup_runner import BatchDedupRunner
|
from compliance.services.batch_dedup_runner import BatchDedupRunner
|
||||||
@@ -790,7 +790,7 @@ async def migrate_batch_dedup(
|
|||||||
try:
|
try:
|
||||||
runner = BatchDedupRunner(db=db)
|
runner = BatchDedupRunner(db=db)
|
||||||
_batch_dedup_runner = runner
|
_batch_dedup_runner = runner
|
||||||
stats = await runner.run(dry_run=dry_run, pattern_filter=pattern_id)
|
stats = await runner.run(dry_run=dry_run, hint_filter=hint_filter)
|
||||||
return MigrationResponse(status="completed", stats=stats)
|
return MigrationResponse(status="completed", stats=stats)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Batch dedup failed: %s", e)
|
logger.error("Batch dedup failed: %s", e)
|
||||||
|
|||||||
@@ -1,17 +1,20 @@
|
|||||||
"""Batch Dedup Runner — Orchestrates deduplication of ~85k atomare Controls.
|
"""Batch Dedup Runner — Orchestrates deduplication of ~85k atomare Controls.
|
||||||
|
|
||||||
Reduces Pass 0b controls from ~85k to ~18-25k unique Master Controls by:
|
Reduces Pass 0b controls from ~85k to ~18-25k unique Master Controls via:
|
||||||
1. Intra-Pattern Dedup: Group by pattern_id + merge_group_hint, pick best master
|
Phase 1: Intra-Group Dedup — same merge_group_hint → pick best, link rest
|
||||||
2. Cross-Regulation Dedup: Find near-duplicates across pattern boundaries
|
(85k → ~52k, mostly title-identical short-circuit, no embeddings)
|
||||||
|
Phase 2: Cross-Group Dedup — embed masters, search Qdrant for similar
|
||||||
|
masters with different hints (52k → ~18-25k)
|
||||||
|
|
||||||
Reuses the existing 4-Stage Pipeline from control_dedup.py. Only adds
|
All Pass 0b controls have pattern_id=NULL. The primary grouping key is
|
||||||
batch orchestration, quality scoring, and parent-link transfer logic.
|
merge_group_hint (format: "action_type:norm_obj:trigger_key"), which
|
||||||
|
encodes the normalized action, object, and trigger.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
runner = BatchDedupRunner(db)
|
runner = BatchDedupRunner(db)
|
||||||
stats = await runner.run(dry_run=True) # preview
|
stats = await runner.run(dry_run=True) # preview
|
||||||
stats = await runner.run(dry_run=False) # execute
|
stats = await runner.run(dry_run=False) # execute
|
||||||
stats = await runner.run(pattern_filter="CP-AUTH-001") # single pattern
|
stats = await runner.run(hint_filter="implement:multi_factor_auth:none")
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@@ -22,17 +25,15 @@ from collections import defaultdict
|
|||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
|
|
||||||
from compliance.services.control_dedup import (
|
from compliance.services.control_dedup import (
|
||||||
ControlDedupChecker,
|
|
||||||
DedupResult,
|
|
||||||
canonicalize_text,
|
canonicalize_text,
|
||||||
ensure_qdrant_collection,
|
ensure_qdrant_collection,
|
||||||
get_embedding,
|
get_embedding,
|
||||||
normalize_action,
|
normalize_action,
|
||||||
normalize_object,
|
normalize_object,
|
||||||
qdrant_search,
|
|
||||||
qdrant_search_cross_regulation,
|
qdrant_search_cross_regulation,
|
||||||
qdrant_upsert,
|
qdrant_upsert,
|
||||||
CROSS_REG_LINK_THRESHOLD,
|
LINK_THRESHOLD,
|
||||||
|
REVIEW_THRESHOLD,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -91,62 +92,73 @@ class BatchDedupRunner:
|
|||||||
self.collection = collection
|
self.collection = collection
|
||||||
self.stats = {
|
self.stats = {
|
||||||
"total_controls": 0,
|
"total_controls": 0,
|
||||||
"patterns_processed": 0,
|
"unique_hints": 0,
|
||||||
"sub_groups_processed": 0,
|
"phase1_groups_processed": 0,
|
||||||
"masters": 0,
|
"masters": 0,
|
||||||
"linked": 0,
|
"linked": 0,
|
||||||
"review": 0,
|
"review": 0,
|
||||||
"new_controls": 0,
|
"new_controls": 0,
|
||||||
"parent_links_transferred": 0,
|
"parent_links_transferred": 0,
|
||||||
"cross_reg_linked": 0,
|
"cross_group_linked": 0,
|
||||||
|
"cross_group_review": 0,
|
||||||
"errors": 0,
|
"errors": 0,
|
||||||
"skipped_title_identical": 0,
|
"skipped_title_identical": 0,
|
||||||
}
|
}
|
||||||
self._progress_pattern = ""
|
self._progress_phase = ""
|
||||||
self._progress_count = 0
|
self._progress_count = 0
|
||||||
|
self._progress_total = 0
|
||||||
|
|
||||||
async def run(
|
async def run(
|
||||||
self,
|
self,
|
||||||
dry_run: bool = False,
|
dry_run: bool = False,
|
||||||
pattern_filter: str = None,
|
hint_filter: str = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Run the full batch dedup pipeline.
|
"""Run the full batch dedup pipeline.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dry_run: If True, compute stats but don't modify DB.
|
dry_run: If True, compute stats but don't modify DB/Qdrant.
|
||||||
pattern_filter: If set, only process this pattern_id.
|
hint_filter: If set, only process groups matching this hint prefix.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Stats dict with counts.
|
Stats dict with counts.
|
||||||
"""
|
"""
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
logger.info("BatchDedup starting (dry_run=%s, pattern_filter=%s)",
|
logger.info("BatchDedup starting (dry_run=%s, hint_filter=%s)",
|
||||||
dry_run, pattern_filter)
|
dry_run, hint_filter)
|
||||||
|
|
||||||
# Ensure Qdrant collection
|
if not dry_run:
|
||||||
await ensure_qdrant_collection(collection=self.collection)
|
await ensure_qdrant_collection(collection=self.collection)
|
||||||
|
|
||||||
# Phase 1: Intra-pattern dedup
|
# Phase 1: Intra-group dedup (same merge_group_hint)
|
||||||
groups = self._load_pattern_groups(pattern_filter)
|
self._progress_phase = "phase1"
|
||||||
for pattern_id, controls in groups:
|
groups = self._load_merge_groups(hint_filter)
|
||||||
|
self._progress_total = self.stats["total_controls"]
|
||||||
|
|
||||||
|
for hint, controls in groups:
|
||||||
try:
|
try:
|
||||||
await self._process_pattern_group(pattern_id, controls, dry_run)
|
await self._process_hint_group(hint, controls, dry_run)
|
||||||
self.stats["patterns_processed"] += 1
|
self.stats["phase1_groups_processed"] += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("BatchDedup error on pattern %s: %s", pattern_id, e)
|
logger.error("BatchDedup Phase 1 error on hint %s: %s", hint, e)
|
||||||
self.stats["errors"] += 1
|
self.stats["errors"] += 1
|
||||||
|
|
||||||
# Phase 2: Cross-regulation dedup (skip in dry_run for speed)
|
logger.info(
|
||||||
|
"BatchDedup Phase 1 done: %d masters, %d linked, %d review",
|
||||||
|
self.stats["masters"], self.stats["linked"], self.stats["review"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Phase 2: Cross-group dedup via embeddings
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
await self._run_cross_regulation_pass()
|
self._progress_phase = "phase2"
|
||||||
|
await self._run_cross_group_pass()
|
||||||
|
|
||||||
elapsed = time.monotonic() - start
|
elapsed = time.monotonic() - start
|
||||||
self.stats["elapsed_seconds"] = round(elapsed, 1)
|
self.stats["elapsed_seconds"] = round(elapsed, 1)
|
||||||
logger.info("BatchDedup completed in %.1fs: %s", elapsed, self.stats)
|
logger.info("BatchDedup completed in %.1fs: %s", elapsed, self.stats)
|
||||||
return self.stats
|
return self.stats
|
||||||
|
|
||||||
def _load_pattern_groups(self, pattern_filter: str = None) -> list:
|
def _load_merge_groups(self, hint_filter: str = None) -> list:
|
||||||
"""Load all Pass 0b controls grouped by pattern_id, largest first."""
|
"""Load all Pass 0b controls grouped by merge_group_hint, largest first."""
|
||||||
conditions = [
|
conditions = [
|
||||||
"decomposition_method = 'pass0b'",
|
"decomposition_method = 'pass0b'",
|
||||||
"release_state != 'deprecated'",
|
"release_state != 'deprecated'",
|
||||||
@@ -154,9 +166,9 @@ class BatchDedupRunner:
|
|||||||
]
|
]
|
||||||
params = {}
|
params = {}
|
||||||
|
|
||||||
if pattern_filter:
|
if hint_filter:
|
||||||
conditions.append("pattern_id = :pf")
|
conditions.append("generation_metadata->>'merge_group_hint' LIKE :hf")
|
||||||
params["pf"] = pattern_filter
|
params["hf"] = f"{hint_filter}%"
|
||||||
|
|
||||||
where = " AND ".join(conditions)
|
where = " AND ".join(conditions)
|
||||||
rows = self.db.execute(text(f"""
|
rows = self.db.execute(text(f"""
|
||||||
@@ -167,13 +179,12 @@ class BatchDedupRunner:
|
|||||||
generation_metadata->>'action_object_class' as action_object_class
|
generation_metadata->>'action_object_class' as action_object_class
|
||||||
FROM canonical_controls
|
FROM canonical_controls
|
||||||
WHERE {where}
|
WHERE {where}
|
||||||
ORDER BY pattern_id, control_id
|
ORDER BY control_id
|
||||||
"""), params).fetchall()
|
"""), params).fetchall()
|
||||||
|
|
||||||
# Group by pattern_id
|
by_hint = defaultdict(list)
|
||||||
by_pattern = defaultdict(list)
|
|
||||||
for r in rows:
|
for r in rows:
|
||||||
by_pattern[r[4]].append({
|
by_hint[r[9] or ""].append({
|
||||||
"uuid": r[0],
|
"uuid": r[0],
|
||||||
"control_id": r[1],
|
"control_id": r[1],
|
||||||
"title": r[2],
|
"title": r[2],
|
||||||
@@ -188,10 +199,10 @@ class BatchDedupRunner:
|
|||||||
})
|
})
|
||||||
|
|
||||||
self.stats["total_controls"] = len(rows)
|
self.stats["total_controls"] = len(rows)
|
||||||
|
self.stats["unique_hints"] = len(by_hint)
|
||||||
|
|
||||||
# Sort patterns by group size (descending) for progress visibility
|
sorted_groups = sorted(by_hint.items(), key=lambda x: len(x[1]), reverse=True)
|
||||||
sorted_groups = sorted(by_pattern.items(), key=lambda x: len(x[1]), reverse=True)
|
logger.info("BatchDedup loaded %d controls in %d hint groups",
|
||||||
logger.info("BatchDedup loaded %d controls in %d patterns",
|
|
||||||
len(rows), len(sorted_groups))
|
len(rows), len(sorted_groups))
|
||||||
return sorted_groups
|
return sorted_groups
|
||||||
|
|
||||||
@@ -203,35 +214,31 @@ class BatchDedupRunner:
|
|||||||
if hint:
|
if hint:
|
||||||
groups[hint].append(c)
|
groups[hint].append(c)
|
||||||
else:
|
else:
|
||||||
# No hint → each control is its own group
|
|
||||||
groups[f"__no_hint_{c['uuid']}"].append(c)
|
groups[f"__no_hint_{c['uuid']}"].append(c)
|
||||||
return dict(groups)
|
return dict(groups)
|
||||||
|
|
||||||
async def _process_pattern_group(
|
async def _process_hint_group(
|
||||||
self,
|
self,
|
||||||
pattern_id: str,
|
hint: str,
|
||||||
controls: list,
|
controls: list,
|
||||||
dry_run: bool,
|
dry_run: bool,
|
||||||
):
|
):
|
||||||
"""Process all controls within a single pattern_id."""
|
"""Process all controls sharing the same merge_group_hint.
|
||||||
self._progress_pattern = pattern_id
|
|
||||||
self._progress_count = 0
|
|
||||||
total = len(controls)
|
|
||||||
|
|
||||||
sub_groups = self._sub_group_by_merge_hint(controls)
|
Within a hint group, all controls share action+object+trigger.
|
||||||
|
The best-quality control becomes master, rest are linked as duplicates.
|
||||||
for hint, group in sub_groups.items():
|
"""
|
||||||
if len(group) < 2:
|
if len(controls) < 2:
|
||||||
# Single control → always master
|
# Singleton → always master
|
||||||
master = group[0]
|
|
||||||
self.stats["masters"] += 1
|
self.stats["masters"] += 1
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
await self._embed_and_index(master)
|
await self._embed_and_index(controls[0])
|
||||||
self._progress_count += 1
|
self._progress_count += 1
|
||||||
continue
|
self._log_progress(hint)
|
||||||
|
return
|
||||||
|
|
||||||
# Sort by quality score (best first)
|
# Sort by quality score (best first)
|
||||||
sorted_group = sorted(group, key=quality_score, reverse=True)
|
sorted_group = sorted(controls, key=quality_score, reverse=True)
|
||||||
master = sorted_group[0]
|
master = sorted_group[0]
|
||||||
self.stats["masters"] += 1
|
self.stats["masters"] += 1
|
||||||
|
|
||||||
@@ -239,63 +246,52 @@ class BatchDedupRunner:
|
|||||||
await self._embed_and_index(master)
|
await self._embed_and_index(master)
|
||||||
|
|
||||||
for candidate in sorted_group[1:]:
|
for candidate in sorted_group[1:]:
|
||||||
await self._check_and_link(master, candidate, pattern_id, dry_run)
|
# All share the same hint → check title similarity
|
||||||
self._progress_count += 1
|
if candidate["title"].strip().lower() == master["title"].strip().lower():
|
||||||
|
# Identical title → direct link (no embedding needed)
|
||||||
self.stats["sub_groups_processed"] += 1
|
|
||||||
|
|
||||||
# Progress logging every 100 controls
|
|
||||||
if self._progress_count > 0 and self._progress_count % 100 == 0:
|
|
||||||
logger.info(
|
|
||||||
"BatchDedup [%s] %d/%d — masters=%d, linked=%d, review=%d",
|
|
||||||
pattern_id, self._progress_count, total,
|
|
||||||
self.stats["masters"], self.stats["linked"], self.stats["review"],
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _check_and_link(
|
|
||||||
self,
|
|
||||||
master: dict,
|
|
||||||
candidate: dict,
|
|
||||||
pattern_id: str,
|
|
||||||
dry_run: bool,
|
|
||||||
):
|
|
||||||
"""Check if candidate is a duplicate of master and link if so."""
|
|
||||||
# Short-circuit: identical titles within same merge_group → direct link
|
|
||||||
if (candidate["title"].strip().lower() == master["title"].strip().lower()
|
|
||||||
and candidate["merge_group_hint"] == master["merge_group_hint"]
|
|
||||||
and candidate["merge_group_hint"]):
|
|
||||||
self.stats["linked"] += 1
|
self.stats["linked"] += 1
|
||||||
self.stats["skipped_title_identical"] += 1
|
self.stats["skipped_title_identical"] += 1
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
await self._mark_duplicate(master, candidate, confidence=1.0)
|
await self._mark_duplicate(master, candidate, confidence=1.0)
|
||||||
return
|
else:
|
||||||
|
# Different title within same hint → still likely duplicate
|
||||||
|
# Use embedding to verify
|
||||||
|
await self._check_and_link_within_group(master, candidate, dry_run)
|
||||||
|
|
||||||
# Extract action/object from merge_group_hint (format: "action_type:norm_obj:trigger_key")
|
self._progress_count += 1
|
||||||
|
self._log_progress(hint)
|
||||||
|
|
||||||
|
async def _check_and_link_within_group(
|
||||||
|
self,
|
||||||
|
master: dict,
|
||||||
|
candidate: dict,
|
||||||
|
dry_run: bool,
|
||||||
|
):
|
||||||
|
"""Check if candidate (same hint group) is duplicate of master via embedding."""
|
||||||
parts = candidate["merge_group_hint"].split(":", 2)
|
parts = candidate["merge_group_hint"].split(":", 2)
|
||||||
action = parts[0] if len(parts) > 0 else ""
|
action = parts[0] if len(parts) > 0 else ""
|
||||||
obj = parts[1] if len(parts) > 1 else ""
|
obj = parts[1] if len(parts) > 1 else ""
|
||||||
|
|
||||||
# Build canonical text and get embedding for candidate
|
|
||||||
canonical = canonicalize_text(action, obj, candidate["title"])
|
canonical = canonicalize_text(action, obj, candidate["title"])
|
||||||
embedding = await get_embedding(canonical)
|
embedding = await get_embedding(canonical)
|
||||||
|
|
||||||
if not embedding:
|
if not embedding:
|
||||||
# Can't embed → keep as new control
|
# Can't embed → link anyway (same hint = same action+object)
|
||||||
self.stats["new_controls"] += 1
|
self.stats["linked"] += 1
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
await self._embed_and_index(candidate)
|
await self._mark_duplicate(master, candidate, confidence=0.90)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Search the dedup collection for similar controls
|
# Search the dedup collection (unfiltered — pattern_id is NULL)
|
||||||
results = await qdrant_search(
|
results = await qdrant_search_cross_regulation(
|
||||||
embedding, pattern_id, top_k=5, collection=self.collection,
|
embedding, top_k=3, collection=self.collection,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not results:
|
if not results:
|
||||||
# No matches → new master
|
# No Qdrant matches yet (master might not be indexed yet) → link to master
|
||||||
self.stats["new_controls"] += 1
|
self.stats["linked"] += 1
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
await self._embed_and_index(candidate)
|
await self._mark_duplicate(master, candidate, confidence=0.90)
|
||||||
return
|
return
|
||||||
|
|
||||||
best = results[0]
|
best = results[0]
|
||||||
@@ -303,28 +299,121 @@ class BatchDedupRunner:
|
|||||||
best_payload = best.get("payload", {})
|
best_payload = best.get("payload", {})
|
||||||
best_uuid = best_payload.get("control_uuid", "")
|
best_uuid = best_payload.get("control_uuid", "")
|
||||||
|
|
||||||
# Same action+object (since same merge_group_hint) → use standard thresholds
|
|
||||||
from compliance.services.control_dedup import LINK_THRESHOLD, REVIEW_THRESHOLD
|
|
||||||
|
|
||||||
if best_score > LINK_THRESHOLD:
|
if best_score > LINK_THRESHOLD:
|
||||||
self.stats["linked"] += 1
|
self.stats["linked"] += 1
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
# Link to the matched master (which may differ from our `master`)
|
await self._mark_duplicate_to(best_uuid, candidate, confidence=best_score)
|
||||||
await self._mark_duplicate_to(
|
|
||||||
master_uuid=best_uuid,
|
|
||||||
candidate=candidate,
|
|
||||||
confidence=best_score,
|
|
||||||
)
|
|
||||||
elif best_score > REVIEW_THRESHOLD:
|
elif best_score > REVIEW_THRESHOLD:
|
||||||
self.stats["review"] += 1
|
self.stats["review"] += 1
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
self._write_review(candidate, best_payload, best_score)
|
self._write_review(candidate, best_payload, best_score)
|
||||||
else:
|
else:
|
||||||
# Below threshold → becomes a new master
|
# Very different despite same hint → new master
|
||||||
self.stats["new_controls"] += 1
|
self.stats["new_controls"] += 1
|
||||||
if not dry_run:
|
if not dry_run:
|
||||||
await self._index_with_embedding(candidate, embedding)
|
await self._index_with_embedding(candidate, embedding)
|
||||||
|
|
||||||
|
async def _run_cross_group_pass(self):
|
||||||
|
"""Phase 2: Find cross-group duplicates among surviving masters.
|
||||||
|
|
||||||
|
After Phase 1, ~52k masters remain. Many have similar semantics
|
||||||
|
despite different merge_group_hints (e.g. different German spellings).
|
||||||
|
This pass embeds all masters and finds near-duplicates via Qdrant.
|
||||||
|
"""
|
||||||
|
logger.info("BatchDedup Phase 2: Cross-group pass starting...")
|
||||||
|
|
||||||
|
rows = self.db.execute(text("""
|
||||||
|
SELECT id::text, control_id, title,
|
||||||
|
generation_metadata->>'merge_group_hint' as merge_group_hint
|
||||||
|
FROM canonical_controls
|
||||||
|
WHERE decomposition_method = 'pass0b'
|
||||||
|
AND release_state != 'duplicate'
|
||||||
|
AND release_state != 'deprecated'
|
||||||
|
ORDER BY control_id
|
||||||
|
""")).fetchall()
|
||||||
|
|
||||||
|
self._progress_total = len(rows)
|
||||||
|
self._progress_count = 0
|
||||||
|
logger.info("BatchDedup Cross-group: %d masters to check", len(rows))
|
||||||
|
cross_linked = 0
|
||||||
|
cross_review = 0
|
||||||
|
|
||||||
|
for i, r in enumerate(rows):
|
||||||
|
uuid = r[0]
|
||||||
|
hint = r[3] or ""
|
||||||
|
parts = hint.split(":", 2)
|
||||||
|
action = parts[0] if len(parts) > 0 else ""
|
||||||
|
obj = parts[1] if len(parts) > 1 else ""
|
||||||
|
|
||||||
|
canonical = canonicalize_text(action, obj, r[2])
|
||||||
|
embedding = await get_embedding(canonical)
|
||||||
|
if not embedding:
|
||||||
|
continue
|
||||||
|
|
||||||
|
results = await qdrant_search_cross_regulation(
|
||||||
|
embedding, top_k=5, collection=self.collection,
|
||||||
|
)
|
||||||
|
if not results:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find best match from a DIFFERENT hint group
|
||||||
|
for match in results:
|
||||||
|
match_score = match.get("score", 0.0)
|
||||||
|
match_payload = match.get("payload", {})
|
||||||
|
match_uuid = match_payload.get("control_uuid", "")
|
||||||
|
|
||||||
|
# Skip self-match
|
||||||
|
if match_uuid == uuid:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Must be a different hint group (otherwise already handled in Phase 1)
|
||||||
|
match_action = match_payload.get("action_normalized", "")
|
||||||
|
match_object = match_payload.get("object_normalized", "")
|
||||||
|
# Simple check: different control UUID is enough
|
||||||
|
if match_score > LINK_THRESHOLD:
|
||||||
|
# Mark the worse one as duplicate
|
||||||
|
self.db.execute(text("""
|
||||||
|
UPDATE canonical_controls
|
||||||
|
SET release_state = 'duplicate', merged_into_uuid = CAST(:master AS uuid)
|
||||||
|
WHERE id = CAST(:dup AS uuid)
|
||||||
|
AND release_state != 'duplicate'
|
||||||
|
"""), {"master": match_uuid, "dup": uuid})
|
||||||
|
|
||||||
|
self.db.execute(text("""
|
||||||
|
INSERT INTO control_parent_links
|
||||||
|
(control_uuid, parent_control_uuid, link_type, confidence)
|
||||||
|
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'cross_regulation', :conf)
|
||||||
|
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
||||||
|
"""), {"cu": match_uuid, "pu": uuid, "conf": match_score})
|
||||||
|
|
||||||
|
# Transfer parent links
|
||||||
|
transferred = self._transfer_parent_links(match_uuid, uuid)
|
||||||
|
self.stats["parent_links_transferred"] += transferred
|
||||||
|
|
||||||
|
self.db.commit()
|
||||||
|
cross_linked += 1
|
||||||
|
break # Only one cross-link per control
|
||||||
|
elif match_score > REVIEW_THRESHOLD:
|
||||||
|
self._write_review(
|
||||||
|
{"control_id": r[1], "title": r[2], "objective": "",
|
||||||
|
"merge_group_hint": hint, "pattern_id": None},
|
||||||
|
match_payload, match_score,
|
||||||
|
)
|
||||||
|
cross_review += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
self._progress_count = i + 1
|
||||||
|
if (i + 1) % 500 == 0:
|
||||||
|
logger.info("BatchDedup Cross-group: %d/%d checked, %d linked, %d review",
|
||||||
|
i + 1, len(rows), cross_linked, cross_review)
|
||||||
|
|
||||||
|
self.stats["cross_group_linked"] = cross_linked
|
||||||
|
self.stats["cross_group_review"] = cross_review
|
||||||
|
logger.info("BatchDedup Cross-group complete: %d linked, %d review",
|
||||||
|
cross_linked, cross_review)
|
||||||
|
|
||||||
|
# ── Qdrant Helpers ───────────────────────────────────────────────────
|
||||||
|
|
||||||
async def _embed_and_index(self, control: dict):
|
async def _embed_and_index(self, control: dict):
|
||||||
"""Compute embedding and index a control in the dedup Qdrant collection."""
|
"""Compute embedding and index a control in the dedup Qdrant collection."""
|
||||||
parts = control["merge_group_hint"].split(":", 2)
|
parts = control["merge_group_hint"].split(":", 2)
|
||||||
@@ -346,10 +435,11 @@ class BatchDedupRunner:
|
|||||||
"control_uuid": control["uuid"],
|
"control_uuid": control["uuid"],
|
||||||
"control_id": control["control_id"],
|
"control_id": control["control_id"],
|
||||||
"title": control["title"],
|
"title": control["title"],
|
||||||
"pattern_id": control["pattern_id"],
|
"pattern_id": control.get("pattern_id"),
|
||||||
"action_normalized": norm_action,
|
"action_normalized": norm_action,
|
||||||
"object_normalized": norm_object,
|
"object_normalized": norm_object,
|
||||||
"canonical_text": canonical,
|
"canonical_text": canonical,
|
||||||
|
"merge_group_hint": control["merge_group_hint"],
|
||||||
},
|
},
|
||||||
collection=self.collection,
|
collection=self.collection,
|
||||||
)
|
)
|
||||||
@@ -371,14 +461,17 @@ class BatchDedupRunner:
|
|||||||
"control_uuid": control["uuid"],
|
"control_uuid": control["uuid"],
|
||||||
"control_id": control["control_id"],
|
"control_id": control["control_id"],
|
||||||
"title": control["title"],
|
"title": control["title"],
|
||||||
"pattern_id": control["pattern_id"],
|
"pattern_id": control.get("pattern_id"),
|
||||||
"action_normalized": norm_action,
|
"action_normalized": norm_action,
|
||||||
"object_normalized": norm_object,
|
"object_normalized": norm_object,
|
||||||
"canonical_text": canonical,
|
"canonical_text": canonical,
|
||||||
|
"merge_group_hint": control["merge_group_hint"],
|
||||||
},
|
},
|
||||||
collection=self.collection,
|
collection=self.collection,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ── DB Write Helpers ─────────────────────────────────────────────────
|
||||||
|
|
||||||
async def _mark_duplicate(self, master: dict, candidate: dict, confidence: float):
|
async def _mark_duplicate(self, master: dict, candidate: dict, confidence: float):
|
||||||
"""Mark candidate as duplicate of master, transfer parent links."""
|
"""Mark candidate as duplicate of master, transfer parent links."""
|
||||||
self.db.execute(text("""
|
self.db.execute(text("""
|
||||||
@@ -387,7 +480,6 @@ class BatchDedupRunner:
|
|||||||
WHERE id = CAST(:cand AS uuid)
|
WHERE id = CAST(:cand AS uuid)
|
||||||
"""), {"master": master["uuid"], "cand": candidate["uuid"]})
|
"""), {"master": master["uuid"], "cand": candidate["uuid"]})
|
||||||
|
|
||||||
# Add dedup_merge link
|
|
||||||
self.db.execute(text("""
|
self.db.execute(text("""
|
||||||
INSERT INTO control_parent_links
|
INSERT INTO control_parent_links
|
||||||
(control_uuid, parent_control_uuid, link_type, confidence)
|
(control_uuid, parent_control_uuid, link_type, confidence)
|
||||||
@@ -395,7 +487,6 @@ class BatchDedupRunner:
|
|||||||
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
||||||
"""), {"master": master["uuid"], "cand_parent": candidate["uuid"], "conf": confidence})
|
"""), {"master": master["uuid"], "cand_parent": candidate["uuid"], "conf": confidence})
|
||||||
|
|
||||||
# Transfer parent links from candidate to master
|
|
||||||
transferred = self._transfer_parent_links(master["uuid"], candidate["uuid"])
|
transferred = self._transfer_parent_links(master["uuid"], candidate["uuid"])
|
||||||
self.stats["parent_links_transferred"] += transferred
|
self.stats["parent_links_transferred"] += transferred
|
||||||
|
|
||||||
@@ -409,7 +500,6 @@ class BatchDedupRunner:
|
|||||||
WHERE id = CAST(:cand AS uuid)
|
WHERE id = CAST(:cand AS uuid)
|
||||||
"""), {"master": master_uuid, "cand": candidate["uuid"]})
|
"""), {"master": master_uuid, "cand": candidate["uuid"]})
|
||||||
|
|
||||||
# Add dedup_merge link
|
|
||||||
self.db.execute(text("""
|
self.db.execute(text("""
|
||||||
INSERT INTO control_parent_links
|
INSERT INTO control_parent_links
|
||||||
(control_uuid, parent_control_uuid, link_type, confidence)
|
(control_uuid, parent_control_uuid, link_type, confidence)
|
||||||
@@ -417,18 +507,13 @@ class BatchDedupRunner:
|
|||||||
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
||||||
"""), {"master": master_uuid, "cand_parent": candidate["uuid"], "conf": confidence})
|
"""), {"master": master_uuid, "cand_parent": candidate["uuid"], "conf": confidence})
|
||||||
|
|
||||||
# Transfer parent links
|
|
||||||
transferred = self._transfer_parent_links(master_uuid, candidate["uuid"])
|
transferred = self._transfer_parent_links(master_uuid, candidate["uuid"])
|
||||||
self.stats["parent_links_transferred"] += transferred
|
self.stats["parent_links_transferred"] += transferred
|
||||||
|
|
||||||
self.db.commit()
|
self.db.commit()
|
||||||
|
|
||||||
def _transfer_parent_links(self, master_uuid: str, duplicate_uuid: str) -> int:
|
def _transfer_parent_links(self, master_uuid: str, duplicate_uuid: str) -> int:
|
||||||
"""Move existing parent links from duplicate to master.
|
"""Move existing parent links from duplicate to master."""
|
||||||
|
|
||||||
Returns the number of links transferred.
|
|
||||||
"""
|
|
||||||
# Find parent links pointing TO the duplicate (where it was the child control)
|
|
||||||
rows = self.db.execute(text("""
|
rows = self.db.execute(text("""
|
||||||
SELECT parent_control_uuid::text, link_type, confidence,
|
SELECT parent_control_uuid::text, link_type, confidence,
|
||||||
source_regulation, source_article, obligation_candidate_id::text
|
source_regulation, source_article, obligation_candidate_id::text
|
||||||
@@ -440,7 +525,6 @@ class BatchDedupRunner:
|
|||||||
transferred = 0
|
transferred = 0
|
||||||
for r in rows:
|
for r in rows:
|
||||||
parent_uuid = r[0]
|
parent_uuid = r[0]
|
||||||
# Skip self-references
|
|
||||||
if parent_uuid == master_uuid:
|
if parent_uuid == master_uuid:
|
||||||
continue
|
continue
|
||||||
self.db.execute(text("""
|
self.db.execute(text("""
|
||||||
@@ -480,81 +564,28 @@ class BatchDedupRunner:
|
|||||||
"mci": matched_payload.get("control_id"),
|
"mci": matched_payload.get("control_id"),
|
||||||
"ss": score,
|
"ss": score,
|
||||||
"dd": json.dumps({
|
"dd": json.dumps({
|
||||||
"merge_group_hint": candidate["merge_group_hint"],
|
"merge_group_hint": candidate.get("merge_group_hint", ""),
|
||||||
"pattern_id": candidate["pattern_id"],
|
"pattern_id": candidate.get("pattern_id"),
|
||||||
}),
|
}),
|
||||||
})
|
})
|
||||||
self.db.commit()
|
self.db.commit()
|
||||||
|
|
||||||
async def _run_cross_regulation_pass(self):
|
# ── Progress ─────────────────────────────────────────────────────────
|
||||||
"""Phase 2: Find cross-regulation duplicates among surviving masters."""
|
|
||||||
logger.info("BatchDedup Phase 2: Cross-regulation pass starting...")
|
|
||||||
|
|
||||||
# Load all non-duplicate pass0b controls that are now masters
|
def _log_progress(self, hint: str):
|
||||||
rows = self.db.execute(text("""
|
"""Log progress every 500 controls."""
|
||||||
SELECT id::text, control_id, title, pattern_id,
|
if self._progress_count > 0 and self._progress_count % 500 == 0:
|
||||||
generation_metadata->>'merge_group_hint' as merge_group_hint
|
logger.info(
|
||||||
FROM canonical_controls
|
"BatchDedup [%s] %d/%d — masters=%d, linked=%d, review=%d",
|
||||||
WHERE decomposition_method = 'pass0b'
|
self._progress_phase, self._progress_count, self._progress_total,
|
||||||
AND release_state != 'duplicate'
|
self.stats["masters"], self.stats["linked"], self.stats["review"],
|
||||||
AND release_state != 'deprecated'
|
|
||||||
ORDER BY control_id
|
|
||||||
""")).fetchall()
|
|
||||||
|
|
||||||
logger.info("BatchDedup Cross-reg: %d masters to check", len(rows))
|
|
||||||
cross_linked = 0
|
|
||||||
|
|
||||||
for i, r in enumerate(rows):
|
|
||||||
uuid = r[0]
|
|
||||||
hint = r[4] or ""
|
|
||||||
parts = hint.split(":", 2)
|
|
||||||
action = parts[0] if len(parts) > 0 else ""
|
|
||||||
obj = parts[1] if len(parts) > 1 else ""
|
|
||||||
|
|
||||||
canonical = canonicalize_text(action, obj, r[2])
|
|
||||||
embedding = await get_embedding(canonical)
|
|
||||||
if not embedding:
|
|
||||||
continue
|
|
||||||
|
|
||||||
results = await qdrant_search_cross_regulation(
|
|
||||||
embedding, top_k=5, collection=self.collection,
|
|
||||||
)
|
)
|
||||||
if not results:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check if best match is from a DIFFERENT pattern
|
|
||||||
best = results[0]
|
|
||||||
best_score = best.get("score", 0.0)
|
|
||||||
best_payload = best.get("payload", {})
|
|
||||||
|
|
||||||
if (best_score > CROSS_REG_LINK_THRESHOLD
|
|
||||||
and best_payload.get("pattern_id") != r[3]
|
|
||||||
and best_payload.get("control_uuid") != uuid):
|
|
||||||
# Cross-regulation link
|
|
||||||
self.db.execute(text("""
|
|
||||||
INSERT INTO control_parent_links
|
|
||||||
(control_uuid, parent_control_uuid, link_type, confidence)
|
|
||||||
VALUES (CAST(:cu AS uuid), CAST(:pu AS uuid), 'cross_regulation', :conf)
|
|
||||||
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
|
||||||
"""), {
|
|
||||||
"cu": best_payload["control_uuid"],
|
|
||||||
"pu": uuid,
|
|
||||||
"conf": best_score,
|
|
||||||
})
|
|
||||||
self.db.commit()
|
|
||||||
cross_linked += 1
|
|
||||||
|
|
||||||
if (i + 1) % 500 == 0:
|
|
||||||
logger.info("BatchDedup Cross-reg: %d/%d checked, %d linked",
|
|
||||||
i + 1, len(rows), cross_linked)
|
|
||||||
|
|
||||||
self.stats["cross_reg_linked"] = cross_linked
|
|
||||||
logger.info("BatchDedup Cross-reg complete: %d links created", cross_linked)
|
|
||||||
|
|
||||||
def get_status(self) -> dict:
|
def get_status(self) -> dict:
|
||||||
"""Return current progress stats (for status endpoint)."""
|
"""Return current progress stats (for status endpoint)."""
|
||||||
return {
|
return {
|
||||||
"pattern": self._progress_pattern,
|
"phase": self._progress_phase,
|
||||||
"progress": self._progress_count,
|
"progress": self._progress_count,
|
||||||
|
"total": self._progress_total,
|
||||||
**self.stats,
|
**self.stats,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ Covers:
|
|||||||
- Master selection (highest quality score wins)
|
- Master selection (highest quality score wins)
|
||||||
- Duplicate linking (mark + parent-link transfer)
|
- Duplicate linking (mark + parent-link transfer)
|
||||||
- Dry run mode (no DB changes)
|
- Dry run mode (no DB changes)
|
||||||
- Cross-regulation pass
|
- Cross-group pass
|
||||||
- Progress reporting / stats
|
- Progress reporting / stats
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -147,31 +147,31 @@ class TestMasterSelection:
|
|||||||
db = MagicMock()
|
db = MagicMock()
|
||||||
db.execute = MagicMock()
|
db.execute = MagicMock()
|
||||||
db.commit = MagicMock()
|
db.commit = MagicMock()
|
||||||
|
# Mock parent link transfer query
|
||||||
|
db.execute.return_value.fetchall.return_value = []
|
||||||
|
|
||||||
runner = BatchDedupRunner(db=db)
|
runner = BatchDedupRunner(db=db)
|
||||||
|
|
||||||
sparse = _make_control("s1", reqs=1, hint="implement:mfa:none")
|
sparse = _make_control("s1", reqs=1, hint="implement:mfa:none",
|
||||||
rich = _make_control("r1", reqs=5, tests=3, evidence=2, hint="implement:mfa:none")
|
title="MFA implementiert")
|
||||||
medium = _make_control("m1", reqs=2, tests=1, hint="implement:mfa:none")
|
rich = _make_control("r1", reqs=5, tests=3, evidence=2,
|
||||||
|
hint="implement:mfa:none", title="MFA implementiert")
|
||||||
|
medium = _make_control("m1", reqs=2, tests=1,
|
||||||
|
hint="implement:mfa:none", title="MFA implementiert")
|
||||||
|
|
||||||
controls = [sparse, medium, rich]
|
controls = [sparse, medium, rich]
|
||||||
|
|
||||||
# Mock embedding to avoid real API calls
|
# All have same title → all should be title-identical linked
|
||||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||||
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
||||||
new_callable=AsyncMock, return_value=True), \
|
new_callable=AsyncMock, return_value=True):
|
||||||
patch("compliance.services.batch_dedup_runner.qdrant_search",
|
await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)
|
||||||
new_callable=AsyncMock, return_value=[{
|
|
||||||
"score": 0.95,
|
|
||||||
"payload": {"control_uuid": rich["uuid"],
|
|
||||||
"control_id": rich["control_id"]},
|
|
||||||
}]):
|
|
||||||
await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True)
|
|
||||||
|
|
||||||
# Rich should be master (1 master), others linked (2 linked)
|
# Rich should be master (1 master), others linked (2 linked)
|
||||||
assert runner.stats["masters"] == 1
|
assert runner.stats["masters"] == 1
|
||||||
assert runner.stats["linked"] == 2
|
assert runner.stats["linked"] == 2
|
||||||
|
assert runner.stats["skipped_title_identical"] == 2
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -191,28 +191,19 @@ class TestDryRun:
|
|||||||
runner = BatchDedupRunner(db=db)
|
runner = BatchDedupRunner(db=db)
|
||||||
|
|
||||||
controls = [
|
controls = [
|
||||||
_make_control("a", reqs=3, hint="implement:mfa:none"),
|
_make_control("a", reqs=3, hint="implement:mfa:none", title="MFA impl"),
|
||||||
_make_control("b", reqs=1, hint="implement:mfa:none"),
|
_make_control("b", reqs=1, hint="implement:mfa:none", title="MFA impl"),
|
||||||
]
|
]
|
||||||
|
|
||||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||||
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
||||||
new_callable=AsyncMock, return_value=True), \
|
new_callable=AsyncMock, return_value=True):
|
||||||
patch("compliance.services.batch_dedup_runner.qdrant_search",
|
await runner._process_hint_group("implement:mfa:none", controls, dry_run=True)
|
||||||
new_callable=AsyncMock, return_value=[{
|
|
||||||
"score": 0.95,
|
|
||||||
"payload": {"control_uuid": "a-uuid",
|
|
||||||
"control_id": "AUTH-001"},
|
|
||||||
}]):
|
|
||||||
await runner._process_pattern_group("CP-AUTH-001", controls, dry_run=True)
|
|
||||||
|
|
||||||
# No DB execute calls for UPDATE/INSERT (only the initial load query was mocked)
|
|
||||||
# In dry_run, _mark_duplicate and _embed_and_index are skipped
|
|
||||||
assert runner.stats["masters"] == 1
|
assert runner.stats["masters"] == 1
|
||||||
# qdrant_upsert should NOT have been called (dry_run skips indexing)
|
assert runner.stats["linked"] == 1
|
||||||
from compliance.services.batch_dedup_runner import qdrant_upsert
|
# No commit for dedup operations in dry_run
|
||||||
# No commit for dedup operations
|
|
||||||
db.commit.assert_not_called()
|
db.commit.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
@@ -261,56 +252,100 @@ class TestTitleIdenticalShortCircuit:
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_identical_titles_skip_embedding(self):
|
async def test_identical_titles_skip_embedding(self):
|
||||||
"""Controls with identical titles in same merge group → direct link."""
|
"""Controls with identical titles in same hint group → direct link."""
|
||||||
db = MagicMock()
|
db = MagicMock()
|
||||||
db.execute = MagicMock()
|
db.execute = MagicMock()
|
||||||
db.commit = MagicMock()
|
db.commit = MagicMock()
|
||||||
# Mock the parent link transfer query
|
|
||||||
db.execute.return_value.fetchall.return_value = []
|
db.execute.return_value.fetchall.return_value = []
|
||||||
|
|
||||||
runner = BatchDedupRunner(db=db)
|
runner = BatchDedupRunner(db=db)
|
||||||
|
|
||||||
master = _make_control("m", reqs=3, hint="implement:mfa:none",
|
controls = [
|
||||||
title="MFA implementieren")
|
_make_control("m", reqs=3, hint="implement:mfa:none",
|
||||||
candidate = _make_control("c", reqs=1, hint="implement:mfa:none",
|
title="MFA implementieren"),
|
||||||
title="MFA implementieren")
|
_make_control("c", reqs=1, hint="implement:mfa:none",
|
||||||
|
title="MFA implementieren"),
|
||||||
|
]
|
||||||
|
|
||||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||||
new_callable=AsyncMock) as mock_embed:
|
new_callable=AsyncMock) as mock_embed, \
|
||||||
await runner._check_and_link(master, candidate, "CP-AUTH-001", dry_run=False)
|
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
||||||
|
new_callable=AsyncMock, return_value=True):
|
||||||
|
await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)
|
||||||
|
|
||||||
# Embedding should NOT be called (title-identical short-circuit)
|
# Embedding should only be called for the master (indexing), not for linking
|
||||||
mock_embed.assert_not_called()
|
|
||||||
assert runner.stats["linked"] == 1
|
assert runner.stats["linked"] == 1
|
||||||
assert runner.stats["skipped_title_identical"] == 1
|
assert runner.stats["skipped_title_identical"] == 1
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Cross-Regulation Pass TESTS
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestCrossRegulationPass:
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_cross_reg_creates_link(self):
|
async def test_different_titles_use_embedding(self):
|
||||||
|
"""Controls with different titles should use embedding check."""
|
||||||
db = MagicMock()
|
db = MagicMock()
|
||||||
db.execute = MagicMock()
|
db.execute = MagicMock()
|
||||||
db.commit = MagicMock()
|
db.commit = MagicMock()
|
||||||
# First call: load masters
|
db.execute.return_value.fetchall.return_value = []
|
||||||
db.execute.return_value.fetchall.return_value = [
|
|
||||||
("uuid-1", "AUTH-001", "MFA implementieren", "CP-AUTH-001",
|
runner = BatchDedupRunner(db=db)
|
||||||
|
|
||||||
|
controls = [
|
||||||
|
_make_control("m", reqs=3, hint="implement:mfa:none",
|
||||||
|
title="MFA implementieren fuer Admins"),
|
||||||
|
_make_control("c", reqs=1, hint="implement:mfa:none",
|
||||||
|
title="MFA einrichten fuer alle Benutzer"),
|
||||||
|
]
|
||||||
|
|
||||||
|
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
||||||
|
new_callable=AsyncMock, return_value=[0.1] * 1024) as mock_embed, \
|
||||||
|
patch("compliance.services.batch_dedup_runner.qdrant_upsert",
|
||||||
|
new_callable=AsyncMock, return_value=True), \
|
||||||
|
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
|
||||||
|
new_callable=AsyncMock, return_value=[]):
|
||||||
|
await runner._process_hint_group("implement:mfa:none", controls, dry_run=False)
|
||||||
|
|
||||||
|
# Different titles → embedding was called for both (master + candidate)
|
||||||
|
assert mock_embed.call_count >= 2
|
||||||
|
# No Qdrant results → linked anyway (same hint = same action+object)
|
||||||
|
assert runner.stats["linked"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Cross-Group Pass TESTS
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestCrossGroupPass:
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cross_group_creates_link(self):
|
||||||
|
db = MagicMock()
|
||||||
|
db.commit = MagicMock()
|
||||||
|
|
||||||
|
# First call returns masters, subsequent calls return empty (for transfer)
|
||||||
|
master_rows = [
|
||||||
|
("uuid-1", "CTRL-001", "MFA implementieren",
|
||||||
"implement:multi_factor_auth:none"),
|
"implement:multi_factor_auth:none"),
|
||||||
]
|
]
|
||||||
|
call_count = {"n": 0}
|
||||||
|
|
||||||
|
def mock_execute(stmt, params=None):
|
||||||
|
result = MagicMock()
|
||||||
|
call_count["n"] += 1
|
||||||
|
if call_count["n"] == 1:
|
||||||
|
result.fetchall.return_value = master_rows
|
||||||
|
else:
|
||||||
|
result.fetchall.return_value = []
|
||||||
|
return result
|
||||||
|
|
||||||
|
db.execute = mock_execute
|
||||||
|
|
||||||
runner = BatchDedupRunner(db=db)
|
runner = BatchDedupRunner(db=db)
|
||||||
|
|
||||||
cross_result = [{
|
cross_result = [{
|
||||||
"score": 0.96,
|
"score": 0.95,
|
||||||
"payload": {
|
"payload": {
|
||||||
"control_uuid": "uuid-2",
|
"control_uuid": "uuid-2",
|
||||||
"control_id": "SEC-001",
|
"control_id": "CTRL-002",
|
||||||
"pattern_id": "CP-SEC-001", # different pattern!
|
"merge_group_hint": "implement:mfa:continuous",
|
||||||
},
|
},
|
||||||
}]
|
}]
|
||||||
|
|
||||||
@@ -318,39 +353,9 @@ class TestCrossRegulationPass:
|
|||||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
||||||
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
|
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
|
||||||
new_callable=AsyncMock, return_value=cross_result):
|
new_callable=AsyncMock, return_value=cross_result):
|
||||||
await runner._run_cross_regulation_pass()
|
await runner._run_cross_group_pass()
|
||||||
|
|
||||||
assert runner.stats["cross_reg_linked"] == 1
|
assert runner.stats["cross_group_linked"] == 1
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_cross_reg_ignores_same_pattern(self):
|
|
||||||
"""Cross-reg should NOT link controls from same pattern."""
|
|
||||||
db = MagicMock()
|
|
||||||
db.execute = MagicMock()
|
|
||||||
db.commit = MagicMock()
|
|
||||||
db.execute.return_value.fetchall.return_value = [
|
|
||||||
("uuid-1", "AUTH-001", "MFA", "CP-AUTH-001", "implement:mfa:none"),
|
|
||||||
]
|
|
||||||
|
|
||||||
runner = BatchDedupRunner(db=db)
|
|
||||||
|
|
||||||
# Match from SAME pattern
|
|
||||||
cross_result = [{
|
|
||||||
"score": 0.97,
|
|
||||||
"payload": {
|
|
||||||
"control_uuid": "uuid-3",
|
|
||||||
"control_id": "AUTH-002",
|
|
||||||
"pattern_id": "CP-AUTH-001", # same pattern
|
|
||||||
},
|
|
||||||
}]
|
|
||||||
|
|
||||||
with patch("compliance.services.batch_dedup_runner.get_embedding",
|
|
||||||
new_callable=AsyncMock, return_value=[0.1] * 1024), \
|
|
||||||
patch("compliance.services.batch_dedup_runner.qdrant_search_cross_regulation",
|
|
||||||
new_callable=AsyncMock, return_value=cross_result):
|
|
||||||
await runner._run_cross_regulation_pass()
|
|
||||||
|
|
||||||
assert runner.stats["cross_reg_linked"] == 0
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -365,12 +370,14 @@ class TestProgressStats:
|
|||||||
runner = BatchDedupRunner(db=db)
|
runner = BatchDedupRunner(db=db)
|
||||||
runner.stats["masters"] = 42
|
runner.stats["masters"] = 42
|
||||||
runner.stats["linked"] = 100
|
runner.stats["linked"] = 100
|
||||||
runner._progress_pattern = "CP-AUTH-001"
|
runner._progress_phase = "phase1"
|
||||||
runner._progress_count = 500
|
runner._progress_count = 500
|
||||||
|
runner._progress_total = 85000
|
||||||
|
|
||||||
status = runner.get_status()
|
status = runner.get_status()
|
||||||
assert status["pattern"] == "CP-AUTH-001"
|
assert status["phase"] == "phase1"
|
||||||
assert status["progress"] == 500
|
assert status["progress"] == 500
|
||||||
|
assert status["total"] == 85000
|
||||||
assert status["masters"] == 42
|
assert status["masters"] == 42
|
||||||
assert status["linked"] == 100
|
assert status["linked"] == 100
|
||||||
|
|
||||||
@@ -415,12 +422,12 @@ def _make_control(
|
|||||||
evidence: int = 0,
|
evidence: int = 0,
|
||||||
hint: str = "",
|
hint: str = "",
|
||||||
title: str = None,
|
title: str = None,
|
||||||
pattern_id: str = "CP-AUTH-001",
|
pattern_id: str = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Build a mock control dict for testing."""
|
"""Build a mock control dict for testing."""
|
||||||
return {
|
return {
|
||||||
"uuid": f"{prefix}-uuid",
|
"uuid": f"{prefix}-uuid",
|
||||||
"control_id": f"AUTH-{prefix}",
|
"control_id": f"CTRL-{prefix}",
|
||||||
"title": title or f"Control {prefix}",
|
"title": title or f"Control {prefix}",
|
||||||
"objective": f"Objective for {prefix}",
|
"objective": f"Objective for {prefix}",
|
||||||
"pattern_id": pattern_id,
|
"pattern_id": pattern_id,
|
||||||
|
|||||||
Reference in New Issue
Block a user