feat: Batch Dedup Runner — 85k→~18-25k Master Controls
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 32s
CI/CD / test-python-backend-compliance (push) Successful in 30s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 16s
CI/CD / validate-canonical-controls (push) Successful in 9s
CI/CD / Deploy (push) Successful in 1s

Adds batch orchestration for deduplicating ~85k Pass 0b atomic controls
into ~18-25k unique masters with M:N parent linking.

New files:
- migrations/078_batch_dedup.sql: merged_into_uuid column, perf indexes,
  link_type CHECK extended for cross_regulation
- batch_dedup_runner.py: BatchDedupRunner with quality scoring, merge-hint
  grouping, title-identical short-circuit, parent-link transfer, and
  cross-regulation pass
- tests/test_batch_dedup_runner.py: 21 tests (all passing)

Modified:
- control_dedup.py: optional collection param on Qdrant functions
- crosswalk_routes.py: POST/GET batch-dedup endpoints

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 07:06:38 +01:00
parent cce2707c03
commit 35784c35eb
5 changed files with 1126 additions and 10 deletions

View File

@@ -317,10 +317,12 @@ async def qdrant_search(
embedding: list[float],
pattern_id: str,
top_k: int = 10,
collection: Optional[str] = None,
) -> list[dict]:
"""Search Qdrant for similar atomic controls, filtered by pattern_id."""
if not embedding:
return []
coll = collection or QDRANT_COLLECTION
body: dict = {
"vector": embedding,
"limit": top_k,
@@ -334,7 +336,7 @@ async def qdrant_search(
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
f"{QDRANT_URL}/collections/{coll}/points/search",
json=body,
)
if resp.status_code != 200:
@@ -349,6 +351,7 @@ async def qdrant_search(
async def qdrant_search_cross_regulation(
embedding: list[float],
top_k: int = 5,
collection: Optional[str] = None,
) -> list[dict]:
"""Search Qdrant for similar controls across ALL regulations (no pattern_id filter).
@@ -356,6 +359,7 @@ async def qdrant_search_cross_regulation(
"""
if not embedding:
return []
coll = collection or QDRANT_COLLECTION
body: dict = {
"vector": embedding,
"limit": top_k,
@@ -364,7 +368,7 @@ async def qdrant_search_cross_regulation(
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
f"{QDRANT_URL}/collections/{coll}/points/search",
json=body,
)
if resp.status_code != 200:
@@ -380,10 +384,12 @@ async def qdrant_upsert(
point_id: str,
embedding: list[float],
payload: dict,
collection: Optional[str] = None,
) -> bool:
"""Upsert a single point into the atomic_controls Qdrant collection."""
"""Upsert a single point into a Qdrant collection."""
if not embedding:
return False
coll = collection or QDRANT_COLLECTION
body = {
"points": [{
"id": point_id,
@@ -394,7 +400,7 @@ async def qdrant_upsert(
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.put(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points",
f"{QDRANT_URL}/collections/{coll}/points",
json=body,
)
return resp.status_code == 200
@@ -403,27 +409,31 @@ async def qdrant_upsert(
return False
async def ensure_qdrant_collection(vector_size: int = 1024) -> bool:
"""Create the Qdrant collection if it doesn't exist (idempotent)."""
async def ensure_qdrant_collection(
vector_size: int = 1024,
collection: Optional[str] = None,
) -> bool:
"""Create a Qdrant collection if it doesn't exist (idempotent)."""
coll = collection or QDRANT_COLLECTION
try:
async with httpx.AsyncClient(timeout=10.0) as client:
# Check if exists
resp = await client.get(f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}")
resp = await client.get(f"{QDRANT_URL}/collections/{coll}")
if resp.status_code == 200:
return True
# Create
resp = await client.put(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}",
f"{QDRANT_URL}/collections/{coll}",
json={
"vectors": {"size": vector_size, "distance": "Cosine"},
},
)
if resp.status_code == 200:
logger.info("Created Qdrant collection: %s", QDRANT_COLLECTION)
logger.info("Created Qdrant collection: %s", coll)
# Create payload indexes
for field_name in ["pattern_id", "action_normalized", "object_normalized", "control_id"]:
await client.put(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/index",
f"{QDRANT_URL}/collections/{coll}/index",
json={"field_name": field_name, "field_schema": "keyword"},
)
return True
@@ -710,6 +720,7 @@ class ControlDedupChecker:
action: str,
obj: str,
pattern_id: str,
collection: Optional[str] = None,
) -> bool:
"""Index a new atomic control in Qdrant for future dedup checks."""
norm_action = normalize_action(action)
@@ -730,4 +741,5 @@ class ControlDedupChecker:
"object_normalized": norm_object,
"canonical_text": canonical,
},
collection=collection,
)