8510af46eb
Phase 0: Quality Audit script (Claude Sonnet, 1750 samples) Phase 1: Object ontology expanded 31 → 74 tokens with descriptions + boundaries Phase 2: 174K controls re-classified via Haiku (10 batches, $50) - Generic tokens removed (documentation, procedure, process) - L2 sub-topics added (108K + 64K controls) - Bad subtopics fixed (stakeholder_*, escalation fragments) Phase 3: Re-clustering K=18704 (37K objects → 16.7K groups) Phase 4: Direct MC generation from canonical tokens (gpre2_direct_mc.py) Phase 5: Regulation-source split (gpre3, dry-run tested) New features: - Tenant-isolated document upload API (rag-service) - BAuA crawler (Playwright, 131 PDFs downloaded) - OSHA Technical Manual crawler (23 chapters) - CE obligation extractor (6141 obligations from Qdrant) RAG ingestion: - 126 BAuA PDFs (TRBS/TRGS/ASR): 27,664 chunks - OSHA Technical Manual: 7,241 chunks - OSHA 1910 Subpart O (full): 745 chunks - EuGH C-588/21 P: 216 chunks - EU 2018/1725: 842 chunks Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
85 lines
2.7 KiB
Python
85 lines
2.7 KiB
Python
"""Shared embedding + sub-clustering utilities for the control pipeline."""
|
|
|
|
import logging
|
|
import os
|
|
from collections import defaultdict
|
|
|
|
import httpx
|
|
import numpy as np
|
|
from sklearn.cluster import MiniBatchKMeans
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
EMBEDDING_URL = os.getenv(
|
|
"EMBEDDING_SERVICE_URL", "http://embedding-service:8087"
|
|
)
|
|
|
|
|
|
def embed_texts(texts: list[str]) -> np.ndarray | None:
|
|
"""Embed texts via the embedding-service in batches of 64."""
|
|
try:
|
|
result = np.zeros((len(texts), 1024), dtype=np.float32)
|
|
batch_size = 64
|
|
for i in range(0, len(texts), batch_size):
|
|
batch = texts[i : i + batch_size]
|
|
for attempt in range(3):
|
|
try:
|
|
with httpx.Client(
|
|
timeout=httpx.Timeout(60.0, connect=10.0)
|
|
) as client:
|
|
resp = client.post(
|
|
f"{EMBEDDING_URL}/embed", json={"texts": batch}
|
|
)
|
|
resp.raise_for_status()
|
|
embs = resp.json().get("embeddings", [])
|
|
end = min(i + len(embs), len(texts))
|
|
result[i:end] = np.array(embs, dtype=np.float32)
|
|
break
|
|
except Exception as e:
|
|
if attempt == 2:
|
|
logger.error("Embed batch %d failed: %s", i, e)
|
|
import time
|
|
time.sleep(2)
|
|
return result
|
|
except Exception as e:
|
|
logger.error("Embedding failed: %s", e)
|
|
return None
|
|
|
|
|
|
def subcluster_controls(
|
|
controls: list[dict], target_size: int = 50
|
|
) -> list[list[dict]]:
|
|
"""Sub-cluster controls by embedding similarity.
|
|
|
|
Returns a list of clusters. Falls back to naive chunking
|
|
if embedding fails.
|
|
"""
|
|
if len(controls) <= target_size:
|
|
return [controls]
|
|
|
|
texts = [c.get("title", "") or c.get("control_id", "") for c in controls]
|
|
embeddings = embed_texts(texts)
|
|
if embeddings is None:
|
|
return [
|
|
controls[i : i + target_size]
|
|
for i in range(0, len(controls), target_size)
|
|
]
|
|
|
|
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
norms[norms == 0] = 1
|
|
normalized = embeddings / norms
|
|
|
|
k = max(2, min(len(controls) // target_size, 30))
|
|
kmeans = MiniBatchKMeans(
|
|
n_clusters=k,
|
|
batch_size=min(100, len(controls)),
|
|
max_iter=50,
|
|
random_state=42,
|
|
)
|
|
labels = kmeans.fit_predict(normalized)
|
|
|
|
clusters: dict[int, list[dict]] = defaultdict(list)
|
|
for i, ctrl in enumerate(controls):
|
|
clusters[int(labels[i])].append(ctrl)
|
|
return list(clusters.values())
|