feat(pipeline): MC Quality Overhaul — 74.5% → 92.8% accuracy, 5.3K → 13.6K MCs
Phase 0: Quality Audit script (Claude Sonnet, 1750 samples) Phase 1: Object ontology expanded 31 → 74 tokens with descriptions + boundaries Phase 2: 174K controls re-classified via Haiku (10 batches, $50) - Generic tokens removed (documentation, procedure, process) - L2 sub-topics added (108K + 64K controls) - Bad subtopics fixed (stakeholder_*, escalation fragments) Phase 3: Re-clustering K=18704 (37K objects → 16.7K groups) Phase 4: Direct MC generation from canonical tokens (gpre2_direct_mc.py) Phase 5: Regulation-source split (gpre3, dry-run tested) New features: - Tenant-isolated document upload API (rag-service) - BAuA crawler (Playwright, 131 PDFs downloaded) - OSHA Technical Manual crawler (23 chapters) - CE obligation extractor (6141 obligations from Qdrant) RAG ingestion: - 126 BAuA PDFs (TRBS/TRGS/ASR): 27,664 chunks - OSHA Technical Manual: 7,241 chunks - OSHA 1910 Subpart O (full): 745 chunks - EuGH C-588/21 P: 216 chunks - EU 2018/1725: 842 chunks Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -235,6 +235,74 @@ class QdrantClientWrapper:
|
||||
logger.info("Deleted points from '%s' with filter %s", collection, filter_conditions)
|
||||
return True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Tenant document helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def get_unique_documents(self, collection: str) -> list[dict]:
|
||||
"""Get unique documents from a collection by scrolling and grouping."""
|
||||
try:
|
||||
self.client.get_collection(collection)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
docs: dict[str, dict] = {}
|
||||
offset = None
|
||||
while True:
|
||||
result = self.client.scroll(
|
||||
collection_name=collection,
|
||||
scroll_filter=None,
|
||||
limit=100,
|
||||
offset=offset,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
points, next_offset = result
|
||||
for pt in points:
|
||||
payload = pt.payload or {}
|
||||
doc_id = payload.get("document_id", "")
|
||||
if doc_id and doc_id not in docs:
|
||||
docs[doc_id] = {
|
||||
"id": doc_id,
|
||||
"filename": payload.get("filename", ""),
|
||||
"file_size": payload.get("file_size", 0),
|
||||
"status": "indexed",
|
||||
"chunk_count": 0,
|
||||
"collection": collection,
|
||||
}
|
||||
if doc_id:
|
||||
docs[doc_id]["chunk_count"] += 1
|
||||
|
||||
if next_offset is None:
|
||||
break
|
||||
offset = next_offset
|
||||
|
||||
return list(docs.values())
|
||||
|
||||
async def count_by_filter(
|
||||
self, collection: str, filter_conditions: dict[str, Any]
|
||||
) -> int:
|
||||
"""Count points matching filter."""
|
||||
try:
|
||||
self.client.get_collection(collection)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
must_conditions = []
|
||||
for key, value in filter_conditions.items():
|
||||
must_conditions.append(
|
||||
qmodels.FieldCondition(
|
||||
key=key, match=qmodels.MatchValue(value=value)
|
||||
)
|
||||
)
|
||||
|
||||
result = self.client.count(
|
||||
collection_name=collection,
|
||||
count_filter=qmodels.Filter(must=must_conditions),
|
||||
exact=True,
|
||||
)
|
||||
return result.count
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Info
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user