feat(pipeline): MC Quality Overhaul — 74.5% → 92.8% accuracy, 5.3K → 13.6K MCs

Phase 0: Quality Audit script (Claude Sonnet, 1750 samples)
Phase 1: Object ontology expanded 31 → 74 tokens with descriptions + boundaries
Phase 2: 174K controls re-classified via Haiku (10 batches, $50)
  - Generic tokens removed (documentation, procedure, process)
  - L2 sub-topics added (108K + 64K controls)
  - Bad subtopics fixed (stakeholder_*, escalation fragments)
Phase 3: Re-clustering K=18704 (37K objects → 16.7K groups)
Phase 4: Direct MC generation from canonical tokens (gpre2_direct_mc.py)
Phase 5: Regulation-source split (gpre3, dry-run tested)

New features:
- Tenant-isolated document upload API (rag-service)
- BAuA crawler (Playwright, 131 PDFs downloaded)
- OSHA Technical Manual crawler (23 chapters)
- CE obligation extractor (6141 obligations from Qdrant)

RAG ingestion:
- 126 BAuA PDFs (TRBS/TRGS/ASR): 27,664 chunks
- OSHA Technical Manual: 7,241 chunks
- OSHA 1910 Subpart O (full): 745 chunks
- EuGH C-588/21 P: 216 chunks
- EU 2018/1725: 842 chunks

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-10 15:08:15 +02:00
parent 81db904b3e
commit 8510af46eb
19 changed files with 3173 additions and 6 deletions
+2
View File
@@ -3,9 +3,11 @@ from fastapi import APIRouter
from api.collections import router as collections_router
from api.documents import router as documents_router
from api.search import router as search_router
from api.tenant_documents import router as tenant_documents_router
router = APIRouter()
router.include_router(collections_router, tags=["Collections"])
router.include_router(documents_router, tags=["Documents"])
router.include_router(tenant_documents_router, tags=["Tenant Documents"])
router.include_router(search_router, tags=["Search"])
+289
View File
@@ -0,0 +1,289 @@
"""
Tenant-isolated document upload, listing, and deletion.
Each tenant gets their own Qdrant collection (bp_docs_tenant_{short_id}).
Documents are stored in MinIO under tenant-specific paths.
No data crosses tenant boundaries.
Endpoints:
POST /api/v1/tenant/documents - Upload + process PDF
GET /api/v1/tenant/documents - List tenant's documents
DELETE /api/v1/tenant/documents/{doc_id} - Delete document + vectors
GET /api/v1/tenant/documents/{doc_id}/status - Processing status
"""
import json
import logging
import uuid
from typing import Optional
from fastapi import APIRouter, File, Form, HTTPException, Header, Request, UploadFile
from pydantic import BaseModel
from api.auth import optional_jwt_auth
from embedding_client import embedding_client
from html_utils import decode_html_bytes, looks_like_html, strip_html
from minio_client_wrapper import minio_wrapper
from qdrant_client_wrapper import qdrant_wrapper
logger = logging.getLogger("rag-service.api.tenant-documents")
router = APIRouter(prefix="/api/v1/tenant/documents")
VECTOR_DIM = 1024 # bge-m3 dimension
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
ALLOWED_TYPES = {"application/pdf", "text/html", "text/plain"}
PDF_MAGIC = b"%PDF"
def _collection_name(tenant_id: str) -> str:
"""Derive tenant-specific Qdrant collection name."""
short = tenant_id.replace("-", "")[:12]
return f"bp_docs_tenant_{short}"
def _storage_path(tenant_id: str, document_id: str, filename: str) -> str:
"""Derive tenant-isolated storage path."""
short = tenant_id.replace("-", "")[:12]
return f"tenant_docs/{short}/{document_id}/{filename}"
def _extract_tenant_id(
request: Request,
x_tenant_id: Optional[str] = Header(None),
) -> str:
"""Extract tenant ID from header. Required for all tenant endpoints."""
tid = x_tenant_id or request.headers.get("x-tenant-id", "")
if not tid:
raise HTTPException(status_code=400, detail="X-Tenant-ID header required")
return tid
# ── Response models ────────────────────────────────────────────────
class DocumentResponse(BaseModel):
id: str
filename: str
file_size: int
status: str
chunk_count: int
collection: str
created_at: Optional[str] = None
class DocumentListResponse(BaseModel):
documents: list[DocumentResponse]
total: int
# ── Endpoints ──────────────────────────────────────────────────────
@router.post("", response_model=DocumentResponse)
async def upload_tenant_document(
request: Request,
file: UploadFile = File(...),
x_tenant_id: Optional[str] = Header(None),
chunk_size: int = Form(default=512),
chunk_overlap: int = Form(default=50),
metadata_json: Optional[str] = Form(default=None),
):
"""Upload a document, process it, and index in tenant-specific collection."""
optional_jwt_auth(request)
tenant_id = _extract_tenant_id(request, x_tenant_id)
# Read + validate
file_bytes = await file.read()
if len(file_bytes) == 0:
raise HTTPException(status_code=400, detail="Empty file")
if len(file_bytes) > MAX_FILE_SIZE:
raise HTTPException(status_code=413, detail=f"File too large (max {MAX_FILE_SIZE // 1024 // 1024} MB)")
filename = file.filename or "document.pdf"
content_type = file.content_type or "application/octet-stream"
# PDF magic bytes check
if filename.lower().endswith(".pdf") and not file_bytes[:4].startswith(PDF_MAGIC):
raise HTTPException(status_code=400, detail="File claims to be PDF but magic bytes don't match")
document_id = str(uuid.uuid4())
collection = _collection_name(tenant_id)
object_name = _storage_path(tenant_id, document_id, filename)
# Ensure collection exists
await qdrant_wrapper.create_collection(collection, VECTOR_DIM)
# Store in MinIO
try:
await minio_wrapper.upload_document(
object_name=object_name,
data=file_bytes,
content_type=content_type,
metadata={"document_id": document_id, "tenant_id": tenant_id},
)
except Exception as exc:
logger.error("MinIO upload failed for tenant %s: %s", tenant_id, exc)
raise HTTPException(status_code=500, detail="Storage failed")
# Extract text
try:
text = await _extract_text(file_bytes, filename, content_type)
except Exception as exc:
logger.error("Text extraction failed: %s", exc)
raise HTTPException(status_code=500, detail=f"Text extraction failed: {exc}")
if not text or not text.strip():
raise HTTPException(status_code=400, detail="No text could be extracted")
# Chunk
chunk_result = await embedding_client.chunk_text(
text=text, strategy="recursive",
chunk_size=chunk_size, overlap=chunk_overlap,
)
chunks = chunk_result.chunks
chunks_meta = chunk_result.chunks_with_metadata
if not chunks:
raise HTTPException(status_code=400, detail="Chunking produced zero chunks")
# Embed
embeddings = await embedding_client.generate_embeddings(chunks)
# Parse extra metadata
extra_metadata = {}
if metadata_json:
try:
extra_metadata = json.loads(metadata_json)
except json.JSONDecodeError:
pass
# Build payloads with tenant isolation
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
payloads = []
for i, chunk in enumerate(chunks):
payload = {
"document_id": document_id,
"tenant_id": tenant_id,
"filename": filename,
"chunk_index": i,
"chunk_text": chunk,
**extra_metadata,
}
if i < len(chunks_meta):
for field in _STRUCT_FIELDS:
value = chunks_meta[i].get(field)
if value is not None and value != "":
payload[field] = value
payloads.append(payload)
# Index in tenant collection
indexed = await qdrant_wrapper.index_documents(
collection=collection, vectors=embeddings, payloads=payloads,
)
logger.info(
"Tenant %s: uploaded %s (%d chunks, %d vectors) to %s",
tenant_id[:8], filename, len(chunks), indexed, collection,
)
return DocumentResponse(
id=document_id, filename=filename,
file_size=len(file_bytes), status="indexed",
chunk_count=len(chunks), collection=collection,
)
@router.get("", response_model=DocumentListResponse)
async def list_tenant_documents(
request: Request,
x_tenant_id: Optional[str] = Header(None),
):
"""List all documents for this tenant."""
optional_jwt_auth(request)
tenant_id = _extract_tenant_id(request, x_tenant_id)
collection = _collection_name(tenant_id)
try:
# Get unique document_ids from Qdrant
docs = await qdrant_wrapper.get_unique_documents(collection)
except Exception:
# Collection doesn't exist yet → no documents
docs = []
return DocumentListResponse(documents=docs, total=len(docs))
@router.delete("/{doc_id}")
async def delete_tenant_document(
doc_id: str,
request: Request,
x_tenant_id: Optional[str] = Header(None),
):
"""Delete a document and all its vectors from tenant collection."""
optional_jwt_auth(request)
tenant_id = _extract_tenant_id(request, x_tenant_id)
collection = _collection_name(tenant_id)
errors = []
# Delete vectors from Qdrant
try:
await qdrant_wrapper.delete_by_filter(
collection=collection,
filter_conditions={"document_id": doc_id},
)
except Exception as exc:
errors.append(f"Qdrant: {exc}")
# Delete file from MinIO
try:
prefix = f"tenant_docs/{tenant_id.replace('-', '')[:12]}/{doc_id}/"
await minio_wrapper.delete_by_prefix(prefix)
except Exception as exc:
errors.append(f"MinIO: {exc}")
if errors:
logger.warning("Partial delete for %s/%s: %s", tenant_id[:8], doc_id[:8], errors)
return {"deleted": True, "warnings": errors}
logger.info("Tenant %s: deleted document %s", tenant_id[:8], doc_id[:8])
return {"deleted": True, "document_id": doc_id}
@router.get("/{doc_id}/status")
async def document_status(
doc_id: str,
request: Request,
x_tenant_id: Optional[str] = Header(None),
):
"""Get processing status for a document."""
optional_jwt_auth(request)
tenant_id = _extract_tenant_id(request, x_tenant_id)
collection = _collection_name(tenant_id)
try:
count = await qdrant_wrapper.count_by_filter(
collection=collection,
filter_conditions={"document_id": doc_id},
)
status = "indexed" if count > 0 else "not_found"
except Exception:
count = 0
status = "not_found"
return {"document_id": doc_id, "status": status, "chunk_count": count}
# ── Helpers ────────────────────────────────────────────────────────
async def _extract_text(file_bytes: bytes, filename: str, content_type: str) -> str:
"""Extract text from PDF, HTML, or plain text."""
if content_type == "application/pdf" or filename.lower().endswith(".pdf"):
return await embedding_client.extract_pdf(file_bytes)
if filename.lower().endswith((".html", ".htm")):
text = decode_html_bytes(file_bytes)
return strip_html(text)
text = file_bytes.decode("utf-8", errors="replace")
if looks_like_html(text):
return strip_html(text)
return text
+10
View File
@@ -122,6 +122,16 @@ class MinioClientWrapper:
logger.error("Failed to delete '%s': %s", object_name, exc)
raise
async def delete_by_prefix(self, prefix: str) -> int:
"""Remove all objects under a prefix."""
objects = self.client.list_objects(settings.MINIO_BUCKET, prefix=prefix, recursive=True)
count = 0
for obj in objects:
self.client.remove_object(settings.MINIO_BUCKET, obj.object_name)
count += 1
logger.info("Deleted %d objects with prefix '%s'", count, prefix)
return count
# ------------------------------------------------------------------
# Presigned URL
# ------------------------------------------------------------------
+68
View File
@@ -235,6 +235,74 @@ class QdrantClientWrapper:
logger.info("Deleted points from '%s' with filter %s", collection, filter_conditions)
return True
# ------------------------------------------------------------------
# Tenant document helpers
# ------------------------------------------------------------------
async def get_unique_documents(self, collection: str) -> list[dict]:
"""Get unique documents from a collection by scrolling and grouping."""
try:
self.client.get_collection(collection)
except Exception:
return []
docs: dict[str, dict] = {}
offset = None
while True:
result = self.client.scroll(
collection_name=collection,
scroll_filter=None,
limit=100,
offset=offset,
with_payload=True,
with_vectors=False,
)
points, next_offset = result
for pt in points:
payload = pt.payload or {}
doc_id = payload.get("document_id", "")
if doc_id and doc_id not in docs:
docs[doc_id] = {
"id": doc_id,
"filename": payload.get("filename", ""),
"file_size": payload.get("file_size", 0),
"status": "indexed",
"chunk_count": 0,
"collection": collection,
}
if doc_id:
docs[doc_id]["chunk_count"] += 1
if next_offset is None:
break
offset = next_offset
return list(docs.values())
async def count_by_filter(
self, collection: str, filter_conditions: dict[str, Any]
) -> int:
"""Count points matching filter."""
try:
self.client.get_collection(collection)
except Exception:
return 0
must_conditions = []
for key, value in filter_conditions.items():
must_conditions.append(
qmodels.FieldCondition(
key=key, match=qmodels.MatchValue(value=value)
)
)
result = self.client.count(
collection_name=collection,
count_filter=qmodels.Filter(must=must_conditions),
exact=True,
)
return result.count
# ------------------------------------------------------------------
# Info
# ------------------------------------------------------------------