feat(pipeline): MC Quality Overhaul — 74.5% → 92.8% accuracy, 5.3K → 13.6K MCs

Phase 0: Quality Audit script (Claude Sonnet, 1750 samples) Phase 1: Object ontology expanded 31 → 74 tokens with descriptions + boundaries Phase 2: 174K controls re-classified via Haiku (10 batches, $50) - Generic tokens removed (documentation, procedure, process) - L2 sub-topics added (108K + 64K controls) - Bad subtopics fixed (stakeholder_*, escalation fragments) Phase 3: Re-clustering K=18704 (37K objects → 16.7K groups) Phase 4: Direct MC generation from canonical tokens (gpre2_direct_mc.py) Phase 5: Regulation-source split (gpre3, dry-run tested) New features: - Tenant-isolated document upload API (rag-service) - BAuA crawler (Playwright, 131 PDFs downloaded) - OSHA Technical Manual crawler (23 chapters) - CE obligation extractor (6141 obligations from Qdrant) RAG ingestion: - 126 BAuA PDFs (TRBS/TRGS/ASR): 27,664 chunks - OSHA Technical Manual: 7,241 chunks - OSHA 1910 Subpart O (full): 745 chunks - EuGH C-588/21 P: 216 chunks - EU 2018/1725: 842 chunks Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-10 15:08:15 +02:00
parent 81db904b3e
commit 8510af46eb
19 changed files with 3173 additions and 6 deletions
@@ -0,0 +1,289 @@
+"""
+Tenant-isolated document upload, listing, and deletion.
+
+Each tenant gets their own Qdrant collection (bp_docs_tenant_{short_id}).
+Documents are stored in MinIO under tenant-specific paths.
+No data crosses tenant boundaries.
+
+Endpoints:
+  POST   /api/v1/tenant/documents          - Upload + process PDF
+  GET    /api/v1/tenant/documents          - List tenant's documents
+  DELETE /api/v1/tenant/documents/{doc_id} - Delete document + vectors
+  GET    /api/v1/tenant/documents/{doc_id}/status - Processing status
+"""
+
+import json
+import logging
+import uuid
+from typing import Optional
+
+from fastapi import APIRouter, File, Form, HTTPException, Header, Request, UploadFile
+from pydantic import BaseModel
+
+from api.auth import optional_jwt_auth
+from embedding_client import embedding_client
+from html_utils import decode_html_bytes, looks_like_html, strip_html
+from minio_client_wrapper import minio_wrapper
+from qdrant_client_wrapper import qdrant_wrapper
+
+logger = logging.getLogger("rag-service.api.tenant-documents")
+
+router = APIRouter(prefix="/api/v1/tenant/documents")
+
+VECTOR_DIM = 1024  # bge-m3 dimension
+MAX_FILE_SIZE = 50 * 1024 * 1024  # 50 MB
+ALLOWED_TYPES = {"application/pdf", "text/html", "text/plain"}
+PDF_MAGIC = b"%PDF"
+
+
+def _collection_name(tenant_id: str) -> str:
+    """Derive tenant-specific Qdrant collection name."""
+    short = tenant_id.replace("-", "")[:12]
+    return f"bp_docs_tenant_{short}"
+
+
+def _storage_path(tenant_id: str, document_id: str, filename: str) -> str:
+    """Derive tenant-isolated storage path."""
+    short = tenant_id.replace("-", "")[:12]
+    return f"tenant_docs/{short}/{document_id}/{filename}"
+
+
+def _extract_tenant_id(
+    request: Request,
+    x_tenant_id: Optional[str] = Header(None),
+) -> str:
+    """Extract tenant ID from header. Required for all tenant endpoints."""
+    tid = x_tenant_id or request.headers.get("x-tenant-id", "")
+    if not tid:
+        raise HTTPException(status_code=400, detail="X-Tenant-ID header required")
+    return tid
+
+
+# ── Response models ────────────────────────────────────────────────
+
+class DocumentResponse(BaseModel):
+    id: str
+    filename: str
+    file_size: int
+    status: str
+    chunk_count: int
+    collection: str
+    created_at: Optional[str] = None
+
+
+class DocumentListResponse(BaseModel):
+    documents: list[DocumentResponse]
+    total: int
+
+
+# ── Endpoints ──────────────────────────────────────────────────────
+
+@router.post("", response_model=DocumentResponse)
+async def upload_tenant_document(
+    request: Request,
+    file: UploadFile = File(...),
+    x_tenant_id: Optional[str] = Header(None),
+    chunk_size: int = Form(default=512),
+    chunk_overlap: int = Form(default=50),
+    metadata_json: Optional[str] = Form(default=None),
+):
+    """Upload a document, process it, and index in tenant-specific collection."""
+    optional_jwt_auth(request)
+    tenant_id = _extract_tenant_id(request, x_tenant_id)
+
+    # Read + validate
+    file_bytes = await file.read()
+    if len(file_bytes) == 0:
+        raise HTTPException(status_code=400, detail="Empty file")
+    if len(file_bytes) > MAX_FILE_SIZE:
+        raise HTTPException(status_code=413, detail=f"File too large (max {MAX_FILE_SIZE // 1024 // 1024} MB)")
+
+    filename = file.filename or "document.pdf"
+    content_type = file.content_type or "application/octet-stream"
+
+    # PDF magic bytes check
+    if filename.lower().endswith(".pdf") and not file_bytes[:4].startswith(PDF_MAGIC):
+        raise HTTPException(status_code=400, detail="File claims to be PDF but magic bytes don't match")
+
+    document_id = str(uuid.uuid4())
+    collection = _collection_name(tenant_id)
+    object_name = _storage_path(tenant_id, document_id, filename)
+
+    # Ensure collection exists
+    await qdrant_wrapper.create_collection(collection, VECTOR_DIM)
+
+    # Store in MinIO
+    try:
+        await minio_wrapper.upload_document(
+            object_name=object_name,
+            data=file_bytes,
+            content_type=content_type,
+            metadata={"document_id": document_id, "tenant_id": tenant_id},
+        )
+    except Exception as exc:
+        logger.error("MinIO upload failed for tenant %s: %s", tenant_id, exc)
+        raise HTTPException(status_code=500, detail="Storage failed")
+
+    # Extract text
+    try:
+        text = await _extract_text(file_bytes, filename, content_type)
+    except Exception as exc:
+        logger.error("Text extraction failed: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Text extraction failed: {exc}")
+
+    if not text or not text.strip():
+        raise HTTPException(status_code=400, detail="No text could be extracted")
+
+    # Chunk
+    chunk_result = await embedding_client.chunk_text(
+        text=text, strategy="recursive",
+        chunk_size=chunk_size, overlap=chunk_overlap,
+    )
+    chunks = chunk_result.chunks
+    chunks_meta = chunk_result.chunks_with_metadata
+
+    if not chunks:
+        raise HTTPException(status_code=400, detail="Chunking produced zero chunks")
+
+    # Embed
+    embeddings = await embedding_client.generate_embeddings(chunks)
+
+    # Parse extra metadata
+    extra_metadata = {}
+    if metadata_json:
+        try:
+            extra_metadata = json.loads(metadata_json)
+        except json.JSONDecodeError:
+            pass
+
+    # Build payloads with tenant isolation
+    _STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
+    payloads = []
+    for i, chunk in enumerate(chunks):
+        payload = {
+            "document_id": document_id,
+            "tenant_id": tenant_id,
+            "filename": filename,
+            "chunk_index": i,
+            "chunk_text": chunk,
+            **extra_metadata,
+        }
+        if i < len(chunks_meta):
+            for field in _STRUCT_FIELDS:
+                value = chunks_meta[i].get(field)
+                if value is not None and value != "":
+                    payload[field] = value
+        payloads.append(payload)
+
+    # Index in tenant collection
+    indexed = await qdrant_wrapper.index_documents(
+        collection=collection, vectors=embeddings, payloads=payloads,
+    )
+
+    logger.info(
+        "Tenant %s: uploaded %s (%d chunks, %d vectors) to %s",
+        tenant_id[:8], filename, len(chunks), indexed, collection,
+    )
+
+    return DocumentResponse(
+        id=document_id, filename=filename,
+        file_size=len(file_bytes), status="indexed",
+        chunk_count=len(chunks), collection=collection,
+    )
+
+
+@router.get("", response_model=DocumentListResponse)
+async def list_tenant_documents(
+    request: Request,
+    x_tenant_id: Optional[str] = Header(None),
+):
+    """List all documents for this tenant."""
+    optional_jwt_auth(request)
+    tenant_id = _extract_tenant_id(request, x_tenant_id)
+
+    collection = _collection_name(tenant_id)
+
+    try:
+        # Get unique document_ids from Qdrant
+        docs = await qdrant_wrapper.get_unique_documents(collection)
+    except Exception:
+        # Collection doesn't exist yet → no documents
+        docs = []
+
+    return DocumentListResponse(documents=docs, total=len(docs))
+
+
+@router.delete("/{doc_id}")
+async def delete_tenant_document(
+    doc_id: str,
+    request: Request,
+    x_tenant_id: Optional[str] = Header(None),
+):
+    """Delete a document and all its vectors from tenant collection."""
+    optional_jwt_auth(request)
+    tenant_id = _extract_tenant_id(request, x_tenant_id)
+
+    collection = _collection_name(tenant_id)
+    errors = []
+
+    # Delete vectors from Qdrant
+    try:
+        await qdrant_wrapper.delete_by_filter(
+            collection=collection,
+            filter_conditions={"document_id": doc_id},
+        )
+    except Exception as exc:
+        errors.append(f"Qdrant: {exc}")
+
+    # Delete file from MinIO
+    try:
+        prefix = f"tenant_docs/{tenant_id.replace('-', '')[:12]}/{doc_id}/"
+        await minio_wrapper.delete_by_prefix(prefix)
+    except Exception as exc:
+        errors.append(f"MinIO: {exc}")
+
+    if errors:
+        logger.warning("Partial delete for %s/%s: %s", tenant_id[:8], doc_id[:8], errors)
+        return {"deleted": True, "warnings": errors}
+
+    logger.info("Tenant %s: deleted document %s", tenant_id[:8], doc_id[:8])
+    return {"deleted": True, "document_id": doc_id}
+
+
+@router.get("/{doc_id}/status")
+async def document_status(
+    doc_id: str,
+    request: Request,
+    x_tenant_id: Optional[str] = Header(None),
+):
+    """Get processing status for a document."""
+    optional_jwt_auth(request)
+    tenant_id = _extract_tenant_id(request, x_tenant_id)
+
+    collection = _collection_name(tenant_id)
+    try:
+        count = await qdrant_wrapper.count_by_filter(
+            collection=collection,
+            filter_conditions={"document_id": doc_id},
+        )
+        status = "indexed" if count > 0 else "not_found"
+    except Exception:
+        count = 0
+        status = "not_found"
+
+    return {"document_id": doc_id, "status": status, "chunk_count": count}
+
+
+# ── Helpers ────────────────────────────────────────────────────────
+
+async def _extract_text(file_bytes: bytes, filename: str, content_type: str) -> str:
+    """Extract text from PDF, HTML, or plain text."""
+    if content_type == "application/pdf" or filename.lower().endswith(".pdf"):
+        return await embedding_client.extract_pdf(file_bytes)
+    if filename.lower().endswith((".html", ".htm")):
+        text = decode_html_bytes(file_bytes)
+        return strip_html(text)
+    text = file_bytes.decode("utf-8", errors="replace")
+    if looks_like_html(text):
+        return strip_html(text)
+    return text