feat(pipeline): MC Quality Overhaul — 74.5% → 92.8% accuracy, 5.3K → 13.6K MCs
Phase 0: Quality Audit script (Claude Sonnet, 1750 samples) Phase 1: Object ontology expanded 31 → 74 tokens with descriptions + boundaries Phase 2: 174K controls re-classified via Haiku (10 batches, $50) - Generic tokens removed (documentation, procedure, process) - L2 sub-topics added (108K + 64K controls) - Bad subtopics fixed (stakeholder_*, escalation fragments) Phase 3: Re-clustering K=18704 (37K objects → 16.7K groups) Phase 4: Direct MC generation from canonical tokens (gpre2_direct_mc.py) Phase 5: Regulation-source split (gpre3, dry-run tested) New features: - Tenant-isolated document upload API (rag-service) - BAuA crawler (Playwright, 131 PDFs downloaded) - OSHA Technical Manual crawler (23 chapters) - CE obligation extractor (6141 obligations from Qdrant) RAG ingestion: - 126 BAuA PDFs (TRBS/TRGS/ASR): 27,664 chunks - OSHA Technical Manual: 7,241 chunks - OSHA 1910 Subpart O (full): 745 chunks - EuGH C-588/21 P: 216 chunks - EU 2018/1725: 842 chunks Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,289 @@
|
||||
"""
|
||||
Tenant-isolated document upload, listing, and deletion.
|
||||
|
||||
Each tenant gets their own Qdrant collection (bp_docs_tenant_{short_id}).
|
||||
Documents are stored in MinIO under tenant-specific paths.
|
||||
No data crosses tenant boundaries.
|
||||
|
||||
Endpoints:
|
||||
POST /api/v1/tenant/documents - Upload + process PDF
|
||||
GET /api/v1/tenant/documents - List tenant's documents
|
||||
DELETE /api/v1/tenant/documents/{doc_id} - Delete document + vectors
|
||||
GET /api/v1/tenant/documents/{doc_id}/status - Processing status
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, File, Form, HTTPException, Header, Request, UploadFile
|
||||
from pydantic import BaseModel
|
||||
|
||||
from api.auth import optional_jwt_auth
|
||||
from embedding_client import embedding_client
|
||||
from html_utils import decode_html_bytes, looks_like_html, strip_html
|
||||
from minio_client_wrapper import minio_wrapper
|
||||
from qdrant_client_wrapper import qdrant_wrapper
|
||||
|
||||
logger = logging.getLogger("rag-service.api.tenant-documents")
|
||||
|
||||
router = APIRouter(prefix="/api/v1/tenant/documents")
|
||||
|
||||
VECTOR_DIM = 1024 # bge-m3 dimension
|
||||
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
||||
ALLOWED_TYPES = {"application/pdf", "text/html", "text/plain"}
|
||||
PDF_MAGIC = b"%PDF"
|
||||
|
||||
|
||||
def _collection_name(tenant_id: str) -> str:
|
||||
"""Derive tenant-specific Qdrant collection name."""
|
||||
short = tenant_id.replace("-", "")[:12]
|
||||
return f"bp_docs_tenant_{short}"
|
||||
|
||||
|
||||
def _storage_path(tenant_id: str, document_id: str, filename: str) -> str:
|
||||
"""Derive tenant-isolated storage path."""
|
||||
short = tenant_id.replace("-", "")[:12]
|
||||
return f"tenant_docs/{short}/{document_id}/{filename}"
|
||||
|
||||
|
||||
def _extract_tenant_id(
|
||||
request: Request,
|
||||
x_tenant_id: Optional[str] = Header(None),
|
||||
) -> str:
|
||||
"""Extract tenant ID from header. Required for all tenant endpoints."""
|
||||
tid = x_tenant_id or request.headers.get("x-tenant-id", "")
|
||||
if not tid:
|
||||
raise HTTPException(status_code=400, detail="X-Tenant-ID header required")
|
||||
return tid
|
||||
|
||||
|
||||
# ── Response models ────────────────────────────────────────────────
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
id: str
|
||||
filename: str
|
||||
file_size: int
|
||||
status: str
|
||||
chunk_count: int
|
||||
collection: str
|
||||
created_at: Optional[str] = None
|
||||
|
||||
|
||||
class DocumentListResponse(BaseModel):
|
||||
documents: list[DocumentResponse]
|
||||
total: int
|
||||
|
||||
|
||||
# ── Endpoints ──────────────────────────────────────────────────────
|
||||
|
||||
@router.post("", response_model=DocumentResponse)
|
||||
async def upload_tenant_document(
|
||||
request: Request,
|
||||
file: UploadFile = File(...),
|
||||
x_tenant_id: Optional[str] = Header(None),
|
||||
chunk_size: int = Form(default=512),
|
||||
chunk_overlap: int = Form(default=50),
|
||||
metadata_json: Optional[str] = Form(default=None),
|
||||
):
|
||||
"""Upload a document, process it, and index in tenant-specific collection."""
|
||||
optional_jwt_auth(request)
|
||||
tenant_id = _extract_tenant_id(request, x_tenant_id)
|
||||
|
||||
# Read + validate
|
||||
file_bytes = await file.read()
|
||||
if len(file_bytes) == 0:
|
||||
raise HTTPException(status_code=400, detail="Empty file")
|
||||
if len(file_bytes) > MAX_FILE_SIZE:
|
||||
raise HTTPException(status_code=413, detail=f"File too large (max {MAX_FILE_SIZE // 1024 // 1024} MB)")
|
||||
|
||||
filename = file.filename or "document.pdf"
|
||||
content_type = file.content_type or "application/octet-stream"
|
||||
|
||||
# PDF magic bytes check
|
||||
if filename.lower().endswith(".pdf") and not file_bytes[:4].startswith(PDF_MAGIC):
|
||||
raise HTTPException(status_code=400, detail="File claims to be PDF but magic bytes don't match")
|
||||
|
||||
document_id = str(uuid.uuid4())
|
||||
collection = _collection_name(tenant_id)
|
||||
object_name = _storage_path(tenant_id, document_id, filename)
|
||||
|
||||
# Ensure collection exists
|
||||
await qdrant_wrapper.create_collection(collection, VECTOR_DIM)
|
||||
|
||||
# Store in MinIO
|
||||
try:
|
||||
await minio_wrapper.upload_document(
|
||||
object_name=object_name,
|
||||
data=file_bytes,
|
||||
content_type=content_type,
|
||||
metadata={"document_id": document_id, "tenant_id": tenant_id},
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error("MinIO upload failed for tenant %s: %s", tenant_id, exc)
|
||||
raise HTTPException(status_code=500, detail="Storage failed")
|
||||
|
||||
# Extract text
|
||||
try:
|
||||
text = await _extract_text(file_bytes, filename, content_type)
|
||||
except Exception as exc:
|
||||
logger.error("Text extraction failed: %s", exc)
|
||||
raise HTTPException(status_code=500, detail=f"Text extraction failed: {exc}")
|
||||
|
||||
if not text or not text.strip():
|
||||
raise HTTPException(status_code=400, detail="No text could be extracted")
|
||||
|
||||
# Chunk
|
||||
chunk_result = await embedding_client.chunk_text(
|
||||
text=text, strategy="recursive",
|
||||
chunk_size=chunk_size, overlap=chunk_overlap,
|
||||
)
|
||||
chunks = chunk_result.chunks
|
||||
chunks_meta = chunk_result.chunks_with_metadata
|
||||
|
||||
if not chunks:
|
||||
raise HTTPException(status_code=400, detail="Chunking produced zero chunks")
|
||||
|
||||
# Embed
|
||||
embeddings = await embedding_client.generate_embeddings(chunks)
|
||||
|
||||
# Parse extra metadata
|
||||
extra_metadata = {}
|
||||
if metadata_json:
|
||||
try:
|
||||
extra_metadata = json.loads(metadata_json)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Build payloads with tenant isolation
|
||||
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
|
||||
payloads = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
payload = {
|
||||
"document_id": document_id,
|
||||
"tenant_id": tenant_id,
|
||||
"filename": filename,
|
||||
"chunk_index": i,
|
||||
"chunk_text": chunk,
|
||||
**extra_metadata,
|
||||
}
|
||||
if i < len(chunks_meta):
|
||||
for field in _STRUCT_FIELDS:
|
||||
value = chunks_meta[i].get(field)
|
||||
if value is not None and value != "":
|
||||
payload[field] = value
|
||||
payloads.append(payload)
|
||||
|
||||
# Index in tenant collection
|
||||
indexed = await qdrant_wrapper.index_documents(
|
||||
collection=collection, vectors=embeddings, payloads=payloads,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Tenant %s: uploaded %s (%d chunks, %d vectors) to %s",
|
||||
tenant_id[:8], filename, len(chunks), indexed, collection,
|
||||
)
|
||||
|
||||
return DocumentResponse(
|
||||
id=document_id, filename=filename,
|
||||
file_size=len(file_bytes), status="indexed",
|
||||
chunk_count=len(chunks), collection=collection,
|
||||
)
|
||||
|
||||
|
||||
@router.get("", response_model=DocumentListResponse)
|
||||
async def list_tenant_documents(
|
||||
request: Request,
|
||||
x_tenant_id: Optional[str] = Header(None),
|
||||
):
|
||||
"""List all documents for this tenant."""
|
||||
optional_jwt_auth(request)
|
||||
tenant_id = _extract_tenant_id(request, x_tenant_id)
|
||||
|
||||
collection = _collection_name(tenant_id)
|
||||
|
||||
try:
|
||||
# Get unique document_ids from Qdrant
|
||||
docs = await qdrant_wrapper.get_unique_documents(collection)
|
||||
except Exception:
|
||||
# Collection doesn't exist yet → no documents
|
||||
docs = []
|
||||
|
||||
return DocumentListResponse(documents=docs, total=len(docs))
|
||||
|
||||
|
||||
@router.delete("/{doc_id}")
|
||||
async def delete_tenant_document(
|
||||
doc_id: str,
|
||||
request: Request,
|
||||
x_tenant_id: Optional[str] = Header(None),
|
||||
):
|
||||
"""Delete a document and all its vectors from tenant collection."""
|
||||
optional_jwt_auth(request)
|
||||
tenant_id = _extract_tenant_id(request, x_tenant_id)
|
||||
|
||||
collection = _collection_name(tenant_id)
|
||||
errors = []
|
||||
|
||||
# Delete vectors from Qdrant
|
||||
try:
|
||||
await qdrant_wrapper.delete_by_filter(
|
||||
collection=collection,
|
||||
filter_conditions={"document_id": doc_id},
|
||||
)
|
||||
except Exception as exc:
|
||||
errors.append(f"Qdrant: {exc}")
|
||||
|
||||
# Delete file from MinIO
|
||||
try:
|
||||
prefix = f"tenant_docs/{tenant_id.replace('-', '')[:12]}/{doc_id}/"
|
||||
await minio_wrapper.delete_by_prefix(prefix)
|
||||
except Exception as exc:
|
||||
errors.append(f"MinIO: {exc}")
|
||||
|
||||
if errors:
|
||||
logger.warning("Partial delete for %s/%s: %s", tenant_id[:8], doc_id[:8], errors)
|
||||
return {"deleted": True, "warnings": errors}
|
||||
|
||||
logger.info("Tenant %s: deleted document %s", tenant_id[:8], doc_id[:8])
|
||||
return {"deleted": True, "document_id": doc_id}
|
||||
|
||||
|
||||
@router.get("/{doc_id}/status")
|
||||
async def document_status(
|
||||
doc_id: str,
|
||||
request: Request,
|
||||
x_tenant_id: Optional[str] = Header(None),
|
||||
):
|
||||
"""Get processing status for a document."""
|
||||
optional_jwt_auth(request)
|
||||
tenant_id = _extract_tenant_id(request, x_tenant_id)
|
||||
|
||||
collection = _collection_name(tenant_id)
|
||||
try:
|
||||
count = await qdrant_wrapper.count_by_filter(
|
||||
collection=collection,
|
||||
filter_conditions={"document_id": doc_id},
|
||||
)
|
||||
status = "indexed" if count > 0 else "not_found"
|
||||
except Exception:
|
||||
count = 0
|
||||
status = "not_found"
|
||||
|
||||
return {"document_id": doc_id, "status": status, "chunk_count": count}
|
||||
|
||||
|
||||
# ── Helpers ────────────────────────────────────────────────────────
|
||||
|
||||
async def _extract_text(file_bytes: bytes, filename: str, content_type: str) -> str:
|
||||
"""Extract text from PDF, HTML, or plain text."""
|
||||
if content_type == "application/pdf" or filename.lower().endswith(".pdf"):
|
||||
return await embedding_client.extract_pdf(file_bytes)
|
||||
if filename.lower().endswith((".html", ".htm")):
|
||||
text = decode_html_bytes(file_bytes)
|
||||
return strip_html(text)
|
||||
text = file_bytes.decode("utf-8", errors="replace")
|
||||
if looks_like_html(text):
|
||||
return strip_html(text)
|
||||
return text
|
||||
Reference in New Issue
Block a user