8510af46eb
Phase 0: Quality Audit script (Claude Sonnet, 1750 samples) Phase 1: Object ontology expanded 31 → 74 tokens with descriptions + boundaries Phase 2: 174K controls re-classified via Haiku (10 batches, $50) - Generic tokens removed (documentation, procedure, process) - L2 sub-topics added (108K + 64K controls) - Bad subtopics fixed (stakeholder_*, escalation fragments) Phase 3: Re-clustering K=18704 (37K objects → 16.7K groups) Phase 4: Direct MC generation from canonical tokens (gpre2_direct_mc.py) Phase 5: Regulation-source split (gpre3, dry-run tested) New features: - Tenant-isolated document upload API (rag-service) - BAuA crawler (Playwright, 131 PDFs downloaded) - OSHA Technical Manual crawler (23 chapters) - CE obligation extractor (6141 obligations from Qdrant) RAG ingestion: - 126 BAuA PDFs (TRBS/TRGS/ASR): 27,664 chunks - OSHA Technical Manual: 7,241 chunks - OSHA 1910 Subpart O (full): 745 chunks - EuGH C-588/21 P: 216 chunks - EU 2018/1725: 842 chunks Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
290 lines
9.7 KiB
Python
290 lines
9.7 KiB
Python
"""
|
|
Tenant-isolated document upload, listing, and deletion.
|
|
|
|
Each tenant gets their own Qdrant collection (bp_docs_tenant_{short_id}).
|
|
Documents are stored in MinIO under tenant-specific paths.
|
|
No data crosses tenant boundaries.
|
|
|
|
Endpoints:
|
|
POST /api/v1/tenant/documents - Upload + process PDF
|
|
GET /api/v1/tenant/documents - List tenant's documents
|
|
DELETE /api/v1/tenant/documents/{doc_id} - Delete document + vectors
|
|
GET /api/v1/tenant/documents/{doc_id}/status - Processing status
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import uuid
|
|
from typing import Optional
|
|
|
|
from fastapi import APIRouter, File, Form, HTTPException, Header, Request, UploadFile
|
|
from pydantic import BaseModel
|
|
|
|
from api.auth import optional_jwt_auth
|
|
from embedding_client import embedding_client
|
|
from html_utils import decode_html_bytes, looks_like_html, strip_html
|
|
from minio_client_wrapper import minio_wrapper
|
|
from qdrant_client_wrapper import qdrant_wrapper
|
|
|
|
logger = logging.getLogger("rag-service.api.tenant-documents")
|
|
|
|
router = APIRouter(prefix="/api/v1/tenant/documents")
|
|
|
|
VECTOR_DIM = 1024 # bge-m3 dimension
|
|
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB
|
|
ALLOWED_TYPES = {"application/pdf", "text/html", "text/plain"}
|
|
PDF_MAGIC = b"%PDF"
|
|
|
|
|
|
def _collection_name(tenant_id: str) -> str:
|
|
"""Derive tenant-specific Qdrant collection name."""
|
|
short = tenant_id.replace("-", "")[:12]
|
|
return f"bp_docs_tenant_{short}"
|
|
|
|
|
|
def _storage_path(tenant_id: str, document_id: str, filename: str) -> str:
|
|
"""Derive tenant-isolated storage path."""
|
|
short = tenant_id.replace("-", "")[:12]
|
|
return f"tenant_docs/{short}/{document_id}/{filename}"
|
|
|
|
|
|
def _extract_tenant_id(
|
|
request: Request,
|
|
x_tenant_id: Optional[str] = Header(None),
|
|
) -> str:
|
|
"""Extract tenant ID from header. Required for all tenant endpoints."""
|
|
tid = x_tenant_id or request.headers.get("x-tenant-id", "")
|
|
if not tid:
|
|
raise HTTPException(status_code=400, detail="X-Tenant-ID header required")
|
|
return tid
|
|
|
|
|
|
# ── Response models ────────────────────────────────────────────────
|
|
|
|
class DocumentResponse(BaseModel):
|
|
id: str
|
|
filename: str
|
|
file_size: int
|
|
status: str
|
|
chunk_count: int
|
|
collection: str
|
|
created_at: Optional[str] = None
|
|
|
|
|
|
class DocumentListResponse(BaseModel):
|
|
documents: list[DocumentResponse]
|
|
total: int
|
|
|
|
|
|
# ── Endpoints ──────────────────────────────────────────────────────
|
|
|
|
@router.post("", response_model=DocumentResponse)
|
|
async def upload_tenant_document(
|
|
request: Request,
|
|
file: UploadFile = File(...),
|
|
x_tenant_id: Optional[str] = Header(None),
|
|
chunk_size: int = Form(default=512),
|
|
chunk_overlap: int = Form(default=50),
|
|
metadata_json: Optional[str] = Form(default=None),
|
|
):
|
|
"""Upload a document, process it, and index in tenant-specific collection."""
|
|
optional_jwt_auth(request)
|
|
tenant_id = _extract_tenant_id(request, x_tenant_id)
|
|
|
|
# Read + validate
|
|
file_bytes = await file.read()
|
|
if len(file_bytes) == 0:
|
|
raise HTTPException(status_code=400, detail="Empty file")
|
|
if len(file_bytes) > MAX_FILE_SIZE:
|
|
raise HTTPException(status_code=413, detail=f"File too large (max {MAX_FILE_SIZE // 1024 // 1024} MB)")
|
|
|
|
filename = file.filename or "document.pdf"
|
|
content_type = file.content_type or "application/octet-stream"
|
|
|
|
# PDF magic bytes check
|
|
if filename.lower().endswith(".pdf") and not file_bytes[:4].startswith(PDF_MAGIC):
|
|
raise HTTPException(status_code=400, detail="File claims to be PDF but magic bytes don't match")
|
|
|
|
document_id = str(uuid.uuid4())
|
|
collection = _collection_name(tenant_id)
|
|
object_name = _storage_path(tenant_id, document_id, filename)
|
|
|
|
# Ensure collection exists
|
|
await qdrant_wrapper.create_collection(collection, VECTOR_DIM)
|
|
|
|
# Store in MinIO
|
|
try:
|
|
await minio_wrapper.upload_document(
|
|
object_name=object_name,
|
|
data=file_bytes,
|
|
content_type=content_type,
|
|
metadata={"document_id": document_id, "tenant_id": tenant_id},
|
|
)
|
|
except Exception as exc:
|
|
logger.error("MinIO upload failed for tenant %s: %s", tenant_id, exc)
|
|
raise HTTPException(status_code=500, detail="Storage failed")
|
|
|
|
# Extract text
|
|
try:
|
|
text = await _extract_text(file_bytes, filename, content_type)
|
|
except Exception as exc:
|
|
logger.error("Text extraction failed: %s", exc)
|
|
raise HTTPException(status_code=500, detail=f"Text extraction failed: {exc}")
|
|
|
|
if not text or not text.strip():
|
|
raise HTTPException(status_code=400, detail="No text could be extracted")
|
|
|
|
# Chunk
|
|
chunk_result = await embedding_client.chunk_text(
|
|
text=text, strategy="recursive",
|
|
chunk_size=chunk_size, overlap=chunk_overlap,
|
|
)
|
|
chunks = chunk_result.chunks
|
|
chunks_meta = chunk_result.chunks_with_metadata
|
|
|
|
if not chunks:
|
|
raise HTTPException(status_code=400, detail="Chunking produced zero chunks")
|
|
|
|
# Embed
|
|
embeddings = await embedding_client.generate_embeddings(chunks)
|
|
|
|
# Parse extra metadata
|
|
extra_metadata = {}
|
|
if metadata_json:
|
|
try:
|
|
extra_metadata = json.loads(metadata_json)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Build payloads with tenant isolation
|
|
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
|
|
payloads = []
|
|
for i, chunk in enumerate(chunks):
|
|
payload = {
|
|
"document_id": document_id,
|
|
"tenant_id": tenant_id,
|
|
"filename": filename,
|
|
"chunk_index": i,
|
|
"chunk_text": chunk,
|
|
**extra_metadata,
|
|
}
|
|
if i < len(chunks_meta):
|
|
for field in _STRUCT_FIELDS:
|
|
value = chunks_meta[i].get(field)
|
|
if value is not None and value != "":
|
|
payload[field] = value
|
|
payloads.append(payload)
|
|
|
|
# Index in tenant collection
|
|
indexed = await qdrant_wrapper.index_documents(
|
|
collection=collection, vectors=embeddings, payloads=payloads,
|
|
)
|
|
|
|
logger.info(
|
|
"Tenant %s: uploaded %s (%d chunks, %d vectors) to %s",
|
|
tenant_id[:8], filename, len(chunks), indexed, collection,
|
|
)
|
|
|
|
return DocumentResponse(
|
|
id=document_id, filename=filename,
|
|
file_size=len(file_bytes), status="indexed",
|
|
chunk_count=len(chunks), collection=collection,
|
|
)
|
|
|
|
|
|
@router.get("", response_model=DocumentListResponse)
|
|
async def list_tenant_documents(
|
|
request: Request,
|
|
x_tenant_id: Optional[str] = Header(None),
|
|
):
|
|
"""List all documents for this tenant."""
|
|
optional_jwt_auth(request)
|
|
tenant_id = _extract_tenant_id(request, x_tenant_id)
|
|
|
|
collection = _collection_name(tenant_id)
|
|
|
|
try:
|
|
# Get unique document_ids from Qdrant
|
|
docs = await qdrant_wrapper.get_unique_documents(collection)
|
|
except Exception:
|
|
# Collection doesn't exist yet → no documents
|
|
docs = []
|
|
|
|
return DocumentListResponse(documents=docs, total=len(docs))
|
|
|
|
|
|
@router.delete("/{doc_id}")
|
|
async def delete_tenant_document(
|
|
doc_id: str,
|
|
request: Request,
|
|
x_tenant_id: Optional[str] = Header(None),
|
|
):
|
|
"""Delete a document and all its vectors from tenant collection."""
|
|
optional_jwt_auth(request)
|
|
tenant_id = _extract_tenant_id(request, x_tenant_id)
|
|
|
|
collection = _collection_name(tenant_id)
|
|
errors = []
|
|
|
|
# Delete vectors from Qdrant
|
|
try:
|
|
await qdrant_wrapper.delete_by_filter(
|
|
collection=collection,
|
|
filter_conditions={"document_id": doc_id},
|
|
)
|
|
except Exception as exc:
|
|
errors.append(f"Qdrant: {exc}")
|
|
|
|
# Delete file from MinIO
|
|
try:
|
|
prefix = f"tenant_docs/{tenant_id.replace('-', '')[:12]}/{doc_id}/"
|
|
await minio_wrapper.delete_by_prefix(prefix)
|
|
except Exception as exc:
|
|
errors.append(f"MinIO: {exc}")
|
|
|
|
if errors:
|
|
logger.warning("Partial delete for %s/%s: %s", tenant_id[:8], doc_id[:8], errors)
|
|
return {"deleted": True, "warnings": errors}
|
|
|
|
logger.info("Tenant %s: deleted document %s", tenant_id[:8], doc_id[:8])
|
|
return {"deleted": True, "document_id": doc_id}
|
|
|
|
|
|
@router.get("/{doc_id}/status")
|
|
async def document_status(
|
|
doc_id: str,
|
|
request: Request,
|
|
x_tenant_id: Optional[str] = Header(None),
|
|
):
|
|
"""Get processing status for a document."""
|
|
optional_jwt_auth(request)
|
|
tenant_id = _extract_tenant_id(request, x_tenant_id)
|
|
|
|
collection = _collection_name(tenant_id)
|
|
try:
|
|
count = await qdrant_wrapper.count_by_filter(
|
|
collection=collection,
|
|
filter_conditions={"document_id": doc_id},
|
|
)
|
|
status = "indexed" if count > 0 else "not_found"
|
|
except Exception:
|
|
count = 0
|
|
status = "not_found"
|
|
|
|
return {"document_id": doc_id, "status": status, "chunk_count": count}
|
|
|
|
|
|
# ── Helpers ────────────────────────────────────────────────────────
|
|
|
|
async def _extract_text(file_bytes: bytes, filename: str, content_type: str) -> str:
|
|
"""Extract text from PDF, HTML, or plain text."""
|
|
if content_type == "application/pdf" or filename.lower().endswith(".pdf"):
|
|
return await embedding_client.extract_pdf(file_bytes)
|
|
if filename.lower().endswith((".html", ".htm")):
|
|
text = decode_html_bytes(file_bytes)
|
|
return strip_html(text)
|
|
text = file_bytes.decode("utf-8", errors="replace")
|
|
if looks_like_html(text):
|
|
return strip_html(text)
|
|
return text
|