Initial commit: breakpilot-core - Shared Infrastructure

Docker Compose with 24+ services: - PostgreSQL (PostGIS), Valkey, MinIO, Qdrant - Vault (PKI/TLS), Nginx (Reverse Proxy) - Backend Core API, Consent Service, Billing Service - RAG Service, Embedding Service - Gitea, Woodpecker CI/CD - Night Scheduler, Health Aggregator - Jitsi (Web/XMPP/JVB/Jicofo), Mailpit Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:13 +01:00
commit ad111d5e69
244 changed files with 84288 additions and 0 deletions
@@ -0,0 +1,11 @@
+from fastapi import APIRouter
+
+from api.collections import router as collections_router
+from api.documents import router as documents_router
+from api.search import router as search_router
+
+router = APIRouter()
+
+router.include_router(collections_router, tags=["Collections"])
+router.include_router(documents_router, tags=["Documents"])
+router.include_router(search_router, tags=["Search"])
@@ -0,0 +1,46 @@
+"""Optional JWT authentication helper.
+
+If JWT_SECRET is configured and an Authorization header is present, the token
+is verified. If no header is present or JWT_SECRET is empty, the request is
+allowed through (public access).
+"""
+
+import logging
+from typing import Optional
+
+from fastapi import HTTPException, Request
+from jose import JWTError, jwt
+
+from config import settings
+
+logger = logging.getLogger("rag-service.auth")
+
+
+def optional_jwt_auth(request: Request) -> Optional[dict]:
+    """
+    Validate the JWT from the Authorization header if present.
+
+    Returns the decoded token payload, or None if no auth was provided.
+    Raises HTTPException 401 if a token IS provided but is invalid.
+    """
+    auth_header: Optional[str] = request.headers.get("authorization")
+
+    if not auth_header:
+        return None
+
+    if not settings.JWT_SECRET:
+        # No secret configured -- skip validation
+        return None
+
+    # Expect "Bearer <token>"
+    parts = auth_header.split()
+    if len(parts) != 2 or parts[0].lower() != "bearer":
+        raise HTTPException(status_code=401, detail="Invalid Authorization header format")
+
+    token = parts[1]
+    try:
+        payload = jwt.decode(token, settings.JWT_SECRET, algorithms=["HS256"])
+        return payload
+    except JWTError as exc:
+        logger.warning("JWT verification failed: %s", exc)
+        raise HTTPException(status_code=401, detail="Invalid or expired token")
@@ -0,0 +1,77 @@
+import logging
+from typing import Optional
+
+from fastapi import APIRouter, HTTPException, Request
+from pydantic import BaseModel
+
+from api.auth import optional_jwt_auth
+from qdrant_client_wrapper import qdrant_wrapper, ALL_DEFAULT_COLLECTIONS
+
+logger = logging.getLogger("rag-service.api.collections")
+
+router = APIRouter(prefix="/api/v1/collections")
+
+
+# ---- Request / Response models --------------------------------------------
+
+class CreateCollectionRequest(BaseModel):
+    name: str
+    vector_size: int = 1536
+
+
+class CollectionInfoResponse(BaseModel):
+    name: str
+    vectors_count: Optional[int] = None
+    points_count: Optional[int] = None
+    status: Optional[str] = None
+    vector_size: Optional[int] = None
+
+
+# ---- Endpoints ------------------------------------------------------------
+
+@router.post("", status_code=201)
+async def create_collection(body: CreateCollectionRequest, request: Request):
+    """Create a new Qdrant collection."""
+    optional_jwt_auth(request)
+    try:
+        created = await qdrant_wrapper.create_collection(body.name, body.vector_size)
+        return {
+            "collection": body.name,
+            "vector_size": body.vector_size,
+            "created": created,
+        }
+    except Exception as exc:
+        logger.error("Failed to create collection '%s': %s", body.name, exc)
+        raise HTTPException(status_code=500, detail=str(exc))
+
+
+@router.get("")
+async def list_collections(request: Request):
+    """List all Qdrant collections."""
+    optional_jwt_auth(request)
+    try:
+        result = qdrant_wrapper.client.get_collections()
+        names = [c.name for c in result.collections]
+        return {"collections": names, "count": len(names)}
+    except Exception as exc:
+        logger.error("Failed to list collections: %s", exc)
+        raise HTTPException(status_code=500, detail=str(exc))
+
+
+@router.get("/defaults")
+async def list_default_collections(request: Request):
+    """Return the pre-configured default collections and their dimensions."""
+    optional_jwt_auth(request)
+    return {"defaults": ALL_DEFAULT_COLLECTIONS}
+
+
+@router.get("/{collection_name}")
+async def get_collection_info(collection_name: str, request: Request):
+    """Get stats for a single collection."""
+    optional_jwt_auth(request)
+    try:
+        info = await qdrant_wrapper.get_collection_info(collection_name)
+        return info
+    except Exception as exc:
+        logger.error("Failed to get collection info for '%s': %s", collection_name, exc)
+        raise HTTPException(status_code=404, detail=f"Collection '{collection_name}' not found or error: {exc}")
@@ -0,0 +1,246 @@
+import logging
+import uuid
+from typing import Optional
+
+from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
+from pydantic import BaseModel
+
+from api.auth import optional_jwt_auth
+from embedding_client import embedding_client
+from minio_client_wrapper import minio_wrapper
+from qdrant_client_wrapper import qdrant_wrapper
+
+logger = logging.getLogger("rag-service.api.documents")
+
+router = APIRouter(prefix="/api/v1/documents")
+
+
+# ---- Request / Response models --------------------------------------------
+
+class DocumentUploadResponse(BaseModel):
+    document_id: str
+    object_name: str
+    chunks_count: int
+    vectors_indexed: int
+    collection: str
+
+
+class DocumentDeleteRequest(BaseModel):
+    object_name: str
+    collection: str
+
+
+# ---- Endpoints ------------------------------------------------------------
+
+@router.post("/upload", response_model=DocumentUploadResponse)
+async def upload_document(
+    request: Request,
+    file: UploadFile = File(...),
+    collection: str = Form(default="bp_eh"),
+    data_type: str = Form(default="eh"),
+    bundesland: str = Form(default="niedersachsen"),
+    use_case: str = Form(default="general"),
+    year: str = Form(default="2024"),
+    chunk_strategy: str = Form(default="recursive"),
+    chunk_size: int = Form(default=512),
+    chunk_overlap: int = Form(default=50),
+    metadata_json: Optional[str] = Form(default=None),
+):
+    """
+    Upload a document:
+    1. Store original file in MinIO
+    2. Extract text (if PDF) via embedding-service
+    3. Chunk the text via embedding-service
+    4. Generate embeddings for each chunk
+    5. Index chunks + embeddings in Qdrant
+    """
+    optional_jwt_auth(request)
+
+    document_id = str(uuid.uuid4())
+
+    # --- Read file bytes ---
+    try:
+        file_bytes = await file.read()
+    except Exception as exc:
+        raise HTTPException(status_code=400, detail=f"Could not read uploaded file: {exc}")
+
+    if len(file_bytes) == 0:
+        raise HTTPException(status_code=400, detail="Uploaded file is empty")
+
+    filename = file.filename or f"{document_id}.bin"
+    content_type = file.content_type or "application/octet-stream"
+
+    # --- Store in MinIO ---
+    object_name = minio_wrapper.get_minio_path(
+        data_type=data_type,
+        bundesland=bundesland,
+        use_case=use_case,
+        year=year,
+        filename=filename,
+    )
+
+    try:
+        minio_meta = {
+            "document_id": document_id,
+            "original_filename": filename,
+        }
+        await minio_wrapper.upload_document(
+            object_name=object_name,
+            data=file_bytes,
+            content_type=content_type,
+            metadata=minio_meta,
+        )
+    except Exception as exc:
+        logger.error("MinIO upload failed: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Failed to store file in MinIO: {exc}")
+
+    # --- Extract text ---
+    try:
+        if content_type == "application/pdf" or filename.lower().endswith(".pdf"):
+            text = await embedding_client.extract_pdf(file_bytes)
+        else:
+            # Try to decode as text
+            text = file_bytes.decode("utf-8", errors="replace")
+    except Exception as exc:
+        logger.error("Text extraction failed: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Text extraction failed: {exc}")
+
+    if not text or not text.strip():
+        raise HTTPException(status_code=400, detail="Could not extract any text from the document")
+
+    # --- Chunk ---
+    try:
+        chunks = await embedding_client.chunk_text(
+            text=text,
+            strategy=chunk_strategy,
+            chunk_size=chunk_size,
+            overlap=chunk_overlap,
+        )
+    except Exception as exc:
+        logger.error("Chunking failed: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Chunking failed: {exc}")
+
+    if not chunks:
+        raise HTTPException(status_code=400, detail="Chunking produced zero chunks")
+
+    # --- Embed ---
+    try:
+        embeddings = await embedding_client.generate_embeddings(chunks)
+    except Exception as exc:
+        logger.error("Embedding generation failed: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Embedding generation failed: {exc}")
+
+    # --- Parse extra metadata ---
+    extra_metadata: dict = {}
+    if metadata_json:
+        import json
+        try:
+            extra_metadata = json.loads(metadata_json)
+        except json.JSONDecodeError:
+            logger.warning("Invalid metadata_json, ignoring")
+
+    # --- Build payloads ---
+    payloads = []
+    for i, chunk in enumerate(chunks):
+        payload = {
+            "document_id": document_id,
+            "object_name": object_name,
+            "filename": filename,
+            "chunk_index": i,
+            "chunk_text": chunk,
+            "data_type": data_type,
+            "bundesland": bundesland,
+            "use_case": use_case,
+            "year": year,
+            **extra_metadata,
+        }
+        payloads.append(payload)
+
+    # --- Index in Qdrant ---
+    try:
+        indexed = await qdrant_wrapper.index_documents(
+            collection=collection,
+            vectors=embeddings,
+            payloads=payloads,
+        )
+    except Exception as exc:
+        logger.error("Qdrant indexing failed: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Qdrant indexing failed: {exc}")
+
+    return DocumentUploadResponse(
+        document_id=document_id,
+        object_name=object_name,
+        chunks_count=len(chunks),
+        vectors_indexed=indexed,
+        collection=collection,
+    )
+
+
+@router.delete("")
+async def delete_document(body: DocumentDeleteRequest, request: Request):
+    """Delete a document from both MinIO and Qdrant."""
+    optional_jwt_auth(request)
+
+    errors: list[str] = []
+
+    # Delete from MinIO
+    try:
+        await minio_wrapper.delete_document(body.object_name)
+    except Exception as exc:
+        errors.append(f"MinIO delete failed: {exc}")
+
+    # Delete vectors from Qdrant
+    try:
+        await qdrant_wrapper.delete_by_filter(
+            collection=body.collection,
+            filter_conditions={"object_name": body.object_name},
+        )
+    except Exception as exc:
+        errors.append(f"Qdrant delete failed: {exc}")
+
+    if errors:
+        return {"deleted": False, "errors": errors}
+
+    return {"deleted": True, "object_name": body.object_name, "collection": body.collection}
+
+
+@router.get("/list")
+async def list_documents(
+    request: Request,
+    prefix: Optional[str] = None,
+):
+    """List documents stored in MinIO."""
+    optional_jwt_auth(request)
+    try:
+        docs = await minio_wrapper.list_documents(prefix=prefix)
+        return {"documents": docs, "count": len(docs)}
+    except Exception as exc:
+        logger.error("Failed to list documents: %s", exc)
+        raise HTTPException(status_code=500, detail=str(exc))
+
+
+@router.get("/download/{object_name:path}")
+async def download_document(object_name: str, request: Request):
+    """Get a presigned download URL for a document."""
+    optional_jwt_auth(request)
+    try:
+        url = await minio_wrapper.get_presigned_url(object_name)
+        return {"url": url, "object_name": object_name}
+    except Exception as exc:
+        logger.error("Failed to generate presigned URL for '%s': %s", object_name, exc)
+        raise HTTPException(status_code=404, detail=f"Document not found: {exc}")
+
+
+@router.get("/stats")
+async def storage_stats(
+    request: Request,
+    prefix: Optional[str] = None,
+):
+    """Get storage stats (size, count) for a given prefix."""
+    optional_jwt_auth(request)
+    try:
+        stats = await minio_wrapper.get_storage_stats(prefix=prefix)
+        return stats
+    except Exception as exc:
+        logger.error("Failed to get storage stats: %s", exc)
+        raise HTTPException(status_code=500, detail=str(exc))
@@ -0,0 +1,200 @@
+import logging
+from typing import Any, Optional
+
+from fastapi import APIRouter, HTTPException, Request
+from pydantic import BaseModel, Field
+
+from api.auth import optional_jwt_auth
+from embedding_client import embedding_client
+from qdrant_client_wrapper import qdrant_wrapper
+
+logger = logging.getLogger("rag-service.api.search")
+
+router = APIRouter(prefix="/api/v1")
+
+
+# ---- Request / Response models --------------------------------------------
+
+class SemanticSearchRequest(BaseModel):
+    query: str
+    collection: str = "bp_eh"
+    limit: int = Field(default=10, ge=1, le=100)
+    filters: Optional[dict[str, Any]] = None
+    score_threshold: Optional[float] = None
+
+
+class HybridSearchRequest(BaseModel):
+    query: str
+    collection: str = "bp_eh"
+    limit: int = Field(default=10, ge=1, le=100)
+    filters: Optional[dict[str, Any]] = None
+    score_threshold: Optional[float] = None
+    keyword_boost: float = Field(default=0.3, ge=0.0, le=1.0)
+    rerank: bool = True
+    rerank_top_k: int = Field(default=10, ge=1, le=50)
+
+
+class RerankRequest(BaseModel):
+    query: str
+    documents: list[str]
+    top_k: int = Field(default=10, ge=1, le=100)
+
+
+class SearchResult(BaseModel):
+    id: str
+    score: float
+    payload: dict[str, Any] = {}
+
+
+class SearchResponse(BaseModel):
+    results: list[SearchResult]
+    count: int
+    query: str
+    collection: str
+
+
+# ---- Endpoints ------------------------------------------------------------
+
+@router.post("/search", response_model=SearchResponse)
+async def semantic_search(body: SemanticSearchRequest, request: Request):
+    """
+    Pure semantic (vector) search.
+    Embeds the query, then searches Qdrant for nearest neighbours.
+    """
+    optional_jwt_auth(request)
+
+    # Generate query embedding
+    try:
+        query_vector = await embedding_client.generate_single_embedding(body.query)
+    except Exception as exc:
+        logger.error("Failed to embed query: %s", exc)
+        raise HTTPException(status_code=502, detail=f"Embedding service error: {exc}")
+
+    # Search Qdrant
+    try:
+        results = await qdrant_wrapper.search(
+            collection=body.collection,
+            query_vector=query_vector,
+            limit=body.limit,
+            filters=body.filters,
+            score_threshold=body.score_threshold,
+        )
+    except Exception as exc:
+        logger.error("Qdrant search failed: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Vector search failed: {exc}")
+
+    return SearchResponse(
+        results=[SearchResult(**r) for r in results],
+        count=len(results),
+        query=body.query,
+        collection=body.collection,
+    )
+
+
+@router.post("/search/hybrid", response_model=SearchResponse)
+async def hybrid_search(body: HybridSearchRequest, request: Request):
+    """
+    Hybrid search: vector search + keyword filtering + optional re-ranking.
+
+    1. Embed query and do vector search with a higher initial limit
+    2. Apply keyword matching on chunk_text to boost relevant results
+    3. Optionally re-rank the top results via the embedding service
+    """
+    optional_jwt_auth(request)
+
+    # --- Step 1: Vector search (fetch more than needed for re-ranking) ---
+    fetch_limit = max(body.limit * 3, 30)
+
+    try:
+        query_vector = await embedding_client.generate_single_embedding(body.query)
+    except Exception as exc:
+        logger.error("Failed to embed query: %s", exc)
+        raise HTTPException(status_code=502, detail=f"Embedding service error: {exc}")
+
+    try:
+        vector_results = await qdrant_wrapper.search(
+            collection=body.collection,
+            query_vector=query_vector,
+            limit=fetch_limit,
+            filters=body.filters,
+            score_threshold=body.score_threshold,
+        )
+    except Exception as exc:
+        logger.error("Qdrant search failed: %s", exc)
+        raise HTTPException(status_code=500, detail=f"Vector search failed: {exc}")
+
+    if not vector_results:
+        return SearchResponse(
+            results=[],
+            count=0,
+            query=body.query,
+            collection=body.collection,
+        )
+
+    # --- Step 2: Keyword boost ---
+    query_terms = body.query.lower().split()
+    for result in vector_results:
+        chunk_text = result.get("payload", {}).get("chunk_text", "").lower()
+        keyword_hits = sum(1 for term in query_terms if term in chunk_text)
+        keyword_score = (keyword_hits / max(len(query_terms), 1)) * body.keyword_boost
+        result["score"] = result["score"] + keyword_score
+
+    # Sort by boosted score
+    vector_results.sort(key=lambda x: x["score"], reverse=True)
+
+    # --- Step 3: Optional re-ranking ---
+    if body.rerank and len(vector_results) > 1:
+        try:
+            documents = [
+                r.get("payload", {}).get("chunk_text", "")
+                for r in vector_results[: body.rerank_top_k]
+            ]
+            reranked = await embedding_client.rerank_documents(
+                query=body.query,
+                documents=documents,
+                top_k=body.limit,
+            )
+            # Rebuild results in re-ranked order
+            reranked_results = []
+            for item in reranked:
+                idx = item.get("index", 0)
+                if idx < len(vector_results):
+                    entry = vector_results[idx].copy()
+                    entry["score"] = item.get("score", entry["score"])
+                    reranked_results.append(entry)
+            vector_results = reranked_results
+        except Exception as exc:
+            logger.warning("Re-ranking failed, falling back to vector+keyword scores: %s", exc)
+
+    # Trim to requested limit
+    final_results = vector_results[: body.limit]
+
+    return SearchResponse(
+        results=[SearchResult(**r) for r in final_results],
+        count=len(final_results),
+        query=body.query,
+        collection=body.collection,
+    )
+
+
+@router.post("/rerank")
+async def rerank(body: RerankRequest, request: Request):
+    """
+    Standalone re-ranking endpoint.
+    Sends query + documents to the embedding service for re-ranking.
+    """
+    optional_jwt_auth(request)
+
+    if not body.documents:
+        return {"results": [], "count": 0}
+
+    try:
+        results = await embedding_client.rerank_documents(
+            query=body.query,
+            documents=body.documents,
+            top_k=body.top_k,
+        )
+        return {"results": results, "count": len(results), "query": body.query}
+    except Exception as exc:
+        logger.error("Re-ranking failed: %s", exc)
+        raise HTTPException(status_code=502, detail=f"Re-ranking failed: {exc}")