""" Legal Corpus API - Corpus Routes Endpoints for the RAG page in admin-v2: - GET /status - Collection status with chunk counts - GET /search - Semantic search - POST /ingest - Trigger ingestion - GET /ingestion-status - Ingestion status - GET /regulations - List regulations - GET /custom-documents - List custom docs - POST /upload - Upload document - POST /add-link - Add link for ingestion - DELETE /custom-documents/{id} - Delete custom doc - GET /traceability - Traceability info Extracted from legal_corpus_api.py to keep files under 500 LOC. """ import os import httpx import uuid import shutil from datetime import datetime from typing import Optional, List, Dict, Any from fastapi import APIRouter, HTTPException, Query, BackgroundTasks, UploadFile, File, Form from pydantic import BaseModel import logging from legal_corpus_ingest_tasks import ( ingest_uploaded_document, ingest_link_document, run_ingestion, ) logger = logging.getLogger(__name__) # Configuration QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333") EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087") COLLECTION_NAME = "bp_legal_corpus" # All regulations for status endpoint REGULATIONS = [ {"code": "GDPR", "name": "DSGVO", "fullName": "Datenschutz-Grundverordnung", "type": "eu_regulation"}, {"code": "EPRIVACY", "name": "ePrivacy-Richtlinie", "fullName": "Richtlinie 2002/58/EG", "type": "eu_directive"}, {"code": "TDDDG", "name": "TDDDG", "fullName": "Telekommunikation-Digitale-Dienste-Datenschutz-Gesetz", "type": "de_law"}, {"code": "SCC", "name": "Standardvertragsklauseln", "fullName": "2021/914/EU", "type": "eu_regulation"}, {"code": "DPF", "name": "EU-US Data Privacy Framework", "fullName": "Angemessenheitsbeschluss", "type": "eu_regulation"}, {"code": "AIACT", "name": "EU AI Act", "fullName": "Verordnung (EU) 2024/1689", "type": "eu_regulation"}, {"code": "CRA", "name": "Cyber Resilience Act", "fullName": "Verordnung (EU) 2024/2847", "type": "eu_regulation"}, {"code": "NIS2", "name": "NIS2-Richtlinie", "fullName": "Richtlinie (EU) 2022/2555", "type": "eu_directive"}, {"code": "EUCSA", "name": "EU Cybersecurity Act", "fullName": "Verordnung (EU) 2019/881", "type": "eu_regulation"}, {"code": "DATAACT", "name": "Data Act", "fullName": "Verordnung (EU) 2023/2854", "type": "eu_regulation"}, {"code": "DGA", "name": "Data Governance Act", "fullName": "Verordnung (EU) 2022/868", "type": "eu_regulation"}, {"code": "DSA", "name": "Digital Services Act", "fullName": "Verordnung (EU) 2022/2065", "type": "eu_regulation"}, {"code": "EAA", "name": "European Accessibility Act", "fullName": "Richtlinie (EU) 2019/882", "type": "eu_directive"}, {"code": "DSM", "name": "DSM-Urheberrechtsrichtlinie", "fullName": "Richtlinie (EU) 2019/790", "type": "eu_directive"}, {"code": "PLD", "name": "Produkthaftungsrichtlinie", "fullName": "Richtlinie 85/374/EWG", "type": "eu_directive"}, {"code": "GPSR", "name": "General Product Safety", "fullName": "Verordnung (EU) 2023/988", "type": "eu_regulation"}, {"code": "BSI-TR-03161-1", "name": "BSI-TR Teil 1", "fullName": "BSI TR-03161 Teil 1 - Mobile Anwendungen", "type": "bsi_standard"}, {"code": "BSI-TR-03161-2", "name": "BSI-TR Teil 2", "fullName": "BSI TR-03161 Teil 2 - Web-Anwendungen", "type": "bsi_standard"}, {"code": "BSI-TR-03161-3", "name": "BSI-TR Teil 3", "fullName": "BSI TR-03161 Teil 3 - Hintergrundsysteme", "type": "bsi_standard"}, ] # Ingestion state (in-memory for now) ingestion_state = { "running": False, "completed": False, "current_regulation": None, "processed": 0, "total": len(REGULATIONS), "error": None, } class SearchRequest(BaseModel): query: str regulations: Optional[List[str]] = None top_k: int = 5 class IngestRequest(BaseModel): force: bool = False regulations: Optional[List[str]] = None class AddLinkRequest(BaseModel): url: str title: str code: str document_type: str = "custom" # Store for custom documents (in-memory for now) custom_documents: List[Dict[str, Any]] = [] router = APIRouter(prefix="/api/v1/admin/legal-corpus", tags=["legal-corpus"]) @router.get("/status") async def get_legal_corpus_status(): """Get status of the legal corpus collection including chunk counts per regulation.""" async with httpx.AsyncClient(timeout=30.0) as client: try: collection_res = await client.get(f"{QDRANT_URL}/collections/{COLLECTION_NAME}") if collection_res.status_code != 200: return { "collection": COLLECTION_NAME, "totalPoints": 0, "vectorSize": 1024, "status": "not_found", "regulations": {}, } collection_data = collection_res.json() result = collection_data.get("result", {}) regulation_counts = {} for reg in REGULATIONS: count_res = await client.post( f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/count", json={ "filter": { "must": [{"key": "regulation_code", "match": {"value": reg["code"]}}] } }, ) if count_res.status_code == 200: count_data = count_res.json() regulation_counts[reg["code"]] = count_data.get("result", {}).get("count", 0) else: regulation_counts[reg["code"]] = 0 return { "collection": COLLECTION_NAME, "totalPoints": result.get("points_count", 0), "vectorSize": result.get("config", {}).get("params", {}).get("vectors", {}).get("size", 1024), "status": result.get("status", "unknown"), "regulations": regulation_counts, } except httpx.RequestError as e: logger.error(f"Failed to get Qdrant status: {e}") raise HTTPException(status_code=503, detail=f"Qdrant not available: {str(e)}") @router.get("/search") async def search_legal_corpus( query: str = Query(..., description="Search query"), top_k: int = Query(5, ge=1, le=20, description="Number of results"), regulations: Optional[str] = Query(None, description="Comma-separated regulation codes to filter"), ): """Semantic search in legal corpus using BGE-M3 embeddings.""" async with httpx.AsyncClient(timeout=60.0) as client: try: embed_res = await client.post( f"{EMBEDDING_SERVICE_URL}/embed", json={"texts": [query]}, ) if embed_res.status_code != 200: raise HTTPException(status_code=500, detail="Embedding service error") embed_data = embed_res.json() query_vector = embed_data["embeddings"][0] search_request = { "vector": query_vector, "limit": top_k, "with_payload": True, } if regulations: reg_codes = [r.strip() for r in regulations.split(",")] search_request["filter"] = { "should": [ {"key": "regulation_code", "match": {"value": code}} for code in reg_codes ] } search_res = await client.post( f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/search", json=search_request, ) if search_res.status_code != 200: raise HTTPException(status_code=500, detail="Search failed") search_data = search_res.json() results = [] for point in search_data.get("result", []): payload = point.get("payload", {}) results.append({ "text": payload.get("text", ""), "regulation_code": payload.get("regulation_code", ""), "regulation_name": payload.get("regulation_name", ""), "article": payload.get("article"), "paragraph": payload.get("paragraph"), "source_url": payload.get("source_url", ""), "score": point.get("score", 0), }) return {"results": results, "query": query, "count": len(results)} except httpx.RequestError as e: logger.error(f"Search failed: {e}") raise HTTPException(status_code=503, detail=f"Service not available: {str(e)}") @router.post("/ingest") async def trigger_ingestion(request: IngestRequest, background_tasks: BackgroundTasks): """Trigger legal corpus ingestion in background.""" global ingestion_state if ingestion_state["running"]: raise HTTPException(status_code=409, detail="Ingestion already running") ingestion_state = { "running": True, "completed": False, "current_regulation": None, "processed": 0, "total": len(REGULATIONS), "error": None, } background_tasks.add_task(run_ingestion, request.force, request.regulations, ingestion_state, REGULATIONS) return { "status": "started", "job_id": "manual-trigger", "message": f"Ingestion started for {len(REGULATIONS)} regulations", } @router.get("/ingestion-status") async def get_ingestion_status(): """Get current ingestion status.""" return ingestion_state @router.get("/regulations") async def get_regulations(): """Get list of all supported regulations.""" return {"regulations": REGULATIONS} @router.get("/custom-documents") async def get_custom_documents(): """Get list of custom documents added by user.""" return {"documents": custom_documents} @router.post("/upload") async def upload_document( background_tasks: BackgroundTasks, file: UploadFile = File(...), title: str = Form(...), code: str = Form(...), document_type: str = Form("custom"), ): """Upload a document (PDF) for ingestion into the legal corpus.""" global custom_documents if not file.filename.endswith(('.pdf', '.PDF')): raise HTTPException(status_code=400, detail="Only PDF files are supported") upload_dir = "/tmp/legal_corpus_uploads" os.makedirs(upload_dir, exist_ok=True) doc_id = str(uuid.uuid4())[:8] safe_filename = f"{doc_id}_{file.filename}" file_path = os.path.join(upload_dir, safe_filename) try: with open(file_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) except Exception as e: logger.error(f"Failed to save uploaded file: {e}") raise HTTPException(status_code=500, detail=f"Failed to save file: {str(e)}") doc_record = { "id": doc_id, "code": code, "title": title, "filename": file.filename, "file_path": file_path, "document_type": document_type, "uploaded_at": datetime.now().isoformat(), "status": "uploaded", "chunk_count": 0, } custom_documents.append(doc_record) background_tasks.add_task(ingest_uploaded_document, doc_record) return { "status": "uploaded", "document_id": doc_id, "message": f"Document '{title}' uploaded and queued for ingestion", "document": doc_record, } @router.post("/add-link") async def add_link(request: AddLinkRequest, background_tasks: BackgroundTasks): """Add a URL/link for ingestion into the legal corpus.""" global custom_documents doc_id = str(uuid.uuid4())[:8] doc_record = { "id": doc_id, "code": request.code, "title": request.title, "url": request.url, "document_type": request.document_type, "uploaded_at": datetime.now().isoformat(), "status": "queued", "chunk_count": 0, } custom_documents.append(doc_record) background_tasks.add_task(ingest_link_document, doc_record) return { "status": "queued", "document_id": doc_id, "message": f"Link '{request.title}' queued for ingestion", "document": doc_record, } @router.delete("/custom-documents/{doc_id}") async def delete_custom_document(doc_id: str): """Delete a custom document from the list.""" global custom_documents doc = next((d for d in custom_documents if d["id"] == doc_id), None) if not doc: raise HTTPException(status_code=404, detail="Document not found") custom_documents = [d for d in custom_documents if d["id"] != doc_id] return {"status": "deleted", "document_id": doc_id} @router.get("/traceability") async def get_traceability( chunk_id: str = Query(..., description="Chunk ID or identifier"), regulation: str = Query(..., description="Regulation code"), ): """Get traceability information for a specific chunk.""" async with httpx.AsyncClient(timeout=30.0) as client: try: return { "chunk_id": chunk_id, "regulation": regulation, "requirements": [], "controls": [], "message": "Traceability-Daten werden verfuegbar sein, sobald die Requirements-Extraktion und Control-Ableitung implementiert sind." } except Exception as e: logger.error(f"Failed to get traceability: {e}") raise HTTPException(status_code=500, detail=f"Traceability lookup failed: {str(e)}")