""" Legal Corpus API - Endpoints for RAG page in admin-v2 Provides endpoints for: - GET /api/v1/admin/legal-corpus/status - Collection status with chunk counts - GET /api/v1/admin/legal-corpus/search - Semantic search - POST /api/v1/admin/legal-corpus/ingest - Trigger ingestion - GET /api/v1/admin/legal-corpus/ingestion-status - Ingestion status - POST /api/v1/admin/legal-corpus/upload - Upload document - POST /api/v1/admin/legal-corpus/add-link - Add link for ingestion - POST /api/v1/admin/pipeline/start - Start compliance pipeline """ import os import asyncio import httpx import uuid import shutil from datetime import datetime from typing import Optional, List, Dict, Any from fastapi import APIRouter, HTTPException, Query, BackgroundTasks, UploadFile, File, Form from pydantic import BaseModel import logging logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/v1/admin/legal-corpus", tags=["legal-corpus"]) # Configuration QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333") EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087") COLLECTION_NAME = "bp_legal_corpus" # All regulations for status endpoint REGULATIONS = [ {"code": "GDPR", "name": "DSGVO", "fullName": "Datenschutz-Grundverordnung", "type": "eu_regulation"}, {"code": "EPRIVACY", "name": "ePrivacy-Richtlinie", "fullName": "Richtlinie 2002/58/EG", "type": "eu_directive"}, {"code": "TDDDG", "name": "TDDDG", "fullName": "Telekommunikation-Digitale-Dienste-Datenschutz-Gesetz", "type": "de_law"}, {"code": "SCC", "name": "Standardvertragsklauseln", "fullName": "2021/914/EU", "type": "eu_regulation"}, {"code": "DPF", "name": "EU-US Data Privacy Framework", "fullName": "Angemessenheitsbeschluss", "type": "eu_regulation"}, {"code": "AIACT", "name": "EU AI Act", "fullName": "Verordnung (EU) 2024/1689", "type": "eu_regulation"}, {"code": "CRA", "name": "Cyber Resilience Act", "fullName": "Verordnung (EU) 2024/2847", "type": "eu_regulation"}, {"code": "NIS2", "name": "NIS2-Richtlinie", "fullName": "Richtlinie (EU) 2022/2555", "type": "eu_directive"}, {"code": "EUCSA", "name": "EU Cybersecurity Act", "fullName": "Verordnung (EU) 2019/881", "type": "eu_regulation"}, {"code": "DATAACT", "name": "Data Act", "fullName": "Verordnung (EU) 2023/2854", "type": "eu_regulation"}, {"code": "DGA", "name": "Data Governance Act", "fullName": "Verordnung (EU) 2022/868", "type": "eu_regulation"}, {"code": "DSA", "name": "Digital Services Act", "fullName": "Verordnung (EU) 2022/2065", "type": "eu_regulation"}, {"code": "EAA", "name": "European Accessibility Act", "fullName": "Richtlinie (EU) 2019/882", "type": "eu_directive"}, {"code": "DSM", "name": "DSM-Urheberrechtsrichtlinie", "fullName": "Richtlinie (EU) 2019/790", "type": "eu_directive"}, {"code": "PLD", "name": "Produkthaftungsrichtlinie", "fullName": "Richtlinie 85/374/EWG", "type": "eu_directive"}, {"code": "GPSR", "name": "General Product Safety", "fullName": "Verordnung (EU) 2023/988", "type": "eu_regulation"}, {"code": "BSI-TR-03161-1", "name": "BSI-TR Teil 1", "fullName": "BSI TR-03161 Teil 1 - Mobile Anwendungen", "type": "bsi_standard"}, {"code": "BSI-TR-03161-2", "name": "BSI-TR Teil 2", "fullName": "BSI TR-03161 Teil 2 - Web-Anwendungen", "type": "bsi_standard"}, {"code": "BSI-TR-03161-3", "name": "BSI-TR Teil 3", "fullName": "BSI TR-03161 Teil 3 - Hintergrundsysteme", "type": "bsi_standard"}, ] # Ingestion state (in-memory for now) ingestion_state = { "running": False, "completed": False, "current_regulation": None, "processed": 0, "total": len(REGULATIONS), "error": None, } class SearchRequest(BaseModel): query: str regulations: Optional[List[str]] = None top_k: int = 5 class IngestRequest(BaseModel): force: bool = False regulations: Optional[List[str]] = None class AddLinkRequest(BaseModel): url: str title: str code: str # Regulation code (e.g. "CUSTOM-1") document_type: str = "custom" # custom, eu_regulation, eu_directive, de_law, bsi_standard class StartPipelineRequest(BaseModel): force_reindex: bool = False skip_ingestion: bool = False # Store for custom documents (in-memory for now, should be persisted) custom_documents: List[Dict[str, Any]] = [] async def get_qdrant_client(): """Get async HTTP client for Qdrant.""" return httpx.AsyncClient(timeout=30.0) @router.get("/status") async def get_legal_corpus_status(): """ Get status of the legal corpus collection including chunk counts per regulation. """ async with httpx.AsyncClient(timeout=30.0) as client: try: # Get collection info collection_res = await client.get(f"{QDRANT_URL}/collections/{COLLECTION_NAME}") if collection_res.status_code != 200: return { "collection": COLLECTION_NAME, "totalPoints": 0, "vectorSize": 1024, "status": "not_found", "regulations": {}, } collection_data = collection_res.json() result = collection_data.get("result", {}) # Get chunk counts per regulation regulation_counts = {} for reg in REGULATIONS: count_res = await client.post( f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/count", json={ "filter": { "must": [{"key": "regulation_code", "match": {"value": reg["code"]}}] } }, ) if count_res.status_code == 200: count_data = count_res.json() regulation_counts[reg["code"]] = count_data.get("result", {}).get("count", 0) else: regulation_counts[reg["code"]] = 0 return { "collection": COLLECTION_NAME, "totalPoints": result.get("points_count", 0), "vectorSize": result.get("config", {}).get("params", {}).get("vectors", {}).get("size", 1024), "status": result.get("status", "unknown"), "regulations": regulation_counts, } except httpx.RequestError as e: logger.error(f"Failed to get Qdrant status: {e}") raise HTTPException(status_code=503, detail=f"Qdrant not available: {str(e)}") @router.get("/search") async def search_legal_corpus( query: str = Query(..., description="Search query"), top_k: int = Query(5, ge=1, le=20, description="Number of results"), regulations: Optional[str] = Query(None, description="Comma-separated regulation codes to filter"), ): """ Semantic search in legal corpus using BGE-M3 embeddings. """ async with httpx.AsyncClient(timeout=60.0) as client: try: # Generate embedding for query embed_res = await client.post( f"{EMBEDDING_SERVICE_URL}/embed", json={"texts": [query]}, ) if embed_res.status_code != 200: raise HTTPException(status_code=500, detail="Embedding service error") embed_data = embed_res.json() query_vector = embed_data["embeddings"][0] # Build Qdrant search request search_request = { "vector": query_vector, "limit": top_k, "with_payload": True, } # Add regulation filter if specified if regulations: reg_codes = [r.strip() for r in regulations.split(",")] search_request["filter"] = { "should": [ {"key": "regulation_code", "match": {"value": code}} for code in reg_codes ] } # Search Qdrant search_res = await client.post( f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/search", json=search_request, ) if search_res.status_code != 200: raise HTTPException(status_code=500, detail="Search failed") search_data = search_res.json() results = [] for point in search_data.get("result", []): payload = point.get("payload", {}) results.append({ "text": payload.get("text", ""), "regulation_code": payload.get("regulation_code", ""), "regulation_name": payload.get("regulation_name", ""), "article": payload.get("article"), "paragraph": payload.get("paragraph"), "source_url": payload.get("source_url", ""), "score": point.get("score", 0), }) return {"results": results, "query": query, "count": len(results)} except httpx.RequestError as e: logger.error(f"Search failed: {e}") raise HTTPException(status_code=503, detail=f"Service not available: {str(e)}") @router.post("/ingest") async def trigger_ingestion(request: IngestRequest, background_tasks: BackgroundTasks): """ Trigger legal corpus ingestion in background. """ global ingestion_state if ingestion_state["running"]: raise HTTPException(status_code=409, detail="Ingestion already running") # Reset state ingestion_state = { "running": True, "completed": False, "current_regulation": None, "processed": 0, "total": len(REGULATIONS), "error": None, } # Start ingestion in background background_tasks.add_task(run_ingestion, request.force, request.regulations) return { "status": "started", "job_id": "manual-trigger", "message": f"Ingestion started for {len(REGULATIONS)} regulations", } async def run_ingestion(force: bool, regulations: Optional[List[str]]): """Background task for running ingestion.""" global ingestion_state try: # Import ingestion module from legal_corpus_ingestion import LegalCorpusIngestion ingestion = LegalCorpusIngestion() # Filter regulations if specified regs_to_process = regulations or [r["code"] for r in REGULATIONS] for i, reg_code in enumerate(regs_to_process): ingestion_state["current_regulation"] = reg_code ingestion_state["processed"] = i try: await ingestion.ingest_single(reg_code, force=force) except Exception as e: logger.error(f"Failed to ingest {reg_code}: {e}") ingestion_state["completed"] = True ingestion_state["processed"] = len(regs_to_process) except Exception as e: logger.error(f"Ingestion failed: {e}") ingestion_state["error"] = str(e) finally: ingestion_state["running"] = False @router.get("/ingestion-status") async def get_ingestion_status(): """ Get current ingestion status. """ return ingestion_state @router.get("/regulations") async def get_regulations(): """ Get list of all supported regulations. """ return {"regulations": REGULATIONS} @router.get("/custom-documents") async def get_custom_documents(): """ Get list of custom documents added by user. """ return {"documents": custom_documents} @router.post("/upload") async def upload_document( background_tasks: BackgroundTasks, file: UploadFile = File(...), title: str = Form(...), code: str = Form(...), document_type: str = Form("custom"), ): """ Upload a document (PDF) for ingestion into the legal corpus. The document will be saved and queued for processing. """ global custom_documents # Validate file type if not file.filename.endswith(('.pdf', '.PDF')): raise HTTPException(status_code=400, detail="Only PDF files are supported") # Create upload directory if needed upload_dir = "/tmp/legal_corpus_uploads" os.makedirs(upload_dir, exist_ok=True) # Save file with unique name doc_id = str(uuid.uuid4())[:8] safe_filename = f"{doc_id}_{file.filename}" file_path = os.path.join(upload_dir, safe_filename) try: with open(file_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) except Exception as e: logger.error(f"Failed to save uploaded file: {e}") raise HTTPException(status_code=500, detail=f"Failed to save file: {str(e)}") # Create document record doc_record = { "id": doc_id, "code": code, "title": title, "filename": file.filename, "file_path": file_path, "document_type": document_type, "uploaded_at": datetime.now().isoformat(), "status": "uploaded", "chunk_count": 0, } custom_documents.append(doc_record) # Queue for background ingestion background_tasks.add_task(ingest_uploaded_document, doc_record) return { "status": "uploaded", "document_id": doc_id, "message": f"Document '{title}' uploaded and queued for ingestion", "document": doc_record, } async def ingest_uploaded_document(doc_record: Dict[str, Any]): """Background task to ingest an uploaded document.""" global custom_documents try: doc_record["status"] = "processing" from legal_corpus_ingestion import LegalCorpusIngestion ingestion = LegalCorpusIngestion() # Read PDF and extract text import fitz # PyMuPDF doc = fitz.open(doc_record["file_path"]) full_text = "" for page in doc: full_text += page.get_text() doc.close() if not full_text.strip(): doc_record["status"] = "error" doc_record["error"] = "No text could be extracted from PDF" return # Chunk the text chunks = ingestion.chunk_text(full_text, doc_record["code"]) # Add metadata for chunk in chunks: chunk["regulation_code"] = doc_record["code"] chunk["regulation_name"] = doc_record["title"] chunk["document_type"] = doc_record["document_type"] chunk["source_url"] = f"upload://{doc_record['filename']}" # Generate embeddings and upsert to Qdrant if chunks: await ingestion.embed_and_upsert(chunks) doc_record["chunk_count"] = len(chunks) doc_record["status"] = "indexed" logger.info(f"Ingested {len(chunks)} chunks from uploaded document {doc_record['code']}") else: doc_record["status"] = "error" doc_record["error"] = "No chunks generated from document" except Exception as e: logger.error(f"Failed to ingest uploaded document: {e}") doc_record["status"] = "error" doc_record["error"] = str(e) @router.post("/add-link") async def add_link(request: AddLinkRequest, background_tasks: BackgroundTasks): """ Add a URL/link for ingestion into the legal corpus. The content will be fetched, extracted, and indexed. """ global custom_documents # Create document record doc_id = str(uuid.uuid4())[:8] doc_record = { "id": doc_id, "code": request.code, "title": request.title, "url": request.url, "document_type": request.document_type, "uploaded_at": datetime.now().isoformat(), "status": "queued", "chunk_count": 0, } custom_documents.append(doc_record) # Queue for background ingestion background_tasks.add_task(ingest_link_document, doc_record) return { "status": "queued", "document_id": doc_id, "message": f"Link '{request.title}' queued for ingestion", "document": doc_record, } async def ingest_link_document(doc_record: Dict[str, Any]): """Background task to ingest content from a URL.""" global custom_documents try: doc_record["status"] = "fetching" async with httpx.AsyncClient(timeout=60.0) as client: # Fetch the URL response = await client.get(doc_record["url"], follow_redirects=True) response.raise_for_status() content_type = response.headers.get("content-type", "") if "application/pdf" in content_type: # Save PDF and process import tempfile with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: f.write(response.content) pdf_path = f.name import fitz pdf_doc = fitz.open(pdf_path) full_text = "" for page in pdf_doc: full_text += page.get_text() pdf_doc.close() os.unlink(pdf_path) elif "text/html" in content_type: # Extract text from HTML from bs4 import BeautifulSoup soup = BeautifulSoup(response.text, "html.parser") # Remove script and style elements for script in soup(["script", "style", "nav", "footer", "header"]): script.decompose() full_text = soup.get_text(separator="\n", strip=True) else: # Try to use as plain text full_text = response.text if not full_text.strip(): doc_record["status"] = "error" doc_record["error"] = "No text could be extracted from URL" return doc_record["status"] = "processing" from legal_corpus_ingestion import LegalCorpusIngestion ingestion = LegalCorpusIngestion() # Chunk the text chunks = ingestion.chunk_text(full_text, doc_record["code"]) # Add metadata for chunk in chunks: chunk["regulation_code"] = doc_record["code"] chunk["regulation_name"] = doc_record["title"] chunk["document_type"] = doc_record["document_type"] chunk["source_url"] = doc_record["url"] # Generate embeddings and upsert to Qdrant if chunks: await ingestion.embed_and_upsert(chunks) doc_record["chunk_count"] = len(chunks) doc_record["status"] = "indexed" logger.info(f"Ingested {len(chunks)} chunks from URL {doc_record['url']}") else: doc_record["status"] = "error" doc_record["error"] = "No chunks generated from content" except httpx.HTTPError as e: logger.error(f"Failed to fetch URL: {e}") doc_record["status"] = "error" doc_record["error"] = f"Failed to fetch URL: {str(e)}" except Exception as e: logger.error(f"Failed to ingest URL content: {e}") doc_record["status"] = "error" doc_record["error"] = str(e) @router.delete("/custom-documents/{doc_id}") async def delete_custom_document(doc_id: str): """ Delete a custom document from the list. Note: This does not remove the chunks from Qdrant yet. """ global custom_documents doc = next((d for d in custom_documents if d["id"] == doc_id), None) if not doc: raise HTTPException(status_code=404, detail="Document not found") custom_documents = [d for d in custom_documents if d["id"] != doc_id] # TODO: Also remove chunks from Qdrant by filtering on code return {"status": "deleted", "document_id": doc_id} # ========== Pipeline Checkpoints ========== # Create a separate router for pipeline-related endpoints pipeline_router = APIRouter(prefix="/api/v1/admin/pipeline", tags=["pipeline"]) @pipeline_router.get("/checkpoints") async def get_pipeline_checkpoints(): """ Get current pipeline checkpoint state. Returns the current state of the compliance pipeline including: - Pipeline ID and overall status - Start and completion times - All checkpoints with their validations and metrics - Summary data """ from pipeline_checkpoints import CheckpointManager state = CheckpointManager.load_state() if state is None: return { "status": "no_data", "message": "No pipeline run data available yet.", "pipeline_id": None, "checkpoints": [], "summary": {} } # Enrich with validation summary validation_summary = { "passed": 0, "warning": 0, "failed": 0, "total": 0 } for checkpoint in state.get("checkpoints", []): for validation in checkpoint.get("validations", []): validation_summary["total"] += 1 status = validation.get("status", "not_run") if status in validation_summary: validation_summary[status] += 1 state["validation_summary"] = validation_summary return state @pipeline_router.get("/checkpoints/history") async def get_pipeline_history(): """ Get list of previous pipeline runs (if stored). For now, returns only current run. """ from pipeline_checkpoints import CheckpointManager state = CheckpointManager.load_state() if state is None: return {"runs": []} return { "runs": [{ "pipeline_id": state.get("pipeline_id"), "status": state.get("status"), "started_at": state.get("started_at"), "completed_at": state.get("completed_at"), }] } # Pipeline state for start/stop pipeline_process_state = { "running": False, "pid": None, "started_at": None, } @pipeline_router.post("/start") async def start_pipeline(request: StartPipelineRequest, background_tasks: BackgroundTasks): """ Start the compliance pipeline in the background. This runs the full_compliance_pipeline.py script which: 1. Ingests all legal documents (unless skip_ingestion=True) 2. Extracts requirements and controls 3. Generates compliance measures 4. Creates checkpoint data for monitoring """ global pipeline_process_state # Check if already running from pipeline_checkpoints import CheckpointManager state = CheckpointManager.load_state() if state and state.get("status") == "running": raise HTTPException( status_code=409, detail="Pipeline is already running" ) if pipeline_process_state["running"]: raise HTTPException( status_code=409, detail="Pipeline start already in progress" ) pipeline_process_state["running"] = True pipeline_process_state["started_at"] = datetime.now().isoformat() # Start pipeline in background background_tasks.add_task( run_pipeline_background, request.force_reindex, request.skip_ingestion ) return { "status": "starting", "message": "Compliance pipeline is starting in background", "started_at": pipeline_process_state["started_at"], } async def run_pipeline_background(force_reindex: bool, skip_ingestion: bool): """Background task to run the compliance pipeline.""" global pipeline_process_state try: import subprocess import sys # Build command cmd = [sys.executable, "full_compliance_pipeline.py"] if force_reindex: cmd.append("--force-reindex") if skip_ingestion: cmd.append("--skip-ingestion") # Run as subprocess logger.info(f"Starting pipeline: {' '.join(cmd)}") process = subprocess.Popen( cmd, cwd=os.path.dirname(os.path.abspath(__file__)), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, ) pipeline_process_state["pid"] = process.pid # Wait for completion (non-blocking via asyncio) import asyncio while process.poll() is None: await asyncio.sleep(5) return_code = process.returncode if return_code != 0: output = process.stdout.read() if process.stdout else "" logger.error(f"Pipeline failed with code {return_code}: {output}") else: logger.info("Pipeline completed successfully") except Exception as e: logger.error(f"Failed to run pipeline: {e}") finally: pipeline_process_state["running"] = False pipeline_process_state["pid"] = None @pipeline_router.get("/status") async def get_pipeline_status(): """ Get current pipeline running status. """ from pipeline_checkpoints import CheckpointManager state = CheckpointManager.load_state() checkpoint_status = state.get("status") if state else "no_data" return { "process_running": pipeline_process_state["running"], "process_pid": pipeline_process_state["pid"], "process_started_at": pipeline_process_state["started_at"], "checkpoint_status": checkpoint_status, "current_phase": state.get("current_phase") if state else None, } # ========== Traceability / Quality Endpoints ========== @router.get("/traceability") async def get_traceability( chunk_id: str = Query(..., description="Chunk ID or identifier"), regulation: str = Query(..., description="Regulation code"), ): """ Get traceability information for a specific chunk. Returns: - The chunk details - Requirements extracted from this chunk - Controls derived from those requirements Note: This is a placeholder that will be enhanced once the requirements extraction pipeline is fully implemented. """ async with httpx.AsyncClient(timeout=30.0) as client: try: # Try to find the chunk by scrolling through points with the regulation filter # In a production system, we would have proper IDs and indexing # For now, return placeholder structure # The actual implementation will query: # 1. The chunk from Qdrant # 2. Requirements from a requirements collection/table # 3. Controls from a controls collection/table return { "chunk_id": chunk_id, "regulation": regulation, "requirements": [], "controls": [], "message": "Traceability-Daten werden verfuegbar sein, sobald die Requirements-Extraktion und Control-Ableitung implementiert sind." } except Exception as e: logger.error(f"Failed to get traceability: {e}") raise HTTPException(status_code=500, detail=f"Traceability lookup failed: {str(e)}")