breakpilot-lehrer/klausur-service/backend/legal_corpus_routes.py

"""
Legal Corpus API - Corpus Routes

Endpoints for the RAG page in admin-v2:
- GET /status - Collection status with chunk counts
- GET /search - Semantic search
- POST /ingest - Trigger ingestion
- GET /ingestion-status - Ingestion status
- GET /regulations - List regulations
- GET /custom-documents - List custom docs
- POST /upload - Upload document
- POST /add-link - Add link for ingestion
- DELETE /custom-documents/{id} - Delete custom doc
- GET /traceability - Traceability info

Extracted from legal_corpus_api.py to keep files under 500 LOC.
"""

import os
import httpx
import uuid
import shutil
from datetime import datetime
from typing import Optional, List, Dict, Any
from fastapi import APIRouter, HTTPException, Query, BackgroundTasks, UploadFile, File, Form
from pydantic import BaseModel
import logging

from legal_corpus_ingest_tasks import (
    ingest_uploaded_document,
    ingest_link_document,
    run_ingestion,
)

logger = logging.getLogger(__name__)

# Configuration
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
COLLECTION_NAME = "bp_legal_corpus"

# All regulations for status endpoint
REGULATIONS = [
    {"code": "GDPR", "name": "DSGVO", "fullName": "Datenschutz-Grundverordnung", "type": "eu_regulation"},
    {"code": "EPRIVACY", "name": "ePrivacy-Richtlinie", "fullName": "Richtlinie 2002/58/EG", "type": "eu_directive"},
    {"code": "TDDDG", "name": "TDDDG", "fullName": "Telekommunikation-Digitale-Dienste-Datenschutz-Gesetz", "type": "de_law"},
    {"code": "SCC", "name": "Standardvertragsklauseln", "fullName": "2021/914/EU", "type": "eu_regulation"},
    {"code": "DPF", "name": "EU-US Data Privacy Framework", "fullName": "Angemessenheitsbeschluss", "type": "eu_regulation"},
    {"code": "AIACT", "name": "EU AI Act", "fullName": "Verordnung (EU) 2024/1689", "type": "eu_regulation"},
    {"code": "CRA", "name": "Cyber Resilience Act", "fullName": "Verordnung (EU) 2024/2847", "type": "eu_regulation"},
    {"code": "NIS2", "name": "NIS2-Richtlinie", "fullName": "Richtlinie (EU) 2022/2555", "type": "eu_directive"},
    {"code": "EUCSA", "name": "EU Cybersecurity Act", "fullName": "Verordnung (EU) 2019/881", "type": "eu_regulation"},
    {"code": "DATAACT", "name": "Data Act", "fullName": "Verordnung (EU) 2023/2854", "type": "eu_regulation"},
    {"code": "DGA", "name": "Data Governance Act", "fullName": "Verordnung (EU) 2022/868", "type": "eu_regulation"},
    {"code": "DSA", "name": "Digital Services Act", "fullName": "Verordnung (EU) 2022/2065", "type": "eu_regulation"},
    {"code": "EAA", "name": "European Accessibility Act", "fullName": "Richtlinie (EU) 2019/882", "type": "eu_directive"},
    {"code": "DSM", "name": "DSM-Urheberrechtsrichtlinie", "fullName": "Richtlinie (EU) 2019/790", "type": "eu_directive"},
    {"code": "PLD", "name": "Produkthaftungsrichtlinie", "fullName": "Richtlinie 85/374/EWG", "type": "eu_directive"},
    {"code": "GPSR", "name": "General Product Safety", "fullName": "Verordnung (EU) 2023/988", "type": "eu_regulation"},
    {"code": "BSI-TR-03161-1", "name": "BSI-TR Teil 1", "fullName": "BSI TR-03161 Teil 1 - Mobile Anwendungen", "type": "bsi_standard"},
    {"code": "BSI-TR-03161-2", "name": "BSI-TR Teil 2", "fullName": "BSI TR-03161 Teil 2 - Web-Anwendungen", "type": "bsi_standard"},
    {"code": "BSI-TR-03161-3", "name": "BSI-TR Teil 3", "fullName": "BSI TR-03161 Teil 3 - Hintergrundsysteme", "type": "bsi_standard"},
]

# Ingestion state (in-memory for now)
ingestion_state = {
    "running": False,
    "completed": False,
    "current_regulation": None,
    "processed": 0,
    "total": len(REGULATIONS),
    "error": None,
}


class SearchRequest(BaseModel):
    query: str
    regulations: Optional[List[str]] = None
    top_k: int = 5


class IngestRequest(BaseModel):
    force: bool = False
    regulations: Optional[List[str]] = None


class AddLinkRequest(BaseModel):
    url: str
    title: str
    code: str
    document_type: str = "custom"


# Store for custom documents (in-memory for now)
custom_documents: List[Dict[str, Any]] = []


router = APIRouter(prefix="/api/v1/admin/legal-corpus", tags=["legal-corpus"])


@router.get("/status")
async def get_legal_corpus_status():
    """Get status of the legal corpus collection including chunk counts per regulation."""
    async with httpx.AsyncClient(timeout=30.0) as client:
        try:
            collection_res = await client.get(f"{QDRANT_URL}/collections/{COLLECTION_NAME}")
            if collection_res.status_code != 200:
                return {
                    "collection": COLLECTION_NAME,
                    "totalPoints": 0,
                    "vectorSize": 1024,
                    "status": "not_found",
                    "regulations": {},
                }

            collection_data = collection_res.json()
            result = collection_data.get("result", {})

            regulation_counts = {}
            for reg in REGULATIONS:
                count_res = await client.post(
                    f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/count",
                    json={
                        "filter": {
                            "must": [{"key": "regulation_code", "match": {"value": reg["code"]}}]
                        }
                    },
                )
                if count_res.status_code == 200:
                    count_data = count_res.json()
                    regulation_counts[reg["code"]] = count_data.get("result", {}).get("count", 0)
                else:
                    regulation_counts[reg["code"]] = 0

            return {
                "collection": COLLECTION_NAME,
                "totalPoints": result.get("points_count", 0),
                "vectorSize": result.get("config", {}).get("params", {}).get("vectors", {}).get("size", 1024),
                "status": result.get("status", "unknown"),
                "regulations": regulation_counts,
            }

        except httpx.RequestError as e:
            logger.error(f"Failed to get Qdrant status: {e}")
            raise HTTPException(status_code=503, detail=f"Qdrant not available: {str(e)}")


@router.get("/search")
async def search_legal_corpus(
    query: str = Query(..., description="Search query"),
    top_k: int = Query(5, ge=1, le=20, description="Number of results"),
    regulations: Optional[str] = Query(None, description="Comma-separated regulation codes to filter"),
):
    """Semantic search in legal corpus using BGE-M3 embeddings."""
    async with httpx.AsyncClient(timeout=60.0) as client:
        try:
            embed_res = await client.post(
                f"{EMBEDDING_SERVICE_URL}/embed",
                json={"texts": [query]},
            )
            if embed_res.status_code != 200:
                raise HTTPException(status_code=500, detail="Embedding service error")

            embed_data = embed_res.json()
            query_vector = embed_data["embeddings"][0]

            search_request = {
                "vector": query_vector,
                "limit": top_k,
                "with_payload": True,
            }

            if regulations:
                reg_codes = [r.strip() for r in regulations.split(",")]
                search_request["filter"] = {
                    "should": [
                        {"key": "regulation_code", "match": {"value": code}}
                        for code in reg_codes
                    ]
                }

            search_res = await client.post(
                f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/search",
                json=search_request,
            )

            if search_res.status_code != 200:
                raise HTTPException(status_code=500, detail="Search failed")

            search_data = search_res.json()
            results = []
            for point in search_data.get("result", []):
                payload = point.get("payload", {})
                results.append({
                    "text": payload.get("text", ""),
                    "regulation_code": payload.get("regulation_code", ""),
                    "regulation_name": payload.get("regulation_name", ""),
                    "article": payload.get("article"),
                    "paragraph": payload.get("paragraph"),
                    "source_url": payload.get("source_url", ""),
                    "score": point.get("score", 0),
                })

            return {"results": results, "query": query, "count": len(results)}

        except httpx.RequestError as e:
            logger.error(f"Search failed: {e}")
            raise HTTPException(status_code=503, detail=f"Service not available: {str(e)}")


@router.post("/ingest")
async def trigger_ingestion(request: IngestRequest, background_tasks: BackgroundTasks):
    """Trigger legal corpus ingestion in background."""
    global ingestion_state

    if ingestion_state["running"]:
        raise HTTPException(status_code=409, detail="Ingestion already running")

    ingestion_state = {
        "running": True,
        "completed": False,
        "current_regulation": None,
        "processed": 0,
        "total": len(REGULATIONS),
        "error": None,
    }

    background_tasks.add_task(run_ingestion, request.force, request.regulations, ingestion_state, REGULATIONS)

    return {
        "status": "started",
        "job_id": "manual-trigger",
        "message": f"Ingestion started for {len(REGULATIONS)} regulations",
    }


@router.get("/ingestion-status")
async def get_ingestion_status():
    """Get current ingestion status."""
    return ingestion_state


@router.get("/regulations")
async def get_regulations():
    """Get list of all supported regulations."""
    return {"regulations": REGULATIONS}


@router.get("/custom-documents")
async def get_custom_documents():
    """Get list of custom documents added by user."""
    return {"documents": custom_documents}


@router.post("/upload")
async def upload_document(
    background_tasks: BackgroundTasks,
    file: UploadFile = File(...),
    title: str = Form(...),
    code: str = Form(...),
    document_type: str = Form("custom"),
):
    """Upload a document (PDF) for ingestion into the legal corpus."""
    global custom_documents

    if not file.filename.endswith(('.pdf', '.PDF')):
        raise HTTPException(status_code=400, detail="Only PDF files are supported")

    upload_dir = "/tmp/legal_corpus_uploads"
    os.makedirs(upload_dir, exist_ok=True)

    doc_id = str(uuid.uuid4())[:8]
    safe_filename = f"{doc_id}_{file.filename}"
    file_path = os.path.join(upload_dir, safe_filename)

    try:
        with open(file_path, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
    except Exception as e:
        logger.error(f"Failed to save uploaded file: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to save file: {str(e)}")

    doc_record = {
        "id": doc_id,
        "code": code,
        "title": title,
        "filename": file.filename,
        "file_path": file_path,
        "document_type": document_type,
        "uploaded_at": datetime.now().isoformat(),
        "status": "uploaded",
        "chunk_count": 0,
    }

    custom_documents.append(doc_record)
    background_tasks.add_task(ingest_uploaded_document, doc_record)

    return {
        "status": "uploaded",
        "document_id": doc_id,
        "message": f"Document '{title}' uploaded and queued for ingestion",
        "document": doc_record,
    }


@router.post("/add-link")
async def add_link(request: AddLinkRequest, background_tasks: BackgroundTasks):
    """Add a URL/link for ingestion into the legal corpus."""
    global custom_documents

    doc_id = str(uuid.uuid4())[:8]
    doc_record = {
        "id": doc_id,
        "code": request.code,
        "title": request.title,
        "url": request.url,
        "document_type": request.document_type,
        "uploaded_at": datetime.now().isoformat(),
        "status": "queued",
        "chunk_count": 0,
    }

    custom_documents.append(doc_record)
    background_tasks.add_task(ingest_link_document, doc_record)

    return {
        "status": "queued",
        "document_id": doc_id,
        "message": f"Link '{request.title}' queued for ingestion",
        "document": doc_record,
    }


@router.delete("/custom-documents/{doc_id}")
async def delete_custom_document(doc_id: str):
    """Delete a custom document from the list."""
    global custom_documents

    doc = next((d for d in custom_documents if d["id"] == doc_id), None)
    if not doc:
        raise HTTPException(status_code=404, detail="Document not found")

    custom_documents = [d for d in custom_documents if d["id"] != doc_id]

    return {"status": "deleted", "document_id": doc_id}


@router.get("/traceability")
async def get_traceability(
    chunk_id: str = Query(..., description="Chunk ID or identifier"),
    regulation: str = Query(..., description="Regulation code"),
):
    """Get traceability information for a specific chunk."""
    async with httpx.AsyncClient(timeout=30.0) as client:
        try:
            return {
                "chunk_id": chunk_id,
                "regulation": regulation,
                "requirements": [],
                "controls": [],
                "message": "Traceability-Daten werden verfuegbar sein, sobald die Requirements-Extraktion und Control-Ableitung implementiert sind."
            }

        except Exception as e:
            logger.error(f"Failed to get traceability: {e}")
            raise HTTPException(status_code=500, detail=f"Traceability lookup failed: {str(e)}")