[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions
--- a/klausur-service/backend/legal_corpus_routes.py
+++ b/klausur-service/backend/legal_corpus_routes.py
@@ -0,0 +1,368 @@
+"""
+Legal Corpus API - Corpus Routes
+
+Endpoints for the RAG page in admin-v2:
+- GET /status - Collection status with chunk counts
+- GET /search - Semantic search
+- POST /ingest - Trigger ingestion
+- GET /ingestion-status - Ingestion status
+- GET /regulations - List regulations
+- GET /custom-documents - List custom docs
+- POST /upload - Upload document
+- POST /add-link - Add link for ingestion
+- DELETE /custom-documents/{id} - Delete custom doc
+- GET /traceability - Traceability info
+
+Extracted from legal_corpus_api.py to keep files under 500 LOC.
+"""
+
+import os
+import httpx
+import uuid
+import shutil
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+from fastapi import APIRouter, HTTPException, Query, BackgroundTasks, UploadFile, File, Form
+from pydantic import BaseModel
+import logging
+
+from legal_corpus_ingest_tasks import (
+    ingest_uploaded_document,
+    ingest_link_document,
+    run_ingestion,
+)
+
+logger = logging.getLogger(__name__)
+
+# Configuration
+QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
+EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
+COLLECTION_NAME = "bp_legal_corpus"
+
+# All regulations for status endpoint
+REGULATIONS = [
+    {"code": "GDPR", "name": "DSGVO", "fullName": "Datenschutz-Grundverordnung", "type": "eu_regulation"},
+    {"code": "EPRIVACY", "name": "ePrivacy-Richtlinie", "fullName": "Richtlinie 2002/58/EG", "type": "eu_directive"},
+    {"code": "TDDDG", "name": "TDDDG", "fullName": "Telekommunikation-Digitale-Dienste-Datenschutz-Gesetz", "type": "de_law"},
+    {"code": "SCC", "name": "Standardvertragsklauseln", "fullName": "2021/914/EU", "type": "eu_regulation"},
+    {"code": "DPF", "name": "EU-US Data Privacy Framework", "fullName": "Angemessenheitsbeschluss", "type": "eu_regulation"},
+    {"code": "AIACT", "name": "EU AI Act", "fullName": "Verordnung (EU) 2024/1689", "type": "eu_regulation"},
+    {"code": "CRA", "name": "Cyber Resilience Act", "fullName": "Verordnung (EU) 2024/2847", "type": "eu_regulation"},
+    {"code": "NIS2", "name": "NIS2-Richtlinie", "fullName": "Richtlinie (EU) 2022/2555", "type": "eu_directive"},
+    {"code": "EUCSA", "name": "EU Cybersecurity Act", "fullName": "Verordnung (EU) 2019/881", "type": "eu_regulation"},
+    {"code": "DATAACT", "name": "Data Act", "fullName": "Verordnung (EU) 2023/2854", "type": "eu_regulation"},
+    {"code": "DGA", "name": "Data Governance Act", "fullName": "Verordnung (EU) 2022/868", "type": "eu_regulation"},
+    {"code": "DSA", "name": "Digital Services Act", "fullName": "Verordnung (EU) 2022/2065", "type": "eu_regulation"},
+    {"code": "EAA", "name": "European Accessibility Act", "fullName": "Richtlinie (EU) 2019/882", "type": "eu_directive"},
+    {"code": "DSM", "name": "DSM-Urheberrechtsrichtlinie", "fullName": "Richtlinie (EU) 2019/790", "type": "eu_directive"},
+    {"code": "PLD", "name": "Produkthaftungsrichtlinie", "fullName": "Richtlinie 85/374/EWG", "type": "eu_directive"},
+    {"code": "GPSR", "name": "General Product Safety", "fullName": "Verordnung (EU) 2023/988", "type": "eu_regulation"},
+    {"code": "BSI-TR-03161-1", "name": "BSI-TR Teil 1", "fullName": "BSI TR-03161 Teil 1 - Mobile Anwendungen", "type": "bsi_standard"},
+    {"code": "BSI-TR-03161-2", "name": "BSI-TR Teil 2", "fullName": "BSI TR-03161 Teil 2 - Web-Anwendungen", "type": "bsi_standard"},
+    {"code": "BSI-TR-03161-3", "name": "BSI-TR Teil 3", "fullName": "BSI TR-03161 Teil 3 - Hintergrundsysteme", "type": "bsi_standard"},
+]
+
+# Ingestion state (in-memory for now)
+ingestion_state = {
+    "running": False,
+    "completed": False,
+    "current_regulation": None,
+    "processed": 0,
+    "total": len(REGULATIONS),
+    "error": None,
+}
+
+
+class SearchRequest(BaseModel):
+    query: str
+    regulations: Optional[List[str]] = None
+    top_k: int = 5
+
+
+class IngestRequest(BaseModel):
+    force: bool = False
+    regulations: Optional[List[str]] = None
+
+
+class AddLinkRequest(BaseModel):
+    url: str
+    title: str
+    code: str
+    document_type: str = "custom"
+
+
+# Store for custom documents (in-memory for now)
+custom_documents: List[Dict[str, Any]] = []
+
+
+router = APIRouter(prefix="/api/v1/admin/legal-corpus", tags=["legal-corpus"])
+
+
+@router.get("/status")
+async def get_legal_corpus_status():
+    """Get status of the legal corpus collection including chunk counts per regulation."""
+    async with httpx.AsyncClient(timeout=30.0) as client:
+        try:
+            collection_res = await client.get(f"{QDRANT_URL}/collections/{COLLECTION_NAME}")
+            if collection_res.status_code != 200:
+                return {
+                    "collection": COLLECTION_NAME,
+                    "totalPoints": 0,
+                    "vectorSize": 1024,
+                    "status": "not_found",
+                    "regulations": {},
+                }
+
+            collection_data = collection_res.json()
+            result = collection_data.get("result", {})
+
+            regulation_counts = {}
+            for reg in REGULATIONS:
+                count_res = await client.post(
+                    f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/count",
+                    json={
+                        "filter": {
+                            "must": [{"key": "regulation_code", "match": {"value": reg["code"]}}]
+                        }
+                    },
+                )
+                if count_res.status_code == 200:
+                    count_data = count_res.json()
+                    regulation_counts[reg["code"]] = count_data.get("result", {}).get("count", 0)
+                else:
+                    regulation_counts[reg["code"]] = 0
+
+            return {
+                "collection": COLLECTION_NAME,
+                "totalPoints": result.get("points_count", 0),
+                "vectorSize": result.get("config", {}).get("params", {}).get("vectors", {}).get("size", 1024),
+                "status": result.get("status", "unknown"),
+                "regulations": regulation_counts,
+            }
+
+        except httpx.RequestError as e:
+            logger.error(f"Failed to get Qdrant status: {e}")
+            raise HTTPException(status_code=503, detail=f"Qdrant not available: {str(e)}")
+
+
+@router.get("/search")
+async def search_legal_corpus(
+    query: str = Query(..., description="Search query"),
+    top_k: int = Query(5, ge=1, le=20, description="Number of results"),
+    regulations: Optional[str] = Query(None, description="Comma-separated regulation codes to filter"),
+):
+    """Semantic search in legal corpus using BGE-M3 embeddings."""
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        try:
+            embed_res = await client.post(
+                f"{EMBEDDING_SERVICE_URL}/embed",
+                json={"texts": [query]},
+            )
+            if embed_res.status_code != 200:
+                raise HTTPException(status_code=500, detail="Embedding service error")
+
+            embed_data = embed_res.json()
+            query_vector = embed_data["embeddings"][0]
+
+            search_request = {
+                "vector": query_vector,
+                "limit": top_k,
+                "with_payload": True,
+            }
+
+            if regulations:
+                reg_codes = [r.strip() for r in regulations.split(",")]
+                search_request["filter"] = {
+                    "should": [
+                        {"key": "regulation_code", "match": {"value": code}}
+                        for code in reg_codes
+                    ]
+                }
+
+            search_res = await client.post(
+                f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/search",
+                json=search_request,
+            )
+
+            if search_res.status_code != 200:
+                raise HTTPException(status_code=500, detail="Search failed")
+
+            search_data = search_res.json()
+            results = []
+            for point in search_data.get("result", []):
+                payload = point.get("payload", {})
+                results.append({
+                    "text": payload.get("text", ""),
+                    "regulation_code": payload.get("regulation_code", ""),
+                    "regulation_name": payload.get("regulation_name", ""),
+                    "article": payload.get("article"),
+                    "paragraph": payload.get("paragraph"),
+                    "source_url": payload.get("source_url", ""),
+                    "score": point.get("score", 0),
+                })
+
+            return {"results": results, "query": query, "count": len(results)}
+
+        except httpx.RequestError as e:
+            logger.error(f"Search failed: {e}")
+            raise HTTPException(status_code=503, detail=f"Service not available: {str(e)}")
+
+
+@router.post("/ingest")
+async def trigger_ingestion(request: IngestRequest, background_tasks: BackgroundTasks):
+    """Trigger legal corpus ingestion in background."""
+    global ingestion_state
+
+    if ingestion_state["running"]:
+        raise HTTPException(status_code=409, detail="Ingestion already running")
+
+    ingestion_state = {
+        "running": True,
+        "completed": False,
+        "current_regulation": None,
+        "processed": 0,
+        "total": len(REGULATIONS),
+        "error": None,
+    }
+
+    background_tasks.add_task(run_ingestion, request.force, request.regulations, ingestion_state, REGULATIONS)
+
+    return {
+        "status": "started",
+        "job_id": "manual-trigger",
+        "message": f"Ingestion started for {len(REGULATIONS)} regulations",
+    }
+
+
+@router.get("/ingestion-status")
+async def get_ingestion_status():
+    """Get current ingestion status."""
+    return ingestion_state
+
+
+@router.get("/regulations")
+async def get_regulations():
+    """Get list of all supported regulations."""
+    return {"regulations": REGULATIONS}
+
+
+@router.get("/custom-documents")
+async def get_custom_documents():
+    """Get list of custom documents added by user."""
+    return {"documents": custom_documents}
+
+
+@router.post("/upload")
+async def upload_document(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(...),
+    title: str = Form(...),
+    code: str = Form(...),
+    document_type: str = Form("custom"),
+):
+    """Upload a document (PDF) for ingestion into the legal corpus."""
+    global custom_documents
+
+    if not file.filename.endswith(('.pdf', '.PDF')):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported")
+
+    upload_dir = "/tmp/legal_corpus_uploads"
+    os.makedirs(upload_dir, exist_ok=True)
+
+    doc_id = str(uuid.uuid4())[:8]
+    safe_filename = f"{doc_id}_{file.filename}"
+    file_path = os.path.join(upload_dir, safe_filename)
+
+    try:
+        with open(file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+    except Exception as e:
+        logger.error(f"Failed to save uploaded file: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to save file: {str(e)}")
+
+    doc_record = {
+        "id": doc_id,
+        "code": code,
+        "title": title,
+        "filename": file.filename,
+        "file_path": file_path,
+        "document_type": document_type,
+        "uploaded_at": datetime.now().isoformat(),
+        "status": "uploaded",
+        "chunk_count": 0,
+    }
+
+    custom_documents.append(doc_record)
+    background_tasks.add_task(ingest_uploaded_document, doc_record)
+
+    return {
+        "status": "uploaded",
+        "document_id": doc_id,
+        "message": f"Document '{title}' uploaded and queued for ingestion",
+        "document": doc_record,
+    }
+
+
+
+@router.post("/add-link")
+async def add_link(request: AddLinkRequest, background_tasks: BackgroundTasks):
+    """Add a URL/link for ingestion into the legal corpus."""
+    global custom_documents
+
+    doc_id = str(uuid.uuid4())[:8]
+    doc_record = {
+        "id": doc_id,
+        "code": request.code,
+        "title": request.title,
+        "url": request.url,
+        "document_type": request.document_type,
+        "uploaded_at": datetime.now().isoformat(),
+        "status": "queued",
+        "chunk_count": 0,
+    }
+
+    custom_documents.append(doc_record)
+    background_tasks.add_task(ingest_link_document, doc_record)
+
+    return {
+        "status": "queued",
+        "document_id": doc_id,
+        "message": f"Link '{request.title}' queued for ingestion",
+        "document": doc_record,
+    }
+
+
+
+@router.delete("/custom-documents/{doc_id}")
+async def delete_custom_document(doc_id: str):
+    """Delete a custom document from the list."""
+    global custom_documents
+
+    doc = next((d for d in custom_documents if d["id"] == doc_id), None)
+    if not doc:
+        raise HTTPException(status_code=404, detail="Document not found")
+
+    custom_documents = [d for d in custom_documents if d["id"] != doc_id]
+
+    return {"status": "deleted", "document_id": doc_id}
+
+
+@router.get("/traceability")
+async def get_traceability(
+    chunk_id: str = Query(..., description="Chunk ID or identifier"),
+    regulation: str = Query(..., description="Regulation code"),
+):
+    """Get traceability information for a specific chunk."""
+    async with httpx.AsyncClient(timeout=30.0) as client:
+        try:
+            return {
+                "chunk_id": chunk_id,
+                "regulation": regulation,
+                "requirements": [],
+                "controls": [],
+                "message": "Traceability-Daten werden verfuegbar sein, sobald die Requirements-Extraktion und Control-Ableitung implementiert sind."
+            }
+
+        except Exception as e:
+            logger.error(f"Failed to get traceability: {e}")
+            raise HTTPException(status_code=500, detail=f"Traceability lookup failed: {str(e)}")