backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
369 lines
13 KiB
Python
369 lines
13 KiB
Python
"""
|
|
Legal Corpus API - Corpus Routes
|
|
|
|
Endpoints for the RAG page in admin-v2:
|
|
- GET /status - Collection status with chunk counts
|
|
- GET /search - Semantic search
|
|
- POST /ingest - Trigger ingestion
|
|
- GET /ingestion-status - Ingestion status
|
|
- GET /regulations - List regulations
|
|
- GET /custom-documents - List custom docs
|
|
- POST /upload - Upload document
|
|
- POST /add-link - Add link for ingestion
|
|
- DELETE /custom-documents/{id} - Delete custom doc
|
|
- GET /traceability - Traceability info
|
|
|
|
Extracted from legal_corpus_api.py to keep files under 500 LOC.
|
|
"""
|
|
|
|
import os
|
|
import httpx
|
|
import uuid
|
|
import shutil
|
|
from datetime import datetime
|
|
from typing import Optional, List, Dict, Any
|
|
from fastapi import APIRouter, HTTPException, Query, BackgroundTasks, UploadFile, File, Form
|
|
from pydantic import BaseModel
|
|
import logging
|
|
|
|
from legal_corpus_ingest_tasks import (
|
|
ingest_uploaded_document,
|
|
ingest_link_document,
|
|
run_ingestion,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
|
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
|
|
COLLECTION_NAME = "bp_legal_corpus"
|
|
|
|
# All regulations for status endpoint
|
|
REGULATIONS = [
|
|
{"code": "GDPR", "name": "DSGVO", "fullName": "Datenschutz-Grundverordnung", "type": "eu_regulation"},
|
|
{"code": "EPRIVACY", "name": "ePrivacy-Richtlinie", "fullName": "Richtlinie 2002/58/EG", "type": "eu_directive"},
|
|
{"code": "TDDDG", "name": "TDDDG", "fullName": "Telekommunikation-Digitale-Dienste-Datenschutz-Gesetz", "type": "de_law"},
|
|
{"code": "SCC", "name": "Standardvertragsklauseln", "fullName": "2021/914/EU", "type": "eu_regulation"},
|
|
{"code": "DPF", "name": "EU-US Data Privacy Framework", "fullName": "Angemessenheitsbeschluss", "type": "eu_regulation"},
|
|
{"code": "AIACT", "name": "EU AI Act", "fullName": "Verordnung (EU) 2024/1689", "type": "eu_regulation"},
|
|
{"code": "CRA", "name": "Cyber Resilience Act", "fullName": "Verordnung (EU) 2024/2847", "type": "eu_regulation"},
|
|
{"code": "NIS2", "name": "NIS2-Richtlinie", "fullName": "Richtlinie (EU) 2022/2555", "type": "eu_directive"},
|
|
{"code": "EUCSA", "name": "EU Cybersecurity Act", "fullName": "Verordnung (EU) 2019/881", "type": "eu_regulation"},
|
|
{"code": "DATAACT", "name": "Data Act", "fullName": "Verordnung (EU) 2023/2854", "type": "eu_regulation"},
|
|
{"code": "DGA", "name": "Data Governance Act", "fullName": "Verordnung (EU) 2022/868", "type": "eu_regulation"},
|
|
{"code": "DSA", "name": "Digital Services Act", "fullName": "Verordnung (EU) 2022/2065", "type": "eu_regulation"},
|
|
{"code": "EAA", "name": "European Accessibility Act", "fullName": "Richtlinie (EU) 2019/882", "type": "eu_directive"},
|
|
{"code": "DSM", "name": "DSM-Urheberrechtsrichtlinie", "fullName": "Richtlinie (EU) 2019/790", "type": "eu_directive"},
|
|
{"code": "PLD", "name": "Produkthaftungsrichtlinie", "fullName": "Richtlinie 85/374/EWG", "type": "eu_directive"},
|
|
{"code": "GPSR", "name": "General Product Safety", "fullName": "Verordnung (EU) 2023/988", "type": "eu_regulation"},
|
|
{"code": "BSI-TR-03161-1", "name": "BSI-TR Teil 1", "fullName": "BSI TR-03161 Teil 1 - Mobile Anwendungen", "type": "bsi_standard"},
|
|
{"code": "BSI-TR-03161-2", "name": "BSI-TR Teil 2", "fullName": "BSI TR-03161 Teil 2 - Web-Anwendungen", "type": "bsi_standard"},
|
|
{"code": "BSI-TR-03161-3", "name": "BSI-TR Teil 3", "fullName": "BSI TR-03161 Teil 3 - Hintergrundsysteme", "type": "bsi_standard"},
|
|
]
|
|
|
|
# Ingestion state (in-memory for now)
|
|
ingestion_state = {
|
|
"running": False,
|
|
"completed": False,
|
|
"current_regulation": None,
|
|
"processed": 0,
|
|
"total": len(REGULATIONS),
|
|
"error": None,
|
|
}
|
|
|
|
|
|
class SearchRequest(BaseModel):
|
|
query: str
|
|
regulations: Optional[List[str]] = None
|
|
top_k: int = 5
|
|
|
|
|
|
class IngestRequest(BaseModel):
|
|
force: bool = False
|
|
regulations: Optional[List[str]] = None
|
|
|
|
|
|
class AddLinkRequest(BaseModel):
|
|
url: str
|
|
title: str
|
|
code: str
|
|
document_type: str = "custom"
|
|
|
|
|
|
# Store for custom documents (in-memory for now)
|
|
custom_documents: List[Dict[str, Any]] = []
|
|
|
|
|
|
router = APIRouter(prefix="/api/v1/admin/legal-corpus", tags=["legal-corpus"])
|
|
|
|
|
|
@router.get("/status")
|
|
async def get_legal_corpus_status():
|
|
"""Get status of the legal corpus collection including chunk counts per regulation."""
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
try:
|
|
collection_res = await client.get(f"{QDRANT_URL}/collections/{COLLECTION_NAME}")
|
|
if collection_res.status_code != 200:
|
|
return {
|
|
"collection": COLLECTION_NAME,
|
|
"totalPoints": 0,
|
|
"vectorSize": 1024,
|
|
"status": "not_found",
|
|
"regulations": {},
|
|
}
|
|
|
|
collection_data = collection_res.json()
|
|
result = collection_data.get("result", {})
|
|
|
|
regulation_counts = {}
|
|
for reg in REGULATIONS:
|
|
count_res = await client.post(
|
|
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/count",
|
|
json={
|
|
"filter": {
|
|
"must": [{"key": "regulation_code", "match": {"value": reg["code"]}}]
|
|
}
|
|
},
|
|
)
|
|
if count_res.status_code == 200:
|
|
count_data = count_res.json()
|
|
regulation_counts[reg["code"]] = count_data.get("result", {}).get("count", 0)
|
|
else:
|
|
regulation_counts[reg["code"]] = 0
|
|
|
|
return {
|
|
"collection": COLLECTION_NAME,
|
|
"totalPoints": result.get("points_count", 0),
|
|
"vectorSize": result.get("config", {}).get("params", {}).get("vectors", {}).get("size", 1024),
|
|
"status": result.get("status", "unknown"),
|
|
"regulations": regulation_counts,
|
|
}
|
|
|
|
except httpx.RequestError as e:
|
|
logger.error(f"Failed to get Qdrant status: {e}")
|
|
raise HTTPException(status_code=503, detail=f"Qdrant not available: {str(e)}")
|
|
|
|
|
|
@router.get("/search")
|
|
async def search_legal_corpus(
|
|
query: str = Query(..., description="Search query"),
|
|
top_k: int = Query(5, ge=1, le=20, description="Number of results"),
|
|
regulations: Optional[str] = Query(None, description="Comma-separated regulation codes to filter"),
|
|
):
|
|
"""Semantic search in legal corpus using BGE-M3 embeddings."""
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
try:
|
|
embed_res = await client.post(
|
|
f"{EMBEDDING_SERVICE_URL}/embed",
|
|
json={"texts": [query]},
|
|
)
|
|
if embed_res.status_code != 200:
|
|
raise HTTPException(status_code=500, detail="Embedding service error")
|
|
|
|
embed_data = embed_res.json()
|
|
query_vector = embed_data["embeddings"][0]
|
|
|
|
search_request = {
|
|
"vector": query_vector,
|
|
"limit": top_k,
|
|
"with_payload": True,
|
|
}
|
|
|
|
if regulations:
|
|
reg_codes = [r.strip() for r in regulations.split(",")]
|
|
search_request["filter"] = {
|
|
"should": [
|
|
{"key": "regulation_code", "match": {"value": code}}
|
|
for code in reg_codes
|
|
]
|
|
}
|
|
|
|
search_res = await client.post(
|
|
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/search",
|
|
json=search_request,
|
|
)
|
|
|
|
if search_res.status_code != 200:
|
|
raise HTTPException(status_code=500, detail="Search failed")
|
|
|
|
search_data = search_res.json()
|
|
results = []
|
|
for point in search_data.get("result", []):
|
|
payload = point.get("payload", {})
|
|
results.append({
|
|
"text": payload.get("text", ""),
|
|
"regulation_code": payload.get("regulation_code", ""),
|
|
"regulation_name": payload.get("regulation_name", ""),
|
|
"article": payload.get("article"),
|
|
"paragraph": payload.get("paragraph"),
|
|
"source_url": payload.get("source_url", ""),
|
|
"score": point.get("score", 0),
|
|
})
|
|
|
|
return {"results": results, "query": query, "count": len(results)}
|
|
|
|
except httpx.RequestError as e:
|
|
logger.error(f"Search failed: {e}")
|
|
raise HTTPException(status_code=503, detail=f"Service not available: {str(e)}")
|
|
|
|
|
|
@router.post("/ingest")
|
|
async def trigger_ingestion(request: IngestRequest, background_tasks: BackgroundTasks):
|
|
"""Trigger legal corpus ingestion in background."""
|
|
global ingestion_state
|
|
|
|
if ingestion_state["running"]:
|
|
raise HTTPException(status_code=409, detail="Ingestion already running")
|
|
|
|
ingestion_state = {
|
|
"running": True,
|
|
"completed": False,
|
|
"current_regulation": None,
|
|
"processed": 0,
|
|
"total": len(REGULATIONS),
|
|
"error": None,
|
|
}
|
|
|
|
background_tasks.add_task(run_ingestion, request.force, request.regulations, ingestion_state, REGULATIONS)
|
|
|
|
return {
|
|
"status": "started",
|
|
"job_id": "manual-trigger",
|
|
"message": f"Ingestion started for {len(REGULATIONS)} regulations",
|
|
}
|
|
|
|
|
|
@router.get("/ingestion-status")
|
|
async def get_ingestion_status():
|
|
"""Get current ingestion status."""
|
|
return ingestion_state
|
|
|
|
|
|
@router.get("/regulations")
|
|
async def get_regulations():
|
|
"""Get list of all supported regulations."""
|
|
return {"regulations": REGULATIONS}
|
|
|
|
|
|
@router.get("/custom-documents")
|
|
async def get_custom_documents():
|
|
"""Get list of custom documents added by user."""
|
|
return {"documents": custom_documents}
|
|
|
|
|
|
@router.post("/upload")
|
|
async def upload_document(
|
|
background_tasks: BackgroundTasks,
|
|
file: UploadFile = File(...),
|
|
title: str = Form(...),
|
|
code: str = Form(...),
|
|
document_type: str = Form("custom"),
|
|
):
|
|
"""Upload a document (PDF) for ingestion into the legal corpus."""
|
|
global custom_documents
|
|
|
|
if not file.filename.endswith(('.pdf', '.PDF')):
|
|
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
|
|
|
upload_dir = "/tmp/legal_corpus_uploads"
|
|
os.makedirs(upload_dir, exist_ok=True)
|
|
|
|
doc_id = str(uuid.uuid4())[:8]
|
|
safe_filename = f"{doc_id}_{file.filename}"
|
|
file_path = os.path.join(upload_dir, safe_filename)
|
|
|
|
try:
|
|
with open(file_path, "wb") as buffer:
|
|
shutil.copyfileobj(file.file, buffer)
|
|
except Exception as e:
|
|
logger.error(f"Failed to save uploaded file: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to save file: {str(e)}")
|
|
|
|
doc_record = {
|
|
"id": doc_id,
|
|
"code": code,
|
|
"title": title,
|
|
"filename": file.filename,
|
|
"file_path": file_path,
|
|
"document_type": document_type,
|
|
"uploaded_at": datetime.now().isoformat(),
|
|
"status": "uploaded",
|
|
"chunk_count": 0,
|
|
}
|
|
|
|
custom_documents.append(doc_record)
|
|
background_tasks.add_task(ingest_uploaded_document, doc_record)
|
|
|
|
return {
|
|
"status": "uploaded",
|
|
"document_id": doc_id,
|
|
"message": f"Document '{title}' uploaded and queued for ingestion",
|
|
"document": doc_record,
|
|
}
|
|
|
|
|
|
|
|
@router.post("/add-link")
|
|
async def add_link(request: AddLinkRequest, background_tasks: BackgroundTasks):
|
|
"""Add a URL/link for ingestion into the legal corpus."""
|
|
global custom_documents
|
|
|
|
doc_id = str(uuid.uuid4())[:8]
|
|
doc_record = {
|
|
"id": doc_id,
|
|
"code": request.code,
|
|
"title": request.title,
|
|
"url": request.url,
|
|
"document_type": request.document_type,
|
|
"uploaded_at": datetime.now().isoformat(),
|
|
"status": "queued",
|
|
"chunk_count": 0,
|
|
}
|
|
|
|
custom_documents.append(doc_record)
|
|
background_tasks.add_task(ingest_link_document, doc_record)
|
|
|
|
return {
|
|
"status": "queued",
|
|
"document_id": doc_id,
|
|
"message": f"Link '{request.title}' queued for ingestion",
|
|
"document": doc_record,
|
|
}
|
|
|
|
|
|
|
|
@router.delete("/custom-documents/{doc_id}")
|
|
async def delete_custom_document(doc_id: str):
|
|
"""Delete a custom document from the list."""
|
|
global custom_documents
|
|
|
|
doc = next((d for d in custom_documents if d["id"] == doc_id), None)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
custom_documents = [d for d in custom_documents if d["id"] != doc_id]
|
|
|
|
return {"status": "deleted", "document_id": doc_id}
|
|
|
|
|
|
@router.get("/traceability")
|
|
async def get_traceability(
|
|
chunk_id: str = Query(..., description="Chunk ID or identifier"),
|
|
regulation: str = Query(..., description="Regulation code"),
|
|
):
|
|
"""Get traceability information for a specific chunk."""
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
try:
|
|
return {
|
|
"chunk_id": chunk_id,
|
|
"regulation": regulation,
|
|
"requirements": [],
|
|
"controls": [],
|
|
"message": "Traceability-Daten werden verfuegbar sein, sobald die Requirements-Extraktion und Control-Ableitung implementiert sind."
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get traceability: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Traceability lookup failed: {str(e)}")
|