[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files):
- llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6)
- messenger_api.py (840 → 5), print_generator.py (824 → 5)
- unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4)
- llm_gateway/routes/edu_search_seeds.py (710 → 4)

klausur-service (12 files):
- ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4)
- legal_corpus_api.py (790 → 4), page_crop.py (758 → 3)
- mail/ai_service.py (747 → 4), github_crawler.py (767 → 3)
- trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4)
- dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4)

website (6 pages):
- audit-checklist (867 → 8), content (806 → 6)
- screen-flow (790 → 4), scraper (789 → 5)
- zeugnisse (776 → 5), modules (745 → 4)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions

View File

@@ -0,0 +1,368 @@
"""
Legal Corpus API - Corpus Routes
Endpoints for the RAG page in admin-v2:
- GET /status - Collection status with chunk counts
- GET /search - Semantic search
- POST /ingest - Trigger ingestion
- GET /ingestion-status - Ingestion status
- GET /regulations - List regulations
- GET /custom-documents - List custom docs
- POST /upload - Upload document
- POST /add-link - Add link for ingestion
- DELETE /custom-documents/{id} - Delete custom doc
- GET /traceability - Traceability info
Extracted from legal_corpus_api.py to keep files under 500 LOC.
"""
import os
import httpx
import uuid
import shutil
from datetime import datetime
from typing import Optional, List, Dict, Any
from fastapi import APIRouter, HTTPException, Query, BackgroundTasks, UploadFile, File, Form
from pydantic import BaseModel
import logging
from legal_corpus_ingest_tasks import (
ingest_uploaded_document,
ingest_link_document,
run_ingestion,
)
logger = logging.getLogger(__name__)
# Configuration
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
EMBEDDING_SERVICE_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://embedding-service:8087")
COLLECTION_NAME = "bp_legal_corpus"
# All regulations for status endpoint
REGULATIONS = [
{"code": "GDPR", "name": "DSGVO", "fullName": "Datenschutz-Grundverordnung", "type": "eu_regulation"},
{"code": "EPRIVACY", "name": "ePrivacy-Richtlinie", "fullName": "Richtlinie 2002/58/EG", "type": "eu_directive"},
{"code": "TDDDG", "name": "TDDDG", "fullName": "Telekommunikation-Digitale-Dienste-Datenschutz-Gesetz", "type": "de_law"},
{"code": "SCC", "name": "Standardvertragsklauseln", "fullName": "2021/914/EU", "type": "eu_regulation"},
{"code": "DPF", "name": "EU-US Data Privacy Framework", "fullName": "Angemessenheitsbeschluss", "type": "eu_regulation"},
{"code": "AIACT", "name": "EU AI Act", "fullName": "Verordnung (EU) 2024/1689", "type": "eu_regulation"},
{"code": "CRA", "name": "Cyber Resilience Act", "fullName": "Verordnung (EU) 2024/2847", "type": "eu_regulation"},
{"code": "NIS2", "name": "NIS2-Richtlinie", "fullName": "Richtlinie (EU) 2022/2555", "type": "eu_directive"},
{"code": "EUCSA", "name": "EU Cybersecurity Act", "fullName": "Verordnung (EU) 2019/881", "type": "eu_regulation"},
{"code": "DATAACT", "name": "Data Act", "fullName": "Verordnung (EU) 2023/2854", "type": "eu_regulation"},
{"code": "DGA", "name": "Data Governance Act", "fullName": "Verordnung (EU) 2022/868", "type": "eu_regulation"},
{"code": "DSA", "name": "Digital Services Act", "fullName": "Verordnung (EU) 2022/2065", "type": "eu_regulation"},
{"code": "EAA", "name": "European Accessibility Act", "fullName": "Richtlinie (EU) 2019/882", "type": "eu_directive"},
{"code": "DSM", "name": "DSM-Urheberrechtsrichtlinie", "fullName": "Richtlinie (EU) 2019/790", "type": "eu_directive"},
{"code": "PLD", "name": "Produkthaftungsrichtlinie", "fullName": "Richtlinie 85/374/EWG", "type": "eu_directive"},
{"code": "GPSR", "name": "General Product Safety", "fullName": "Verordnung (EU) 2023/988", "type": "eu_regulation"},
{"code": "BSI-TR-03161-1", "name": "BSI-TR Teil 1", "fullName": "BSI TR-03161 Teil 1 - Mobile Anwendungen", "type": "bsi_standard"},
{"code": "BSI-TR-03161-2", "name": "BSI-TR Teil 2", "fullName": "BSI TR-03161 Teil 2 - Web-Anwendungen", "type": "bsi_standard"},
{"code": "BSI-TR-03161-3", "name": "BSI-TR Teil 3", "fullName": "BSI TR-03161 Teil 3 - Hintergrundsysteme", "type": "bsi_standard"},
]
# Ingestion state (in-memory for now)
ingestion_state = {
"running": False,
"completed": False,
"current_regulation": None,
"processed": 0,
"total": len(REGULATIONS),
"error": None,
}
class SearchRequest(BaseModel):
query: str
regulations: Optional[List[str]] = None
top_k: int = 5
class IngestRequest(BaseModel):
force: bool = False
regulations: Optional[List[str]] = None
class AddLinkRequest(BaseModel):
url: str
title: str
code: str
document_type: str = "custom"
# Store for custom documents (in-memory for now)
custom_documents: List[Dict[str, Any]] = []
router = APIRouter(prefix="/api/v1/admin/legal-corpus", tags=["legal-corpus"])
@router.get("/status")
async def get_legal_corpus_status():
"""Get status of the legal corpus collection including chunk counts per regulation."""
async with httpx.AsyncClient(timeout=30.0) as client:
try:
collection_res = await client.get(f"{QDRANT_URL}/collections/{COLLECTION_NAME}")
if collection_res.status_code != 200:
return {
"collection": COLLECTION_NAME,
"totalPoints": 0,
"vectorSize": 1024,
"status": "not_found",
"regulations": {},
}
collection_data = collection_res.json()
result = collection_data.get("result", {})
regulation_counts = {}
for reg in REGULATIONS:
count_res = await client.post(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/count",
json={
"filter": {
"must": [{"key": "regulation_code", "match": {"value": reg["code"]}}]
}
},
)
if count_res.status_code == 200:
count_data = count_res.json()
regulation_counts[reg["code"]] = count_data.get("result", {}).get("count", 0)
else:
regulation_counts[reg["code"]] = 0
return {
"collection": COLLECTION_NAME,
"totalPoints": result.get("points_count", 0),
"vectorSize": result.get("config", {}).get("params", {}).get("vectors", {}).get("size", 1024),
"status": result.get("status", "unknown"),
"regulations": regulation_counts,
}
except httpx.RequestError as e:
logger.error(f"Failed to get Qdrant status: {e}")
raise HTTPException(status_code=503, detail=f"Qdrant not available: {str(e)}")
@router.get("/search")
async def search_legal_corpus(
query: str = Query(..., description="Search query"),
top_k: int = Query(5, ge=1, le=20, description="Number of results"),
regulations: Optional[str] = Query(None, description="Comma-separated regulation codes to filter"),
):
"""Semantic search in legal corpus using BGE-M3 embeddings."""
async with httpx.AsyncClient(timeout=60.0) as client:
try:
embed_res = await client.post(
f"{EMBEDDING_SERVICE_URL}/embed",
json={"texts": [query]},
)
if embed_res.status_code != 200:
raise HTTPException(status_code=500, detail="Embedding service error")
embed_data = embed_res.json()
query_vector = embed_data["embeddings"][0]
search_request = {
"vector": query_vector,
"limit": top_k,
"with_payload": True,
}
if regulations:
reg_codes = [r.strip() for r in regulations.split(",")]
search_request["filter"] = {
"should": [
{"key": "regulation_code", "match": {"value": code}}
for code in reg_codes
]
}
search_res = await client.post(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points/search",
json=search_request,
)
if search_res.status_code != 200:
raise HTTPException(status_code=500, detail="Search failed")
search_data = search_res.json()
results = []
for point in search_data.get("result", []):
payload = point.get("payload", {})
results.append({
"text": payload.get("text", ""),
"regulation_code": payload.get("regulation_code", ""),
"regulation_name": payload.get("regulation_name", ""),
"article": payload.get("article"),
"paragraph": payload.get("paragraph"),
"source_url": payload.get("source_url", ""),
"score": point.get("score", 0),
})
return {"results": results, "query": query, "count": len(results)}
except httpx.RequestError as e:
logger.error(f"Search failed: {e}")
raise HTTPException(status_code=503, detail=f"Service not available: {str(e)}")
@router.post("/ingest")
async def trigger_ingestion(request: IngestRequest, background_tasks: BackgroundTasks):
"""Trigger legal corpus ingestion in background."""
global ingestion_state
if ingestion_state["running"]:
raise HTTPException(status_code=409, detail="Ingestion already running")
ingestion_state = {
"running": True,
"completed": False,
"current_regulation": None,
"processed": 0,
"total": len(REGULATIONS),
"error": None,
}
background_tasks.add_task(run_ingestion, request.force, request.regulations, ingestion_state, REGULATIONS)
return {
"status": "started",
"job_id": "manual-trigger",
"message": f"Ingestion started for {len(REGULATIONS)} regulations",
}
@router.get("/ingestion-status")
async def get_ingestion_status():
"""Get current ingestion status."""
return ingestion_state
@router.get("/regulations")
async def get_regulations():
"""Get list of all supported regulations."""
return {"regulations": REGULATIONS}
@router.get("/custom-documents")
async def get_custom_documents():
"""Get list of custom documents added by user."""
return {"documents": custom_documents}
@router.post("/upload")
async def upload_document(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
title: str = Form(...),
code: str = Form(...),
document_type: str = Form("custom"),
):
"""Upload a document (PDF) for ingestion into the legal corpus."""
global custom_documents
if not file.filename.endswith(('.pdf', '.PDF')):
raise HTTPException(status_code=400, detail="Only PDF files are supported")
upload_dir = "/tmp/legal_corpus_uploads"
os.makedirs(upload_dir, exist_ok=True)
doc_id = str(uuid.uuid4())[:8]
safe_filename = f"{doc_id}_{file.filename}"
file_path = os.path.join(upload_dir, safe_filename)
try:
with open(file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
except Exception as e:
logger.error(f"Failed to save uploaded file: {e}")
raise HTTPException(status_code=500, detail=f"Failed to save file: {str(e)}")
doc_record = {
"id": doc_id,
"code": code,
"title": title,
"filename": file.filename,
"file_path": file_path,
"document_type": document_type,
"uploaded_at": datetime.now().isoformat(),
"status": "uploaded",
"chunk_count": 0,
}
custom_documents.append(doc_record)
background_tasks.add_task(ingest_uploaded_document, doc_record)
return {
"status": "uploaded",
"document_id": doc_id,
"message": f"Document '{title}' uploaded and queued for ingestion",
"document": doc_record,
}
@router.post("/add-link")
async def add_link(request: AddLinkRequest, background_tasks: BackgroundTasks):
"""Add a URL/link for ingestion into the legal corpus."""
global custom_documents
doc_id = str(uuid.uuid4())[:8]
doc_record = {
"id": doc_id,
"code": request.code,
"title": request.title,
"url": request.url,
"document_type": request.document_type,
"uploaded_at": datetime.now().isoformat(),
"status": "queued",
"chunk_count": 0,
}
custom_documents.append(doc_record)
background_tasks.add_task(ingest_link_document, doc_record)
return {
"status": "queued",
"document_id": doc_id,
"message": f"Link '{request.title}' queued for ingestion",
"document": doc_record,
}
@router.delete("/custom-documents/{doc_id}")
async def delete_custom_document(doc_id: str):
"""Delete a custom document from the list."""
global custom_documents
doc = next((d for d in custom_documents if d["id"] == doc_id), None)
if not doc:
raise HTTPException(status_code=404, detail="Document not found")
custom_documents = [d for d in custom_documents if d["id"] != doc_id]
return {"status": "deleted", "document_id": doc_id}
@router.get("/traceability")
async def get_traceability(
chunk_id: str = Query(..., description="Chunk ID or identifier"),
regulation: str = Query(..., description="Regulation code"),
):
"""Get traceability information for a specific chunk."""
async with httpx.AsyncClient(timeout=30.0) as client:
try:
return {
"chunk_id": chunk_id,
"regulation": regulation,
"requirements": [],
"controls": [],
"message": "Traceability-Daten werden verfuegbar sein, sobald die Requirements-Extraktion und Control-Ableitung implementiert sind."
}
except Exception as e:
logger.error(f"Failed to get traceability: {e}")
raise HTTPException(status_code=500, detail=f"Traceability lookup failed: {str(e)}")