Files
breakpilot-lehrer/klausur-service/backend/zeugnis_api.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

538 lines
18 KiB
Python

"""
Zeugnis Rights-Aware Crawler - API Endpoints
FastAPI router for managing zeugnis sources, documents, and crawler operations.
"""
from datetime import datetime, timedelta
from typing import Optional, List
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
from pydantic import BaseModel
from zeugnis_models import (
ZeugnisSource, ZeugnisSourceCreate, ZeugnisSourceVerify,
SeedUrl, SeedUrlCreate,
ZeugnisDocument, ZeugnisStats,
CrawlerStatus, CrawlRequest, CrawlQueueItem,
UsageEvent, AuditExport,
LicenseType, CrawlStatus, DocType, EventType,
BUNDESLAENDER, TRAINING_PERMISSIONS,
generate_id, get_training_allowed, get_bundesland_name, get_license_for_bundesland,
)
from zeugnis_crawler import (
start_crawler, stop_crawler, get_crawler_status,
)
from metrics_db import (
get_zeugnis_sources, upsert_zeugnis_source,
get_zeugnis_documents, get_zeugnis_stats,
log_zeugnis_event, get_pool,
)
router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
# =============================================================================
# Sources Endpoints
# =============================================================================
@router.get("/sources", response_model=List[dict])
async def list_sources():
"""Get all zeugnis sources (Bundesländer)."""
sources = await get_zeugnis_sources()
if not sources:
# Return default sources if none exist
return [
{
"id": None,
"bundesland": code,
"name": info["name"],
"base_url": None,
"license_type": str(get_license_for_bundesland(code).value),
"training_allowed": get_training_allowed(code),
"verified_by": None,
"verified_at": None,
"created_at": None,
"updated_at": None,
}
for code, info in BUNDESLAENDER.items()
]
return sources
@router.post("/sources", response_model=dict)
async def create_source(source: ZeugnisSourceCreate):
"""Create or update a zeugnis source."""
source_id = generate_id()
success = await upsert_zeugnis_source(
id=source_id,
bundesland=source.bundesland,
name=source.name,
license_type=source.license_type.value,
training_allowed=source.training_allowed,
base_url=source.base_url,
)
if not success:
raise HTTPException(status_code=500, detail="Failed to create source")
return {"id": source_id, "success": True}
@router.put("/sources/{source_id}/verify", response_model=dict)
async def verify_source(source_id: str, verification: ZeugnisSourceVerify):
"""Verify a source's license status."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
async with pool.acquire() as conn:
await conn.execute(
"""
UPDATE zeugnis_sources
SET license_type = $2,
training_allowed = $3,
verified_by = $4,
verified_at = NOW(),
updated_at = NOW()
WHERE id = $1
""",
source_id, verification.license_type.value,
verification.training_allowed, verification.verified_by
)
return {"success": True, "source_id": source_id}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/sources/{bundesland}", response_model=dict)
async def get_source_by_bundesland(bundesland: str):
"""Get source details for a specific Bundesland."""
pool = await get_pool()
if not pool:
# Return default info
if bundesland not in BUNDESLAENDER:
raise HTTPException(status_code=404, detail=f"Bundesland not found: {bundesland}")
return {
"bundesland": bundesland,
"name": get_bundesland_name(bundesland),
"training_allowed": get_training_allowed(bundesland),
"license_type": get_license_for_bundesland(bundesland).value,
"document_count": 0,
}
try:
async with pool.acquire() as conn:
source = await conn.fetchrow(
"SELECT * FROM zeugnis_sources WHERE bundesland = $1",
bundesland
)
if source:
doc_count = await conn.fetchval(
"""
SELECT COUNT(*) FROM zeugnis_documents d
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
WHERE u.source_id = $1
""",
source["id"]
)
return {**dict(source), "document_count": doc_count or 0}
# Return default
return {
"bundesland": bundesland,
"name": get_bundesland_name(bundesland),
"training_allowed": get_training_allowed(bundesland),
"license_type": get_license_for_bundesland(bundesland).value,
"document_count": 0,
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Seed URLs Endpoints
# =============================================================================
@router.get("/sources/{source_id}/urls", response_model=List[dict])
async def list_seed_urls(source_id: str):
"""Get all seed URLs for a source."""
pool = await get_pool()
if not pool:
return []
try:
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 ORDER BY created_at",
source_id
)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/sources/{source_id}/urls", response_model=dict)
async def add_seed_url(source_id: str, seed_url: SeedUrlCreate):
"""Add a new seed URL to a source."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
url_id = generate_id()
try:
async with pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO zeugnis_seed_urls (id, source_id, url, doc_type, status)
VALUES ($1, $2, $3, $4, 'pending')
""",
url_id, source_id, seed_url.url, seed_url.doc_type.value
)
return {"id": url_id, "success": True}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/urls/{url_id}", response_model=dict)
async def delete_seed_url(url_id: str):
"""Delete a seed URL."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
async with pool.acquire() as conn:
await conn.execute(
"DELETE FROM zeugnis_seed_urls WHERE id = $1",
url_id
)
return {"success": True}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Documents Endpoints
# =============================================================================
@router.get("/documents", response_model=List[dict])
async def list_documents(
bundesland: Optional[str] = None,
limit: int = Query(100, le=500),
offset: int = 0,
):
"""Get all zeugnis documents with optional filtering."""
documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset)
return documents
@router.get("/documents/{document_id}", response_model=dict)
async def get_document(document_id: str):
"""Get details for a specific document."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
async with pool.acquire() as conn:
doc = await conn.fetchrow(
"""
SELECT d.*, s.bundesland, s.name as source_name
FROM zeugnis_documents d
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
JOIN zeugnis_sources s ON u.source_id = s.id
WHERE d.id = $1
""",
document_id
)
if not doc:
raise HTTPException(status_code=404, detail="Document not found")
# Log view event
await log_zeugnis_event(document_id, EventType.VIEWED.value)
return dict(doc)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/documents/{document_id}/versions", response_model=List[dict])
async def get_document_versions(document_id: str):
"""Get version history for a document."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT * FROM zeugnis_document_versions
WHERE document_id = $1
ORDER BY version DESC
""",
document_id
)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Crawler Control Endpoints
# =============================================================================
@router.get("/crawler/status", response_model=dict)
async def crawler_status():
"""Get current crawler status."""
return get_crawler_status()
@router.post("/crawler/start", response_model=dict)
async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks):
"""Start the crawler."""
success = await start_crawler(
bundesland=request.bundesland,
source_id=request.source_id,
)
if not success:
raise HTTPException(status_code=409, detail="Crawler already running")
return {"success": True, "message": "Crawler started"}
@router.post("/crawler/stop", response_model=dict)
async def stop_crawl():
"""Stop the crawler."""
success = await stop_crawler()
if not success:
raise HTTPException(status_code=409, detail="Crawler not running")
return {"success": True, "message": "Crawler stopped"}
@router.get("/crawler/queue", response_model=List[dict])
async def get_queue():
"""Get the crawler queue."""
pool = await get_pool()
if not pool:
return []
try:
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT q.*, s.bundesland, s.name as source_name
FROM zeugnis_crawler_queue q
JOIN zeugnis_sources s ON q.source_id = s.id
ORDER BY q.priority DESC, q.created_at
"""
)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/crawler/queue", response_model=dict)
async def add_to_queue(request: CrawlRequest):
"""Add a source to the crawler queue."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
queue_id = generate_id()
try:
async with pool.acquire() as conn:
# Get source ID if bundesland provided
source_id = request.source_id
if not source_id and request.bundesland:
source = await conn.fetchrow(
"SELECT id FROM zeugnis_sources WHERE bundesland = $1",
request.bundesland
)
if source:
source_id = source["id"]
if not source_id:
raise HTTPException(status_code=400, detail="Source not found")
await conn.execute(
"""
INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status)
VALUES ($1, $2, $3, 'pending')
""",
queue_id, source_id, request.priority
)
return {"id": queue_id, "success": True}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Statistics Endpoints
# =============================================================================
@router.get("/stats", response_model=dict)
async def get_stats():
"""Get zeugnis crawler statistics."""
stats = await get_zeugnis_stats()
return stats
@router.get("/stats/bundesland", response_model=List[dict])
async def get_bundesland_stats():
"""Get statistics per Bundesland."""
pool = await get_pool()
# Build stats from BUNDESLAENDER with DB data if available
stats = []
for code, info in BUNDESLAENDER.items():
stat = {
"bundesland": code,
"name": info["name"],
"training_allowed": get_training_allowed(code),
"document_count": 0,
"indexed_count": 0,
"last_crawled": None,
}
if pool:
try:
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT
COUNT(d.id) as doc_count,
COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count,
MAX(u.last_crawled) as last_crawled
FROM zeugnis_sources s
LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
WHERE s.bundesland = $1
GROUP BY s.id
""",
code
)
if row:
stat["document_count"] = row["doc_count"] or 0
stat["indexed_count"] = row["indexed_count"] or 0
stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None
except Exception:
pass
stats.append(stat)
return stats
# =============================================================================
# Audit Endpoints
# =============================================================================
@router.get("/audit/events", response_model=List[dict])
async def get_audit_events(
document_id: Optional[str] = None,
event_type: Optional[str] = None,
limit: int = Query(100, le=1000),
days: int = Query(30, le=365),
):
"""Get audit events with optional filtering."""
pool = await get_pool()
if not pool:
return []
try:
since = datetime.now() - timedelta(days=days)
async with pool.acquire() as conn:
query = """
SELECT * FROM zeugnis_usage_events
WHERE created_at >= $1
"""
params = [since]
if document_id:
query += " AND document_id = $2"
params.append(document_id)
if event_type:
query += f" AND event_type = ${len(params) + 1}"
params.append(event_type)
query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}"
params.append(limit)
rows = await conn.fetch(query, *params)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/audit/export", response_model=dict)
async def export_audit(
days: int = Query(30, le=365),
requested_by: str = Query(..., description="User requesting the export"),
):
"""Export audit data for GDPR compliance."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
since = datetime.now() - timedelta(days=days)
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT * FROM zeugnis_usage_events
WHERE created_at >= $1
ORDER BY created_at DESC
""",
since
)
doc_count = await conn.fetchval(
"SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1",
since
)
return {
"export_date": datetime.now().isoformat(),
"requested_by": requested_by,
"events": [dict(r) for r in rows],
"document_count": doc_count or 0,
"date_range_start": since.isoformat(),
"date_range_end": datetime.now().isoformat(),
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Initialization Endpoint
# =============================================================================
@router.post("/init", response_model=dict)
async def initialize_sources():
"""Initialize default sources from BUNDESLAENDER."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
created = 0
try:
for code, info in BUNDESLAENDER.items():
source_id = generate_id()
success = await upsert_zeugnis_source(
id=source_id,
bundesland=code,
name=info["name"],
license_type=get_license_for_bundesland(code).value,
training_allowed=get_training_allowed(code),
)
if success:
created += 1
return {"success": True, "sources_created": created}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))