klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
322 lines
11 KiB
Python
322 lines
11 KiB
Python
"""
|
|
Zeugnis API Docs — documents, crawler control, statistics, audit endpoints.
|
|
|
|
Extracted from zeugnis_api.py for modularity.
|
|
"""
|
|
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional, List
|
|
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
|
|
|
|
from zeugnis_models import (
|
|
CrawlRequest, EventType,
|
|
BUNDESLAENDER,
|
|
generate_id, get_training_allowed, get_license_for_bundesland,
|
|
)
|
|
from zeugnis_crawler import (
|
|
start_crawler, stop_crawler, get_crawler_status,
|
|
)
|
|
from metrics_db import (
|
|
get_zeugnis_documents, get_zeugnis_stats,
|
|
log_zeugnis_event, get_pool,
|
|
)
|
|
|
|
|
|
router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
|
|
|
|
|
|
# =============================================================================
|
|
# Documents Endpoints
|
|
# =============================================================================
|
|
|
|
@router.get("/documents", response_model=List[dict])
|
|
async def list_documents(
|
|
bundesland: Optional[str] = None,
|
|
limit: int = Query(100, le=500),
|
|
offset: int = 0,
|
|
):
|
|
"""Get all zeugnis documents with optional filtering."""
|
|
documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset)
|
|
return documents
|
|
|
|
|
|
@router.get("/documents/{document_id}", response_model=dict)
|
|
async def get_document(document_id: str):
|
|
"""Get details for a specific document."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
doc = await conn.fetchrow(
|
|
"""
|
|
SELECT d.*, s.bundesland, s.name as source_name
|
|
FROM zeugnis_documents d
|
|
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
|
JOIN zeugnis_sources s ON u.source_id = s.id
|
|
WHERE d.id = $1
|
|
""",
|
|
document_id
|
|
)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
# Log view event
|
|
await log_zeugnis_event(document_id, EventType.VIEWED.value)
|
|
|
|
return dict(doc)
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.get("/documents/{document_id}/versions", response_model=List[dict])
|
|
async def get_document_versions(document_id: str):
|
|
"""Get version history for a document."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(
|
|
"""
|
|
SELECT * FROM zeugnis_document_versions
|
|
WHERE document_id = $1
|
|
ORDER BY version DESC
|
|
""",
|
|
document_id
|
|
)
|
|
return [dict(r) for r in rows]
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
# =============================================================================
|
|
# Crawler Control Endpoints
|
|
# =============================================================================
|
|
|
|
@router.get("/crawler/status", response_model=dict)
|
|
async def crawler_status():
|
|
"""Get current crawler status."""
|
|
return get_crawler_status()
|
|
|
|
|
|
@router.post("/crawler/start", response_model=dict)
|
|
async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks):
|
|
"""Start the crawler."""
|
|
success = await start_crawler(
|
|
bundesland=request.bundesland,
|
|
source_id=request.source_id,
|
|
)
|
|
if not success:
|
|
raise HTTPException(status_code=409, detail="Crawler already running")
|
|
return {"success": True, "message": "Crawler started"}
|
|
|
|
|
|
@router.post("/crawler/stop", response_model=dict)
|
|
async def stop_crawl():
|
|
"""Stop the crawler."""
|
|
success = await stop_crawler()
|
|
if not success:
|
|
raise HTTPException(status_code=409, detail="Crawler not running")
|
|
return {"success": True, "message": "Crawler stopped"}
|
|
|
|
|
|
@router.get("/crawler/queue", response_model=List[dict])
|
|
async def get_queue():
|
|
"""Get the crawler queue."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
return []
|
|
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(
|
|
"""
|
|
SELECT q.*, s.bundesland, s.name as source_name
|
|
FROM zeugnis_crawler_queue q
|
|
JOIN zeugnis_sources s ON q.source_id = s.id
|
|
ORDER BY q.priority DESC, q.created_at
|
|
"""
|
|
)
|
|
return [dict(r) for r in rows]
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.post("/crawler/queue", response_model=dict)
|
|
async def add_to_queue(request: CrawlRequest):
|
|
"""Add a source to the crawler queue."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
queue_id = generate_id()
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
# Get source ID if bundesland provided
|
|
source_id = request.source_id
|
|
if not source_id and request.bundesland:
|
|
source = await conn.fetchrow(
|
|
"SELECT id FROM zeugnis_sources WHERE bundesland = $1",
|
|
request.bundesland
|
|
)
|
|
if source:
|
|
source_id = source["id"]
|
|
|
|
if not source_id:
|
|
raise HTTPException(status_code=400, detail="Source not found")
|
|
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status)
|
|
VALUES ($1, $2, $3, 'pending')
|
|
""",
|
|
queue_id, source_id, request.priority
|
|
)
|
|
return {"id": queue_id, "success": True}
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
# =============================================================================
|
|
# Statistics Endpoints
|
|
# =============================================================================
|
|
|
|
@router.get("/stats", response_model=dict)
|
|
async def get_stats():
|
|
"""Get zeugnis crawler statistics."""
|
|
stats = await get_zeugnis_stats()
|
|
return stats
|
|
|
|
|
|
@router.get("/stats/bundesland", response_model=List[dict])
|
|
async def get_bundesland_stats():
|
|
"""Get statistics per Bundesland."""
|
|
pool = await get_pool()
|
|
|
|
# Build stats from BUNDESLAENDER with DB data if available
|
|
stats = []
|
|
for code, info in BUNDESLAENDER.items():
|
|
stat = {
|
|
"bundesland": code,
|
|
"name": info["name"],
|
|
"training_allowed": get_training_allowed(code),
|
|
"document_count": 0,
|
|
"indexed_count": 0,
|
|
"last_crawled": None,
|
|
}
|
|
|
|
if pool:
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
row = await conn.fetchrow(
|
|
"""
|
|
SELECT
|
|
COUNT(d.id) as doc_count,
|
|
COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count,
|
|
MAX(u.last_crawled) as last_crawled
|
|
FROM zeugnis_sources s
|
|
LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
|
|
LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
|
|
WHERE s.bundesland = $1
|
|
GROUP BY s.id
|
|
""",
|
|
code
|
|
)
|
|
if row:
|
|
stat["document_count"] = row["doc_count"] or 0
|
|
stat["indexed_count"] = row["indexed_count"] or 0
|
|
stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None
|
|
except Exception:
|
|
pass
|
|
|
|
stats.append(stat)
|
|
|
|
return stats
|
|
|
|
|
|
# =============================================================================
|
|
# Audit Endpoints
|
|
# =============================================================================
|
|
|
|
@router.get("/audit/events", response_model=List[dict])
|
|
async def get_audit_events(
|
|
document_id: Optional[str] = None,
|
|
event_type: Optional[str] = None,
|
|
limit: int = Query(100, le=1000),
|
|
days: int = Query(30, le=365),
|
|
):
|
|
"""Get audit events with optional filtering."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
return []
|
|
|
|
try:
|
|
since = datetime.now() - timedelta(days=days)
|
|
async with pool.acquire() as conn:
|
|
query = """
|
|
SELECT * FROM zeugnis_usage_events
|
|
WHERE created_at >= $1
|
|
"""
|
|
params = [since]
|
|
|
|
if document_id:
|
|
query += " AND document_id = $2"
|
|
params.append(document_id)
|
|
if event_type:
|
|
query += f" AND event_type = ${len(params) + 1}"
|
|
params.append(event_type)
|
|
|
|
query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}"
|
|
params.append(limit)
|
|
|
|
rows = await conn.fetch(query, *params)
|
|
return [dict(r) for r in rows]
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.get("/audit/export", response_model=dict)
|
|
async def export_audit(
|
|
days: int = Query(30, le=365),
|
|
requested_by: str = Query(..., description="User requesting the export"),
|
|
):
|
|
"""Export audit data for GDPR compliance."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
try:
|
|
since = datetime.now() - timedelta(days=days)
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(
|
|
"""
|
|
SELECT * FROM zeugnis_usage_events
|
|
WHERE created_at >= $1
|
|
ORDER BY created_at DESC
|
|
""",
|
|
since
|
|
)
|
|
|
|
doc_count = await conn.fetchval(
|
|
"SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1",
|
|
since
|
|
)
|
|
|
|
return {
|
|
"export_date": datetime.now().isoformat(),
|
|
"requested_by": requested_by,
|
|
"events": [dict(r) for r in rows],
|
|
"document_count": doc_count or 0,
|
|
"date_range_start": since.isoformat(),
|
|
"date_range_end": datetime.now().isoformat(),
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|