Restructure: Move 52 files into 7 domain packages
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m22s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 23s

korrektur/ zeugnis/ admin/ compliance/ worksheet/ training/ metrics/
52 shims, relative imports, RAG untouched.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 22:10:48 +02:00
parent 0504d22b8e
commit 165c493d1e
111 changed files with 11859 additions and 11609 deletions

View File

@@ -0,0 +1,321 @@
"""
Zeugnis API Docs — documents, crawler control, statistics, audit endpoints.
Extracted from zeugnis_api.py for modularity.
"""
from datetime import datetime, timedelta
from typing import Optional, List
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
from .models import (
CrawlRequest, EventType,
BUNDESLAENDER,
generate_id, get_training_allowed, get_license_for_bundesland,
)
from .crawler import (
start_crawler, stop_crawler, get_crawler_status,
)
from metrics_db import (
get_zeugnis_documents, get_zeugnis_stats,
log_zeugnis_event, get_pool,
)
router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
# =============================================================================
# Documents Endpoints
# =============================================================================
@router.get("/documents", response_model=List[dict])
async def list_documents(
bundesland: Optional[str] = None,
limit: int = Query(100, le=500),
offset: int = 0,
):
"""Get all zeugnis documents with optional filtering."""
documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset)
return documents
@router.get("/documents/{document_id}", response_model=dict)
async def get_document(document_id: str):
"""Get details for a specific document."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
async with pool.acquire() as conn:
doc = await conn.fetchrow(
"""
SELECT d.*, s.bundesland, s.name as source_name
FROM zeugnis_documents d
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
JOIN zeugnis_sources s ON u.source_id = s.id
WHERE d.id = $1
""",
document_id
)
if not doc:
raise HTTPException(status_code=404, detail="Document not found")
# Log view event
await log_zeugnis_event(document_id, EventType.VIEWED.value)
return dict(doc)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/documents/{document_id}/versions", response_model=List[dict])
async def get_document_versions(document_id: str):
"""Get version history for a document."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT * FROM zeugnis_document_versions
WHERE document_id = $1
ORDER BY version DESC
""",
document_id
)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Crawler Control Endpoints
# =============================================================================
@router.get("/crawler/status", response_model=dict)
async def crawler_status():
"""Get current crawler status."""
return get_crawler_status()
@router.post("/crawler/start", response_model=dict)
async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks):
"""Start the crawler."""
success = await start_crawler(
bundesland=request.bundesland,
source_id=request.source_id,
)
if not success:
raise HTTPException(status_code=409, detail="Crawler already running")
return {"success": True, "message": "Crawler started"}
@router.post("/crawler/stop", response_model=dict)
async def stop_crawl():
"""Stop the crawler."""
success = await stop_crawler()
if not success:
raise HTTPException(status_code=409, detail="Crawler not running")
return {"success": True, "message": "Crawler stopped"}
@router.get("/crawler/queue", response_model=List[dict])
async def get_queue():
"""Get the crawler queue."""
pool = await get_pool()
if not pool:
return []
try:
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT q.*, s.bundesland, s.name as source_name
FROM zeugnis_crawler_queue q
JOIN zeugnis_sources s ON q.source_id = s.id
ORDER BY q.priority DESC, q.created_at
"""
)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/crawler/queue", response_model=dict)
async def add_to_queue(request: CrawlRequest):
"""Add a source to the crawler queue."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
queue_id = generate_id()
try:
async with pool.acquire() as conn:
# Get source ID if bundesland provided
source_id = request.source_id
if not source_id and request.bundesland:
source = await conn.fetchrow(
"SELECT id FROM zeugnis_sources WHERE bundesland = $1",
request.bundesland
)
if source:
source_id = source["id"]
if not source_id:
raise HTTPException(status_code=400, detail="Source not found")
await conn.execute(
"""
INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status)
VALUES ($1, $2, $3, 'pending')
""",
queue_id, source_id, request.priority
)
return {"id": queue_id, "success": True}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# =============================================================================
# Statistics Endpoints
# =============================================================================
@router.get("/stats", response_model=dict)
async def get_stats():
"""Get zeugnis crawler statistics."""
stats = await get_zeugnis_stats()
return stats
@router.get("/stats/bundesland", response_model=List[dict])
async def get_bundesland_stats():
"""Get statistics per Bundesland."""
pool = await get_pool()
# Build stats from BUNDESLAENDER with DB data if available
stats = []
for code, info in BUNDESLAENDER.items():
stat = {
"bundesland": code,
"name": info["name"],
"training_allowed": get_training_allowed(code),
"document_count": 0,
"indexed_count": 0,
"last_crawled": None,
}
if pool:
try:
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT
COUNT(d.id) as doc_count,
COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count,
MAX(u.last_crawled) as last_crawled
FROM zeugnis_sources s
LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
WHERE s.bundesland = $1
GROUP BY s.id
""",
code
)
if row:
stat["document_count"] = row["doc_count"] or 0
stat["indexed_count"] = row["indexed_count"] or 0
stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None
except Exception:
pass
stats.append(stat)
return stats
# =============================================================================
# Audit Endpoints
# =============================================================================
@router.get("/audit/events", response_model=List[dict])
async def get_audit_events(
document_id: Optional[str] = None,
event_type: Optional[str] = None,
limit: int = Query(100, le=1000),
days: int = Query(30, le=365),
):
"""Get audit events with optional filtering."""
pool = await get_pool()
if not pool:
return []
try:
since = datetime.now() - timedelta(days=days)
async with pool.acquire() as conn:
query = """
SELECT * FROM zeugnis_usage_events
WHERE created_at >= $1
"""
params = [since]
if document_id:
query += " AND document_id = $2"
params.append(document_id)
if event_type:
query += f" AND event_type = ${len(params) + 1}"
params.append(event_type)
query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}"
params.append(limit)
rows = await conn.fetch(query, *params)
return [dict(r) for r in rows]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/audit/export", response_model=dict)
async def export_audit(
days: int = Query(30, le=365),
requested_by: str = Query(..., description="User requesting the export"),
):
"""Export audit data for GDPR compliance."""
pool = await get_pool()
if not pool:
raise HTTPException(status_code=503, detail="Database not available")
try:
since = datetime.now() - timedelta(days=days)
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT * FROM zeugnis_usage_events
WHERE created_at >= $1
ORDER BY created_at DESC
""",
since
)
doc_count = await conn.fetchval(
"SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1",
since
)
return {
"export_date": datetime.now().isoformat(),
"requested_by": requested_by,
"events": [dict(r) for r in rows],
"document_count": doc_count or 0,
"date_range_start": since.isoformat(),
"date_range_end": datetime.now().isoformat(),
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))