Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
538 lines
18 KiB
Python
538 lines
18 KiB
Python
"""
|
|
Zeugnis Rights-Aware Crawler - API Endpoints
|
|
|
|
FastAPI router for managing zeugnis sources, documents, and crawler operations.
|
|
"""
|
|
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional, List
|
|
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
|
|
from pydantic import BaseModel
|
|
|
|
from zeugnis_models import (
|
|
ZeugnisSource, ZeugnisSourceCreate, ZeugnisSourceVerify,
|
|
SeedUrl, SeedUrlCreate,
|
|
ZeugnisDocument, ZeugnisStats,
|
|
CrawlerStatus, CrawlRequest, CrawlQueueItem,
|
|
UsageEvent, AuditExport,
|
|
LicenseType, CrawlStatus, DocType, EventType,
|
|
BUNDESLAENDER, TRAINING_PERMISSIONS,
|
|
generate_id, get_training_allowed, get_bundesland_name, get_license_for_bundesland,
|
|
)
|
|
from zeugnis_crawler import (
|
|
start_crawler, stop_crawler, get_crawler_status,
|
|
)
|
|
from metrics_db import (
|
|
get_zeugnis_sources, upsert_zeugnis_source,
|
|
get_zeugnis_documents, get_zeugnis_stats,
|
|
log_zeugnis_event, get_pool,
|
|
)
|
|
|
|
|
|
router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
|
|
|
|
|
|
# =============================================================================
|
|
# Sources Endpoints
|
|
# =============================================================================
|
|
|
|
@router.get("/sources", response_model=List[dict])
|
|
async def list_sources():
|
|
"""Get all zeugnis sources (Bundesländer)."""
|
|
sources = await get_zeugnis_sources()
|
|
if not sources:
|
|
# Return default sources if none exist
|
|
return [
|
|
{
|
|
"id": None,
|
|
"bundesland": code,
|
|
"name": info["name"],
|
|
"base_url": None,
|
|
"license_type": str(get_license_for_bundesland(code).value),
|
|
"training_allowed": get_training_allowed(code),
|
|
"verified_by": None,
|
|
"verified_at": None,
|
|
"created_at": None,
|
|
"updated_at": None,
|
|
}
|
|
for code, info in BUNDESLAENDER.items()
|
|
]
|
|
return sources
|
|
|
|
|
|
@router.post("/sources", response_model=dict)
|
|
async def create_source(source: ZeugnisSourceCreate):
|
|
"""Create or update a zeugnis source."""
|
|
source_id = generate_id()
|
|
success = await upsert_zeugnis_source(
|
|
id=source_id,
|
|
bundesland=source.bundesland,
|
|
name=source.name,
|
|
license_type=source.license_type.value,
|
|
training_allowed=source.training_allowed,
|
|
base_url=source.base_url,
|
|
)
|
|
if not success:
|
|
raise HTTPException(status_code=500, detail="Failed to create source")
|
|
return {"id": source_id, "success": True}
|
|
|
|
|
|
@router.put("/sources/{source_id}/verify", response_model=dict)
|
|
async def verify_source(source_id: str, verification: ZeugnisSourceVerify):
|
|
"""Verify a source's license status."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
await conn.execute(
|
|
"""
|
|
UPDATE zeugnis_sources
|
|
SET license_type = $2,
|
|
training_allowed = $3,
|
|
verified_by = $4,
|
|
verified_at = NOW(),
|
|
updated_at = NOW()
|
|
WHERE id = $1
|
|
""",
|
|
source_id, verification.license_type.value,
|
|
verification.training_allowed, verification.verified_by
|
|
)
|
|
return {"success": True, "source_id": source_id}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.get("/sources/{bundesland}", response_model=dict)
|
|
async def get_source_by_bundesland(bundesland: str):
|
|
"""Get source details for a specific Bundesland."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
# Return default info
|
|
if bundesland not in BUNDESLAENDER:
|
|
raise HTTPException(status_code=404, detail=f"Bundesland not found: {bundesland}")
|
|
return {
|
|
"bundesland": bundesland,
|
|
"name": get_bundesland_name(bundesland),
|
|
"training_allowed": get_training_allowed(bundesland),
|
|
"license_type": get_license_for_bundesland(bundesland).value,
|
|
"document_count": 0,
|
|
}
|
|
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
source = await conn.fetchrow(
|
|
"SELECT * FROM zeugnis_sources WHERE bundesland = $1",
|
|
bundesland
|
|
)
|
|
if source:
|
|
doc_count = await conn.fetchval(
|
|
"""
|
|
SELECT COUNT(*) FROM zeugnis_documents d
|
|
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
|
WHERE u.source_id = $1
|
|
""",
|
|
source["id"]
|
|
)
|
|
return {**dict(source), "document_count": doc_count or 0}
|
|
|
|
# Return default
|
|
return {
|
|
"bundesland": bundesland,
|
|
"name": get_bundesland_name(bundesland),
|
|
"training_allowed": get_training_allowed(bundesland),
|
|
"license_type": get_license_for_bundesland(bundesland).value,
|
|
"document_count": 0,
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
# =============================================================================
|
|
# Seed URLs Endpoints
|
|
# =============================================================================
|
|
|
|
@router.get("/sources/{source_id}/urls", response_model=List[dict])
|
|
async def list_seed_urls(source_id: str):
|
|
"""Get all seed URLs for a source."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
return []
|
|
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(
|
|
"SELECT * FROM zeugnis_seed_urls WHERE source_id = $1 ORDER BY created_at",
|
|
source_id
|
|
)
|
|
return [dict(r) for r in rows]
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.post("/sources/{source_id}/urls", response_model=dict)
|
|
async def add_seed_url(source_id: str, seed_url: SeedUrlCreate):
|
|
"""Add a new seed URL to a source."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
url_id = generate_id()
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO zeugnis_seed_urls (id, source_id, url, doc_type, status)
|
|
VALUES ($1, $2, $3, $4, 'pending')
|
|
""",
|
|
url_id, source_id, seed_url.url, seed_url.doc_type.value
|
|
)
|
|
return {"id": url_id, "success": True}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.delete("/urls/{url_id}", response_model=dict)
|
|
async def delete_seed_url(url_id: str):
|
|
"""Delete a seed URL."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
await conn.execute(
|
|
"DELETE FROM zeugnis_seed_urls WHERE id = $1",
|
|
url_id
|
|
)
|
|
return {"success": True}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
# =============================================================================
|
|
# Documents Endpoints
|
|
# =============================================================================
|
|
|
|
@router.get("/documents", response_model=List[dict])
|
|
async def list_documents(
|
|
bundesland: Optional[str] = None,
|
|
limit: int = Query(100, le=500),
|
|
offset: int = 0,
|
|
):
|
|
"""Get all zeugnis documents with optional filtering."""
|
|
documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset)
|
|
return documents
|
|
|
|
|
|
@router.get("/documents/{document_id}", response_model=dict)
|
|
async def get_document(document_id: str):
|
|
"""Get details for a specific document."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
doc = await conn.fetchrow(
|
|
"""
|
|
SELECT d.*, s.bundesland, s.name as source_name
|
|
FROM zeugnis_documents d
|
|
JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
|
|
JOIN zeugnis_sources s ON u.source_id = s.id
|
|
WHERE d.id = $1
|
|
""",
|
|
document_id
|
|
)
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
# Log view event
|
|
await log_zeugnis_event(document_id, EventType.VIEWED.value)
|
|
|
|
return dict(doc)
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.get("/documents/{document_id}/versions", response_model=List[dict])
|
|
async def get_document_versions(document_id: str):
|
|
"""Get version history for a document."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(
|
|
"""
|
|
SELECT * FROM zeugnis_document_versions
|
|
WHERE document_id = $1
|
|
ORDER BY version DESC
|
|
""",
|
|
document_id
|
|
)
|
|
return [dict(r) for r in rows]
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
# =============================================================================
|
|
# Crawler Control Endpoints
|
|
# =============================================================================
|
|
|
|
@router.get("/crawler/status", response_model=dict)
|
|
async def crawler_status():
|
|
"""Get current crawler status."""
|
|
return get_crawler_status()
|
|
|
|
|
|
@router.post("/crawler/start", response_model=dict)
|
|
async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks):
|
|
"""Start the crawler."""
|
|
success = await start_crawler(
|
|
bundesland=request.bundesland,
|
|
source_id=request.source_id,
|
|
)
|
|
if not success:
|
|
raise HTTPException(status_code=409, detail="Crawler already running")
|
|
return {"success": True, "message": "Crawler started"}
|
|
|
|
|
|
@router.post("/crawler/stop", response_model=dict)
|
|
async def stop_crawl():
|
|
"""Stop the crawler."""
|
|
success = await stop_crawler()
|
|
if not success:
|
|
raise HTTPException(status_code=409, detail="Crawler not running")
|
|
return {"success": True, "message": "Crawler stopped"}
|
|
|
|
|
|
@router.get("/crawler/queue", response_model=List[dict])
|
|
async def get_queue():
|
|
"""Get the crawler queue."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
return []
|
|
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(
|
|
"""
|
|
SELECT q.*, s.bundesland, s.name as source_name
|
|
FROM zeugnis_crawler_queue q
|
|
JOIN zeugnis_sources s ON q.source_id = s.id
|
|
ORDER BY q.priority DESC, q.created_at
|
|
"""
|
|
)
|
|
return [dict(r) for r in rows]
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.post("/crawler/queue", response_model=dict)
|
|
async def add_to_queue(request: CrawlRequest):
|
|
"""Add a source to the crawler queue."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
queue_id = generate_id()
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
# Get source ID if bundesland provided
|
|
source_id = request.source_id
|
|
if not source_id and request.bundesland:
|
|
source = await conn.fetchrow(
|
|
"SELECT id FROM zeugnis_sources WHERE bundesland = $1",
|
|
request.bundesland
|
|
)
|
|
if source:
|
|
source_id = source["id"]
|
|
|
|
if not source_id:
|
|
raise HTTPException(status_code=400, detail="Source not found")
|
|
|
|
await conn.execute(
|
|
"""
|
|
INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status)
|
|
VALUES ($1, $2, $3, 'pending')
|
|
""",
|
|
queue_id, source_id, request.priority
|
|
)
|
|
return {"id": queue_id, "success": True}
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
# =============================================================================
|
|
# Statistics Endpoints
|
|
# =============================================================================
|
|
|
|
@router.get("/stats", response_model=dict)
|
|
async def get_stats():
|
|
"""Get zeugnis crawler statistics."""
|
|
stats = await get_zeugnis_stats()
|
|
return stats
|
|
|
|
|
|
@router.get("/stats/bundesland", response_model=List[dict])
|
|
async def get_bundesland_stats():
|
|
"""Get statistics per Bundesland."""
|
|
pool = await get_pool()
|
|
|
|
# Build stats from BUNDESLAENDER with DB data if available
|
|
stats = []
|
|
for code, info in BUNDESLAENDER.items():
|
|
stat = {
|
|
"bundesland": code,
|
|
"name": info["name"],
|
|
"training_allowed": get_training_allowed(code),
|
|
"document_count": 0,
|
|
"indexed_count": 0,
|
|
"last_crawled": None,
|
|
}
|
|
|
|
if pool:
|
|
try:
|
|
async with pool.acquire() as conn:
|
|
row = await conn.fetchrow(
|
|
"""
|
|
SELECT
|
|
COUNT(d.id) as doc_count,
|
|
COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count,
|
|
MAX(u.last_crawled) as last_crawled
|
|
FROM zeugnis_sources s
|
|
LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
|
|
LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
|
|
WHERE s.bundesland = $1
|
|
GROUP BY s.id
|
|
""",
|
|
code
|
|
)
|
|
if row:
|
|
stat["document_count"] = row["doc_count"] or 0
|
|
stat["indexed_count"] = row["indexed_count"] or 0
|
|
stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None
|
|
except Exception:
|
|
pass
|
|
|
|
stats.append(stat)
|
|
|
|
return stats
|
|
|
|
|
|
# =============================================================================
|
|
# Audit Endpoints
|
|
# =============================================================================
|
|
|
|
@router.get("/audit/events", response_model=List[dict])
|
|
async def get_audit_events(
|
|
document_id: Optional[str] = None,
|
|
event_type: Optional[str] = None,
|
|
limit: int = Query(100, le=1000),
|
|
days: int = Query(30, le=365),
|
|
):
|
|
"""Get audit events with optional filtering."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
return []
|
|
|
|
try:
|
|
since = datetime.now() - timedelta(days=days)
|
|
async with pool.acquire() as conn:
|
|
query = """
|
|
SELECT * FROM zeugnis_usage_events
|
|
WHERE created_at >= $1
|
|
"""
|
|
params = [since]
|
|
|
|
if document_id:
|
|
query += " AND document_id = $2"
|
|
params.append(document_id)
|
|
if event_type:
|
|
query += f" AND event_type = ${len(params) + 1}"
|
|
params.append(event_type)
|
|
|
|
query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}"
|
|
params.append(limit)
|
|
|
|
rows = await conn.fetch(query, *params)
|
|
return [dict(r) for r in rows]
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@router.get("/audit/export", response_model=dict)
|
|
async def export_audit(
|
|
days: int = Query(30, le=365),
|
|
requested_by: str = Query(..., description="User requesting the export"),
|
|
):
|
|
"""Export audit data for GDPR compliance."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
try:
|
|
since = datetime.now() - timedelta(days=days)
|
|
async with pool.acquire() as conn:
|
|
rows = await conn.fetch(
|
|
"""
|
|
SELECT * FROM zeugnis_usage_events
|
|
WHERE created_at >= $1
|
|
ORDER BY created_at DESC
|
|
""",
|
|
since
|
|
)
|
|
|
|
doc_count = await conn.fetchval(
|
|
"SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1",
|
|
since
|
|
)
|
|
|
|
return {
|
|
"export_date": datetime.now().isoformat(),
|
|
"requested_by": requested_by,
|
|
"events": [dict(r) for r in rows],
|
|
"document_count": doc_count or 0,
|
|
"date_range_start": since.isoformat(),
|
|
"date_range_end": datetime.now().isoformat(),
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
# =============================================================================
|
|
# Initialization Endpoint
|
|
# =============================================================================
|
|
|
|
@router.post("/init", response_model=dict)
|
|
async def initialize_sources():
|
|
"""Initialize default sources from BUNDESLAENDER."""
|
|
pool = await get_pool()
|
|
if not pool:
|
|
raise HTTPException(status_code=503, detail="Database not available")
|
|
|
|
created = 0
|
|
try:
|
|
for code, info in BUNDESLAENDER.items():
|
|
source_id = generate_id()
|
|
success = await upsert_zeugnis_source(
|
|
id=source_id,
|
|
bundesland=code,
|
|
name=info["name"],
|
|
license_type=get_license_for_bundesland(code).value,
|
|
training_allowed=get_training_allowed(code),
|
|
)
|
|
if success:
|
|
created += 1
|
|
|
|
return {"success": True, "sources_created": created}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|