""" Zeugnis API Docs — documents, crawler control, statistics, audit endpoints. Extracted from zeugnis_api.py for modularity. """ from datetime import datetime, timedelta from typing import Optional, List from fastapi import APIRouter, HTTPException, BackgroundTasks, Query from zeugnis_models import ( CrawlRequest, EventType, BUNDESLAENDER, generate_id, get_training_allowed, get_license_for_bundesland, ) from zeugnis_crawler import ( start_crawler, stop_crawler, get_crawler_status, ) from metrics_db import ( get_zeugnis_documents, get_zeugnis_stats, log_zeugnis_event, get_pool, ) router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"]) # ============================================================================= # Documents Endpoints # ============================================================================= @router.get("/documents", response_model=List[dict]) async def list_documents( bundesland: Optional[str] = None, limit: int = Query(100, le=500), offset: int = 0, ): """Get all zeugnis documents with optional filtering.""" documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset) return documents @router.get("/documents/{document_id}", response_model=dict) async def get_document(document_id: str): """Get details for a specific document.""" pool = await get_pool() if not pool: raise HTTPException(status_code=503, detail="Database not available") try: async with pool.acquire() as conn: doc = await conn.fetchrow( """ SELECT d.*, s.bundesland, s.name as source_name FROM zeugnis_documents d JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id JOIN zeugnis_sources s ON u.source_id = s.id WHERE d.id = $1 """, document_id ) if not doc: raise HTTPException(status_code=404, detail="Document not found") # Log view event await log_zeugnis_event(document_id, EventType.VIEWED.value) return dict(doc) except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @router.get("/documents/{document_id}/versions", response_model=List[dict]) async def get_document_versions(document_id: str): """Get version history for a document.""" pool = await get_pool() if not pool: raise HTTPException(status_code=503, detail="Database not available") try: async with pool.acquire() as conn: rows = await conn.fetch( """ SELECT * FROM zeugnis_document_versions WHERE document_id = $1 ORDER BY version DESC """, document_id ) return [dict(r) for r in rows] except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # ============================================================================= # Crawler Control Endpoints # ============================================================================= @router.get("/crawler/status", response_model=dict) async def crawler_status(): """Get current crawler status.""" return get_crawler_status() @router.post("/crawler/start", response_model=dict) async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks): """Start the crawler.""" success = await start_crawler( bundesland=request.bundesland, source_id=request.source_id, ) if not success: raise HTTPException(status_code=409, detail="Crawler already running") return {"success": True, "message": "Crawler started"} @router.post("/crawler/stop", response_model=dict) async def stop_crawl(): """Stop the crawler.""" success = await stop_crawler() if not success: raise HTTPException(status_code=409, detail="Crawler not running") return {"success": True, "message": "Crawler stopped"} @router.get("/crawler/queue", response_model=List[dict]) async def get_queue(): """Get the crawler queue.""" pool = await get_pool() if not pool: return [] try: async with pool.acquire() as conn: rows = await conn.fetch( """ SELECT q.*, s.bundesland, s.name as source_name FROM zeugnis_crawler_queue q JOIN zeugnis_sources s ON q.source_id = s.id ORDER BY q.priority DESC, q.created_at """ ) return [dict(r) for r in rows] except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @router.post("/crawler/queue", response_model=dict) async def add_to_queue(request: CrawlRequest): """Add a source to the crawler queue.""" pool = await get_pool() if not pool: raise HTTPException(status_code=503, detail="Database not available") queue_id = generate_id() try: async with pool.acquire() as conn: # Get source ID if bundesland provided source_id = request.source_id if not source_id and request.bundesland: source = await conn.fetchrow( "SELECT id FROM zeugnis_sources WHERE bundesland = $1", request.bundesland ) if source: source_id = source["id"] if not source_id: raise HTTPException(status_code=400, detail="Source not found") await conn.execute( """ INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status) VALUES ($1, $2, $3, 'pending') """, queue_id, source_id, request.priority ) return {"id": queue_id, "success": True} except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # ============================================================================= # Statistics Endpoints # ============================================================================= @router.get("/stats", response_model=dict) async def get_stats(): """Get zeugnis crawler statistics.""" stats = await get_zeugnis_stats() return stats @router.get("/stats/bundesland", response_model=List[dict]) async def get_bundesland_stats(): """Get statistics per Bundesland.""" pool = await get_pool() # Build stats from BUNDESLAENDER with DB data if available stats = [] for code, info in BUNDESLAENDER.items(): stat = { "bundesland": code, "name": info["name"], "training_allowed": get_training_allowed(code), "document_count": 0, "indexed_count": 0, "last_crawled": None, } if pool: try: async with pool.acquire() as conn: row = await conn.fetchrow( """ SELECT COUNT(d.id) as doc_count, COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count, MAX(u.last_crawled) as last_crawled FROM zeugnis_sources s LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id WHERE s.bundesland = $1 GROUP BY s.id """, code ) if row: stat["document_count"] = row["doc_count"] or 0 stat["indexed_count"] = row["indexed_count"] or 0 stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None except Exception: pass stats.append(stat) return stats # ============================================================================= # Audit Endpoints # ============================================================================= @router.get("/audit/events", response_model=List[dict]) async def get_audit_events( document_id: Optional[str] = None, event_type: Optional[str] = None, limit: int = Query(100, le=1000), days: int = Query(30, le=365), ): """Get audit events with optional filtering.""" pool = await get_pool() if not pool: return [] try: since = datetime.now() - timedelta(days=days) async with pool.acquire() as conn: query = """ SELECT * FROM zeugnis_usage_events WHERE created_at >= $1 """ params = [since] if document_id: query += " AND document_id = $2" params.append(document_id) if event_type: query += f" AND event_type = ${len(params) + 1}" params.append(event_type) query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}" params.append(limit) rows = await conn.fetch(query, *params) return [dict(r) for r in rows] except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @router.get("/audit/export", response_model=dict) async def export_audit( days: int = Query(30, le=365), requested_by: str = Query(..., description="User requesting the export"), ): """Export audit data for GDPR compliance.""" pool = await get_pool() if not pool: raise HTTPException(status_code=503, detail="Database not available") try: since = datetime.now() - timedelta(days=days) async with pool.acquire() as conn: rows = await conn.fetch( """ SELECT * FROM zeugnis_usage_events WHERE created_at >= $1 ORDER BY created_at DESC """, since ) doc_count = await conn.fetchval( "SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1", since ) return { "export_date": datetime.now().isoformat(), "requested_by": requested_by, "events": [dict(r) for r in rows], "document_count": doc_count or 0, "date_range_start": since.isoformat(), "date_range_end": datetime.now().isoformat(), } except Exception as e: raise HTTPException(status_code=500, detail=str(e))