breakpilot-lehrer/klausur-service/backend/zeugnis_api_docs.py

"""
Zeugnis API Docs — documents, crawler control, statistics, audit endpoints.

Extracted from zeugnis_api.py for modularity.
"""

from datetime import datetime, timedelta
from typing import Optional, List
from fastapi import APIRouter, HTTPException, BackgroundTasks, Query

from zeugnis_models import (
    CrawlRequest, EventType,
    BUNDESLAENDER,
    generate_id, get_training_allowed, get_license_for_bundesland,
)
from zeugnis_crawler import (
    start_crawler, stop_crawler, get_crawler_status,
)
from metrics_db import (
    get_zeugnis_documents, get_zeugnis_stats,
    log_zeugnis_event, get_pool,
)


router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])


# =============================================================================
# Documents Endpoints
# =============================================================================

@router.get("/documents", response_model=List[dict])
async def list_documents(
    bundesland: Optional[str] = None,
    limit: int = Query(100, le=500),
    offset: int = 0,
):
    """Get all zeugnis documents with optional filtering."""
    documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset)
    return documents


@router.get("/documents/{document_id}", response_model=dict)
async def get_document(document_id: str):
    """Get details for a specific document."""
    pool = await get_pool()
    if not pool:
        raise HTTPException(status_code=503, detail="Database not available")

    try:
        async with pool.acquire() as conn:
            doc = await conn.fetchrow(
                """
                SELECT d.*, s.bundesland, s.name as source_name
                FROM zeugnis_documents d
                JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
                JOIN zeugnis_sources s ON u.source_id = s.id
                WHERE d.id = $1
                """,
                document_id
            )
            if not doc:
                raise HTTPException(status_code=404, detail="Document not found")

            # Log view event
            await log_zeugnis_event(document_id, EventType.VIEWED.value)

            return dict(doc)
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@router.get("/documents/{document_id}/versions", response_model=List[dict])
async def get_document_versions(document_id: str):
    """Get version history for a document."""
    pool = await get_pool()
    if not pool:
        raise HTTPException(status_code=503, detail="Database not available")

    try:
        async with pool.acquire() as conn:
            rows = await conn.fetch(
                """
                SELECT * FROM zeugnis_document_versions
                WHERE document_id = $1
                ORDER BY version DESC
                """,
                document_id
            )
            return [dict(r) for r in rows]
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


# =============================================================================
# Crawler Control Endpoints
# =============================================================================

@router.get("/crawler/status", response_model=dict)
async def crawler_status():
    """Get current crawler status."""
    return get_crawler_status()


@router.post("/crawler/start", response_model=dict)
async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks):
    """Start the crawler."""
    success = await start_crawler(
        bundesland=request.bundesland,
        source_id=request.source_id,
    )
    if not success:
        raise HTTPException(status_code=409, detail="Crawler already running")
    return {"success": True, "message": "Crawler started"}


@router.post("/crawler/stop", response_model=dict)
async def stop_crawl():
    """Stop the crawler."""
    success = await stop_crawler()
    if not success:
        raise HTTPException(status_code=409, detail="Crawler not running")
    return {"success": True, "message": "Crawler stopped"}


@router.get("/crawler/queue", response_model=List[dict])
async def get_queue():
    """Get the crawler queue."""
    pool = await get_pool()
    if not pool:
        return []

    try:
        async with pool.acquire() as conn:
            rows = await conn.fetch(
                """
                SELECT q.*, s.bundesland, s.name as source_name
                FROM zeugnis_crawler_queue q
                JOIN zeugnis_sources s ON q.source_id = s.id
                ORDER BY q.priority DESC, q.created_at
                """
            )
            return [dict(r) for r in rows]
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@router.post("/crawler/queue", response_model=dict)
async def add_to_queue(request: CrawlRequest):
    """Add a source to the crawler queue."""
    pool = await get_pool()
    if not pool:
        raise HTTPException(status_code=503, detail="Database not available")

    queue_id = generate_id()
    try:
        async with pool.acquire() as conn:
            # Get source ID if bundesland provided
            source_id = request.source_id
            if not source_id and request.bundesland:
                source = await conn.fetchrow(
                    "SELECT id FROM zeugnis_sources WHERE bundesland = $1",
                    request.bundesland
                )
                if source:
                    source_id = source["id"]

            if not source_id:
                raise HTTPException(status_code=400, detail="Source not found")

            await conn.execute(
                """
                INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status)
                VALUES ($1, $2, $3, 'pending')
                """,
                queue_id, source_id, request.priority
            )
        return {"id": queue_id, "success": True}
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


# =============================================================================
# Statistics Endpoints
# =============================================================================

@router.get("/stats", response_model=dict)
async def get_stats():
    """Get zeugnis crawler statistics."""
    stats = await get_zeugnis_stats()
    return stats


@router.get("/stats/bundesland", response_model=List[dict])
async def get_bundesland_stats():
    """Get statistics per Bundesland."""
    pool = await get_pool()

    # Build stats from BUNDESLAENDER with DB data if available
    stats = []
    for code, info in BUNDESLAENDER.items():
        stat = {
            "bundesland": code,
            "name": info["name"],
            "training_allowed": get_training_allowed(code),
            "document_count": 0,
            "indexed_count": 0,
            "last_crawled": None,
        }

        if pool:
            try:
                async with pool.acquire() as conn:
                    row = await conn.fetchrow(
                        """
                        SELECT
                            COUNT(d.id) as doc_count,
                            COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count,
                            MAX(u.last_crawled) as last_crawled
                        FROM zeugnis_sources s
                        LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
                        LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
                        WHERE s.bundesland = $1
                        GROUP BY s.id
                        """,
                        code
                    )
                    if row:
                        stat["document_count"] = row["doc_count"] or 0
                        stat["indexed_count"] = row["indexed_count"] or 0
                        stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None
            except Exception:
                pass

        stats.append(stat)

    return stats


# =============================================================================
# Audit Endpoints
# =============================================================================

@router.get("/audit/events", response_model=List[dict])
async def get_audit_events(
    document_id: Optional[str] = None,
    event_type: Optional[str] = None,
    limit: int = Query(100, le=1000),
    days: int = Query(30, le=365),
):
    """Get audit events with optional filtering."""
    pool = await get_pool()
    if not pool:
        return []

    try:
        since = datetime.now() - timedelta(days=days)
        async with pool.acquire() as conn:
            query = """
                SELECT * FROM zeugnis_usage_events
                WHERE created_at >= $1
            """
            params = [since]

            if document_id:
                query += " AND document_id = $2"
                params.append(document_id)
            if event_type:
                query += f" AND event_type = ${len(params) + 1}"
                params.append(event_type)

            query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}"
            params.append(limit)

            rows = await conn.fetch(query, *params)
            return [dict(r) for r in rows]
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@router.get("/audit/export", response_model=dict)
async def export_audit(
    days: int = Query(30, le=365),
    requested_by: str = Query(..., description="User requesting the export"),
):
    """Export audit data for GDPR compliance."""
    pool = await get_pool()
    if not pool:
        raise HTTPException(status_code=503, detail="Database not available")

    try:
        since = datetime.now() - timedelta(days=days)
        async with pool.acquire() as conn:
            rows = await conn.fetch(
                """
                SELECT * FROM zeugnis_usage_events
                WHERE created_at >= $1
                ORDER BY created_at DESC
                """,
                since
            )

            doc_count = await conn.fetchval(
                "SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1",
                since
            )

            return {
                "export_date": datetime.now().isoformat(),
                "requested_by": requested_by,
                "events": [dict(r) for r in rows],
                "document_count": doc_count or 0,
                "date_range_start": since.isoformat(),
                "date_range_end": datetime.now().isoformat(),
            }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))