Restructure: Move 52 files into 7 domain packages

korrektur/ zeugnis/ admin/ compliance/ worksheet/ training/ metrics/ 52 shims, relative imports, RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 22:10:48 +02:00
parent 0504d22b8e
commit 165c493d1e
111 changed files with 11859 additions and 11609 deletions
--- a/klausur-service/backend/zeugnis/api_docs.py
+++ b/klausur-service/backend/zeugnis/api_docs.py
@@ -0,0 +1,321 @@
+"""
+Zeugnis API Docs — documents, crawler control, statistics, audit endpoints.
+
+Extracted from zeugnis_api.py for modularity.
+"""
+
+from datetime import datetime, timedelta
+from typing import Optional, List
+from fastapi import APIRouter, HTTPException, BackgroundTasks, Query
+
+from .models import (
+    CrawlRequest, EventType,
+    BUNDESLAENDER,
+    generate_id, get_training_allowed, get_license_for_bundesland,
+)
+from .crawler import (
+    start_crawler, stop_crawler, get_crawler_status,
+)
+from metrics_db import (
+    get_zeugnis_documents, get_zeugnis_stats,
+    log_zeugnis_event, get_pool,
+)
+
+
+router = APIRouter(prefix="/api/v1/admin/zeugnis", tags=["Zeugnis Crawler"])
+
+
+# =============================================================================
+# Documents Endpoints
+# =============================================================================
+
+@router.get("/documents", response_model=List[dict])
+async def list_documents(
+    bundesland: Optional[str] = None,
+    limit: int = Query(100, le=500),
+    offset: int = 0,
+):
+    """Get all zeugnis documents with optional filtering."""
+    documents = await get_zeugnis_documents(bundesland=bundesland, limit=limit, offset=offset)
+    return documents
+
+
+@router.get("/documents/{document_id}", response_model=dict)
+async def get_document(document_id: str):
+    """Get details for a specific document."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    try:
+        async with pool.acquire() as conn:
+            doc = await conn.fetchrow(
+                """
+                SELECT d.*, s.bundesland, s.name as source_name
+                FROM zeugnis_documents d
+                JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
+                JOIN zeugnis_sources s ON u.source_id = s.id
+                WHERE d.id = $1
+                """,
+                document_id
+            )
+            if not doc:
+                raise HTTPException(status_code=404, detail="Document not found")
+
+            # Log view event
+            await log_zeugnis_event(document_id, EventType.VIEWED.value)
+
+            return dict(doc)
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/documents/{document_id}/versions", response_model=List[dict])
+async def get_document_versions(document_id: str):
+    """Get version history for a document."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    try:
+        async with pool.acquire() as conn:
+            rows = await conn.fetch(
+                """
+                SELECT * FROM zeugnis_document_versions
+                WHERE document_id = $1
+                ORDER BY version DESC
+                """,
+                document_id
+            )
+            return [dict(r) for r in rows]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# =============================================================================
+# Crawler Control Endpoints
+# =============================================================================
+
+@router.get("/crawler/status", response_model=dict)
+async def crawler_status():
+    """Get current crawler status."""
+    return get_crawler_status()
+
+
+@router.post("/crawler/start", response_model=dict)
+async def start_crawl(request: CrawlRequest, background_tasks: BackgroundTasks):
+    """Start the crawler."""
+    success = await start_crawler(
+        bundesland=request.bundesland,
+        source_id=request.source_id,
+    )
+    if not success:
+        raise HTTPException(status_code=409, detail="Crawler already running")
+    return {"success": True, "message": "Crawler started"}
+
+
+@router.post("/crawler/stop", response_model=dict)
+async def stop_crawl():
+    """Stop the crawler."""
+    success = await stop_crawler()
+    if not success:
+        raise HTTPException(status_code=409, detail="Crawler not running")
+    return {"success": True, "message": "Crawler stopped"}
+
+
+@router.get("/crawler/queue", response_model=List[dict])
+async def get_queue():
+    """Get the crawler queue."""
+    pool = await get_pool()
+    if not pool:
+        return []
+
+    try:
+        async with pool.acquire() as conn:
+            rows = await conn.fetch(
+                """
+                SELECT q.*, s.bundesland, s.name as source_name
+                FROM zeugnis_crawler_queue q
+                JOIN zeugnis_sources s ON q.source_id = s.id
+                ORDER BY q.priority DESC, q.created_at
+                """
+            )
+            return [dict(r) for r in rows]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/crawler/queue", response_model=dict)
+async def add_to_queue(request: CrawlRequest):
+    """Add a source to the crawler queue."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    queue_id = generate_id()
+    try:
+        async with pool.acquire() as conn:
+            # Get source ID if bundesland provided
+            source_id = request.source_id
+            if not source_id and request.bundesland:
+                source = await conn.fetchrow(
+                    "SELECT id FROM zeugnis_sources WHERE bundesland = $1",
+                    request.bundesland
+                )
+                if source:
+                    source_id = source["id"]
+
+            if not source_id:
+                raise HTTPException(status_code=400, detail="Source not found")
+
+            await conn.execute(
+                """
+                INSERT INTO zeugnis_crawler_queue (id, source_id, priority, status)
+                VALUES ($1, $2, $3, 'pending')
+                """,
+                queue_id, source_id, request.priority
+            )
+        return {"id": queue_id, "success": True}
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# =============================================================================
+# Statistics Endpoints
+# =============================================================================
+
+@router.get("/stats", response_model=dict)
+async def get_stats():
+    """Get zeugnis crawler statistics."""
+    stats = await get_zeugnis_stats()
+    return stats
+
+
+@router.get("/stats/bundesland", response_model=List[dict])
+async def get_bundesland_stats():
+    """Get statistics per Bundesland."""
+    pool = await get_pool()
+
+    # Build stats from BUNDESLAENDER with DB data if available
+    stats = []
+    for code, info in BUNDESLAENDER.items():
+        stat = {
+            "bundesland": code,
+            "name": info["name"],
+            "training_allowed": get_training_allowed(code),
+            "document_count": 0,
+            "indexed_count": 0,
+            "last_crawled": None,
+        }
+
+        if pool:
+            try:
+                async with pool.acquire() as conn:
+                    row = await conn.fetchrow(
+                        """
+                        SELECT
+                            COUNT(d.id) as doc_count,
+                            COUNT(CASE WHEN d.indexed_in_qdrant THEN 1 END) as indexed_count,
+                            MAX(u.last_crawled) as last_crawled
+                        FROM zeugnis_sources s
+                        LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
+                        LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
+                        WHERE s.bundesland = $1
+                        GROUP BY s.id
+                        """,
+                        code
+                    )
+                    if row:
+                        stat["document_count"] = row["doc_count"] or 0
+                        stat["indexed_count"] = row["indexed_count"] or 0
+                        stat["last_crawled"] = row["last_crawled"].isoformat() if row["last_crawled"] else None
+            except Exception:
+                pass
+
+        stats.append(stat)
+
+    return stats
+
+
+# =============================================================================
+# Audit Endpoints
+# =============================================================================
+
+@router.get("/audit/events", response_model=List[dict])
+async def get_audit_events(
+    document_id: Optional[str] = None,
+    event_type: Optional[str] = None,
+    limit: int = Query(100, le=1000),
+    days: int = Query(30, le=365),
+):
+    """Get audit events with optional filtering."""
+    pool = await get_pool()
+    if not pool:
+        return []
+
+    try:
+        since = datetime.now() - timedelta(days=days)
+        async with pool.acquire() as conn:
+            query = """
+                SELECT * FROM zeugnis_usage_events
+                WHERE created_at >= $1
+            """
+            params = [since]
+
+            if document_id:
+                query += " AND document_id = $2"
+                params.append(document_id)
+            if event_type:
+                query += f" AND event_type = ${len(params) + 1}"
+                params.append(event_type)
+
+            query += f" ORDER BY created_at DESC LIMIT ${len(params) + 1}"
+            params.append(limit)
+
+            rows = await conn.fetch(query, *params)
+            return [dict(r) for r in rows]
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.get("/audit/export", response_model=dict)
+async def export_audit(
+    days: int = Query(30, le=365),
+    requested_by: str = Query(..., description="User requesting the export"),
+):
+    """Export audit data for GDPR compliance."""
+    pool = await get_pool()
+    if not pool:
+        raise HTTPException(status_code=503, detail="Database not available")
+
+    try:
+        since = datetime.now() - timedelta(days=days)
+        async with pool.acquire() as conn:
+            rows = await conn.fetch(
+                """
+                SELECT * FROM zeugnis_usage_events
+                WHERE created_at >= $1
+                ORDER BY created_at DESC
+                """,
+                since
+            )
+
+            doc_count = await conn.fetchval(
+                "SELECT COUNT(DISTINCT document_id) FROM zeugnis_usage_events WHERE created_at >= $1",
+                since
+            )
+
+            return {
+                "export_date": datetime.now().isoformat(),
+                "requested_by": requested_by,
+                "events": [dict(r) for r in rows],
+                "document_count": doc_count or 0,
+                "date_range_start": since.isoformat(),
+                "date_range_end": datetime.now().isoformat(),
+            }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))