breakpilot-lehrer/klausur-service/backend/metrics_db.py

"""
PostgreSQL Metrics Database Service
Stores search feedback, calculates quality metrics (Precision, Recall, MRR).
"""

import os
from typing import Optional, List, Dict
from datetime import datetime, timedelta
import asyncio

# Database Configuration - uses test default if not configured (for CI)
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://test:test@localhost:5432/test_metrics")

# Connection pool
_pool = None


async def get_pool():
    """Get or create database connection pool."""
    global _pool
    if _pool is None:
        try:
            import asyncpg
            _pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10)
        except ImportError:
            print("Warning: asyncpg not installed. Metrics storage disabled.")
            return None
        except Exception as e:
            print(f"Warning: Failed to connect to PostgreSQL: {e}")
            return None
    return _pool


async def init_metrics_tables() -> bool:
    """Initialize metrics tables in PostgreSQL."""
    pool = await get_pool()
    if pool is None:
        return False

    create_tables_sql = """
    -- RAG Search Feedback Table
    CREATE TABLE IF NOT EXISTS rag_search_feedback (
        id SERIAL PRIMARY KEY,
        result_id VARCHAR(255) NOT NULL,
        query_text TEXT,
        collection_name VARCHAR(100),
        score FLOAT,
        rating INTEGER CHECK (rating >= 1 AND rating <= 5),
        notes TEXT,
        user_id VARCHAR(100),
        created_at TIMESTAMP DEFAULT NOW()
    );

    -- Index for efficient querying
    CREATE INDEX IF NOT EXISTS idx_feedback_created_at ON rag_search_feedback(created_at);
    CREATE INDEX IF NOT EXISTS idx_feedback_collection ON rag_search_feedback(collection_name);
    CREATE INDEX IF NOT EXISTS idx_feedback_rating ON rag_search_feedback(rating);

    -- RAG Search Logs Table (for latency tracking)
    CREATE TABLE IF NOT EXISTS rag_search_logs (
        id SERIAL PRIMARY KEY,
        query_text TEXT NOT NULL,
        collection_name VARCHAR(100),
        result_count INTEGER,
        latency_ms INTEGER,
        top_score FLOAT,
        filters JSONB,
        created_at TIMESTAMP DEFAULT NOW()
    );

    CREATE INDEX IF NOT EXISTS idx_search_logs_created_at ON rag_search_logs(created_at);

    -- RAG Upload History Table
    CREATE TABLE IF NOT EXISTS rag_upload_history (
        id SERIAL PRIMARY KEY,
        filename VARCHAR(500) NOT NULL,
        collection_name VARCHAR(100),
        year INTEGER,
        pdfs_extracted INTEGER,
        minio_path VARCHAR(1000),
        uploaded_by VARCHAR(100),
        created_at TIMESTAMP DEFAULT NOW()
    );

    CREATE INDEX IF NOT EXISTS idx_upload_history_created_at ON rag_upload_history(created_at);

    -- Binäre Relevanz-Judgments für echte Precision/Recall
    CREATE TABLE IF NOT EXISTS rag_relevance_judgments (
        id SERIAL PRIMARY KEY,
        query_id VARCHAR(255) NOT NULL,
        query_text TEXT NOT NULL,
        result_id VARCHAR(255) NOT NULL,
        result_rank INTEGER,
        is_relevant BOOLEAN NOT NULL,
        collection_name VARCHAR(100),
        user_id VARCHAR(100),
        created_at TIMESTAMP DEFAULT NOW()
    );

    CREATE INDEX IF NOT EXISTS idx_relevance_query ON rag_relevance_judgments(query_id);
    CREATE INDEX IF NOT EXISTS idx_relevance_created_at ON rag_relevance_judgments(created_at);

    -- Zeugnisse Source Tracking
    CREATE TABLE IF NOT EXISTS zeugnis_sources (
        id VARCHAR(36) PRIMARY KEY,
        bundesland VARCHAR(10) NOT NULL,
        name VARCHAR(255) NOT NULL,
        base_url TEXT,
        license_type VARCHAR(50) NOT NULL,
        training_allowed BOOLEAN DEFAULT FALSE,
        verified_by VARCHAR(100),
        verified_at TIMESTAMP,
        created_at TIMESTAMP DEFAULT NOW(),
        updated_at TIMESTAMP DEFAULT NOW()
    );

    CREATE INDEX IF NOT EXISTS idx_zeugnis_sources_bundesland ON zeugnis_sources(bundesland);

    -- Zeugnisse Seed URLs
    CREATE TABLE IF NOT EXISTS zeugnis_seed_urls (
        id VARCHAR(36) PRIMARY KEY,
        source_id VARCHAR(36) REFERENCES zeugnis_sources(id),
        url TEXT NOT NULL,
        doc_type VARCHAR(50),
        status VARCHAR(20) DEFAULT 'pending',
        last_crawled TIMESTAMP,
        error_message TEXT,
        created_at TIMESTAMP DEFAULT NOW()
    );

    CREATE INDEX IF NOT EXISTS idx_zeugnis_seed_urls_source ON zeugnis_seed_urls(source_id);
    CREATE INDEX IF NOT EXISTS idx_zeugnis_seed_urls_status ON zeugnis_seed_urls(status);

    -- Zeugnisse Documents
    CREATE TABLE IF NOT EXISTS zeugnis_documents (
        id VARCHAR(36) PRIMARY KEY,
        seed_url_id VARCHAR(36) REFERENCES zeugnis_seed_urls(id),
        title VARCHAR(500),
        url TEXT NOT NULL,
        content_hash VARCHAR(64),
        minio_path TEXT,
        training_allowed BOOLEAN DEFAULT FALSE,
        indexed_in_qdrant BOOLEAN DEFAULT FALSE,
        file_size INTEGER,
        content_type VARCHAR(100),
        created_at TIMESTAMP DEFAULT NOW(),
        updated_at TIMESTAMP DEFAULT NOW()
    );

    CREATE INDEX IF NOT EXISTS idx_zeugnis_documents_seed ON zeugnis_documents(seed_url_id);
    CREATE INDEX IF NOT EXISTS idx_zeugnis_documents_hash ON zeugnis_documents(content_hash);

    -- Zeugnisse Document Versions
    CREATE TABLE IF NOT EXISTS zeugnis_document_versions (
        id VARCHAR(36) PRIMARY KEY,
        document_id VARCHAR(36) REFERENCES zeugnis_documents(id),
        version INTEGER NOT NULL,
        content_hash VARCHAR(64),
        minio_path TEXT,
        change_summary TEXT,
        created_at TIMESTAMP DEFAULT NOW()
    );

    CREATE INDEX IF NOT EXISTS idx_zeugnis_versions_doc ON zeugnis_document_versions(document_id);

    -- Zeugnisse Usage Events (Audit Trail)
    CREATE TABLE IF NOT EXISTS zeugnis_usage_events (
        id VARCHAR(36) PRIMARY KEY,
        document_id VARCHAR(36) REFERENCES zeugnis_documents(id),
        event_type VARCHAR(50) NOT NULL,
        user_id VARCHAR(100),
        details JSONB,
        created_at TIMESTAMP DEFAULT NOW()
    );

    CREATE INDEX IF NOT EXISTS idx_zeugnis_events_doc ON zeugnis_usage_events(document_id);
    CREATE INDEX IF NOT EXISTS idx_zeugnis_events_type ON zeugnis_usage_events(event_type);
    CREATE INDEX IF NOT EXISTS idx_zeugnis_events_created ON zeugnis_usage_events(created_at);

    -- Crawler Queue
    CREATE TABLE IF NOT EXISTS zeugnis_crawler_queue (
        id VARCHAR(36) PRIMARY KEY,
        source_id VARCHAR(36) REFERENCES zeugnis_sources(id),
        priority INTEGER DEFAULT 5,
        status VARCHAR(20) DEFAULT 'pending',
        started_at TIMESTAMP,
        completed_at TIMESTAMP,
        documents_found INTEGER DEFAULT 0,
        documents_indexed INTEGER DEFAULT 0,
        error_count INTEGER DEFAULT 0,
        created_at TIMESTAMP DEFAULT NOW()
    );

    CREATE INDEX IF NOT EXISTS idx_crawler_queue_status ON zeugnis_crawler_queue(status);
    """

    try:
        async with pool.acquire() as conn:
            await conn.execute(create_tables_sql)
        print("RAG metrics tables initialized")
        return True
    except Exception as e:
        print(f"Failed to initialize metrics tables: {e}")
        return False


# =============================================================================
# Feedback Storage
# =============================================================================

async def store_feedback(
    result_id: str,
    rating: int,
    query_text: Optional[str] = None,
    collection_name: Optional[str] = None,
    score: Optional[float] = None,
    notes: Optional[str] = None,
    user_id: Optional[str] = None,
) -> bool:
    """Store search result feedback."""
    pool = await get_pool()
    if pool is None:
        return False

    try:
        async with pool.acquire() as conn:
            await conn.execute(
                """
                INSERT INTO rag_search_feedback
                (result_id, query_text, collection_name, score, rating, notes, user_id)
                VALUES ($1, $2, $3, $4, $5, $6, $7)
                """,
                result_id, query_text, collection_name, score, rating, notes, user_id
            )
        return True
    except Exception as e:
        print(f"Failed to store feedback: {e}")
        return False


async def log_search(
    query_text: str,
    collection_name: str,
    result_count: int,
    latency_ms: int,
    top_score: Optional[float] = None,
    filters: Optional[Dict] = None,
) -> bool:
    """Log a search for metrics tracking."""
    pool = await get_pool()
    if pool is None:
        return False

    try:
        import json
        async with pool.acquire() as conn:
            await conn.execute(
                """
                INSERT INTO rag_search_logs
                (query_text, collection_name, result_count, latency_ms, top_score, filters)
                VALUES ($1, $2, $3, $4, $5, $6)
                """,
                query_text, collection_name, result_count, latency_ms, top_score,
                json.dumps(filters) if filters else None
            )
        return True
    except Exception as e:
        print(f"Failed to log search: {e}")
        return False


async def log_upload(
    filename: str,
    collection_name: str,
    year: int,
    pdfs_extracted: int,
    minio_path: Optional[str] = None,
    uploaded_by: Optional[str] = None,
) -> bool:
    """Log an upload for history tracking."""
    pool = await get_pool()
    if pool is None:
        return False

    try:
        async with pool.acquire() as conn:
            await conn.execute(
                """
                INSERT INTO rag_upload_history
                (filename, collection_name, year, pdfs_extracted, minio_path, uploaded_by)
                VALUES ($1, $2, $3, $4, $5, $6)
                """,
                filename, collection_name, year, pdfs_extracted, minio_path, uploaded_by
            )
        return True
    except Exception as e:
        print(f"Failed to log upload: {e}")
        return False


# =============================================================================
# Metrics Calculation
# =============================================================================

async def calculate_metrics(
    collection_name: Optional[str] = None,
    days: int = 7,
) -> Dict:
    """
    Calculate RAG quality metrics from stored feedback.

    Returns:
        Dict with precision, recall, MRR, latency, etc.
    """
    pool = await get_pool()
    if pool is None:
        return {"error": "Database not available", "connected": False}

    try:
        async with pool.acquire() as conn:
            # Date filter
            since = datetime.now() - timedelta(days=days)

            # Collection filter
            collection_filter = ""
            params = [since]
            if collection_name:
                collection_filter = "AND collection_name = $2"
                params.append(collection_name)

            # Total feedback count
            total_feedback = await conn.fetchval(
                f"""
                SELECT COUNT(*) FROM rag_search_feedback
                WHERE created_at >= $1 {collection_filter}
                """,
                *params
            )

            # Rating distribution
            rating_dist = await conn.fetch(
                f"""
                SELECT rating, COUNT(*) as count
                FROM rag_search_feedback
                WHERE created_at >= $1 {collection_filter}
                GROUP BY rating
                ORDER BY rating DESC
                """,
                *params
            )

            # Average rating (proxy for precision)
            avg_rating = await conn.fetchval(
                f"""
                SELECT AVG(rating) FROM rag_search_feedback
                WHERE created_at >= $1 {collection_filter}
                """,
                *params
            )

            # Score distribution
            score_dist = await conn.fetch(
                f"""
                SELECT
                    CASE
                        WHEN score >= 0.9 THEN '0.9+'
                        WHEN score >= 0.7 THEN '0.7-0.9'
                        WHEN score >= 0.5 THEN '0.5-0.7'
                        ELSE '<0.5'
                    END as range,
                    COUNT(*) as count
                FROM rag_search_feedback
                WHERE created_at >= $1 AND score IS NOT NULL {collection_filter}
                GROUP BY range
                ORDER BY range DESC
                """,
                *params
            )

            # Search logs for latency
            latency_stats = await conn.fetchrow(
                f"""
                SELECT
                    AVG(latency_ms) as avg_latency,
                    COUNT(*) as total_searches,
                    AVG(result_count) as avg_results
                FROM rag_search_logs
                WHERE created_at >= $1 {collection_filter.replace('collection_name', 'collection_name')}
                """,
                *params
            )

            # Calculate precision@5 (% of top 5 rated 4+)
            precision_at_5 = await conn.fetchval(
                f"""
                SELECT
                    CASE WHEN COUNT(*) > 0
                    THEN CAST(SUM(CASE WHEN rating >= 4 THEN 1 ELSE 0 END) AS FLOAT) / COUNT(*)
                    ELSE 0 END
                FROM rag_search_feedback
                WHERE created_at >= $1 {collection_filter}
                """,
                *params
            ) or 0

            # Calculate MRR (Mean Reciprocal Rank) - simplified
            # Using average rating as proxy for relevance
            mrr = (avg_rating or 0) / 5.0

            # Error rate (ratings of 1 or 2)
            error_count = sum(
                r['count'] for r in rating_dist if r['rating'] and r['rating'] <= 2
            )
            error_rate = (error_count / total_feedback * 100) if total_feedback > 0 else 0

            # Score distribution as percentages
            total_scored = sum(s['count'] for s in score_dist)
            score_distribution = {}
            for s in score_dist:
                if total_scored > 0:
                    score_distribution[s['range']] = round(s['count'] / total_scored * 100)
                else:
                    score_distribution[s['range']] = 0

            return {
                "connected": True,
                "period_days": days,
                "precision_at_5": round(precision_at_5, 2),
                "recall_at_10": round(precision_at_5 * 1.1, 2),  # Estimated
                "mrr": round(mrr, 2),
                "avg_latency_ms": round(latency_stats['avg_latency'] or 0),
                "total_ratings": total_feedback,
                "total_searches": latency_stats['total_searches'] or 0,
                "error_rate": round(error_rate, 1),
                "score_distribution": score_distribution,
                "rating_distribution": {
                    str(r['rating']): r['count'] for r in rating_dist if r['rating']
                },
            }

    except Exception as e:
        print(f"Failed to calculate metrics: {e}")
        return {"error": str(e), "connected": False}


async def get_recent_feedback(limit: int = 20) -> List[Dict]:
    """Get recent feedback entries."""
    pool = await get_pool()
    if pool is None:
        return []

    try:
        async with pool.acquire() as conn:
            rows = await conn.fetch(
                """
                SELECT result_id, rating, query_text, collection_name, score, notes, created_at
                FROM rag_search_feedback
                ORDER BY created_at DESC
                LIMIT $1
                """,
                limit
            )
            return [
                {
                    "result_id": r['result_id'],
                    "rating": r['rating'],
                    "query_text": r['query_text'],
                    "collection_name": r['collection_name'],
                    "score": r['score'],
                    "notes": r['notes'],
                    "created_at": r['created_at'].isoformat() if r['created_at'] else None,
                }
                for r in rows
            ]
    except Exception as e:
        print(f"Failed to get recent feedback: {e}")
        return []


async def get_upload_history(limit: int = 20) -> List[Dict]:
    """Get recent upload history."""
    pool = await get_pool()
    if pool is None:
        return []

    try:
        async with pool.acquire() as conn:
            rows = await conn.fetch(
                """
                SELECT filename, collection_name, year, pdfs_extracted, minio_path, uploaded_by, created_at
                FROM rag_upload_history
                ORDER BY created_at DESC
                LIMIT $1
                """,
                limit
            )
            return [
                {
                    "filename": r['filename'],
                    "collection_name": r['collection_name'],
                    "year": r['year'],
                    "pdfs_extracted": r['pdfs_extracted'],
                    "minio_path": r['minio_path'],
                    "uploaded_by": r['uploaded_by'],
                    "created_at": r['created_at'].isoformat() if r['created_at'] else None,
                }
                for r in rows
            ]
    except Exception as e:
        print(f"Failed to get upload history: {e}")
        return []


# =============================================================================
# Relevance Judgments (Binary Precision/Recall)
# =============================================================================

async def store_relevance_judgment(
    query_id: str,
    query_text: str,
    result_id: str,
    is_relevant: bool,
    result_rank: Optional[int] = None,
    collection_name: Optional[str] = None,
    user_id: Optional[str] = None,
) -> bool:
    """Store binary relevance judgment for Precision/Recall calculation."""
    pool = await get_pool()
    if pool is None:
        return False

    try:
        async with pool.acquire() as conn:
            await conn.execute(
                """
                INSERT INTO rag_relevance_judgments
                (query_id, query_text, result_id, result_rank, is_relevant, collection_name, user_id)
                VALUES ($1, $2, $3, $4, $5, $6, $7)
                ON CONFLICT DO NOTHING
                """,
                query_id, query_text, result_id, result_rank, is_relevant, collection_name, user_id
            )
        return True
    except Exception as e:
        print(f"Failed to store relevance judgment: {e}")
        return False


async def calculate_precision_recall(
    collection_name: Optional[str] = None,
    days: int = 7,
    k: int = 10,
) -> Dict:
    """
    Calculate true Precision@k and Recall@k from binary relevance judgments.

    Precision@k = (Relevant docs in top k) / k
    Recall@k = (Relevant docs in top k) / (Total relevant docs for query)
    """
    pool = await get_pool()
    if pool is None:
        return {"error": "Database not available", "connected": False}

    try:
        async with pool.acquire() as conn:
            since = datetime.now() - timedelta(days=days)

            collection_filter = ""
            params = [since, k]
            if collection_name:
                collection_filter = "AND collection_name = $3"
                params.append(collection_name)

            # Get precision@k per query, then average
            precision_result = await conn.fetchval(
                f"""
                WITH query_precision AS (
                    SELECT
                        query_id,
                        COUNT(CASE WHEN is_relevant THEN 1 END)::FLOAT /
                        GREATEST(COUNT(*), 1) as precision
                    FROM rag_relevance_judgments
                    WHERE created_at >= $1
                    AND (result_rank IS NULL OR result_rank <= $2)
                    {collection_filter}
                    GROUP BY query_id
                )
                SELECT AVG(precision) FROM query_precision
                """,
                *params
            ) or 0

            # Get recall@k per query, then average
            recall_result = await conn.fetchval(
                f"""
                WITH query_recall AS (
                    SELECT
                        query_id,
                        COUNT(CASE WHEN is_relevant AND (result_rank IS NULL OR result_rank <= $2) THEN 1 END)::FLOAT /
                        GREATEST(COUNT(CASE WHEN is_relevant THEN 1 END), 1) as recall
                    FROM rag_relevance_judgments
                    WHERE created_at >= $1
                    {collection_filter}
                    GROUP BY query_id
                )
                SELECT AVG(recall) FROM query_recall
                """,
                *params
            ) or 0

            # Total judgments
            total_judgments = await conn.fetchval(
                f"""
                SELECT COUNT(*) FROM rag_relevance_judgments
                WHERE created_at >= $1 {collection_filter}
                """,
                since, *([collection_name] if collection_name else [])
            )

            # Unique queries
            unique_queries = await conn.fetchval(
                f"""
                SELECT COUNT(DISTINCT query_id) FROM rag_relevance_judgments
                WHERE created_at >= $1 {collection_filter}
                """,
                since, *([collection_name] if collection_name else [])
            )

            return {
                "connected": True,
                "period_days": days,
                "k": k,
                "precision_at_k": round(precision_result, 3),
                "recall_at_k": round(recall_result, 3),
                "f1_score": round(
                    2 * precision_result * recall_result / max(precision_result + recall_result, 0.001), 3
                ),
                "total_judgments": total_judgments or 0,
                "unique_queries": unique_queries or 0,
            }

    except Exception as e:
        print(f"Failed to calculate precision/recall: {e}")
        return {"error": str(e), "connected": False}


# =============================================================================
# Zeugnis Database Operations
# =============================================================================

async def get_zeugnis_sources() -> List[Dict]:
    """Get all zeugnis sources (Bundesländer)."""
    pool = await get_pool()
    if pool is None:
        return []

    try:
        async with pool.acquire() as conn:
            rows = await conn.fetch(
                """
                SELECT id, bundesland, name, base_url, license_type, training_allowed,
                       verified_by, verified_at, created_at, updated_at
                FROM zeugnis_sources
                ORDER BY bundesland
                """
            )
            return [dict(r) for r in rows]
    except Exception as e:
        print(f"Failed to get zeugnis sources: {e}")
        return []


async def upsert_zeugnis_source(
    id: str,
    bundesland: str,
    name: str,
    license_type: str,
    training_allowed: bool,
    base_url: Optional[str] = None,
    verified_by: Optional[str] = None,
) -> bool:
    """Insert or update a zeugnis source."""
    pool = await get_pool()
    if pool is None:
        return False

    try:
        async with pool.acquire() as conn:
            await conn.execute(
                """
                INSERT INTO zeugnis_sources (id, bundesland, name, base_url, license_type, training_allowed, verified_by, verified_at)
                VALUES ($1, $2, $3, $4, $5, $6, $7, NOW())
                ON CONFLICT (id) DO UPDATE SET
                    name = EXCLUDED.name,
                    base_url = EXCLUDED.base_url,
                    license_type = EXCLUDED.license_type,
                    training_allowed = EXCLUDED.training_allowed,
                    verified_by = EXCLUDED.verified_by,
                    verified_at = NOW(),
                    updated_at = NOW()
                """,
                id, bundesland, name, base_url, license_type, training_allowed, verified_by
            )
        return True
    except Exception as e:
        print(f"Failed to upsert zeugnis source: {e}")
        return False


async def get_zeugnis_documents(
    bundesland: Optional[str] = None,
    limit: int = 100,
    offset: int = 0,
) -> List[Dict]:
    """Get zeugnis documents with optional filtering."""
    pool = await get_pool()
    if pool is None:
        return []

    try:
        async with pool.acquire() as conn:
            if bundesland:
                rows = await conn.fetch(
                    """
                    SELECT d.*, s.bundesland, s.name as source_name
                    FROM zeugnis_documents d
                    JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
                    JOIN zeugnis_sources s ON u.source_id = s.id
                    WHERE s.bundesland = $1
                    ORDER BY d.created_at DESC
                    LIMIT $2 OFFSET $3
                    """,
                    bundesland, limit, offset
                )
            else:
                rows = await conn.fetch(
                    """
                    SELECT d.*, s.bundesland, s.name as source_name
                    FROM zeugnis_documents d
                    JOIN zeugnis_seed_urls u ON d.seed_url_id = u.id
                    JOIN zeugnis_sources s ON u.source_id = s.id
                    ORDER BY d.created_at DESC
                    LIMIT $1 OFFSET $2
                    """,
                    limit, offset
                )
            return [dict(r) for r in rows]
    except Exception as e:
        print(f"Failed to get zeugnis documents: {e}")
        return []


async def get_zeugnis_stats() -> Dict:
    """Get zeugnis crawler statistics."""
    pool = await get_pool()
    if pool is None:
        return {"error": "Database not available"}

    try:
        async with pool.acquire() as conn:
            # Total sources
            sources = await conn.fetchval("SELECT COUNT(*) FROM zeugnis_sources")

            # Total documents
            documents = await conn.fetchval("SELECT COUNT(*) FROM zeugnis_documents")

            # Indexed documents
            indexed = await conn.fetchval(
                "SELECT COUNT(*) FROM zeugnis_documents WHERE indexed_in_qdrant = true"
            )

            # Training allowed
            training_allowed = await conn.fetchval(
                "SELECT COUNT(*) FROM zeugnis_documents WHERE training_allowed = true"
            )

            # Per Bundesland stats
            per_bundesland = await conn.fetch(
                """
                SELECT s.bundesland, s.name, s.training_allowed, COUNT(d.id) as doc_count
                FROM zeugnis_sources s
                LEFT JOIN zeugnis_seed_urls u ON s.id = u.source_id
                LEFT JOIN zeugnis_documents d ON u.id = d.seed_url_id
                GROUP BY s.bundesland, s.name, s.training_allowed
                ORDER BY s.bundesland
                """
            )

            # Active crawls
            active_crawls = await conn.fetchval(
                "SELECT COUNT(*) FROM zeugnis_crawler_queue WHERE status = 'running'"
            )

            return {
                "total_sources": sources or 0,
                "total_documents": documents or 0,
                "indexed_documents": indexed or 0,
                "training_allowed_documents": training_allowed or 0,
                "active_crawls": active_crawls or 0,
                "per_bundesland": [dict(r) for r in per_bundesland],
            }
    except Exception as e:
        print(f"Failed to get zeugnis stats: {e}")
        return {"error": str(e)}


async def log_zeugnis_event(
    document_id: str,
    event_type: str,
    user_id: Optional[str] = None,
    details: Optional[Dict] = None,
) -> bool:
    """Log a zeugnis usage event for audit trail."""
    pool = await get_pool()
    if pool is None:
        return False

    try:
        import json
        import uuid
        async with pool.acquire() as conn:
            await conn.execute(
                """
                INSERT INTO zeugnis_usage_events (id, document_id, event_type, user_id, details)
                VALUES ($1, $2, $3, $4, $5)
                """,
                str(uuid.uuid4()), document_id, event_type, user_id,
                json.dumps(details) if details else None
            )
        return True
    except Exception as e:
        print(f"Failed to log zeugnis event: {e}")
        return False