breakpilot-lehrer/klausur-service/backend/routes/archiv.py

"""
Klausur-Service Abitur-Archiv Routes

Endpoints for accessing NiBiS Zentralabitur documents (public archive).
Provides filtered listing and presigned URLs for PDF access.
"""

from typing import Optional, List, Dict
from datetime import datetime

from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel

from qdrant_service import get_qdrant_client, search_nibis_eh
from minio_storage import get_presigned_url, list_documents
from eh_pipeline import generate_single_embedding

router = APIRouter()


# =============================================
# MODELS
# =============================================

class AbiturDokument(BaseModel):
    """Abitur document from the archive."""
    id: str
    title: str
    subject: str
    niveau: str  # eA or gA
    year: int
    task_number: Optional[str] = None  # Can be "1", "2A", "2C", etc.
    doc_type: str  # EWH, Aufgabe, Material
    variant: Optional[str] = None
    bundesland: str = "NI"
    minio_path: Optional[str] = None
    preview_url: Optional[str] = None


class ArchivSearchResponse(BaseModel):
    """Response for archive listing."""
    total: int
    documents: List[AbiturDokument]
    filters: Dict


class SemanticSearchResult(BaseModel):
    """Result from semantic search."""
    id: str
    score: float
    text: str
    year: int
    subject: str
    niveau: str
    task_number: Optional[str] = None  # Can be "1", "2A", "2C", etc.
    doc_type: str


# =============================================
# ARCHIVE LISTING & FILTERS
# =============================================

# IMPORTANT: Specific routes MUST come before parameterized routes!
# Otherwise /api/v1/archiv/stats would be caught by /api/v1/archiv/{doc_id}

# =============================================
# STATS (must be before {doc_id})
# =============================================

@router.get("/api/v1/archiv/stats")
async def get_archiv_stats():
    """
    Get archive statistics (document counts, available years, etc.).
    """
    try:
        client = get_qdrant_client()
        collection = "bp_nibis_eh"

        # Get collection info
        info = client.get_collection(collection)

        # Scroll to get stats by year/subject
        all_points, _ = client.scroll(
            collection_name=collection,
            limit=1000,
            with_payload=True,
            with_vectors=False
        )

        # Aggregate stats
        by_year = {}
        by_subject = {}
        by_niveau = {}

        seen_docs = set()

        for point in all_points:
            payload = point.payload
            doc_id = payload.get("doc_id") or payload.get("original_id", str(point.id))

            if doc_id in seen_docs:
                continue
            seen_docs.add(doc_id)

            year = str(payload.get("year", "unknown"))
            subject = payload.get("subject", "unknown")
            niveau = payload.get("niveau", "unknown")

            by_year[year] = by_year.get(year, 0) + 1
            by_subject[subject] = by_subject.get(subject, 0) + 1
            by_niveau[niveau] = by_niveau.get(niveau, 0) + 1

        return {
            "total_documents": len(seen_docs),
            "total_chunks": info.points_count,
            "by_year": dict(sorted(by_year.items(), reverse=True)),
            "by_subject": dict(sorted(by_subject.items(), key=lambda x: -x[1])),
            "by_niveau": by_niveau,
            "collection_status": info.status.value
        }

    except Exception as e:
        print(f"Stats error: {e}")
        return {
            "total_documents": 0,
            "total_chunks": 0,
            "by_year": {},
            "by_subject": {},
            "by_niveau": {},
            "error": str(e)
        }


# =============================================
# THEME SUGGESTIONS (must be before {doc_id})
# =============================================

@router.get("/api/v1/archiv/suggest")
async def suggest_themes(
    query: str = Query(..., min_length=2, description="Partial search query")
) -> List[Dict]:
    """
    Get theme suggestions for autocomplete.

    Returns popular themes/topics that match the query.
    """
    # Predefined themes for autocomplete
    THEMES = [
        {"label": "Textanalyse", "type": "Analyse"},
        {"label": "Gedichtanalyse", "type": "Analyse"},
        {"label": "Dramenanalyse", "type": "Analyse"},
        {"label": "Prosaanalyse", "type": "Analyse"},
        {"label": "Eroerterung", "type": "Aufsatz"},
        {"label": "Textgebundene Eroerterung", "type": "Aufsatz"},
        {"label": "Materialgestuetzte Eroerterung", "type": "Aufsatz"},
        {"label": "Sprachreflexion", "type": "Analyse"},
        {"label": "Kafka", "type": "Autor"},
        {"label": "Goethe", "type": "Autor"},
        {"label": "Schiller", "type": "Autor"},
        {"label": "Romantik", "type": "Epoche"},
        {"label": "Expressionismus", "type": "Epoche"},
        {"label": "Sturm und Drang", "type": "Epoche"},
        {"label": "Aufklaerung", "type": "Epoche"},
        {"label": "Sprachvarietaeten", "type": "Thema"},
        {"label": "Sprachwandel", "type": "Thema"},
        {"label": "Kommunikation", "type": "Thema"},
        {"label": "Medien", "type": "Thema"},
    ]

    query_lower = query.lower()
    matches = [
        theme for theme in THEMES
        if query_lower in theme["label"].lower()
    ]

    return matches[:10]


# =============================================
# SEMANTIC SEARCH (must be before {doc_id})
# =============================================

@router.get("/api/v1/archiv/search/semantic")
async def semantic_search(
    query: str = Query(..., min_length=3, description="Search query"),
    year: Optional[int] = Query(None),
    subject: Optional[str] = Query(None),
    niveau: Optional[str] = Query(None),
    limit: int = Query(10, ge=1, le=50)
) -> List[SemanticSearchResult]:
    """
    Perform semantic search across the archive using embeddings.

    This searches for conceptually similar content, not just keyword matches.
    """
    try:
        # Generate query embedding
        query_embedding = await generate_single_embedding(query)

        # Search in Qdrant
        results = await search_nibis_eh(
            query_embedding=query_embedding,
            year=year,
            subject=subject,
            niveau=niveau,
            limit=limit
        )

        return [
            SemanticSearchResult(
                id=r["id"],
                score=r["score"],
                text=r.get("text", "")[:500],  # Truncate for response
                year=r.get("year", 0),
                subject=r.get("subject", ""),
                niveau=r.get("niveau", ""),
                task_number=r.get("task_number"),
                doc_type=r.get("doc_type", "EWH")
            )
            for r in results
        ]

    except Exception as e:
        print(f"Semantic search error: {e}")
        return []


# =============================================
# ARCHIVE LISTING
# =============================================

@router.get("/api/v1/archiv", response_model=ArchivSearchResponse)
async def list_archiv_documents(
    subject: Optional[str] = Query(None, description="Filter by subject (e.g., Deutsch, Englisch)"),
    year: Optional[int] = Query(None, description="Filter by year (e.g., 2024)"),
    bundesland: Optional[str] = Query(None, description="Filter by state (e.g., NI)"),
    niveau: Optional[str] = Query(None, description="Filter by level (eA or gA)"),
    doc_type: Optional[str] = Query(None, description="Filter by type (EWH, Aufgabe)"),
    search: Optional[str] = Query(None, description="Theme/keyword search"),
    limit: int = Query(50, ge=1, le=200),
    offset: int = Query(0, ge=0)
):
    """
    List all documents in the Abitur archive with optional filters.

    Returns metadata for documents stored in the bp_nibis_eh Qdrant collection.
    PDF URLs are generated on-demand via MinIO presigned URLs.
    """
    try:
        client = get_qdrant_client()
        collection = "bp_nibis_eh"

        # Get all unique documents (dedup by doc_id)
        # We scroll through the collection to get document metadata
        from qdrant_client.models import Filter, FieldCondition, MatchValue

        # Build filter conditions
        must_conditions = []

        if subject:
            must_conditions.append(
                FieldCondition(key="subject", match=MatchValue(value=subject))
            )
        if year:
            must_conditions.append(
                FieldCondition(key="year", match=MatchValue(value=year))
            )
        if bundesland:
            must_conditions.append(
                FieldCondition(key="bundesland", match=MatchValue(value=bundesland))
            )
        if niveau:
            must_conditions.append(
                FieldCondition(key="niveau", match=MatchValue(value=niveau))
            )
        if doc_type:
            must_conditions.append(
                FieldCondition(key="doc_type", match=MatchValue(value=doc_type))
            )

        query_filter = Filter(must=must_conditions) if must_conditions else None

        # Scroll through all points to get unique documents
        all_points, _ = client.scroll(
            collection_name=collection,
            scroll_filter=query_filter,
            limit=1000,  # Get more to ensure we find unique docs
            with_payload=True,
            with_vectors=False
        )

        # Deduplicate by doc_id and collect unique documents
        seen_docs = {}
        for point in all_points:
            payload = point.payload
            doc_id = payload.get("doc_id") or payload.get("original_id", str(point.id))

            # Skip if already seen
            if doc_id in seen_docs:
                continue

            # Apply text search filter if provided
            if search:
                text = payload.get("text", "")
                if search.lower() not in text.lower():
                    continue

            # Build document title from metadata
            subject_name = payload.get("subject", "Unbekannt")
            doc_year = payload.get("year", 0)
            doc_niveau = payload.get("niveau", "")
            task_num = payload.get("task_number")
            doc_type_val = payload.get("doc_type", "EWH")
            variant = payload.get("variant")

            # Generate title
            title_parts = [subject_name, str(doc_year), doc_niveau]
            if task_num:
                title_parts.append(f"Aufgabe {task_num}")
            if doc_type_val and doc_type_val != "EWH":
                title_parts.append(doc_type_val)
            if variant:
                title_parts.append(f"({variant})")

            title = " ".join(title_parts)

            # Generate MinIO path for this document
            # Path pattern: landes-daten/ni/klausur/{year}/{filename}.pdf
            minio_path = f"landes-daten/ni/klausur/{doc_year}/{doc_year}_{subject_name}_{doc_niveau}"
            if task_num:
                minio_path += f"_{task_num}"
            minio_path += "_EWH.pdf"

            seen_docs[doc_id] = AbiturDokument(
                id=doc_id,
                title=title,
                subject=subject_name,
                niveau=doc_niveau,
                year=doc_year,
                task_number=task_num,
                doc_type=doc_type_val,
                variant=variant,
                bundesland=payload.get("bundesland", "NI"),
                minio_path=minio_path
            )

        # Convert to list and apply pagination
        documents = list(seen_docs.values())

        # Sort by year descending, then subject
        documents.sort(key=lambda d: (-d.year, d.subject))

        total = len(documents)
        paginated = documents[offset:offset + limit]

        # Get available filter options for UI
        filters = {
            "subjects": sorted(list(set(d.subject for d in documents))),
            "years": sorted(list(set(d.year for d in documents)), reverse=True),
            "niveaus": sorted(list(set(d.niveau for d in documents if d.niveau))),
            "doc_types": sorted(list(set(d.doc_type for d in documents if d.doc_type))),
        }

        return ArchivSearchResponse(
            total=total,
            documents=paginated,
            filters=filters
        )

    except Exception as e:
        print(f"Archiv list error: {e}")
        # Return empty response with mock data if Qdrant fails
        return ArchivSearchResponse(
            total=0,
            documents=[],
            filters={
                "subjects": ["Deutsch", "Englisch", "Mathematik"],
                "years": [2025, 2024, 2023, 2022, 2021],
                "niveaus": ["eA", "gA"],
                "doc_types": ["EWH", "Aufgabe"]
            }
        )


@router.get("/api/v1/archiv/{doc_id}")
async def get_archiv_document(doc_id: str):
    """
    Get details for a specific document including presigned PDF URL.
    """
    try:
        client = get_qdrant_client()
        collection = "bp_nibis_eh"

        from qdrant_client.models import Filter, FieldCondition, MatchValue

        # Find document by doc_id in payload
        results, _ = client.scroll(
            collection_name=collection,
            scroll_filter=Filter(must=[
                FieldCondition(key="doc_id", match=MatchValue(value=doc_id))
            ]),
            limit=1,
            with_payload=True
        )

        if not results:
            # Try original_id
            results, _ = client.scroll(
                collection_name=collection,
                scroll_filter=Filter(must=[
                    FieldCondition(key="original_id", match=MatchValue(value=doc_id))
                ]),
                limit=1,
                with_payload=True
            )

        if not results:
            raise HTTPException(status_code=404, detail="Document not found")

        payload = results[0].payload

        # Generate MinIO presigned URL
        subject_name = payload.get("subject", "Unbekannt")
        doc_year = payload.get("year", 0)
        doc_niveau = payload.get("niveau", "")
        task_num = payload.get("task_number")

        minio_path = f"landes-daten/ni/klausur/{doc_year}/{doc_year}_{subject_name}_{doc_niveau}"
        if task_num:
            minio_path += f"_{task_num}"
        minio_path += "_EWH.pdf"

        # Get presigned URL (1 hour expiry)
        preview_url = await get_presigned_url(minio_path, expires=3600)

        return {
            "id": doc_id,
            "title": f"{subject_name} {doc_year} {doc_niveau}",
            "subject": subject_name,
            "niveau": doc_niveau,
            "year": doc_year,
            "task_number": task_num,
            "doc_type": payload.get("doc_type", "EWH"),
            "variant": payload.get("variant"),
            "bundesland": payload.get("bundesland", "NI"),
            "minio_path": minio_path,
            "preview_url": preview_url,
            "text_preview": payload.get("text", "")[:500] + "..." if payload.get("text") else None
        }

    except HTTPException:
        raise
    except Exception as e:
        print(f"Get document error: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to get document: {str(e)}")


@router.get("/api/v1/archiv/{doc_id}/url")
async def get_document_url(doc_id: str, expires: int = Query(3600, ge=300, le=86400)):
    """
    Get a presigned URL for downloading the PDF.

    Args:
        doc_id: Document ID
        expires: URL expiration time in seconds (default 1 hour, max 24 hours)
    """
    try:
        # First, get the document to find the MinIO path
        doc = await get_archiv_document(doc_id)

        if not doc.get("minio_path"):
            raise HTTPException(status_code=404, detail="Document path not found")

        # Generate presigned URL
        url = await get_presigned_url(doc["minio_path"], expires=expires)

        if not url:
            raise HTTPException(status_code=500, detail="Failed to generate download URL")

        return {
            "url": url,
            "expires_in": expires,
            "filename": doc["minio_path"].split("/")[-1]
        }

    except HTTPException:
        raise
    except Exception as e:
        print(f"Get URL error: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to generate URL: {str(e)}")