""" Klausur-Service Abitur-Archiv Routes Endpoints for accessing NiBiS Zentralabitur documents (public archive). Provides filtered listing and presigned URLs for PDF access. """ from typing import Optional, List, Dict from datetime import datetime from fastapi import APIRouter, HTTPException, Query from pydantic import BaseModel from qdrant_service import get_qdrant_client, search_nibis_eh from minio_storage import get_presigned_url, list_documents from eh_pipeline import generate_single_embedding router = APIRouter() # ============================================= # MODELS # ============================================= class AbiturDokument(BaseModel): """Abitur document from the archive.""" id: str title: str subject: str niveau: str # eA or gA year: int task_number: Optional[str] = None # Can be "1", "2A", "2C", etc. doc_type: str # EWH, Aufgabe, Material variant: Optional[str] = None bundesland: str = "NI" minio_path: Optional[str] = None preview_url: Optional[str] = None class ArchivSearchResponse(BaseModel): """Response for archive listing.""" total: int documents: List[AbiturDokument] filters: Dict class SemanticSearchResult(BaseModel): """Result from semantic search.""" id: str score: float text: str year: int subject: str niveau: str task_number: Optional[str] = None # Can be "1", "2A", "2C", etc. doc_type: str # ============================================= # ARCHIVE LISTING & FILTERS # ============================================= # IMPORTANT: Specific routes MUST come before parameterized routes! # Otherwise /api/v1/archiv/stats would be caught by /api/v1/archiv/{doc_id} # ============================================= # STATS (must be before {doc_id}) # ============================================= @router.get("/api/v1/archiv/stats") async def get_archiv_stats(): """ Get archive statistics (document counts, available years, etc.). """ try: client = get_qdrant_client() collection = "bp_nibis_eh" # Get collection info info = client.get_collection(collection) # Scroll to get stats by year/subject all_points, _ = client.scroll( collection_name=collection, limit=1000, with_payload=True, with_vectors=False ) # Aggregate stats by_year = {} by_subject = {} by_niveau = {} seen_docs = set() for point in all_points: payload = point.payload doc_id = payload.get("doc_id") or payload.get("original_id", str(point.id)) if doc_id in seen_docs: continue seen_docs.add(doc_id) year = str(payload.get("year", "unknown")) subject = payload.get("subject", "unknown") niveau = payload.get("niveau", "unknown") by_year[year] = by_year.get(year, 0) + 1 by_subject[subject] = by_subject.get(subject, 0) + 1 by_niveau[niveau] = by_niveau.get(niveau, 0) + 1 return { "total_documents": len(seen_docs), "total_chunks": info.points_count, "by_year": dict(sorted(by_year.items(), reverse=True)), "by_subject": dict(sorted(by_subject.items(), key=lambda x: -x[1])), "by_niveau": by_niveau, "collection_status": info.status.value } except Exception as e: print(f"Stats error: {e}") return { "total_documents": 0, "total_chunks": 0, "by_year": {}, "by_subject": {}, "by_niveau": {}, "error": str(e) } # ============================================= # THEME SUGGESTIONS (must be before {doc_id}) # ============================================= @router.get("/api/v1/archiv/suggest") async def suggest_themes( query: str = Query(..., min_length=2, description="Partial search query") ) -> List[Dict]: """ Get theme suggestions for autocomplete. Returns popular themes/topics that match the query. """ # Predefined themes for autocomplete THEMES = [ {"label": "Textanalyse", "type": "Analyse"}, {"label": "Gedichtanalyse", "type": "Analyse"}, {"label": "Dramenanalyse", "type": "Analyse"}, {"label": "Prosaanalyse", "type": "Analyse"}, {"label": "Eroerterung", "type": "Aufsatz"}, {"label": "Textgebundene Eroerterung", "type": "Aufsatz"}, {"label": "Materialgestuetzte Eroerterung", "type": "Aufsatz"}, {"label": "Sprachreflexion", "type": "Analyse"}, {"label": "Kafka", "type": "Autor"}, {"label": "Goethe", "type": "Autor"}, {"label": "Schiller", "type": "Autor"}, {"label": "Romantik", "type": "Epoche"}, {"label": "Expressionismus", "type": "Epoche"}, {"label": "Sturm und Drang", "type": "Epoche"}, {"label": "Aufklaerung", "type": "Epoche"}, {"label": "Sprachvarietaeten", "type": "Thema"}, {"label": "Sprachwandel", "type": "Thema"}, {"label": "Kommunikation", "type": "Thema"}, {"label": "Medien", "type": "Thema"}, ] query_lower = query.lower() matches = [ theme for theme in THEMES if query_lower in theme["label"].lower() ] return matches[:10] # ============================================= # SEMANTIC SEARCH (must be before {doc_id}) # ============================================= @router.get("/api/v1/archiv/search/semantic") async def semantic_search( query: str = Query(..., min_length=3, description="Search query"), year: Optional[int] = Query(None), subject: Optional[str] = Query(None), niveau: Optional[str] = Query(None), limit: int = Query(10, ge=1, le=50) ) -> List[SemanticSearchResult]: """ Perform semantic search across the archive using embeddings. This searches for conceptually similar content, not just keyword matches. """ try: # Generate query embedding query_embedding = await generate_single_embedding(query) # Search in Qdrant results = await search_nibis_eh( query_embedding=query_embedding, year=year, subject=subject, niveau=niveau, limit=limit ) return [ SemanticSearchResult( id=r["id"], score=r["score"], text=r.get("text", "")[:500], # Truncate for response year=r.get("year", 0), subject=r.get("subject", ""), niveau=r.get("niveau", ""), task_number=r.get("task_number"), doc_type=r.get("doc_type", "EWH") ) for r in results ] except Exception as e: print(f"Semantic search error: {e}") return [] # ============================================= # ARCHIVE LISTING # ============================================= @router.get("/api/v1/archiv", response_model=ArchivSearchResponse) async def list_archiv_documents( subject: Optional[str] = Query(None, description="Filter by subject (e.g., Deutsch, Englisch)"), year: Optional[int] = Query(None, description="Filter by year (e.g., 2024)"), bundesland: Optional[str] = Query(None, description="Filter by state (e.g., NI)"), niveau: Optional[str] = Query(None, description="Filter by level (eA or gA)"), doc_type: Optional[str] = Query(None, description="Filter by type (EWH, Aufgabe)"), search: Optional[str] = Query(None, description="Theme/keyword search"), limit: int = Query(50, ge=1, le=200), offset: int = Query(0, ge=0) ): """ List all documents in the Abitur archive with optional filters. Returns metadata for documents stored in the bp_nibis_eh Qdrant collection. PDF URLs are generated on-demand via MinIO presigned URLs. """ try: client = get_qdrant_client() collection = "bp_nibis_eh" # Get all unique documents (dedup by doc_id) # We scroll through the collection to get document metadata from qdrant_client.models import Filter, FieldCondition, MatchValue # Build filter conditions must_conditions = [] if subject: must_conditions.append( FieldCondition(key="subject", match=MatchValue(value=subject)) ) if year: must_conditions.append( FieldCondition(key="year", match=MatchValue(value=year)) ) if bundesland: must_conditions.append( FieldCondition(key="bundesland", match=MatchValue(value=bundesland)) ) if niveau: must_conditions.append( FieldCondition(key="niveau", match=MatchValue(value=niveau)) ) if doc_type: must_conditions.append( FieldCondition(key="doc_type", match=MatchValue(value=doc_type)) ) query_filter = Filter(must=must_conditions) if must_conditions else None # Scroll through all points to get unique documents all_points, _ = client.scroll( collection_name=collection, scroll_filter=query_filter, limit=1000, # Get more to ensure we find unique docs with_payload=True, with_vectors=False ) # Deduplicate by doc_id and collect unique documents seen_docs = {} for point in all_points: payload = point.payload doc_id = payload.get("doc_id") or payload.get("original_id", str(point.id)) # Skip if already seen if doc_id in seen_docs: continue # Apply text search filter if provided if search: text = payload.get("text", "") if search.lower() not in text.lower(): continue # Build document title from metadata subject_name = payload.get("subject", "Unbekannt") doc_year = payload.get("year", 0) doc_niveau = payload.get("niveau", "") task_num = payload.get("task_number") doc_type_val = payload.get("doc_type", "EWH") variant = payload.get("variant") # Generate title title_parts = [subject_name, str(doc_year), doc_niveau] if task_num: title_parts.append(f"Aufgabe {task_num}") if doc_type_val and doc_type_val != "EWH": title_parts.append(doc_type_val) if variant: title_parts.append(f"({variant})") title = " ".join(title_parts) # Generate MinIO path for this document # Path pattern: landes-daten/ni/klausur/{year}/{filename}.pdf minio_path = f"landes-daten/ni/klausur/{doc_year}/{doc_year}_{subject_name}_{doc_niveau}" if task_num: minio_path += f"_{task_num}" minio_path += "_EWH.pdf" seen_docs[doc_id] = AbiturDokument( id=doc_id, title=title, subject=subject_name, niveau=doc_niveau, year=doc_year, task_number=task_num, doc_type=doc_type_val, variant=variant, bundesland=payload.get("bundesland", "NI"), minio_path=minio_path ) # Convert to list and apply pagination documents = list(seen_docs.values()) # Sort by year descending, then subject documents.sort(key=lambda d: (-d.year, d.subject)) total = len(documents) paginated = documents[offset:offset + limit] # Get available filter options for UI filters = { "subjects": sorted(list(set(d.subject for d in documents))), "years": sorted(list(set(d.year for d in documents)), reverse=True), "niveaus": sorted(list(set(d.niveau for d in documents if d.niveau))), "doc_types": sorted(list(set(d.doc_type for d in documents if d.doc_type))), } return ArchivSearchResponse( total=total, documents=paginated, filters=filters ) except Exception as e: print(f"Archiv list error: {e}") # Return empty response with mock data if Qdrant fails return ArchivSearchResponse( total=0, documents=[], filters={ "subjects": ["Deutsch", "Englisch", "Mathematik"], "years": [2025, 2024, 2023, 2022, 2021], "niveaus": ["eA", "gA"], "doc_types": ["EWH", "Aufgabe"] } ) @router.get("/api/v1/archiv/{doc_id}") async def get_archiv_document(doc_id: str): """ Get details for a specific document including presigned PDF URL. """ try: client = get_qdrant_client() collection = "bp_nibis_eh" from qdrant_client.models import Filter, FieldCondition, MatchValue # Find document by doc_id in payload results, _ = client.scroll( collection_name=collection, scroll_filter=Filter(must=[ FieldCondition(key="doc_id", match=MatchValue(value=doc_id)) ]), limit=1, with_payload=True ) if not results: # Try original_id results, _ = client.scroll( collection_name=collection, scroll_filter=Filter(must=[ FieldCondition(key="original_id", match=MatchValue(value=doc_id)) ]), limit=1, with_payload=True ) if not results: raise HTTPException(status_code=404, detail="Document not found") payload = results[0].payload # Generate MinIO presigned URL subject_name = payload.get("subject", "Unbekannt") doc_year = payload.get("year", 0) doc_niveau = payload.get("niveau", "") task_num = payload.get("task_number") minio_path = f"landes-daten/ni/klausur/{doc_year}/{doc_year}_{subject_name}_{doc_niveau}" if task_num: minio_path += f"_{task_num}" minio_path += "_EWH.pdf" # Get presigned URL (1 hour expiry) preview_url = await get_presigned_url(minio_path, expires=3600) return { "id": doc_id, "title": f"{subject_name} {doc_year} {doc_niveau}", "subject": subject_name, "niveau": doc_niveau, "year": doc_year, "task_number": task_num, "doc_type": payload.get("doc_type", "EWH"), "variant": payload.get("variant"), "bundesland": payload.get("bundesland", "NI"), "minio_path": minio_path, "preview_url": preview_url, "text_preview": payload.get("text", "")[:500] + "..." if payload.get("text") else None } except HTTPException: raise except Exception as e: print(f"Get document error: {e}") raise HTTPException(status_code=500, detail=f"Failed to get document: {str(e)}") @router.get("/api/v1/archiv/{doc_id}/url") async def get_document_url(doc_id: str, expires: int = Query(3600, ge=300, le=86400)): """ Get a presigned URL for downloading the PDF. Args: doc_id: Document ID expires: URL expiration time in seconds (default 1 hour, max 24 hours) """ try: # First, get the document to find the MinIO path doc = await get_archiv_document(doc_id) if not doc.get("minio_path"): raise HTTPException(status_code=404, detail="Document path not found") # Generate presigned URL url = await get_presigned_url(doc["minio_path"], expires=expires) if not url: raise HTTPException(status_code=500, detail="Failed to generate download URL") return { "url": url, "expires_in": expires, "filename": doc["minio_path"].split("/")[-1] } except HTTPException: raise except Exception as e: print(f"Get URL error: {e}") raise HTTPException(status_code=500, detail=f"Failed to generate URL: {str(e)}")