Files
breakpilot-lehrer/klausur-service/backend/routes/archiv.py
Benjamin Boenisch 5a31f52310 Initial commit: breakpilot-lehrer - Lehrer KI Platform
Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website,
Klausur-Service, School-Service, Voice-Service, Geo-Service,
BreakPilot Drive, Agent-Core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-11 23:47:26 +01:00

491 lines
16 KiB
Python

"""
Klausur-Service Abitur-Archiv Routes
Endpoints for accessing NiBiS Zentralabitur documents (public archive).
Provides filtered listing and presigned URLs for PDF access.
"""
from typing import Optional, List, Dict
from datetime import datetime
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from qdrant_service import get_qdrant_client, search_nibis_eh
from minio_storage import get_presigned_url, list_documents
from eh_pipeline import generate_single_embedding
router = APIRouter()
# =============================================
# MODELS
# =============================================
class AbiturDokument(BaseModel):
"""Abitur document from the archive."""
id: str
title: str
subject: str
niveau: str # eA or gA
year: int
task_number: Optional[str] = None # Can be "1", "2A", "2C", etc.
doc_type: str # EWH, Aufgabe, Material
variant: Optional[str] = None
bundesland: str = "NI"
minio_path: Optional[str] = None
preview_url: Optional[str] = None
class ArchivSearchResponse(BaseModel):
"""Response for archive listing."""
total: int
documents: List[AbiturDokument]
filters: Dict
class SemanticSearchResult(BaseModel):
"""Result from semantic search."""
id: str
score: float
text: str
year: int
subject: str
niveau: str
task_number: Optional[str] = None # Can be "1", "2A", "2C", etc.
doc_type: str
# =============================================
# ARCHIVE LISTING & FILTERS
# =============================================
# IMPORTANT: Specific routes MUST come before parameterized routes!
# Otherwise /api/v1/archiv/stats would be caught by /api/v1/archiv/{doc_id}
# =============================================
# STATS (must be before {doc_id})
# =============================================
@router.get("/api/v1/archiv/stats")
async def get_archiv_stats():
"""
Get archive statistics (document counts, available years, etc.).
"""
try:
client = get_qdrant_client()
collection = "bp_nibis_eh"
# Get collection info
info = client.get_collection(collection)
# Scroll to get stats by year/subject
all_points, _ = client.scroll(
collection_name=collection,
limit=1000,
with_payload=True,
with_vectors=False
)
# Aggregate stats
by_year = {}
by_subject = {}
by_niveau = {}
seen_docs = set()
for point in all_points:
payload = point.payload
doc_id = payload.get("doc_id") or payload.get("original_id", str(point.id))
if doc_id in seen_docs:
continue
seen_docs.add(doc_id)
year = str(payload.get("year", "unknown"))
subject = payload.get("subject", "unknown")
niveau = payload.get("niveau", "unknown")
by_year[year] = by_year.get(year, 0) + 1
by_subject[subject] = by_subject.get(subject, 0) + 1
by_niveau[niveau] = by_niveau.get(niveau, 0) + 1
return {
"total_documents": len(seen_docs),
"total_chunks": info.points_count,
"by_year": dict(sorted(by_year.items(), reverse=True)),
"by_subject": dict(sorted(by_subject.items(), key=lambda x: -x[1])),
"by_niveau": by_niveau,
"collection_status": info.status.value
}
except Exception as e:
print(f"Stats error: {e}")
return {
"total_documents": 0,
"total_chunks": 0,
"by_year": {},
"by_subject": {},
"by_niveau": {},
"error": str(e)
}
# =============================================
# THEME SUGGESTIONS (must be before {doc_id})
# =============================================
@router.get("/api/v1/archiv/suggest")
async def suggest_themes(
query: str = Query(..., min_length=2, description="Partial search query")
) -> List[Dict]:
"""
Get theme suggestions for autocomplete.
Returns popular themes/topics that match the query.
"""
# Predefined themes for autocomplete
THEMES = [
{"label": "Textanalyse", "type": "Analyse"},
{"label": "Gedichtanalyse", "type": "Analyse"},
{"label": "Dramenanalyse", "type": "Analyse"},
{"label": "Prosaanalyse", "type": "Analyse"},
{"label": "Eroerterung", "type": "Aufsatz"},
{"label": "Textgebundene Eroerterung", "type": "Aufsatz"},
{"label": "Materialgestuetzte Eroerterung", "type": "Aufsatz"},
{"label": "Sprachreflexion", "type": "Analyse"},
{"label": "Kafka", "type": "Autor"},
{"label": "Goethe", "type": "Autor"},
{"label": "Schiller", "type": "Autor"},
{"label": "Romantik", "type": "Epoche"},
{"label": "Expressionismus", "type": "Epoche"},
{"label": "Sturm und Drang", "type": "Epoche"},
{"label": "Aufklaerung", "type": "Epoche"},
{"label": "Sprachvarietaeten", "type": "Thema"},
{"label": "Sprachwandel", "type": "Thema"},
{"label": "Kommunikation", "type": "Thema"},
{"label": "Medien", "type": "Thema"},
]
query_lower = query.lower()
matches = [
theme for theme in THEMES
if query_lower in theme["label"].lower()
]
return matches[:10]
# =============================================
# SEMANTIC SEARCH (must be before {doc_id})
# =============================================
@router.get("/api/v1/archiv/search/semantic")
async def semantic_search(
query: str = Query(..., min_length=3, description="Search query"),
year: Optional[int] = Query(None),
subject: Optional[str] = Query(None),
niveau: Optional[str] = Query(None),
limit: int = Query(10, ge=1, le=50)
) -> List[SemanticSearchResult]:
"""
Perform semantic search across the archive using embeddings.
This searches for conceptually similar content, not just keyword matches.
"""
try:
# Generate query embedding
query_embedding = await generate_single_embedding(query)
# Search in Qdrant
results = await search_nibis_eh(
query_embedding=query_embedding,
year=year,
subject=subject,
niveau=niveau,
limit=limit
)
return [
SemanticSearchResult(
id=r["id"],
score=r["score"],
text=r.get("text", "")[:500], # Truncate for response
year=r.get("year", 0),
subject=r.get("subject", ""),
niveau=r.get("niveau", ""),
task_number=r.get("task_number"),
doc_type=r.get("doc_type", "EWH")
)
for r in results
]
except Exception as e:
print(f"Semantic search error: {e}")
return []
# =============================================
# ARCHIVE LISTING
# =============================================
@router.get("/api/v1/archiv", response_model=ArchivSearchResponse)
async def list_archiv_documents(
subject: Optional[str] = Query(None, description="Filter by subject (e.g., Deutsch, Englisch)"),
year: Optional[int] = Query(None, description="Filter by year (e.g., 2024)"),
bundesland: Optional[str] = Query(None, description="Filter by state (e.g., NI)"),
niveau: Optional[str] = Query(None, description="Filter by level (eA or gA)"),
doc_type: Optional[str] = Query(None, description="Filter by type (EWH, Aufgabe)"),
search: Optional[str] = Query(None, description="Theme/keyword search"),
limit: int = Query(50, ge=1, le=200),
offset: int = Query(0, ge=0)
):
"""
List all documents in the Abitur archive with optional filters.
Returns metadata for documents stored in the bp_nibis_eh Qdrant collection.
PDF URLs are generated on-demand via MinIO presigned URLs.
"""
try:
client = get_qdrant_client()
collection = "bp_nibis_eh"
# Get all unique documents (dedup by doc_id)
# We scroll through the collection to get document metadata
from qdrant_client.models import Filter, FieldCondition, MatchValue
# Build filter conditions
must_conditions = []
if subject:
must_conditions.append(
FieldCondition(key="subject", match=MatchValue(value=subject))
)
if year:
must_conditions.append(
FieldCondition(key="year", match=MatchValue(value=year))
)
if bundesland:
must_conditions.append(
FieldCondition(key="bundesland", match=MatchValue(value=bundesland))
)
if niveau:
must_conditions.append(
FieldCondition(key="niveau", match=MatchValue(value=niveau))
)
if doc_type:
must_conditions.append(
FieldCondition(key="doc_type", match=MatchValue(value=doc_type))
)
query_filter = Filter(must=must_conditions) if must_conditions else None
# Scroll through all points to get unique documents
all_points, _ = client.scroll(
collection_name=collection,
scroll_filter=query_filter,
limit=1000, # Get more to ensure we find unique docs
with_payload=True,
with_vectors=False
)
# Deduplicate by doc_id and collect unique documents
seen_docs = {}
for point in all_points:
payload = point.payload
doc_id = payload.get("doc_id") or payload.get("original_id", str(point.id))
# Skip if already seen
if doc_id in seen_docs:
continue
# Apply text search filter if provided
if search:
text = payload.get("text", "")
if search.lower() not in text.lower():
continue
# Build document title from metadata
subject_name = payload.get("subject", "Unbekannt")
doc_year = payload.get("year", 0)
doc_niveau = payload.get("niveau", "")
task_num = payload.get("task_number")
doc_type_val = payload.get("doc_type", "EWH")
variant = payload.get("variant")
# Generate title
title_parts = [subject_name, str(doc_year), doc_niveau]
if task_num:
title_parts.append(f"Aufgabe {task_num}")
if doc_type_val and doc_type_val != "EWH":
title_parts.append(doc_type_val)
if variant:
title_parts.append(f"({variant})")
title = " ".join(title_parts)
# Generate MinIO path for this document
# Path pattern: landes-daten/ni/klausur/{year}/{filename}.pdf
minio_path = f"landes-daten/ni/klausur/{doc_year}/{doc_year}_{subject_name}_{doc_niveau}"
if task_num:
minio_path += f"_{task_num}"
minio_path += "_EWH.pdf"
seen_docs[doc_id] = AbiturDokument(
id=doc_id,
title=title,
subject=subject_name,
niveau=doc_niveau,
year=doc_year,
task_number=task_num,
doc_type=doc_type_val,
variant=variant,
bundesland=payload.get("bundesland", "NI"),
minio_path=minio_path
)
# Convert to list and apply pagination
documents = list(seen_docs.values())
# Sort by year descending, then subject
documents.sort(key=lambda d: (-d.year, d.subject))
total = len(documents)
paginated = documents[offset:offset + limit]
# Get available filter options for UI
filters = {
"subjects": sorted(list(set(d.subject for d in documents))),
"years": sorted(list(set(d.year for d in documents)), reverse=True),
"niveaus": sorted(list(set(d.niveau for d in documents if d.niveau))),
"doc_types": sorted(list(set(d.doc_type for d in documents if d.doc_type))),
}
return ArchivSearchResponse(
total=total,
documents=paginated,
filters=filters
)
except Exception as e:
print(f"Archiv list error: {e}")
# Return empty response with mock data if Qdrant fails
return ArchivSearchResponse(
total=0,
documents=[],
filters={
"subjects": ["Deutsch", "Englisch", "Mathematik"],
"years": [2025, 2024, 2023, 2022, 2021],
"niveaus": ["eA", "gA"],
"doc_types": ["EWH", "Aufgabe"]
}
)
@router.get("/api/v1/archiv/{doc_id}")
async def get_archiv_document(doc_id: str):
"""
Get details for a specific document including presigned PDF URL.
"""
try:
client = get_qdrant_client()
collection = "bp_nibis_eh"
from qdrant_client.models import Filter, FieldCondition, MatchValue
# Find document by doc_id in payload
results, _ = client.scroll(
collection_name=collection,
scroll_filter=Filter(must=[
FieldCondition(key="doc_id", match=MatchValue(value=doc_id))
]),
limit=1,
with_payload=True
)
if not results:
# Try original_id
results, _ = client.scroll(
collection_name=collection,
scroll_filter=Filter(must=[
FieldCondition(key="original_id", match=MatchValue(value=doc_id))
]),
limit=1,
with_payload=True
)
if not results:
raise HTTPException(status_code=404, detail="Document not found")
payload = results[0].payload
# Generate MinIO presigned URL
subject_name = payload.get("subject", "Unbekannt")
doc_year = payload.get("year", 0)
doc_niveau = payload.get("niveau", "")
task_num = payload.get("task_number")
minio_path = f"landes-daten/ni/klausur/{doc_year}/{doc_year}_{subject_name}_{doc_niveau}"
if task_num:
minio_path += f"_{task_num}"
minio_path += "_EWH.pdf"
# Get presigned URL (1 hour expiry)
preview_url = await get_presigned_url(minio_path, expires=3600)
return {
"id": doc_id,
"title": f"{subject_name} {doc_year} {doc_niveau}",
"subject": subject_name,
"niveau": doc_niveau,
"year": doc_year,
"task_number": task_num,
"doc_type": payload.get("doc_type", "EWH"),
"variant": payload.get("variant"),
"bundesland": payload.get("bundesland", "NI"),
"minio_path": minio_path,
"preview_url": preview_url,
"text_preview": payload.get("text", "")[:500] + "..." if payload.get("text") else None
}
except HTTPException:
raise
except Exception as e:
print(f"Get document error: {e}")
raise HTTPException(status_code=500, detail=f"Failed to get document: {str(e)}")
@router.get("/api/v1/archiv/{doc_id}/url")
async def get_document_url(doc_id: str, expires: int = Query(3600, ge=300, le=86400)):
"""
Get a presigned URL for downloading the PDF.
Args:
doc_id: Document ID
expires: URL expiration time in seconds (default 1 hour, max 24 hours)
"""
try:
# First, get the document to find the MinIO path
doc = await get_archiv_document(doc_id)
if not doc.get("minio_path"):
raise HTTPException(status_code=404, detail="Document path not found")
# Generate presigned URL
url = await get_presigned_url(doc["minio_path"], expires=expires)
if not url:
raise HTTPException(status_code=500, detail="Failed to generate download URL")
return {
"url": url,
"expires_in": expires,
"filename": doc["minio_path"].split("/")[-1]
}
except HTTPException:
raise
except Exception as e:
print(f"Get URL error: {e}")
raise HTTPException(status_code=500, detail=f"Failed to generate URL: {str(e)}")