Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
491 lines
16 KiB
Python
491 lines
16 KiB
Python
"""
|
|
Klausur-Service Abitur-Archiv Routes
|
|
|
|
Endpoints for accessing NiBiS Zentralabitur documents (public archive).
|
|
Provides filtered listing and presigned URLs for PDF access.
|
|
"""
|
|
|
|
from typing import Optional, List, Dict
|
|
from datetime import datetime
|
|
|
|
from fastapi import APIRouter, HTTPException, Query
|
|
from pydantic import BaseModel
|
|
|
|
from qdrant_service import get_qdrant_client, search_nibis_eh
|
|
from minio_storage import get_presigned_url, list_documents
|
|
from eh_pipeline import generate_single_embedding
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
# =============================================
|
|
# MODELS
|
|
# =============================================
|
|
|
|
class AbiturDokument(BaseModel):
|
|
"""Abitur document from the archive."""
|
|
id: str
|
|
title: str
|
|
subject: str
|
|
niveau: str # eA or gA
|
|
year: int
|
|
task_number: Optional[str] = None # Can be "1", "2A", "2C", etc.
|
|
doc_type: str # EWH, Aufgabe, Material
|
|
variant: Optional[str] = None
|
|
bundesland: str = "NI"
|
|
minio_path: Optional[str] = None
|
|
preview_url: Optional[str] = None
|
|
|
|
|
|
class ArchivSearchResponse(BaseModel):
|
|
"""Response for archive listing."""
|
|
total: int
|
|
documents: List[AbiturDokument]
|
|
filters: Dict
|
|
|
|
|
|
class SemanticSearchResult(BaseModel):
|
|
"""Result from semantic search."""
|
|
id: str
|
|
score: float
|
|
text: str
|
|
year: int
|
|
subject: str
|
|
niveau: str
|
|
task_number: Optional[str] = None # Can be "1", "2A", "2C", etc.
|
|
doc_type: str
|
|
|
|
|
|
# =============================================
|
|
# ARCHIVE LISTING & FILTERS
|
|
# =============================================
|
|
|
|
# IMPORTANT: Specific routes MUST come before parameterized routes!
|
|
# Otherwise /api/v1/archiv/stats would be caught by /api/v1/archiv/{doc_id}
|
|
|
|
# =============================================
|
|
# STATS (must be before {doc_id})
|
|
# =============================================
|
|
|
|
@router.get("/api/v1/archiv/stats")
|
|
async def get_archiv_stats():
|
|
"""
|
|
Get archive statistics (document counts, available years, etc.).
|
|
"""
|
|
try:
|
|
client = get_qdrant_client()
|
|
collection = "bp_nibis_eh"
|
|
|
|
# Get collection info
|
|
info = client.get_collection(collection)
|
|
|
|
# Scroll to get stats by year/subject
|
|
all_points, _ = client.scroll(
|
|
collection_name=collection,
|
|
limit=1000,
|
|
with_payload=True,
|
|
with_vectors=False
|
|
)
|
|
|
|
# Aggregate stats
|
|
by_year = {}
|
|
by_subject = {}
|
|
by_niveau = {}
|
|
|
|
seen_docs = set()
|
|
|
|
for point in all_points:
|
|
payload = point.payload
|
|
doc_id = payload.get("doc_id") or payload.get("original_id", str(point.id))
|
|
|
|
if doc_id in seen_docs:
|
|
continue
|
|
seen_docs.add(doc_id)
|
|
|
|
year = str(payload.get("year", "unknown"))
|
|
subject = payload.get("subject", "unknown")
|
|
niveau = payload.get("niveau", "unknown")
|
|
|
|
by_year[year] = by_year.get(year, 0) + 1
|
|
by_subject[subject] = by_subject.get(subject, 0) + 1
|
|
by_niveau[niveau] = by_niveau.get(niveau, 0) + 1
|
|
|
|
return {
|
|
"total_documents": len(seen_docs),
|
|
"total_chunks": info.points_count,
|
|
"by_year": dict(sorted(by_year.items(), reverse=True)),
|
|
"by_subject": dict(sorted(by_subject.items(), key=lambda x: -x[1])),
|
|
"by_niveau": by_niveau,
|
|
"collection_status": info.status.value
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Stats error: {e}")
|
|
return {
|
|
"total_documents": 0,
|
|
"total_chunks": 0,
|
|
"by_year": {},
|
|
"by_subject": {},
|
|
"by_niveau": {},
|
|
"error": str(e)
|
|
}
|
|
|
|
|
|
# =============================================
|
|
# THEME SUGGESTIONS (must be before {doc_id})
|
|
# =============================================
|
|
|
|
@router.get("/api/v1/archiv/suggest")
|
|
async def suggest_themes(
|
|
query: str = Query(..., min_length=2, description="Partial search query")
|
|
) -> List[Dict]:
|
|
"""
|
|
Get theme suggestions for autocomplete.
|
|
|
|
Returns popular themes/topics that match the query.
|
|
"""
|
|
# Predefined themes for autocomplete
|
|
THEMES = [
|
|
{"label": "Textanalyse", "type": "Analyse"},
|
|
{"label": "Gedichtanalyse", "type": "Analyse"},
|
|
{"label": "Dramenanalyse", "type": "Analyse"},
|
|
{"label": "Prosaanalyse", "type": "Analyse"},
|
|
{"label": "Eroerterung", "type": "Aufsatz"},
|
|
{"label": "Textgebundene Eroerterung", "type": "Aufsatz"},
|
|
{"label": "Materialgestuetzte Eroerterung", "type": "Aufsatz"},
|
|
{"label": "Sprachreflexion", "type": "Analyse"},
|
|
{"label": "Kafka", "type": "Autor"},
|
|
{"label": "Goethe", "type": "Autor"},
|
|
{"label": "Schiller", "type": "Autor"},
|
|
{"label": "Romantik", "type": "Epoche"},
|
|
{"label": "Expressionismus", "type": "Epoche"},
|
|
{"label": "Sturm und Drang", "type": "Epoche"},
|
|
{"label": "Aufklaerung", "type": "Epoche"},
|
|
{"label": "Sprachvarietaeten", "type": "Thema"},
|
|
{"label": "Sprachwandel", "type": "Thema"},
|
|
{"label": "Kommunikation", "type": "Thema"},
|
|
{"label": "Medien", "type": "Thema"},
|
|
]
|
|
|
|
query_lower = query.lower()
|
|
matches = [
|
|
theme for theme in THEMES
|
|
if query_lower in theme["label"].lower()
|
|
]
|
|
|
|
return matches[:10]
|
|
|
|
|
|
# =============================================
|
|
# SEMANTIC SEARCH (must be before {doc_id})
|
|
# =============================================
|
|
|
|
@router.get("/api/v1/archiv/search/semantic")
|
|
async def semantic_search(
|
|
query: str = Query(..., min_length=3, description="Search query"),
|
|
year: Optional[int] = Query(None),
|
|
subject: Optional[str] = Query(None),
|
|
niveau: Optional[str] = Query(None),
|
|
limit: int = Query(10, ge=1, le=50)
|
|
) -> List[SemanticSearchResult]:
|
|
"""
|
|
Perform semantic search across the archive using embeddings.
|
|
|
|
This searches for conceptually similar content, not just keyword matches.
|
|
"""
|
|
try:
|
|
# Generate query embedding
|
|
query_embedding = await generate_single_embedding(query)
|
|
|
|
# Search in Qdrant
|
|
results = await search_nibis_eh(
|
|
query_embedding=query_embedding,
|
|
year=year,
|
|
subject=subject,
|
|
niveau=niveau,
|
|
limit=limit
|
|
)
|
|
|
|
return [
|
|
SemanticSearchResult(
|
|
id=r["id"],
|
|
score=r["score"],
|
|
text=r.get("text", "")[:500], # Truncate for response
|
|
year=r.get("year", 0),
|
|
subject=r.get("subject", ""),
|
|
niveau=r.get("niveau", ""),
|
|
task_number=r.get("task_number"),
|
|
doc_type=r.get("doc_type", "EWH")
|
|
)
|
|
for r in results
|
|
]
|
|
|
|
except Exception as e:
|
|
print(f"Semantic search error: {e}")
|
|
return []
|
|
|
|
|
|
# =============================================
|
|
# ARCHIVE LISTING
|
|
# =============================================
|
|
|
|
@router.get("/api/v1/archiv", response_model=ArchivSearchResponse)
|
|
async def list_archiv_documents(
|
|
subject: Optional[str] = Query(None, description="Filter by subject (e.g., Deutsch, Englisch)"),
|
|
year: Optional[int] = Query(None, description="Filter by year (e.g., 2024)"),
|
|
bundesland: Optional[str] = Query(None, description="Filter by state (e.g., NI)"),
|
|
niveau: Optional[str] = Query(None, description="Filter by level (eA or gA)"),
|
|
doc_type: Optional[str] = Query(None, description="Filter by type (EWH, Aufgabe)"),
|
|
search: Optional[str] = Query(None, description="Theme/keyword search"),
|
|
limit: int = Query(50, ge=1, le=200),
|
|
offset: int = Query(0, ge=0)
|
|
):
|
|
"""
|
|
List all documents in the Abitur archive with optional filters.
|
|
|
|
Returns metadata for documents stored in the bp_nibis_eh Qdrant collection.
|
|
PDF URLs are generated on-demand via MinIO presigned URLs.
|
|
"""
|
|
try:
|
|
client = get_qdrant_client()
|
|
collection = "bp_nibis_eh"
|
|
|
|
# Get all unique documents (dedup by doc_id)
|
|
# We scroll through the collection to get document metadata
|
|
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
|
|
# Build filter conditions
|
|
must_conditions = []
|
|
|
|
if subject:
|
|
must_conditions.append(
|
|
FieldCondition(key="subject", match=MatchValue(value=subject))
|
|
)
|
|
if year:
|
|
must_conditions.append(
|
|
FieldCondition(key="year", match=MatchValue(value=year))
|
|
)
|
|
if bundesland:
|
|
must_conditions.append(
|
|
FieldCondition(key="bundesland", match=MatchValue(value=bundesland))
|
|
)
|
|
if niveau:
|
|
must_conditions.append(
|
|
FieldCondition(key="niveau", match=MatchValue(value=niveau))
|
|
)
|
|
if doc_type:
|
|
must_conditions.append(
|
|
FieldCondition(key="doc_type", match=MatchValue(value=doc_type))
|
|
)
|
|
|
|
query_filter = Filter(must=must_conditions) if must_conditions else None
|
|
|
|
# Scroll through all points to get unique documents
|
|
all_points, _ = client.scroll(
|
|
collection_name=collection,
|
|
scroll_filter=query_filter,
|
|
limit=1000, # Get more to ensure we find unique docs
|
|
with_payload=True,
|
|
with_vectors=False
|
|
)
|
|
|
|
# Deduplicate by doc_id and collect unique documents
|
|
seen_docs = {}
|
|
for point in all_points:
|
|
payload = point.payload
|
|
doc_id = payload.get("doc_id") or payload.get("original_id", str(point.id))
|
|
|
|
# Skip if already seen
|
|
if doc_id in seen_docs:
|
|
continue
|
|
|
|
# Apply text search filter if provided
|
|
if search:
|
|
text = payload.get("text", "")
|
|
if search.lower() not in text.lower():
|
|
continue
|
|
|
|
# Build document title from metadata
|
|
subject_name = payload.get("subject", "Unbekannt")
|
|
doc_year = payload.get("year", 0)
|
|
doc_niveau = payload.get("niveau", "")
|
|
task_num = payload.get("task_number")
|
|
doc_type_val = payload.get("doc_type", "EWH")
|
|
variant = payload.get("variant")
|
|
|
|
# Generate title
|
|
title_parts = [subject_name, str(doc_year), doc_niveau]
|
|
if task_num:
|
|
title_parts.append(f"Aufgabe {task_num}")
|
|
if doc_type_val and doc_type_val != "EWH":
|
|
title_parts.append(doc_type_val)
|
|
if variant:
|
|
title_parts.append(f"({variant})")
|
|
|
|
title = " ".join(title_parts)
|
|
|
|
# Generate MinIO path for this document
|
|
# Path pattern: landes-daten/ni/klausur/{year}/{filename}.pdf
|
|
minio_path = f"landes-daten/ni/klausur/{doc_year}/{doc_year}_{subject_name}_{doc_niveau}"
|
|
if task_num:
|
|
minio_path += f"_{task_num}"
|
|
minio_path += "_EWH.pdf"
|
|
|
|
seen_docs[doc_id] = AbiturDokument(
|
|
id=doc_id,
|
|
title=title,
|
|
subject=subject_name,
|
|
niveau=doc_niveau,
|
|
year=doc_year,
|
|
task_number=task_num,
|
|
doc_type=doc_type_val,
|
|
variant=variant,
|
|
bundesland=payload.get("bundesland", "NI"),
|
|
minio_path=minio_path
|
|
)
|
|
|
|
# Convert to list and apply pagination
|
|
documents = list(seen_docs.values())
|
|
|
|
# Sort by year descending, then subject
|
|
documents.sort(key=lambda d: (-d.year, d.subject))
|
|
|
|
total = len(documents)
|
|
paginated = documents[offset:offset + limit]
|
|
|
|
# Get available filter options for UI
|
|
filters = {
|
|
"subjects": sorted(list(set(d.subject for d in documents))),
|
|
"years": sorted(list(set(d.year for d in documents)), reverse=True),
|
|
"niveaus": sorted(list(set(d.niveau for d in documents if d.niveau))),
|
|
"doc_types": sorted(list(set(d.doc_type for d in documents if d.doc_type))),
|
|
}
|
|
|
|
return ArchivSearchResponse(
|
|
total=total,
|
|
documents=paginated,
|
|
filters=filters
|
|
)
|
|
|
|
except Exception as e:
|
|
print(f"Archiv list error: {e}")
|
|
# Return empty response with mock data if Qdrant fails
|
|
return ArchivSearchResponse(
|
|
total=0,
|
|
documents=[],
|
|
filters={
|
|
"subjects": ["Deutsch", "Englisch", "Mathematik"],
|
|
"years": [2025, 2024, 2023, 2022, 2021],
|
|
"niveaus": ["eA", "gA"],
|
|
"doc_types": ["EWH", "Aufgabe"]
|
|
}
|
|
)
|
|
|
|
|
|
@router.get("/api/v1/archiv/{doc_id}")
|
|
async def get_archiv_document(doc_id: str):
|
|
"""
|
|
Get details for a specific document including presigned PDF URL.
|
|
"""
|
|
try:
|
|
client = get_qdrant_client()
|
|
collection = "bp_nibis_eh"
|
|
|
|
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
|
|
# Find document by doc_id in payload
|
|
results, _ = client.scroll(
|
|
collection_name=collection,
|
|
scroll_filter=Filter(must=[
|
|
FieldCondition(key="doc_id", match=MatchValue(value=doc_id))
|
|
]),
|
|
limit=1,
|
|
with_payload=True
|
|
)
|
|
|
|
if not results:
|
|
# Try original_id
|
|
results, _ = client.scroll(
|
|
collection_name=collection,
|
|
scroll_filter=Filter(must=[
|
|
FieldCondition(key="original_id", match=MatchValue(value=doc_id))
|
|
]),
|
|
limit=1,
|
|
with_payload=True
|
|
)
|
|
|
|
if not results:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
payload = results[0].payload
|
|
|
|
# Generate MinIO presigned URL
|
|
subject_name = payload.get("subject", "Unbekannt")
|
|
doc_year = payload.get("year", 0)
|
|
doc_niveau = payload.get("niveau", "")
|
|
task_num = payload.get("task_number")
|
|
|
|
minio_path = f"landes-daten/ni/klausur/{doc_year}/{doc_year}_{subject_name}_{doc_niveau}"
|
|
if task_num:
|
|
minio_path += f"_{task_num}"
|
|
minio_path += "_EWH.pdf"
|
|
|
|
# Get presigned URL (1 hour expiry)
|
|
preview_url = await get_presigned_url(minio_path, expires=3600)
|
|
|
|
return {
|
|
"id": doc_id,
|
|
"title": f"{subject_name} {doc_year} {doc_niveau}",
|
|
"subject": subject_name,
|
|
"niveau": doc_niveau,
|
|
"year": doc_year,
|
|
"task_number": task_num,
|
|
"doc_type": payload.get("doc_type", "EWH"),
|
|
"variant": payload.get("variant"),
|
|
"bundesland": payload.get("bundesland", "NI"),
|
|
"minio_path": minio_path,
|
|
"preview_url": preview_url,
|
|
"text_preview": payload.get("text", "")[:500] + "..." if payload.get("text") else None
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
print(f"Get document error: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to get document: {str(e)}")
|
|
|
|
|
|
@router.get("/api/v1/archiv/{doc_id}/url")
|
|
async def get_document_url(doc_id: str, expires: int = Query(3600, ge=300, le=86400)):
|
|
"""
|
|
Get a presigned URL for downloading the PDF.
|
|
|
|
Args:
|
|
doc_id: Document ID
|
|
expires: URL expiration time in seconds (default 1 hour, max 24 hours)
|
|
"""
|
|
try:
|
|
# First, get the document to find the MinIO path
|
|
doc = await get_archiv_document(doc_id)
|
|
|
|
if not doc.get("minio_path"):
|
|
raise HTTPException(status_code=404, detail="Document path not found")
|
|
|
|
# Generate presigned URL
|
|
url = await get_presigned_url(doc["minio_path"], expires=expires)
|
|
|
|
if not url:
|
|
raise HTTPException(status_code=500, detail="Failed to generate download URL")
|
|
|
|
return {
|
|
"url": url,
|
|
"expires_in": expires,
|
|
"filename": doc["minio_path"].split("/")[-1]
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
print(f"Get URL error: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to generate URL: {str(e)}")
|