Files
breakpilot-lehrer/klausur-service/backend/vocab_worksheet_api.py
Benjamin Admin 9ba420fa91
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Fix: Remove broken getKlausurApiUrl and clean up empty lines
sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00

500 lines
16 KiB
Python

"""
Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
vocabulary editing, worksheet generation, and PDF downloads.
Sub-routers (included at bottom):
- vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
- vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
"""
from fastapi import APIRouter, HTTPException, UploadFile, File, Query
from fastapi.responses import StreamingResponse
from typing import List, Dict, Any
from datetime import datetime
import uuid
import os
import io
import logging
logger = logging.getLogger(__name__)
# --- Imports from extracted sub-modules ---
from vocab_worksheet_models import (
WorksheetType,
SessionStatus,
VocabularyEntry,
SessionCreate,
SessionResponse,
VocabularyResponse,
VocabularyUpdate,
WorksheetGenerateRequest,
WorksheetResponse,
)
from vocab_worksheet_extraction import extract_vocabulary_from_image
from vocab_worksheet_generation import (
generate_worksheet_html, generate_worksheet_pdf,
convert_pdf_page_to_image,
)
# --- Database integration (used by main.py lifespan) ---
try:
from vocab_session_store import (
DATABASE_URL, get_pool, init_vocab_tables,
list_sessions_db, get_session_db,
)
except ImportError:
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
get_pool = None
init_vocab_tables = None
list_sessions_db = None
get_session_db = None
_db_pool = None
def set_db_pool(pool):
"""Set the database connection pool (called from main.py lifespan)."""
global _db_pool
_db_pool = pool
async def _init_vocab_table():
"""Initialize vocab tables in database."""
if init_vocab_tables:
try:
await init_vocab_tables()
logger.info("vocab_session_cache table ready")
except Exception as e:
logger.warning(f"Failed to init vocab tables: {e}")
else:
logger.info("vocab_session_cache table ready")
async def _load_all_sessions():
"""Load all vocab sessions from database into memory cache."""
if not list_sessions_db:
logger.info("Loaded 0 vocab sessions from database")
return
try:
sessions = await list_sessions_db(limit=500)
count = 0
for s in sessions:
sid = s.get("id") or s.get("session_id")
if sid and sid not in _sessions:
_sessions[sid] = {
"id": sid,
"name": s.get("name", ""),
"description": s.get("description", ""),
"status": s.get("status", "created"),
"vocabulary_count": s.get("vocabulary_count", 0),
"source_language": s.get("source_language", "en"),
"target_language": s.get("target_language", "de"),
"created_at": str(s.get("created_at", "")),
}
count += 1
logger.info(f"Loaded {count} vocab sessions from database")
except Exception as e:
logger.warning(f"Failed to load sessions from database: {e}")
# --- Router & module-level state ---
router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
_sessions: Dict[str, Dict[str, Any]] = {}
_worksheets: Dict[str, Dict[str, Any]] = {}
@router.post("/sessions", response_model=SessionResponse)
async def create_session(session: SessionCreate):
"""Create a new vocabulary extraction session."""
session_id = str(uuid.uuid4())
session_data = {
"id": session_id,
"name": session.name,
"description": session.description,
"source_language": session.source_language,
"target_language": session.target_language,
"status": SessionStatus.PENDING.value,
"vocabulary": [],
"vocabulary_count": 0,
"image_path": None,
"extraction_confidence": None,
"created_at": datetime.utcnow(),
}
_sessions[session_id] = session_data
# Create storage directory
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
os.makedirs(session_dir, exist_ok=True)
return SessionResponse(
id=session_id,
name=session.name,
description=session.description,
source_language=session.source_language,
target_language=session.target_language,
status=SessionStatus.PENDING.value,
vocabulary_count=0,
image_path=None,
created_at=session_data["created_at"],
)
@router.get("/sessions", response_model=List[SessionResponse])
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
"""List all vocabulary sessions."""
sessions = sorted(
_sessions.values(),
key=lambda x: x["created_at"],
reverse=True
)[:limit]
return [
SessionResponse(
id=s["id"],
name=s["name"],
description=s.get("description"),
source_language=s["source_language"],
target_language=s["target_language"],
status=s["status"],
vocabulary_count=s.get("vocabulary_count", 0),
image_path=s.get("image_path"),
created_at=s["created_at"],
)
for s in sessions
]
@router.get("/sessions/{session_id}", response_model=SessionResponse)
async def get_session(session_id: str):
"""Get a specific session."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
s = _sessions[session_id]
return SessionResponse(
id=s["id"],
name=s["name"],
description=s.get("description"),
source_language=s["source_language"],
target_language=s["target_language"],
status=s["status"],
vocabulary_count=s.get("vocabulary_count", 0),
image_path=s.get("image_path"),
created_at=s["created_at"],
)
@router.post("/sessions/{session_id}/upload")
async def upload_image(
session_id: str,
file: UploadFile = File(...),
):
"""
Upload a textbook page image or PDF and extract vocabulary.
Supported formats: PNG, JPG, JPEG, PDF
"""
logger.info(f"Upload request for session {session_id}")
logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
if session_id not in _sessions:
logger.error(f"Session {session_id} not found")
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
# Validate file type - check both extension and content type
extension = file.filename.split('.')[-1].lower() if file.filename else ''
content_type = file.content_type or ''
# Accept images and PDFs
valid_image_extensions = ['png', 'jpg', 'jpeg']
valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
is_pdf = extension == 'pdf' or content_type == 'application/pdf'
is_image = extension in valid_image_extensions or content_type in valid_image_content_types
if not is_pdf and not is_image:
logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
raise HTTPException(
status_code=400,
detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
)
# Determine final extension for saving
if is_pdf:
save_extension = 'png' # PDFs will be converted to PNG
elif extension in valid_image_extensions:
save_extension = extension
elif content_type == 'image/png':
save_extension = 'png'
else:
save_extension = 'jpg'
# Read file content
content = await file.read()
logger.info(f"Read {len(content)} bytes from uploaded file")
# Convert PDF to image if needed
if is_pdf:
logger.info("Converting PDF to image...")
content = await convert_pdf_page_to_image(content, page_number=0)
logger.info(f"PDF converted, image size: {len(content)} bytes")
# Save image
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
os.makedirs(session_dir, exist_ok=True)
image_path = os.path.join(session_dir, f"source.{save_extension}")
with open(image_path, 'wb') as f:
f.write(content)
# Update session status
session["status"] = SessionStatus.PROCESSING.value
session["image_path"] = image_path
# Extract vocabulary using Vision LLM
vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
# Update session with extracted vocabulary
session["vocabulary"] = [v.dict() for v in vocabulary]
session["vocabulary_count"] = len(vocabulary)
session["extraction_confidence"] = confidence
session["status"] = SessionStatus.EXTRACTED.value
result = {
"session_id": session_id,
"filename": file.filename,
"image_path": image_path,
"vocabulary_count": len(vocabulary),
"extraction_confidence": confidence,
"status": SessionStatus.EXTRACTED.value,
}
if error:
result["error"] = error
return result
@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
async def get_vocabulary(session_id: str):
"""Get extracted vocabulary for a session."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
return VocabularyResponse(
session_id=session_id,
vocabulary=vocabulary,
extraction_confidence=session.get("extraction_confidence"),
)
@router.put("/sessions/{session_id}/vocabulary")
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
"""Update vocabulary entries (for manual corrections)."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
session["vocabulary"] = [v.dict() for v in update.vocabulary]
session["vocabulary_count"] = len(update.vocabulary)
return {
"session_id": session_id,
"vocabulary_count": len(update.vocabulary),
"message": "Vocabulary updated successfully",
}
@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
"""Generate worksheet PDF(s) from extracted vocabulary."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
if not vocabulary:
raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
worksheet_id = str(uuid.uuid4())
title = request.title or session["name"]
# Generate HTML for each worksheet type
combined_html = ""
for wtype in request.worksheet_types:
html = generate_worksheet_html(
vocabulary=vocabulary,
worksheet_type=wtype,
title=f"{title} - {wtype.value}",
show_solutions=False,
repetitions=request.repetitions,
line_height=request.line_height,
)
combined_html += html + '<div style="page-break-after: always;"></div>'
# Generate PDF
try:
pdf_bytes = await generate_worksheet_pdf(combined_html)
except Exception as e:
raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
# Save PDF
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
with open(pdf_path, 'wb') as f:
f.write(pdf_bytes)
# Generate solution PDF if requested
solution_path = None
if request.include_solutions:
solution_html = ""
for wtype in request.worksheet_types:
html = generate_worksheet_html(
vocabulary=vocabulary,
worksheet_type=wtype,
title=f"{title} - {wtype.value} (Loesung)",
show_solutions=True,
repetitions=request.repetitions,
line_height=request.line_height,
)
solution_html += html + '<div style="page-break-after: always;"></div>'
solution_bytes = await generate_worksheet_pdf(solution_html)
solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
with open(solution_path, 'wb') as f:
f.write(solution_bytes)
# Store worksheet info
worksheet_data = {
"id": worksheet_id,
"session_id": session_id,
"worksheet_types": [wt.value for wt in request.worksheet_types],
"pdf_path": pdf_path,
"solution_path": solution_path,
"generated_at": datetime.utcnow(),
}
_worksheets[worksheet_id] = worksheet_data
# Update session status
session["status"] = SessionStatus.COMPLETED.value
return WorksheetResponse(
id=worksheet_id,
session_id=session_id,
worksheet_types=worksheet_data["worksheet_types"],
pdf_path=pdf_path,
solution_path=solution_path,
generated_at=worksheet_data["generated_at"],
)
@router.get("/worksheets/{worksheet_id}/pdf")
async def download_worksheet_pdf(worksheet_id: str):
"""Download the generated worksheet PDF."""
if worksheet_id not in _worksheets:
raise HTTPException(status_code=404, detail="Worksheet not found")
worksheet = _worksheets[worksheet_id]
pdf_path = worksheet["pdf_path"]
if not os.path.exists(pdf_path):
raise HTTPException(status_code=404, detail="PDF file not found")
with open(pdf_path, 'rb') as f:
pdf_bytes = f.read()
return StreamingResponse(
io.BytesIO(pdf_bytes),
media_type="application/pdf",
headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
)
@router.get("/worksheets/{worksheet_id}/solution")
async def download_solution_pdf(worksheet_id: str):
"""Download the solution PDF."""
if worksheet_id not in _worksheets:
raise HTTPException(status_code=404, detail="Worksheet not found")
worksheet = _worksheets[worksheet_id]
solution_path = worksheet.get("solution_path")
if not solution_path or not os.path.exists(solution_path):
raise HTTPException(status_code=404, detail="Solution PDF not found")
with open(solution_path, 'rb') as f:
pdf_bytes = f.read()
return StreamingResponse(
io.BytesIO(pdf_bytes),
media_type="application/pdf",
headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
)
@router.get("/sessions/{session_id}/image")
async def get_session_image(session_id: str):
"""Get the uploaded source image for a session."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = _sessions[session_id]
image_path = session.get("image_path")
if not image_path or not os.path.exists(image_path):
raise HTTPException(status_code=404, detail="Image not found")
# Determine content type
extension = image_path.split('.')[-1].lower()
content_type = {
'png': 'image/png',
'jpg': 'image/jpeg',
'jpeg': 'image/jpeg',
}.get(extension, 'application/octet-stream')
with open(image_path, 'rb') as f:
image_bytes = f.read()
return StreamingResponse(
io.BytesIO(image_bytes),
media_type=content_type,
)
@router.delete("/sessions/{session_id}")
async def delete_session(session_id: str):
"""Delete a vocabulary session and all associated files."""
if session_id not in _sessions:
raise HTTPException(status_code=404, detail="Session not found")
# Delete session directory
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
if os.path.exists(session_dir):
import shutil
shutil.rmtree(session_dir)
# Remove from storage
del _sessions[session_id]
# Remove associated worksheets
for wid, ws in list(_worksheets.items()):
if ws["session_id"] == session_id:
del _worksheets[wid]
return {"message": "Session deleted successfully", "session_id": session_id}
# --- Include sub-routers ---
from vocab_worksheet_upload_api import upload_router
from vocab_worksheet_analysis_api import analysis_router
router.include_router(upload_router)
router.include_router(analysis_router)