Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
500 lines
16 KiB
Python
500 lines
16 KiB
Python
"""
|
|
Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
|
|
vocabulary editing, worksheet generation, and PDF downloads.
|
|
|
|
Sub-routers (included at bottom):
|
|
- vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
|
|
- vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
|
|
"""
|
|
|
|
from fastapi import APIRouter, HTTPException, UploadFile, File, Query
|
|
from fastapi.responses import StreamingResponse
|
|
from typing import List, Dict, Any
|
|
from datetime import datetime
|
|
import uuid
|
|
import os
|
|
import io
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# --- Imports from extracted sub-modules ---
|
|
from vocab_worksheet_models import (
|
|
WorksheetType,
|
|
SessionStatus,
|
|
VocabularyEntry,
|
|
SessionCreate,
|
|
SessionResponse,
|
|
VocabularyResponse,
|
|
VocabularyUpdate,
|
|
WorksheetGenerateRequest,
|
|
WorksheetResponse,
|
|
)
|
|
from vocab_worksheet_extraction import extract_vocabulary_from_image
|
|
from vocab_worksheet_generation import (
|
|
generate_worksheet_html, generate_worksheet_pdf,
|
|
convert_pdf_page_to_image,
|
|
)
|
|
|
|
# --- Database integration (used by main.py lifespan) ---
|
|
try:
|
|
from vocab_session_store import (
|
|
DATABASE_URL, get_pool, init_vocab_tables,
|
|
list_sessions_db, get_session_db,
|
|
)
|
|
except ImportError:
|
|
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
|
|
get_pool = None
|
|
init_vocab_tables = None
|
|
list_sessions_db = None
|
|
get_session_db = None
|
|
|
|
_db_pool = None
|
|
|
|
|
|
def set_db_pool(pool):
|
|
"""Set the database connection pool (called from main.py lifespan)."""
|
|
global _db_pool
|
|
_db_pool = pool
|
|
|
|
|
|
async def _init_vocab_table():
|
|
"""Initialize vocab tables in database."""
|
|
if init_vocab_tables:
|
|
try:
|
|
await init_vocab_tables()
|
|
logger.info("vocab_session_cache table ready")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to init vocab tables: {e}")
|
|
else:
|
|
logger.info("vocab_session_cache table ready")
|
|
|
|
|
|
async def _load_all_sessions():
|
|
"""Load all vocab sessions from database into memory cache."""
|
|
if not list_sessions_db:
|
|
logger.info("Loaded 0 vocab sessions from database")
|
|
return
|
|
|
|
try:
|
|
sessions = await list_sessions_db(limit=500)
|
|
count = 0
|
|
for s in sessions:
|
|
sid = s.get("id") or s.get("session_id")
|
|
if sid and sid not in _sessions:
|
|
_sessions[sid] = {
|
|
"id": sid,
|
|
"name": s.get("name", ""),
|
|
"description": s.get("description", ""),
|
|
"status": s.get("status", "created"),
|
|
"vocabulary_count": s.get("vocabulary_count", 0),
|
|
"source_language": s.get("source_language", "en"),
|
|
"target_language": s.get("target_language", "de"),
|
|
"created_at": str(s.get("created_at", "")),
|
|
}
|
|
count += 1
|
|
logger.info(f"Loaded {count} vocab sessions from database")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to load sessions from database: {e}")
|
|
|
|
|
|
# --- Router & module-level state ---
|
|
router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
|
|
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
|
|
_sessions: Dict[str, Dict[str, Any]] = {}
|
|
_worksheets: Dict[str, Dict[str, Any]] = {}
|
|
|
|
|
|
@router.post("/sessions", response_model=SessionResponse)
|
|
async def create_session(session: SessionCreate):
|
|
"""Create a new vocabulary extraction session."""
|
|
session_id = str(uuid.uuid4())
|
|
|
|
session_data = {
|
|
"id": session_id,
|
|
"name": session.name,
|
|
"description": session.description,
|
|
"source_language": session.source_language,
|
|
"target_language": session.target_language,
|
|
"status": SessionStatus.PENDING.value,
|
|
"vocabulary": [],
|
|
"vocabulary_count": 0,
|
|
"image_path": None,
|
|
"extraction_confidence": None,
|
|
"created_at": datetime.utcnow(),
|
|
}
|
|
|
|
_sessions[session_id] = session_data
|
|
|
|
# Create storage directory
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
|
|
return SessionResponse(
|
|
id=session_id,
|
|
name=session.name,
|
|
description=session.description,
|
|
source_language=session.source_language,
|
|
target_language=session.target_language,
|
|
status=SessionStatus.PENDING.value,
|
|
vocabulary_count=0,
|
|
image_path=None,
|
|
created_at=session_data["created_at"],
|
|
)
|
|
|
|
|
|
@router.get("/sessions", response_model=List[SessionResponse])
|
|
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
|
|
"""List all vocabulary sessions."""
|
|
sessions = sorted(
|
|
_sessions.values(),
|
|
key=lambda x: x["created_at"],
|
|
reverse=True
|
|
)[:limit]
|
|
|
|
return [
|
|
SessionResponse(
|
|
id=s["id"],
|
|
name=s["name"],
|
|
description=s.get("description"),
|
|
source_language=s["source_language"],
|
|
target_language=s["target_language"],
|
|
status=s["status"],
|
|
vocabulary_count=s.get("vocabulary_count", 0),
|
|
image_path=s.get("image_path"),
|
|
created_at=s["created_at"],
|
|
)
|
|
for s in sessions
|
|
]
|
|
|
|
|
|
@router.get("/sessions/{session_id}", response_model=SessionResponse)
|
|
async def get_session(session_id: str):
|
|
"""Get a specific session."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
s = _sessions[session_id]
|
|
return SessionResponse(
|
|
id=s["id"],
|
|
name=s["name"],
|
|
description=s.get("description"),
|
|
source_language=s["source_language"],
|
|
target_language=s["target_language"],
|
|
status=s["status"],
|
|
vocabulary_count=s.get("vocabulary_count", 0),
|
|
image_path=s.get("image_path"),
|
|
created_at=s["created_at"],
|
|
)
|
|
|
|
|
|
@router.post("/sessions/{session_id}/upload")
|
|
async def upload_image(
|
|
session_id: str,
|
|
file: UploadFile = File(...),
|
|
):
|
|
"""
|
|
Upload a textbook page image or PDF and extract vocabulary.
|
|
|
|
Supported formats: PNG, JPG, JPEG, PDF
|
|
"""
|
|
logger.info(f"Upload request for session {session_id}")
|
|
logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
|
|
|
|
if session_id not in _sessions:
|
|
logger.error(f"Session {session_id} not found")
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
|
|
# Validate file type - check both extension and content type
|
|
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
|
content_type = file.content_type or ''
|
|
|
|
# Accept images and PDFs
|
|
valid_image_extensions = ['png', 'jpg', 'jpeg']
|
|
valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
|
|
is_pdf = extension == 'pdf' or content_type == 'application/pdf'
|
|
is_image = extension in valid_image_extensions or content_type in valid_image_content_types
|
|
|
|
if not is_pdf and not is_image:
|
|
logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
|
|
)
|
|
|
|
# Determine final extension for saving
|
|
if is_pdf:
|
|
save_extension = 'png' # PDFs will be converted to PNG
|
|
elif extension in valid_image_extensions:
|
|
save_extension = extension
|
|
elif content_type == 'image/png':
|
|
save_extension = 'png'
|
|
else:
|
|
save_extension = 'jpg'
|
|
|
|
# Read file content
|
|
content = await file.read()
|
|
logger.info(f"Read {len(content)} bytes from uploaded file")
|
|
|
|
# Convert PDF to image if needed
|
|
if is_pdf:
|
|
logger.info("Converting PDF to image...")
|
|
content = await convert_pdf_page_to_image(content, page_number=0)
|
|
logger.info(f"PDF converted, image size: {len(content)} bytes")
|
|
|
|
# Save image
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
os.makedirs(session_dir, exist_ok=True)
|
|
image_path = os.path.join(session_dir, f"source.{save_extension}")
|
|
|
|
with open(image_path, 'wb') as f:
|
|
f.write(content)
|
|
|
|
# Update session status
|
|
session["status"] = SessionStatus.PROCESSING.value
|
|
session["image_path"] = image_path
|
|
|
|
# Extract vocabulary using Vision LLM
|
|
vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
|
|
|
|
# Update session with extracted vocabulary
|
|
session["vocabulary"] = [v.dict() for v in vocabulary]
|
|
session["vocabulary_count"] = len(vocabulary)
|
|
session["extraction_confidence"] = confidence
|
|
session["status"] = SessionStatus.EXTRACTED.value
|
|
|
|
result = {
|
|
"session_id": session_id,
|
|
"filename": file.filename,
|
|
"image_path": image_path,
|
|
"vocabulary_count": len(vocabulary),
|
|
"extraction_confidence": confidence,
|
|
"status": SessionStatus.EXTRACTED.value,
|
|
}
|
|
|
|
if error:
|
|
result["error"] = error
|
|
|
|
return result
|
|
|
|
|
|
@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
|
|
async def get_vocabulary(session_id: str):
|
|
"""Get extracted vocabulary for a session."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
session = _sessions[session_id]
|
|
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
|
|
return VocabularyResponse(
|
|
session_id=session_id,
|
|
vocabulary=vocabulary,
|
|
extraction_confidence=session.get("extraction_confidence"),
|
|
)
|
|
|
|
|
|
@router.put("/sessions/{session_id}/vocabulary")
|
|
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
|
|
"""Update vocabulary entries (for manual corrections)."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
session["vocabulary"] = [v.dict() for v in update.vocabulary]
|
|
session["vocabulary_count"] = len(update.vocabulary)
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"vocabulary_count": len(update.vocabulary),
|
|
"message": "Vocabulary updated successfully",
|
|
}
|
|
|
|
|
|
@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
|
|
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
|
|
"""Generate worksheet PDF(s) from extracted vocabulary."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
|
|
|
|
if not vocabulary:
|
|
raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
|
|
|
|
worksheet_id = str(uuid.uuid4())
|
|
title = request.title or session["name"]
|
|
|
|
# Generate HTML for each worksheet type
|
|
combined_html = ""
|
|
for wtype in request.worksheet_types:
|
|
html = generate_worksheet_html(
|
|
vocabulary=vocabulary,
|
|
worksheet_type=wtype,
|
|
title=f"{title} - {wtype.value}",
|
|
show_solutions=False,
|
|
repetitions=request.repetitions,
|
|
line_height=request.line_height,
|
|
)
|
|
combined_html += html + '<div style="page-break-after: always;"></div>'
|
|
|
|
# Generate PDF
|
|
try:
|
|
pdf_bytes = await generate_worksheet_pdf(combined_html)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
|
|
|
|
# Save PDF
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
|
|
with open(pdf_path, 'wb') as f:
|
|
f.write(pdf_bytes)
|
|
|
|
# Generate solution PDF if requested
|
|
solution_path = None
|
|
if request.include_solutions:
|
|
solution_html = ""
|
|
for wtype in request.worksheet_types:
|
|
html = generate_worksheet_html(
|
|
vocabulary=vocabulary,
|
|
worksheet_type=wtype,
|
|
title=f"{title} - {wtype.value} (Loesung)",
|
|
show_solutions=True,
|
|
repetitions=request.repetitions,
|
|
line_height=request.line_height,
|
|
)
|
|
solution_html += html + '<div style="page-break-after: always;"></div>'
|
|
|
|
solution_bytes = await generate_worksheet_pdf(solution_html)
|
|
solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
|
|
with open(solution_path, 'wb') as f:
|
|
f.write(solution_bytes)
|
|
|
|
# Store worksheet info
|
|
worksheet_data = {
|
|
"id": worksheet_id,
|
|
"session_id": session_id,
|
|
"worksheet_types": [wt.value for wt in request.worksheet_types],
|
|
"pdf_path": pdf_path,
|
|
"solution_path": solution_path,
|
|
"generated_at": datetime.utcnow(),
|
|
}
|
|
_worksheets[worksheet_id] = worksheet_data
|
|
|
|
# Update session status
|
|
session["status"] = SessionStatus.COMPLETED.value
|
|
|
|
return WorksheetResponse(
|
|
id=worksheet_id,
|
|
session_id=session_id,
|
|
worksheet_types=worksheet_data["worksheet_types"],
|
|
pdf_path=pdf_path,
|
|
solution_path=solution_path,
|
|
generated_at=worksheet_data["generated_at"],
|
|
)
|
|
|
|
|
|
@router.get("/worksheets/{worksheet_id}/pdf")
|
|
async def download_worksheet_pdf(worksheet_id: str):
|
|
"""Download the generated worksheet PDF."""
|
|
if worksheet_id not in _worksheets:
|
|
raise HTTPException(status_code=404, detail="Worksheet not found")
|
|
|
|
worksheet = _worksheets[worksheet_id]
|
|
pdf_path = worksheet["pdf_path"]
|
|
|
|
if not os.path.exists(pdf_path):
|
|
raise HTTPException(status_code=404, detail="PDF file not found")
|
|
|
|
with open(pdf_path, 'rb') as f:
|
|
pdf_bytes = f.read()
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(pdf_bytes),
|
|
media_type="application/pdf",
|
|
headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
|
|
)
|
|
|
|
|
|
@router.get("/worksheets/{worksheet_id}/solution")
|
|
async def download_solution_pdf(worksheet_id: str):
|
|
"""Download the solution PDF."""
|
|
if worksheet_id not in _worksheets:
|
|
raise HTTPException(status_code=404, detail="Worksheet not found")
|
|
|
|
worksheet = _worksheets[worksheet_id]
|
|
solution_path = worksheet.get("solution_path")
|
|
|
|
if not solution_path or not os.path.exists(solution_path):
|
|
raise HTTPException(status_code=404, detail="Solution PDF not found")
|
|
|
|
with open(solution_path, 'rb') as f:
|
|
pdf_bytes = f.read()
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(pdf_bytes),
|
|
media_type="application/pdf",
|
|
headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
|
|
)
|
|
|
|
|
|
@router.get("/sessions/{session_id}/image")
|
|
async def get_session_image(session_id: str):
|
|
"""Get the uploaded source image for a session."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
session = _sessions[session_id]
|
|
image_path = session.get("image_path")
|
|
|
|
if not image_path or not os.path.exists(image_path):
|
|
raise HTTPException(status_code=404, detail="Image not found")
|
|
|
|
# Determine content type
|
|
extension = image_path.split('.')[-1].lower()
|
|
content_type = {
|
|
'png': 'image/png',
|
|
'jpg': 'image/jpeg',
|
|
'jpeg': 'image/jpeg',
|
|
}.get(extension, 'application/octet-stream')
|
|
|
|
with open(image_path, 'rb') as f:
|
|
image_bytes = f.read()
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(image_bytes),
|
|
media_type=content_type,
|
|
)
|
|
|
|
|
|
@router.delete("/sessions/{session_id}")
|
|
async def delete_session(session_id: str):
|
|
"""Delete a vocabulary session and all associated files."""
|
|
if session_id not in _sessions:
|
|
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
# Delete session directory
|
|
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
|
if os.path.exists(session_dir):
|
|
import shutil
|
|
shutil.rmtree(session_dir)
|
|
|
|
# Remove from storage
|
|
del _sessions[session_id]
|
|
|
|
# Remove associated worksheets
|
|
for wid, ws in list(_worksheets.items()):
|
|
if ws["session_id"] == session_id:
|
|
del _worksheets[wid]
|
|
|
|
return {"message": "Session deleted successfully", "session_id": session_id}
|
|
|
|
|
|
# --- Include sub-routers ---
|
|
from vocab_worksheet_upload_api import upload_router
|
|
from vocab_worksheet_analysis_api import analysis_router
|
|
|
|
router.include_router(upload_router)
|
|
router.include_router(analysis_router)
|