Restructure: Move grid_* + vocab_* into packages (klausur-service)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s
grid/ package (16 files): grid/build/ — core, zones, cleanup, text_ops, cell_ops, finalize grid/editor/ — api, helpers, columns, filters, headers, zones vocab/ package (10 files): vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare vocab/ — session_store, learn_bridge 26 backward-compat shims. Internal imports relative. RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,499 +1,4 @@
|
||||
"""
|
||||
Vocabulary Worksheet API — core CRUD routes for sessions, uploads,
|
||||
vocabulary editing, worksheet generation, and PDF downloads.
|
||||
|
||||
Sub-routers (included at bottom):
|
||||
- vocab_worksheet_upload_api: PDF upload, thumbnails, page processing
|
||||
- vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
import os
|
||||
import io
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Imports from extracted sub-modules ---
|
||||
from vocab_worksheet_models import (
|
||||
WorksheetType,
|
||||
SessionStatus,
|
||||
VocabularyEntry,
|
||||
SessionCreate,
|
||||
SessionResponse,
|
||||
VocabularyResponse,
|
||||
VocabularyUpdate,
|
||||
WorksheetGenerateRequest,
|
||||
WorksheetResponse,
|
||||
)
|
||||
from vocab_worksheet_extraction import extract_vocabulary_from_image
|
||||
from vocab_worksheet_generation import (
|
||||
generate_worksheet_html, generate_worksheet_pdf,
|
||||
convert_pdf_page_to_image,
|
||||
)
|
||||
|
||||
# --- Database integration (used by main.py lifespan) ---
|
||||
try:
|
||||
from vocab_session_store import (
|
||||
DATABASE_URL, get_pool, init_vocab_tables,
|
||||
list_sessions_db, get_session_db,
|
||||
)
|
||||
except ImportError:
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db")
|
||||
get_pool = None
|
||||
init_vocab_tables = None
|
||||
list_sessions_db = None
|
||||
get_session_db = None
|
||||
|
||||
_db_pool = None
|
||||
|
||||
|
||||
def set_db_pool(pool):
|
||||
"""Set the database connection pool (called from main.py lifespan)."""
|
||||
global _db_pool
|
||||
_db_pool = pool
|
||||
|
||||
|
||||
async def _init_vocab_table():
|
||||
"""Initialize vocab tables in database."""
|
||||
if init_vocab_tables:
|
||||
try:
|
||||
await init_vocab_tables()
|
||||
logger.info("vocab_session_cache table ready")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to init vocab tables: {e}")
|
||||
else:
|
||||
logger.info("vocab_session_cache table ready")
|
||||
|
||||
|
||||
async def _load_all_sessions():
|
||||
"""Load all vocab sessions from database into memory cache."""
|
||||
if not list_sessions_db:
|
||||
logger.info("Loaded 0 vocab sessions from database")
|
||||
return
|
||||
|
||||
try:
|
||||
sessions = await list_sessions_db(limit=500)
|
||||
count = 0
|
||||
for s in sessions:
|
||||
sid = s.get("id") or s.get("session_id")
|
||||
if sid and sid not in _sessions:
|
||||
_sessions[sid] = {
|
||||
"id": sid,
|
||||
"name": s.get("name", ""),
|
||||
"description": s.get("description", ""),
|
||||
"status": s.get("status", "created"),
|
||||
"vocabulary_count": s.get("vocabulary_count", 0),
|
||||
"source_language": s.get("source_language", "en"),
|
||||
"target_language": s.get("target_language", "de"),
|
||||
"created_at": str(s.get("created_at", "")),
|
||||
}
|
||||
count += 1
|
||||
logger.info(f"Loaded {count} vocab sessions from database")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load sessions from database: {e}")
|
||||
|
||||
|
||||
# --- Router & module-level state ---
|
||||
router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"])
|
||||
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
|
||||
_sessions: Dict[str, Dict[str, Any]] = {}
|
||||
_worksheets: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
|
||||
@router.post("/sessions", response_model=SessionResponse)
|
||||
async def create_session(session: SessionCreate):
|
||||
"""Create a new vocabulary extraction session."""
|
||||
session_id = str(uuid.uuid4())
|
||||
|
||||
session_data = {
|
||||
"id": session_id,
|
||||
"name": session.name,
|
||||
"description": session.description,
|
||||
"source_language": session.source_language,
|
||||
"target_language": session.target_language,
|
||||
"status": SessionStatus.PENDING.value,
|
||||
"vocabulary": [],
|
||||
"vocabulary_count": 0,
|
||||
"image_path": None,
|
||||
"extraction_confidence": None,
|
||||
"created_at": datetime.utcnow(),
|
||||
}
|
||||
|
||||
_sessions[session_id] = session_data
|
||||
|
||||
# Create storage directory
|
||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||
os.makedirs(session_dir, exist_ok=True)
|
||||
|
||||
return SessionResponse(
|
||||
id=session_id,
|
||||
name=session.name,
|
||||
description=session.description,
|
||||
source_language=session.source_language,
|
||||
target_language=session.target_language,
|
||||
status=SessionStatus.PENDING.value,
|
||||
vocabulary_count=0,
|
||||
image_path=None,
|
||||
created_at=session_data["created_at"],
|
||||
)
|
||||
|
||||
|
||||
@router.get("/sessions", response_model=List[SessionResponse])
|
||||
async def list_sessions(limit: int = Query(50, ge=1, le=100)):
|
||||
"""List all vocabulary sessions."""
|
||||
sessions = sorted(
|
||||
_sessions.values(),
|
||||
key=lambda x: x["created_at"],
|
||||
reverse=True
|
||||
)[:limit]
|
||||
|
||||
return [
|
||||
SessionResponse(
|
||||
id=s["id"],
|
||||
name=s["name"],
|
||||
description=s.get("description"),
|
||||
source_language=s["source_language"],
|
||||
target_language=s["target_language"],
|
||||
status=s["status"],
|
||||
vocabulary_count=s.get("vocabulary_count", 0),
|
||||
image_path=s.get("image_path"),
|
||||
created_at=s["created_at"],
|
||||
)
|
||||
for s in sessions
|
||||
]
|
||||
|
||||
|
||||
@router.get("/sessions/{session_id}", response_model=SessionResponse)
|
||||
async def get_session(session_id: str):
|
||||
"""Get a specific session."""
|
||||
if session_id not in _sessions:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
s = _sessions[session_id]
|
||||
return SessionResponse(
|
||||
id=s["id"],
|
||||
name=s["name"],
|
||||
description=s.get("description"),
|
||||
source_language=s["source_language"],
|
||||
target_language=s["target_language"],
|
||||
status=s["status"],
|
||||
vocabulary_count=s.get("vocabulary_count", 0),
|
||||
image_path=s.get("image_path"),
|
||||
created_at=s["created_at"],
|
||||
)
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/upload")
|
||||
async def upload_image(
|
||||
session_id: str,
|
||||
file: UploadFile = File(...),
|
||||
):
|
||||
"""
|
||||
Upload a textbook page image or PDF and extract vocabulary.
|
||||
|
||||
Supported formats: PNG, JPG, JPEG, PDF
|
||||
"""
|
||||
logger.info(f"Upload request for session {session_id}")
|
||||
logger.info(f"File: filename={file.filename}, content_type={file.content_type}")
|
||||
|
||||
if session_id not in _sessions:
|
||||
logger.error(f"Session {session_id} not found")
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _sessions[session_id]
|
||||
|
||||
# Validate file type - check both extension and content type
|
||||
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
||||
content_type = file.content_type or ''
|
||||
|
||||
# Accept images and PDFs
|
||||
valid_image_extensions = ['png', 'jpg', 'jpeg']
|
||||
valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg']
|
||||
is_pdf = extension == 'pdf' or content_type == 'application/pdf'
|
||||
is_image = extension in valid_image_extensions or content_type in valid_image_content_types
|
||||
|
||||
if not is_pdf and not is_image:
|
||||
logger.error(f"Invalid file type: extension={extension}, content_type={content_type}")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}"
|
||||
)
|
||||
|
||||
# Determine final extension for saving
|
||||
if is_pdf:
|
||||
save_extension = 'png' # PDFs will be converted to PNG
|
||||
elif extension in valid_image_extensions:
|
||||
save_extension = extension
|
||||
elif content_type == 'image/png':
|
||||
save_extension = 'png'
|
||||
else:
|
||||
save_extension = 'jpg'
|
||||
|
||||
# Read file content
|
||||
content = await file.read()
|
||||
logger.info(f"Read {len(content)} bytes from uploaded file")
|
||||
|
||||
# Convert PDF to image if needed
|
||||
if is_pdf:
|
||||
logger.info("Converting PDF to image...")
|
||||
content = await convert_pdf_page_to_image(content, page_number=0)
|
||||
logger.info(f"PDF converted, image size: {len(content)} bytes")
|
||||
|
||||
# Save image
|
||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||
os.makedirs(session_dir, exist_ok=True)
|
||||
image_path = os.path.join(session_dir, f"source.{save_extension}")
|
||||
|
||||
with open(image_path, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
# Update session status
|
||||
session["status"] = SessionStatus.PROCESSING.value
|
||||
session["image_path"] = image_path
|
||||
|
||||
# Extract vocabulary using Vision LLM
|
||||
vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0)
|
||||
|
||||
# Update session with extracted vocabulary
|
||||
session["vocabulary"] = [v.dict() for v in vocabulary]
|
||||
session["vocabulary_count"] = len(vocabulary)
|
||||
session["extraction_confidence"] = confidence
|
||||
session["status"] = SessionStatus.EXTRACTED.value
|
||||
|
||||
result = {
|
||||
"session_id": session_id,
|
||||
"filename": file.filename,
|
||||
"image_path": image_path,
|
||||
"vocabulary_count": len(vocabulary),
|
||||
"extraction_confidence": confidence,
|
||||
"status": SessionStatus.EXTRACTED.value,
|
||||
}
|
||||
|
||||
if error:
|
||||
result["error"] = error
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse)
|
||||
async def get_vocabulary(session_id: str):
|
||||
"""Get extracted vocabulary for a session."""
|
||||
if session_id not in _sessions:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
session = _sessions[session_id]
|
||||
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
|
||||
return VocabularyResponse(
|
||||
session_id=session_id,
|
||||
vocabulary=vocabulary,
|
||||
extraction_confidence=session.get("extraction_confidence"),
|
||||
)
|
||||
|
||||
|
||||
@router.put("/sessions/{session_id}/vocabulary")
|
||||
async def update_vocabulary(session_id: str, update: VocabularyUpdate):
|
||||
"""Update vocabulary entries (for manual corrections)."""
|
||||
if session_id not in _sessions:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _sessions[session_id]
|
||||
session["vocabulary"] = [v.dict() for v in update.vocabulary]
|
||||
session["vocabulary_count"] = len(update.vocabulary)
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"vocabulary_count": len(update.vocabulary),
|
||||
"message": "Vocabulary updated successfully",
|
||||
}
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse)
|
||||
async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest):
|
||||
"""Generate worksheet PDF(s) from extracted vocabulary."""
|
||||
if session_id not in _sessions:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _sessions[session_id]
|
||||
vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])]
|
||||
|
||||
if not vocabulary:
|
||||
raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from")
|
||||
|
||||
worksheet_id = str(uuid.uuid4())
|
||||
title = request.title or session["name"]
|
||||
|
||||
# Generate HTML for each worksheet type
|
||||
combined_html = ""
|
||||
for wtype in request.worksheet_types:
|
||||
html = generate_worksheet_html(
|
||||
vocabulary=vocabulary,
|
||||
worksheet_type=wtype,
|
||||
title=f"{title} - {wtype.value}",
|
||||
show_solutions=False,
|
||||
repetitions=request.repetitions,
|
||||
line_height=request.line_height,
|
||||
)
|
||||
combined_html += html + '<div style="page-break-after: always;"></div>'
|
||||
|
||||
# Generate PDF
|
||||
try:
|
||||
pdf_bytes = await generate_worksheet_pdf(combined_html)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}")
|
||||
|
||||
# Save PDF
|
||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||
pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf")
|
||||
with open(pdf_path, 'wb') as f:
|
||||
f.write(pdf_bytes)
|
||||
|
||||
# Generate solution PDF if requested
|
||||
solution_path = None
|
||||
if request.include_solutions:
|
||||
solution_html = ""
|
||||
for wtype in request.worksheet_types:
|
||||
html = generate_worksheet_html(
|
||||
vocabulary=vocabulary,
|
||||
worksheet_type=wtype,
|
||||
title=f"{title} - {wtype.value} (Loesung)",
|
||||
show_solutions=True,
|
||||
repetitions=request.repetitions,
|
||||
line_height=request.line_height,
|
||||
)
|
||||
solution_html += html + '<div style="page-break-after: always;"></div>'
|
||||
|
||||
solution_bytes = await generate_worksheet_pdf(solution_html)
|
||||
solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf")
|
||||
with open(solution_path, 'wb') as f:
|
||||
f.write(solution_bytes)
|
||||
|
||||
# Store worksheet info
|
||||
worksheet_data = {
|
||||
"id": worksheet_id,
|
||||
"session_id": session_id,
|
||||
"worksheet_types": [wt.value for wt in request.worksheet_types],
|
||||
"pdf_path": pdf_path,
|
||||
"solution_path": solution_path,
|
||||
"generated_at": datetime.utcnow(),
|
||||
}
|
||||
_worksheets[worksheet_id] = worksheet_data
|
||||
|
||||
# Update session status
|
||||
session["status"] = SessionStatus.COMPLETED.value
|
||||
|
||||
return WorksheetResponse(
|
||||
id=worksheet_id,
|
||||
session_id=session_id,
|
||||
worksheet_types=worksheet_data["worksheet_types"],
|
||||
pdf_path=pdf_path,
|
||||
solution_path=solution_path,
|
||||
generated_at=worksheet_data["generated_at"],
|
||||
)
|
||||
|
||||
|
||||
@router.get("/worksheets/{worksheet_id}/pdf")
|
||||
async def download_worksheet_pdf(worksheet_id: str):
|
||||
"""Download the generated worksheet PDF."""
|
||||
if worksheet_id not in _worksheets:
|
||||
raise HTTPException(status_code=404, detail="Worksheet not found")
|
||||
|
||||
worksheet = _worksheets[worksheet_id]
|
||||
pdf_path = worksheet["pdf_path"]
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
raise HTTPException(status_code=404, detail="PDF file not found")
|
||||
|
||||
with open(pdf_path, 'rb') as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(pdf_bytes),
|
||||
media_type="application/pdf",
|
||||
headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"}
|
||||
)
|
||||
|
||||
|
||||
@router.get("/worksheets/{worksheet_id}/solution")
|
||||
async def download_solution_pdf(worksheet_id: str):
|
||||
"""Download the solution PDF."""
|
||||
if worksheet_id not in _worksheets:
|
||||
raise HTTPException(status_code=404, detail="Worksheet not found")
|
||||
|
||||
worksheet = _worksheets[worksheet_id]
|
||||
solution_path = worksheet.get("solution_path")
|
||||
|
||||
if not solution_path or not os.path.exists(solution_path):
|
||||
raise HTTPException(status_code=404, detail="Solution PDF not found")
|
||||
|
||||
with open(solution_path, 'rb') as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(pdf_bytes),
|
||||
media_type="application/pdf",
|
||||
headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"}
|
||||
)
|
||||
|
||||
|
||||
@router.get("/sessions/{session_id}/image")
|
||||
async def get_session_image(session_id: str):
|
||||
"""Get the uploaded source image for a session."""
|
||||
if session_id not in _sessions:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _sessions[session_id]
|
||||
image_path = session.get("image_path")
|
||||
|
||||
if not image_path or not os.path.exists(image_path):
|
||||
raise HTTPException(status_code=404, detail="Image not found")
|
||||
|
||||
# Determine content type
|
||||
extension = image_path.split('.')[-1].lower()
|
||||
content_type = {
|
||||
'png': 'image/png',
|
||||
'jpg': 'image/jpeg',
|
||||
'jpeg': 'image/jpeg',
|
||||
}.get(extension, 'application/octet-stream')
|
||||
|
||||
with open(image_path, 'rb') as f:
|
||||
image_bytes = f.read()
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(image_bytes),
|
||||
media_type=content_type,
|
||||
)
|
||||
|
||||
|
||||
@router.delete("/sessions/{session_id}")
|
||||
async def delete_session(session_id: str):
|
||||
"""Delete a vocabulary session and all associated files."""
|
||||
if session_id not in _sessions:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
# Delete session directory
|
||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||
if os.path.exists(session_dir):
|
||||
import shutil
|
||||
shutil.rmtree(session_dir)
|
||||
|
||||
# Remove from storage
|
||||
del _sessions[session_id]
|
||||
|
||||
# Remove associated worksheets
|
||||
for wid, ws in list(_worksheets.items()):
|
||||
if ws["session_id"] == session_id:
|
||||
del _worksheets[wid]
|
||||
|
||||
return {"message": "Session deleted successfully", "session_id": session_id}
|
||||
|
||||
|
||||
# --- Include sub-routers ---
|
||||
from vocab_worksheet_upload_api import upload_router
|
||||
from vocab_worksheet_analysis_api import analysis_router
|
||||
|
||||
router.include_router(upload_router)
|
||||
router.include_router(analysis_router)
|
||||
# Backward-compat shim -- module moved to vocab/worksheet/api.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("vocab.worksheet.api")
|
||||
|
||||
Reference in New Issue
Block a user