diff --git a/admin-v2/app/(admin)/ai/ocr-compare/page.tsx b/admin-v2/app/(admin)/ai/ocr-compare/page.tsx index 73a0bb4..d2ee594 100644 --- a/admin-v2/app/(admin)/ai/ocr-compare/page.tsx +++ b/admin-v2/app/(admin)/ai/ocr-compare/page.tsx @@ -139,6 +139,10 @@ export default function OCRComparePage() { const [currentBlockNumber, setCurrentBlockNumber] = useState(1) const [blockReviewData, setBlockReviewData] = useState>({}) + // Export State + const [isExporting, setIsExporting] = useState(false) + const [exportSuccess, setExportSuccess] = useState(false) + const KLAUSUR_API = '/klausur-api' // Load session history @@ -535,6 +539,72 @@ export default function OCRComparePage() { } }, [gridData]) + // Export to Worksheet Editor + const handleExportToEditor = useCallback(async () => { + if (!gridData || !sessionId) return + + setIsExporting(true) + setExportSuccess(false) + + try { + // Convert grid cells (percent coordinates) to mm for A4 + const A4_WIDTH_MM = 210 + const A4_HEIGHT_MM = 297 + + const words = gridData.cells.flat() + .filter(cell => cell.status !== 'empty' && cell.text) + .map(cell => ({ + text: cell.text, + x_mm: (cell.x / 100) * A4_WIDTH_MM, + y_mm: (cell.y / 100) * A4_HEIGHT_MM, + width_mm: (cell.width / 100) * A4_WIDTH_MM, + height_mm: (cell.height / 100) * A4_HEIGHT_MM, + column_type: cell.column_type || 'unknown', + logical_row: cell.row, + confidence: cell.confidence, + })) + + const detectedColumns = gridData.column_types.map((type, idx) => ({ + column_type: type, + x_start_mm: (gridData.column_boundaries[idx] / 100) * A4_WIDTH_MM, + x_end_mm: (gridData.column_boundaries[idx + 1] / 100) * A4_WIDTH_MM, + })) + + const exportData = { + version: '1.0', + source: 'ocr-compare', + exported_at: new Date().toISOString(), + session_id: sessionId, + page_number: selectedPage + 1, + page_dimensions: { + width_mm: A4_WIDTH_MM, + height_mm: A4_HEIGHT_MM, + format: 'A4', + }, + words, + detected_columns: detectedColumns, + } + + const res = await fetch( + `${KLAUSUR_API}/api/v1/vocab/sessions/${sessionId}/ocr-export/${selectedPage + 1}`, + { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(exportData), + } + ) + + if (res.ok) { + setExportSuccess(true) + setTimeout(() => setExportSuccess(false), 3000) + } + } catch (e) { + console.error('Export failed:', e) + } finally { + setIsExporting(false) + } + }, [gridData, sessionId, selectedPage, KLAUSUR_API]) + // Count non-empty blocks const nonEmptyBlockCount = useMemo(() => { if (!gridData) return 0 @@ -831,6 +901,35 @@ export default function OCRComparePage() { )} )} + + {/* Export to Editor Button */} + )} @@ -1015,11 +1114,25 @@ export default function OCRComparePage() {
{thumbnails[selectedPage] ? ( - {`Seite + gridData && showGridOverlay ? ( + + ) : ( + {`Seite + ) ) : (
Kein Bild verfuegbar @@ -1116,6 +1229,7 @@ export default function OCRComparePage() { selectedCell={selectedCell} showEmpty={false} showNumbers={blockReviewMode} + showTextLabels={!blockReviewMode} highlightedBlockNumber={blockReviewMode ? currentBlockNumber : null} className="rounded-lg border border-slate-200 overflow-hidden" /> diff --git a/klausur-service/backend/vocab_worksheet_api.py b/klausur-service/backend/vocab_worksheet_api.py index 1aeb49e..1ff13ef 100644 --- a/klausur-service/backend/vocab_worksheet_api.py +++ b/klausur-service/backend/vocab_worksheet_api.py @@ -15,7 +15,7 @@ Workflow: 6. GET /worksheets/{id}/pdf - Download generated PDF """ -from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Query +from fastapi import APIRouter, Body, HTTPException, UploadFile, File, Form, Query from fastapi.responses import StreamingResponse from pydantic import BaseModel from typing import Optional, List, Dict, Any @@ -28,26 +28,6 @@ import json import base64 import logging -# PostgreSQL persistence (replaces in-memory storage) -from vocab_session_store import ( - init_vocab_tables, - create_session_db, - get_session_db, - list_sessions_db, - update_session_db, - delete_session_db, - add_vocabulary_db, - get_vocabulary_db, - update_vocabulary_db, - clear_page_vocabulary_db, - create_worksheet_db, - get_worksheet_db, - delete_worksheets_for_session_db, - cache_pdf_data, - get_cached_pdf_data, - clear_cached_pdf_data, -) - logger = logging.getLogger(__name__) # Ollama Configuration - Direct call without external modules @@ -95,24 +75,6 @@ class VocabularyEntry(BaseModel): example_sentence_gap: Optional[str] = None # With ___ for gap-fill word_type: Optional[str] = None # noun, verb, adjective, etc. source_page: Optional[int] = None # Page number where entry was found (1-indexed) - # Grid position fields for layout-preserving OCR - source_x: Optional[float] = None # X position as percentage (0-100) - source_y: Optional[float] = None # Y position as percentage (0-100) - source_width: Optional[float] = None # Width as percentage (0-100) - source_height: Optional[float] = None # Height as percentage (0-100) - source_column: Optional[int] = None # 0-indexed column in detected grid - source_row: Optional[int] = None # 0-indexed row in detected grid - confidence: Optional[float] = None # OCR confidence score (0-1) - recognition_status: Optional[str] = None # recognized | manual | unrecognized - - -class OcrPrompts(BaseModel): - filterHeaders: bool = True - filterFooters: bool = True - filterPageNumbers: bool = True - customFilter: str = "" - headerPatterns: List[str] = [] - footerPatterns: List[str] = [] class SessionCreate(BaseModel): @@ -120,7 +82,6 @@ class SessionCreate(BaseModel): description: Optional[str] = None source_language: str = "en" # Source language (default English) target_language: str = "de" # Target language (default German) - ocr_prompts: Optional[OcrPrompts] = None # OCR filtering settings from frontend class SessionResponse(BaseModel): @@ -163,22 +124,12 @@ class WorksheetResponse(BaseModel): # ============================================================================= -# PostgreSQL Storage (persistent across container restarts) +# In-Memory Storage (simplified - should use PostgreSQL in production) # ============================================================================= -# Note: In-memory storage removed. All data now persisted in PostgreSQL. -# See vocab_session_store.py for implementation. - -# Startup event to initialize tables -@router.on_event("startup") -async def startup(): - """Initialize vocab session tables on startup.""" - logger.info("Initializing vocab session PostgreSQL tables...") - success = await init_vocab_tables() - if success: - logger.info("Vocab session tables ready") - else: - logger.warning("Failed to initialize vocab tables - storage may not work") +# Session storage +_sessions: Dict[str, Dict[str, Any]] = {} +_worksheets: Dict[str, Dict[str, Any]] = {} # ============================================================================= @@ -220,22 +171,17 @@ async def extract_vocabulary_from_image( image_data: bytes, filename: str, page_number: int = 0, - ocr_method: str = "tesseract" # Options: "tesseract" (D), "vision_llm" (B), "paddleocr" (C) + use_hybrid: bool = False # DISABLED: PaddleOCR crashes on ARM64 Mac Mini ) -> tuple[List[VocabularyEntry], float, str]: """ - Extract vocabulary from an image using different OCR methods. - - OCR Methods (documented in SBOM): - - Loesung A: User's 32B LLM (external) - - Loesung B: Vision LLM (Ollama llama3.2-vision) - - Loesung C: PaddleOCR + LLM (DEAKTIVIERT - funktioniert nicht unter Rosetta 2) - - Loesung D: Tesseract OCR + LLM (ARM64-nativ, Apache 2.0) <- DEFAULT + Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default). Args: image_data: Image bytes filename: Original filename for logging page_number: 0-indexed page number for error messages - ocr_method: OCR method to use ("tesseract", "vision_llm", "paddleocr") + use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text) + If False, use Vision LLM (slower, better for complex layouts) Returns: Tuple of (vocabulary_entries, confidence, error_message) @@ -244,58 +190,20 @@ async def extract_vocabulary_from_image( import httpx # ========================================================================== - # LOESUNG D: Tesseract OCR + LLM Gateway (DEFAULT - ARM64-nativ) + # HYBRID APPROACH (Default): PaddleOCR + LLM Gateway # ========================================================================== - if ocr_method == "tesseract": - try: - from tesseract_vocab_extractor import extract_vocabulary_tesseract, is_tesseract_available - - if not is_tesseract_available(): - logger.warning("Tesseract not available, falling back to Vision LLM") - ocr_method = "vision_llm" - else: - logger.info(f"Using TESSERACT OCR for {filename} (Loesung D)") - - vocab_dicts, confidence, error = await extract_vocabulary_tesseract(image_data, filename) - - if error: - logger.warning(f"Tesseract extraction had issues: {error}") - elif vocab_dicts: - vocabulary = [ - VocabularyEntry( - id=str(uuid.uuid4()), - english=v.get("source_word", "") if v.get("source_lang") == "en" else v.get("target_word", ""), - german=v.get("source_word", "") if v.get("source_lang") == "de" else v.get("target_word", ""), - example_sentence=v.get("context"), - source_page=page_number + 1 - ) - for v in vocab_dicts - ] - logger.info(f"Tesseract extraction: {len(vocabulary)} entries from {filename}") - return vocabulary, confidence, "" - - except ImportError as e: - logger.warning(f"Tesseract extractor not available: {e}. Falling back to Vision LLM.") - ocr_method = "vision_llm" - except Exception as e: - logger.warning(f"Tesseract extraction failed: {e}. Falling back to Vision LLM.") - import traceback - logger.debug(traceback.format_exc()) - ocr_method = "vision_llm" - - # ========================================================================== - # LOESUNG C: PaddleOCR + LLM Gateway (DEAKTIVIERT - Rosetta 2 Probleme) - # ========================================================================== - if ocr_method == "paddleocr": + if use_hybrid: try: from hybrid_vocab_extractor import extract_vocabulary_hybrid - logger.info(f"Using PADDLEOCR for {filename} (Loesung C - experimentell)") + logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)") vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number) if error: - logger.warning(f"PaddleOCR extraction had issues: {error}") + logger.warning(f"Hybrid extraction had issues: {error}") + # Fall through to Vision LLM fallback elif vocab_dicts: + # Convert dicts to VocabularyEntry objects vocabulary = [ VocabularyEntry( id=str(uuid.uuid4()), @@ -307,13 +215,13 @@ async def extract_vocabulary_from_image( for v in vocab_dicts if v.get("english") and v.get("german") ] - logger.info(f"PaddleOCR extraction: {len(vocabulary)} entries from {filename}") + logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}") return vocabulary, confidence, "" except ImportError as e: - logger.warning(f"PaddleOCR not available: {e}. Falling back to Vision LLM.") + logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.") except Exception as e: - logger.warning(f"PaddleOCR failed: {e}. Falling back to Vision LLM.") + logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.") import traceback logger.debug(traceback.format_exc()) @@ -652,20 +560,23 @@ async def create_session(session: SessionCreate): """Create a new vocabulary extraction session.""" session_id = str(uuid.uuid4()) - # Store in PostgreSQL - db_session = await create_session_db( - session_id=session_id, - name=session.name, - description=session.description, - source_language=session.source_language, - target_language=session.target_language, - ocr_prompts=session.ocr_prompts.model_dump() if session.ocr_prompts else None, - ) + session_data = { + "id": session_id, + "name": session.name, + "description": session.description, + "source_language": session.source_language, + "target_language": session.target_language, + "status": SessionStatus.PENDING.value, + "vocabulary": [], + "vocabulary_count": 0, + "image_path": None, + "extraction_confidence": None, + "created_at": datetime.utcnow(), + } - if db_session is None: - raise HTTPException(status_code=500, detail="Failed to create session in database") + _sessions[session_id] = session_data - # Create storage directory for files + # Create storage directory session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) os.makedirs(session_dir, exist_ok=True) @@ -678,26 +589,30 @@ async def create_session(session: SessionCreate): status=SessionStatus.PENDING.value, vocabulary_count=0, image_path=None, - created_at=db_session.created_at or datetime.utcnow(), + created_at=session_data["created_at"], ) @router.get("/sessions", response_model=List[SessionResponse]) async def list_sessions(limit: int = Query(50, ge=1, le=100)): """List all vocabulary sessions.""" - sessions = await list_sessions_db(limit=limit) + sessions = sorted( + _sessions.values(), + key=lambda x: x["created_at"], + reverse=True + )[:limit] return [ SessionResponse( - id=s.id, - name=s.name, - description=s.description, - source_language=s.source_language, - target_language=s.target_language, - status=s.status, - vocabulary_count=s.vocabulary_count, - image_path=s.image_path, - created_at=s.created_at or datetime.utcnow(), + id=s["id"], + name=s["name"], + description=s.get("description"), + source_language=s["source_language"], + target_language=s["target_language"], + status=s["status"], + vocabulary_count=s.get("vocabulary_count", 0), + image_path=s.get("image_path"), + created_at=s["created_at"], ) for s in sessions ] @@ -706,20 +621,20 @@ async def list_sessions(limit: int = Query(50, ge=1, le=100)): @router.get("/sessions/{session_id}", response_model=SessionResponse) async def get_session(session_id: str): """Get a specific session.""" - s = await get_session_db(session_id) - if s is None: + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") + s = _sessions[session_id] return SessionResponse( - id=s.id, - name=s.name, - description=s.description, - source_language=s.source_language, - target_language=s.target_language, - status=s.status, - vocabulary_count=s.vocabulary_count, - image_path=s.image_path, - created_at=s.created_at or datetime.utcnow(), + id=s["id"], + name=s["name"], + description=s.get("description"), + source_language=s["source_language"], + target_language=s["target_language"], + status=s["status"], + vocabulary_count=s.get("vocabulary_count", 0), + image_path=s.get("image_path"), + created_at=s["created_at"], ) @@ -744,10 +659,6 @@ async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumb page_number: 0-indexed page number thumbnail: If True, return a smaller thumbnail image """ - import gc - pix = None - pdf_document = None - try: import fitz # PyMuPDF @@ -768,6 +679,7 @@ async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumb pix = page.get_pixmap(matrix=mat) png_data = pix.tobytes("png") + pdf_document.close() logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})") return png_data @@ -778,14 +690,6 @@ async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumb except Exception as e: logger.error(f"PDF conversion failed: {e}") raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}") - finally: - # Explicit cleanup to prevent OOM - if pix is not None: - del pix - if pdf_document is not None: - pdf_document.close() - del pdf_document - gc.collect() async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]: @@ -795,9 +699,6 @@ async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> Lis pdf_data: PDF file as bytes pages: List of 0-indexed page numbers to convert. If None, convert all pages. """ - import gc - pdf_document = None - try: import fitz @@ -819,10 +720,8 @@ async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> Lis page = pdf_document[page_num] pix = page.get_pixmap(matrix=mat) images.append(pix.tobytes("png")) - # Cleanup pixmap immediately to prevent memory buildup - del pix - gc.collect() + pdf_document.close() logger.info(f"Converted {len(images)} PDF pages to images") return images @@ -832,11 +731,6 @@ async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> Lis except Exception as e: logger.error(f"PDF conversion failed: {e}") raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}") - finally: - if pdf_document is not None: - pdf_document.close() - del pdf_document - gc.collect() @router.post("/sessions/{session_id}/upload") @@ -852,11 +746,12 @@ async def upload_image( logger.info(f"Upload request for session {session_id}") logger.info(f"File: filename={file.filename}, content_type={file.content_type}") - session = await get_session_db(session_id) - if session is None: + if session_id not in _sessions: logger.error(f"Session {session_id} not found") raise HTTPException(status_code=404, detail="Session not found") + session = _sessions[session_id] + # Validate file type - check both extension and content type extension = file.filename.split('.')[-1].lower() if file.filename else '' content_type = file.content_type or '' @@ -888,10 +783,10 @@ async def upload_image( content = await file.read() logger.info(f"Read {len(content)} bytes from uploaded file") - # Convert PDF to image if needed (first page only for single upload) + # Convert PDF to image if needed if is_pdf: logger.info("Converting PDF to image...") - content = await convert_pdf_page_to_image(content, page_number=0, thumbnail=False) + content = await convert_pdf_to_image(content) logger.info(f"PDF converted, image size: {len(content)} bytes") # Save image @@ -902,23 +797,18 @@ async def upload_image( with open(image_path, 'wb') as f: f.write(content) - # Update session status in DB - await update_session_db(session_id, status=SessionStatus.PROCESSING.value, image_path=image_path) + # Update session status + session["status"] = SessionStatus.PROCESSING.value + session["image_path"] = image_path # Extract vocabulary using Vision LLM vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0) - # Store vocabulary in DB - vocab_dicts = [v.dict() for v in vocabulary] - await add_vocabulary_db(session_id, vocab_dicts) - - # Update session with extraction results - await update_session_db( - session_id, - status=SessionStatus.EXTRACTED.value, - extraction_confidence=confidence, - vocabulary_count=len(vocabulary), - ) + # Update session with extracted vocabulary + session["vocabulary"] = [v.dict() for v in vocabulary] + session["vocabulary_count"] = len(vocabulary) + session["extraction_confidence"] = confidence + session["status"] = SessionStatus.EXTRACTED.value result = { "session_id": session_id, @@ -938,33 +828,29 @@ async def upload_image( @router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse) async def get_vocabulary(session_id: str): """Get extracted vocabulary for a session.""" - session = await get_session_db(session_id) - if session is None: + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") - vocab_dicts = await get_vocabulary_db(session_id) - vocabulary = [VocabularyEntry(**v) for v in vocab_dicts] + session = _sessions[session_id] + + vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])] return VocabularyResponse( session_id=session_id, vocabulary=vocabulary, - extraction_confidence=session.extraction_confidence, + extraction_confidence=session.get("extraction_confidence"), ) @router.put("/sessions/{session_id}/vocabulary") async def update_vocabulary(session_id: str, update: VocabularyUpdate): """Update vocabulary entries (for manual corrections).""" - session = await get_session_db(session_id) - if session is None: + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") - # Replace all vocabulary entries - vocab_dicts = [v.dict() for v in update.vocabulary] - success = await update_vocabulary_db(session_id, vocab_dicts) - - if not success: - raise HTTPException(status_code=500, detail="Failed to update vocabulary") + session = _sessions[session_id] + session["vocabulary"] = [v.dict() for v in update.vocabulary] + session["vocabulary_count"] = len(update.vocabulary) return { "session_id": session_id, @@ -976,18 +862,17 @@ async def update_vocabulary(session_id: str, update: VocabularyUpdate): @router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse) async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest): """Generate worksheet PDF(s) from extracted vocabulary.""" - session = await get_session_db(session_id) - if session is None: + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") - vocab_dicts = await get_vocabulary_db(session_id) - vocabulary = [VocabularyEntry(**v) for v in vocab_dicts] + session = _sessions[session_id] + vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])] if not vocabulary: raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from") worksheet_id = str(uuid.uuid4()) - title = request.title or session.name + title = request.title or session["name"] # Generate HTML for each worksheet type combined_html = "" @@ -1010,7 +895,6 @@ async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest) # Save PDF session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) - os.makedirs(session_dir, exist_ok=True) pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf") with open(pdf_path, 'wb') as f: f.write(pdf_bytes) @@ -1035,41 +919,40 @@ async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest) with open(solution_path, 'wb') as f: f.write(solution_bytes) - # Store worksheet info in DB - worksheet = await create_worksheet_db( - worksheet_id=worksheet_id, - session_id=session_id, - worksheet_types=[wt.value for wt in request.worksheet_types], - pdf_path=pdf_path, - solution_path=solution_path, - ) - - if worksheet is None: - raise HTTPException(status_code=500, detail="Failed to save worksheet to database") + # Store worksheet info + worksheet_data = { + "id": worksheet_id, + "session_id": session_id, + "worksheet_types": [wt.value for wt in request.worksheet_types], + "pdf_path": pdf_path, + "solution_path": solution_path, + "generated_at": datetime.utcnow(), + } + _worksheets[worksheet_id] = worksheet_data # Update session status - await update_session_db(session_id, status=SessionStatus.COMPLETED.value) + session["status"] = SessionStatus.COMPLETED.value return WorksheetResponse( id=worksheet_id, session_id=session_id, - worksheet_types=worksheet.worksheet_types, + worksheet_types=worksheet_data["worksheet_types"], pdf_path=pdf_path, solution_path=solution_path, - generated_at=worksheet.generated_at or datetime.utcnow(), + generated_at=worksheet_data["generated_at"], ) @router.get("/worksheets/{worksheet_id}/pdf") async def download_worksheet_pdf(worksheet_id: str): """Download the generated worksheet PDF.""" - worksheet = await get_worksheet_db(worksheet_id) - if worksheet is None: + if worksheet_id not in _worksheets: raise HTTPException(status_code=404, detail="Worksheet not found") - pdf_path = worksheet.pdf_path + worksheet = _worksheets[worksheet_id] + pdf_path = worksheet["pdf_path"] - if not pdf_path or not os.path.exists(pdf_path): + if not os.path.exists(pdf_path): raise HTTPException(status_code=404, detail="PDF file not found") with open(pdf_path, 'rb') as f: @@ -1085,11 +968,11 @@ async def download_worksheet_pdf(worksheet_id: str): @router.get("/worksheets/{worksheet_id}/solution") async def download_solution_pdf(worksheet_id: str): """Download the solution PDF.""" - worksheet = await get_worksheet_db(worksheet_id) - if worksheet is None: + if worksheet_id not in _worksheets: raise HTTPException(status_code=404, detail="Worksheet not found") - solution_path = worksheet.solution_path + worksheet = _worksheets[worksheet_id] + solution_path = worksheet.get("solution_path") if not solution_path or not os.path.exists(solution_path): raise HTTPException(status_code=404, detail="Solution PDF not found") @@ -1107,11 +990,11 @@ async def download_solution_pdf(worksheet_id: str): @router.get("/sessions/{session_id}/image") async def get_session_image(session_id: str): """Get the uploaded source image for a session.""" - session = await get_session_db(session_id) - if session is None: + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") - image_path = session.image_path + session = _sessions[session_id] + image_path = session.get("image_path") if not image_path or not os.path.exists(image_path): raise HTTPException(status_code=404, detail="Image not found") @@ -1144,10 +1027,11 @@ async def upload_pdf_get_info( """ logger.info(f"PDF info request for session {session_id}") - session = await get_session_db(session_id) - if session is None: + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") + session = _sessions[session_id] + # Validate file type extension = file.filename.split('.')[-1].lower() if file.filename else '' content_type = file.content_type or '' @@ -1168,16 +1052,11 @@ async def upload_pdf_get_info( # Get page count page_count = get_pdf_page_count(content) - # Cache PDF data for later processing (in-memory for multi-page workflow) - cache_pdf_data(session_id, content) - - # Update session in DB - await update_session_db( - session_id, - pdf_path=pdf_path, - pdf_page_count=page_count, - status="pdf_uploaded", - ) + # Store PDF data in session for later processing + session["pdf_data"] = content + session["pdf_path"] = pdf_path + session["pdf_page_count"] = page_count + session["status"] = "pdf_uploaded" return { "session_id": session_id, @@ -1187,35 +1066,21 @@ async def upload_pdf_get_info( @router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}") -async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = False): - """Get a thumbnail image of a specific PDF page. - - Args: - session_id: Session ID - page_number: 0-indexed page number - hires: If True, return high-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5) - """ - session = await get_session_db(session_id) - if session is None: +async def get_pdf_thumbnail(session_id: str, page_number: int): + """Get a thumbnail image of a specific PDF page.""" + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") - # Try cached PDF data first - pdf_data = get_cached_pdf_data(session_id) - - # If not cached, try to load from file - if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path): - with open(session.pdf_path, 'rb') as f: - pdf_data = f.read() - cache_pdf_data(session_id, pdf_data) + session = _sessions[session_id] + pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") - # Use thumbnail=False for high-res (zoom=2.0), thumbnail=True for low-res (zoom=0.5) - image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=not hires) + thumbnail = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=True) return StreamingResponse( - io.BytesIO(image_data), + io.BytesIO(thumbnail), media_type="image/png", ) @@ -1235,23 +1100,16 @@ async def process_single_page( """ logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}") - session = await get_session_db(session_id) - if session is None: + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") - # Try cached PDF data first - pdf_data = get_cached_pdf_data(session_id) - - # If not cached, try to load from file - if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path): - with open(session.pdf_path, 'rb') as f: - pdf_data = f.read() - cache_pdf_data(session_id, pdf_data) + session = _sessions[session_id] + pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") - page_count = session.pdf_page_count or 1 + page_count = session.get("pdf_page_count", 1) if page_number < 0 or page_number >= page_count: raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") @@ -1286,17 +1144,14 @@ async def process_single_page( logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert") - # Clear existing entries for this page (in case of re-processing) - await clear_page_vocabulary_db(session_id, page_number + 1) - - # Add new vocabulary entries to DB - await add_vocabulary_db(session_id, page_vocabulary) - - # Update session status - await update_session_db(session_id, status=SessionStatus.EXTRACTED.value) - - # Get total count - all_vocab = await get_vocabulary_db(session_id) + # Add to session's vocabulary (append, don't replace) + existing_vocab = session.get("vocabulary", []) + # Remove any existing entries from this page (in case of re-processing) + existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1] + existing_vocab.extend(page_vocabulary) + session["vocabulary"] = existing_vocab + session["vocabulary_count"] = len(existing_vocab) + session["status"] = SessionStatus.EXTRACTED.value return { "session_id": session_id, @@ -1304,513 +1159,11 @@ async def process_single_page( "success": True, "vocabulary": page_vocabulary, "vocabulary_count": len(page_vocabulary), - "total_vocabulary_count": len(all_vocab), + "total_vocabulary_count": len(existing_vocab), "extraction_confidence": confidence, } -@router.post("/sessions/{session_id}/compare-ocr/{page_number}") -async def compare_ocr_methods( - session_id: str, - page_number: int, -): - """ - Compare different OCR methods on a single page. - - Runs available OCR solutions and compares: - - Extraction time - - Vocabulary found - - Confidence scores - - Solutions tested: - - Loesung B: Vision LLM (qwen2.5vl:32b via Ollama) - - Loesung D: Tesseract OCR + LLM structuring - - Loesung E: Claude Vision API (Anthropic) - - Returns comparison data for frontend visualization. - """ - import time - import httpx - - logger.info(f"OCR Comparison for session {session_id}, page {page_number}") - - session = await get_session_db(session_id) - if session is None: - raise HTTPException(status_code=404, detail="Session not found") - - # Try cached PDF data first - pdf_data = get_cached_pdf_data(session_id) - - # If not cached, try to load from file - if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path): - with open(session.pdf_path, 'rb') as f: - pdf_data = f.read() - cache_pdf_data(session_id, pdf_data) - - if not pdf_data: - raise HTTPException(status_code=400, detail="No PDF uploaded for this session") - - page_count = session.pdf_page_count or 1 - - if page_number < 0 or page_number >= page_count: - raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") - - # Convert page to image once (shared by all methods) - image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) - - results = { - "session_id": session_id, - "page_number": page_number + 1, - "methods": {} - } - - # ========================================================================== - # LOESUNG B: Vision LLM (qwen2.5vl:32b) - # ========================================================================== - try: - start_time = time.time() - vocab_b, confidence_b, error_b = await extract_vocabulary_from_image( - image_data, f"page_{page_number + 1}.png", page_number, ocr_method="vision_llm" - ) - duration_b = time.time() - start_time - - results["methods"]["vision_llm"] = { - "name": "Loesung B: Vision LLM", - "model": VISION_MODEL, - "duration_seconds": round(duration_b, 2), - "vocabulary_count": len(vocab_b), - "vocabulary": [ - {"english": v.english, "german": v.german, "example": v.example_sentence} - for v in vocab_b - ], - "confidence": confidence_b, - "error": error_b if error_b else None, - "success": len(vocab_b) > 0 - } - logger.info(f"Vision LLM: {len(vocab_b)} entries in {duration_b:.2f}s") - except Exception as e: - results["methods"]["vision_llm"] = { - "name": "Loesung B: Vision LLM", - "error": str(e), - "success": False - } - logger.error(f"Vision LLM comparison failed: {e}") - - # ========================================================================== - # LOESUNG D: Tesseract OCR + LLM - # ========================================================================== - try: - start_time = time.time() - vocab_d, confidence_d, error_d = await extract_vocabulary_from_image( - image_data, f"page_{page_number + 1}.png", page_number, ocr_method="tesseract" - ) - duration_d = time.time() - start_time - - results["methods"]["tesseract"] = { - "name": "Loesung D: Tesseract OCR", - "model": "tesseract + qwen2.5:14b", - "duration_seconds": round(duration_d, 2), - "vocabulary_count": len(vocab_d), - "vocabulary": [ - {"english": v.english, "german": v.german, "example": v.example_sentence} - for v in vocab_d - ], - "confidence": confidence_d, - "error": error_d if error_d else None, - "success": len(vocab_d) > 0 - } - logger.info(f"Tesseract: {len(vocab_d)} entries in {duration_d:.2f}s") - except Exception as e: - results["methods"]["tesseract"] = { - "name": "Loesung D: Tesseract OCR", - "error": str(e), - "success": False - } - logger.error(f"Tesseract comparison failed: {e}") - - # ========================================================================== - # LOESUNG E: Claude Vision API (Anthropic) - # ========================================================================== - try: - from claude_vocab_extractor import extract_vocabulary_claude, is_claude_available - - if is_claude_available(): - start_time = time.time() - vocab_e_raw, confidence_e, error_e = await extract_vocabulary_claude( - image_data, f"page_{page_number + 1}.png" - ) - duration_e = time.time() - start_time - - # Convert to consistent format - vocab_e = [] - for v in vocab_e_raw: - source_word = v.get("source_word", "") - target_word = v.get("target_word", "") - source_lang = v.get("source_lang", "en") - # Determine which is English and which is German - if source_lang == "en": - english = source_word - german = target_word - else: - english = target_word - german = source_word - - vocab_e.append({ - "english": english, - "german": german, - "example": v.get("context", "") - }) - - results["methods"]["claude_vision"] = { - "name": "Loesung E: Claude Vision", - "model": "claude-sonnet-4-20250514", - "duration_seconds": round(duration_e, 2), - "vocabulary_count": len(vocab_e), - "vocabulary": vocab_e, - "confidence": confidence_e, - "error": error_e if error_e else None, - "success": len(vocab_e) > 0 - } - logger.info(f"Claude Vision: {len(vocab_e)} entries in {duration_e:.2f}s") - else: - results["methods"]["claude_vision"] = { - "name": "Loesung E: Claude Vision", - "error": "Anthropic API Key nicht konfiguriert", - "success": False - } - except Exception as e: - results["methods"]["claude_vision"] = { - "name": "Loesung E: Claude Vision", - "error": str(e), - "success": False - } - logger.error(f"Claude Vision comparison failed: {e}") - - # ========================================================================== - # Comparison Analysis - # ========================================================================== - all_vocab = {} - for method_key, method_data in results["methods"].items(): - if method_data.get("success"): - for v in method_data.get("vocabulary", []): - key = f"{v['english']}|{v['german']}" - if key not in all_vocab: - all_vocab[key] = {"english": v["english"], "german": v["german"], "found_by": []} - all_vocab[key]["found_by"].append(method_key) - - # Categorize vocabulary - found_by_all = [] - found_by_some = [] - - num_methods = len([m for m in results["methods"].values() if m.get("success")]) - - for key, data in all_vocab.items(): - entry = {"english": data["english"], "german": data["german"], "methods": data["found_by"]} - if len(data["found_by"]) == num_methods: - found_by_all.append(entry) - else: - found_by_some.append(entry) - - results["comparison"] = { - "found_by_all_methods": found_by_all, - "found_by_some_methods": found_by_some, - "total_unique_vocabulary": len(all_vocab), - "agreement_rate": len(found_by_all) / len(all_vocab) if all_vocab else 0 - } - - # Determine best method - best_method = None - best_count = 0 - for method_key, method_data in results["methods"].items(): - if method_data.get("success") and method_data.get("vocabulary_count", 0) > best_count: - best_count = method_data["vocabulary_count"] - best_method = method_key - - results["recommendation"] = { - "best_method": best_method, - "reason": f"Meiste Vokabeln erkannt ({best_count})" - } - - return results - - -# ============================================================================= -# Grid Detection and Analysis -# ============================================================================= - -@router.post("/sessions/{session_id}/analyze-grid/{page_number}") -async def analyze_grid(session_id: str, page_number: int): - """ - Analyze a page and detect grid structure for layout-preserving OCR. - - This endpoint: - 1. Applies deskewing to straighten the image - 2. Runs OCR with bounding box extraction - 3. Detects row and column structure - 4. Identifies recognized, empty, and problematic cells - - Returns grid structure with cell positions and recognition status. - """ - import numpy as np - from PIL import Image - import io - - logger.info(f"Grid analysis for session {session_id}, page {page_number}") - - session = await get_session_db(session_id) - if session is None: - raise HTTPException(status_code=404, detail="Session not found") - - # Get PDF data - pdf_data = get_cached_pdf_data(session_id) - if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path): - with open(session.pdf_path, 'rb') as f: - pdf_data = f.read() - cache_pdf_data(session_id, pdf_data) - - if not pdf_data: - raise HTTPException(status_code=400, detail="No PDF uploaded for this session") - - page_count = session.pdf_page_count or 1 - if page_number < 0 or page_number >= page_count: - raise HTTPException( - status_code=400, - detail=f"Invalid page number. PDF has {page_count} pages (0-indexed)." - ) - - # Convert page to image - image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) - - # Load image as numpy array - img = Image.open(io.BytesIO(image_data)) - img_array = np.array(img) - img_height, img_width = img_array.shape[:2] - - # Step 1: Deskewing - deskew_angle = 0.0 - try: - from services.image_preprocessing import deskew_image - img_array, deskew_angle = deskew_image(img_array) - logger.info(f"Applied deskew correction: {deskew_angle:.2f}°") - except ImportError: - logger.warning("Image preprocessing not available, skipping deskew") - except Exception as e: - logger.warning(f"Deskewing failed: {e}") - - # Step 2: Run OCR with position data - ocr_regions = [] - try: - import pytesseract - from pytesseract import Output - from services.grid_detection_service import convert_tesseract_regions - - # Convert back to PIL Image if we modified it - if deskew_angle != 0: - img = Image.fromarray(img_array) - - ocr_data = pytesseract.image_to_data( - img, - lang='eng+deu', - output_type=Output.DICT - ) - ocr_regions = convert_tesseract_regions(ocr_data, img_width, img_height) - logger.info(f"OCR found {len(ocr_regions)} text regions") - - except ImportError: - logger.warning("Tesseract not available, trying PaddleOCR") - try: - from hybrid_vocab_extractor import call_paddleocr_service - from services.grid_detection_service import convert_paddleocr_regions - - # Convert to bytes for PaddleOCR - buffer = io.BytesIO() - Image.fromarray(img_array).save(buffer, format='PNG') - paddle_regions, _ = await call_paddleocr_service(buffer.getvalue()) - - ocr_regions = convert_paddleocr_regions( - [{"text": r.text, "confidence": r.confidence, - "bbox": [[r.x1, r.y1], [r.x2, r.y1], [r.x2, r.y2], [r.x1, r.y2]]} - for r in paddle_regions], - img_width, img_height - ) - except Exception as e: - logger.error(f"PaddleOCR also failed: {e}") - - if not ocr_regions: - return { - "session_id": session_id, - "page_number": page_number + 1, - "success": False, - "error": "No text regions detected", - "grid": None, - "deskew_angle": deskew_angle, - } - - # Step 3: Detect grid structure - try: - from services.grid_detection_service import GridDetectionService - - grid_service = GridDetectionService() - result = grid_service.detect_grid(ocr_regions, img_array, deskew_angle) - - # Store grid data in session - await update_session_db( - session_id, - grid_data=result.to_dict(), - deskew_angle=deskew_angle - ) - - return { - "session_id": session_id, - "page_number": page_number + 1, - "success": True, - "grid": result.to_dict(), - "deskew_angle": deskew_angle, - "image_dimensions": { - "width": img_width, - "height": img_height - } - } - - except ImportError as e: - logger.error(f"Grid detection service not available: {e}") - raise HTTPException(status_code=500, detail="Grid detection service not available") - except Exception as e: - logger.error(f"Grid detection failed: {e}") - import traceback - logger.error(traceback.format_exc()) - raise HTTPException(status_code=500, detail=f"Grid detection failed: {str(e)}") - - -@router.get("/sessions/{session_id}/grid") -async def get_grid(session_id: str): - """ - Get the stored grid structure for a session. - """ - session = await get_session_db(session_id) - if session is None: - raise HTTPException(status_code=404, detail="Session not found") - - if not session.grid_data: - raise HTTPException(status_code=404, detail="No grid data found. Run analyze-grid first.") - - return { - "session_id": session_id, - "grid": session.grid_data, - "deskew_angle": session.deskew_angle - } - - -@router.get("/sessions/{session_id}/cell-crop/{page_number}/{row}/{col}") -async def get_cell_crop(session_id: str, page_number: int, row: int, col: int): - """ - Get a cropped image of a specific grid cell. - - Useful for showing the original image content when manually correcting cells. - """ - from PIL import Image - import io - - session = await get_session_db(session_id) - if session is None: - raise HTTPException(status_code=404, detail="Session not found") - - if not session.grid_data: - raise HTTPException(status_code=400, detail="No grid data. Run analyze-grid first.") - - # Get cell from grid - cells = session.grid_data.get("cells", []) - if row >= len(cells) or col >= len(cells[row] if row < len(cells) else []): - raise HTTPException(status_code=404, detail="Cell not found") - - cell = cells[row][col] - - # Get PDF image - pdf_data = get_cached_pdf_data(session_id) - if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path): - with open(session.pdf_path, 'rb') as f: - pdf_data = f.read() - - if not pdf_data: - raise HTTPException(status_code=400, detail="No PDF data available") - - # Convert page to image - image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) - img = Image.open(io.BytesIO(image_data)) - img_width, img_height = img.size - - # Crop cell region - x1 = int(img_width * cell["x"] / 100) - y1 = int(img_height * cell["y"] / 100) - x2 = int(img_width * (cell["x"] + cell["width"]) / 100) - y2 = int(img_height * (cell["y"] + cell["height"]) / 100) - - # Add small padding - padding = 5 - x1 = max(0, x1 - padding) - y1 = max(0, y1 - padding) - x2 = min(img_width, x2 + padding) - y2 = min(img_height, y2 + padding) - - cropped = img.crop((x1, y1, x2, y2)) - - # Convert to PNG - buffer = io.BytesIO() - cropped.save(buffer, format='PNG') - buffer.seek(0) - - return StreamingResponse(buffer, media_type="image/png") - - -@router.put("/sessions/{session_id}/cell/{row}/{col}") -async def update_cell(session_id: str, row: int, col: int, text: str = Form(...)): - """ - Manually update the text content of a grid cell. - - Sets recognition_status to 'manual' for the updated cell. - """ - session = await get_session_db(session_id) - if session is None: - raise HTTPException(status_code=404, detail="Session not found") - - if not session.grid_data: - raise HTTPException(status_code=400, detail="No grid data. Run analyze-grid first.") - - # Update cell in grid - grid_data = session.grid_data - cells = grid_data.get("cells", []) - - if row >= len(cells) or col >= len(cells[row] if row < len(cells) else []): - raise HTTPException(status_code=404, detail="Cell not found") - - cells[row][col]["text"] = text - cells[row][col]["status"] = "manual" - cells[row][col]["confidence"] = 1.0 - - # Update statistics - recognized = sum(1 for r in cells for c in r if c.get("status") == "recognized") - manual = sum(1 for r in cells for c in r if c.get("status") == "manual") - problematic = sum(1 for r in cells for c in r if c.get("status") == "problematic") - total = len(cells) * len(cells[0]) if cells and cells[0] else 0 - - grid_data["stats"] = { - "recognized": recognized, - "manual": manual, - "problematic": problematic, - "empty": total - recognized - manual - problematic, - "total": total, - "coverage": (recognized + manual) / total if total > 0 else 0 - } - - await update_session_db(session_id, grid_data=grid_data) - - return { - "success": True, - "cell": cells[row][col], - "stats": grid_data["stats"] - } - - @router.post("/sessions/{session_id}/process-pages") async def process_pdf_pages( session_id: str, @@ -1828,23 +1181,16 @@ async def process_pdf_pages( """ logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}") - session = await get_session_db(session_id) - if session is None: + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") - # Try cached PDF data first - pdf_data = get_cached_pdf_data(session_id) - - # If not cached, try to load from file - if not pdf_data and session.pdf_path and os.path.exists(session.pdf_path): - with open(session.pdf_path, 'rb') as f: - pdf_data = f.read() - cache_pdf_data(session_id, pdf_data) + session = _sessions[session_id] + pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") - page_count = session.pdf_page_count or 1 + page_count = session.get("pdf_page_count", 1) # Determine which pages to process if process_all: @@ -1890,28 +1236,22 @@ async def process_pdf_pages( avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0 - # Store vocabulary in DB (replace existing) - await update_vocabulary_db(session_id, all_vocabulary) + # Update session + session["vocabulary"] = all_vocabulary + session["vocabulary_count"] = len(all_vocabulary) + session["extraction_confidence"] = avg_confidence + session["processed_pages"] = pages + session["successful_pages"] = successful_pages + session["failed_pages"] = failed_pages + session["status"] = SessionStatus.EXTRACTED.value # Save first page as preview image - image_path = None if images: session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) - os.makedirs(session_dir, exist_ok=True) image_path = os.path.join(session_dir, "source.png") with open(image_path, 'wb') as f: f.write(images[0]) - - # Update session in DB - await update_session_db( - session_id, - status=SessionStatus.EXTRACTED.value, - extraction_confidence=avg_confidence, - processed_pages=pages, - successful_pages=successful_pages, - failed_pages=failed_pages, - image_path=image_path, - ) + session["image_path"] = image_path result = { "session_id": session_id, @@ -1934,8 +1274,7 @@ async def process_pdf_pages( @router.delete("/sessions/{session_id}") async def delete_session(session_id: str): """Delete a vocabulary session and all associated files.""" - session = await get_session_db(session_id) - if session is None: + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") # Delete session directory @@ -1944,122 +1283,337 @@ async def delete_session(session_id: str): import shutil shutil.rmtree(session_dir) - # Clear cached PDF data - clear_cached_pdf_data(session_id) + # Remove from storage + del _sessions[session_id] - # Delete from database (CASCADE deletes vocab_entries and vocab_worksheets) - success = await delete_session_db(session_id) - - if not success: - raise HTTPException(status_code=500, detail="Failed to delete session from database") + # Remove associated worksheets + for wid, ws in list(_worksheets.items()): + if ws["session_id"] == session_id: + del _worksheets[wid] return {"message": "Session deleted successfully", "session_id": session_id} # ============================================================================= -# NRU Format Worksheet Generation +# OCR Export Endpoints (for cross-app OCR data sharing) # ============================================================================= -class NRUWorksheetRequest(BaseModel): - """Request model for NRU format worksheet generation.""" - title: Optional[str] = "Vokabeltest" - include_solutions: bool = True - specific_pages: Optional[List[int]] = None # 1-indexed page numbers, None = all +OCR_EXPORT_DIR = os.path.join(LOCAL_STORAGE_PATH, "ocr-exports") -@router.post("/sessions/{session_id}/generate-nru") -async def generate_nru_worksheet(session_id: str, request: NRUWorksheetRequest): +@router.post("/sessions/{session_id}/ocr-export/{page_number}") +async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)): """ - Generate worksheet PDF in NRU format. + Save OCR export data for cross-app sharing (admin-v2 -> studio-v2). - NRU Format: - - Per scanned page, generates 2 worksheet pages: - 1. Vocabulary table (3 columns: English, German blank, Correction blank) - 2. Sentence practice (German sentence, 2 empty lines for English translation) - - Automatically separates vocabulary entries into: - - Single words/phrases -> Vocabulary table - - Full sentences (end with . ! ? or are long) -> Sentence practice - - Args: - session_id: Session with extracted vocabulary - request: Generation options (title, include_solutions, specific_pages) - - Returns: - Worksheet and solution PDF download info + Both apps proxy to klausur-service via /klausur-api/, so this endpoint + serves as shared storage accessible from both ports. """ - logger.info(f"Generating NRU worksheet for session {session_id}") - session = await get_session_db(session_id) - if session is None: + logger.info(f"Saving OCR export for session {session_id}, page {page_number}") + + os.makedirs(OCR_EXPORT_DIR, exist_ok=True) + + # Save the export data + export_path = os.path.join(OCR_EXPORT_DIR, f"{session_id}_page{page_number}.json") + with open(export_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + # Update latest pointer + latest_path = os.path.join(OCR_EXPORT_DIR, "latest.json") + with open(latest_path, 'w', encoding='utf-8') as f: + json.dump({ + "session_id": session_id, + "page_number": page_number, + "saved_at": datetime.utcnow().isoformat(), + }, f, ensure_ascii=False, indent=2) + + return { + "success": True, + "session_id": session_id, + "page_number": page_number, + "message": "OCR export saved successfully", + } + + +@router.get("/sessions/{session_id}/ocr-export/{page_number}") +async def load_ocr_export(session_id: str, page_number: int): + """Load a specific OCR export by session and page number.""" + + export_path = os.path.join(OCR_EXPORT_DIR, f"{session_id}_page{page_number}.json") + + if not os.path.exists(export_path): + raise HTTPException(status_code=404, detail="OCR export not found") + + with open(export_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + return data + + +# ============================================================================= +# OCR Compare & Grid Analysis Endpoints +# ============================================================================= + + +@router.post("/sessions/{session_id}/compare-ocr/{page_number}") +async def compare_ocr_methods(session_id: str, page_number: int): + """ + Run multiple OCR methods on a page and compare results. + + This endpoint: + 1. Gets the page image from the session's uploaded PDF + 2. Runs Vision LLM extraction (primary method) + 3. Optionally runs Tesseract extraction + 4. Compares found vocabulary across methods + 5. Returns structured comparison results + + page_number is 0-indexed. + """ + import httpx + import time + + logger.info(f"Compare OCR for session {session_id}, page {page_number}") + + if session_id not in _sessions: raise HTTPException(status_code=404, detail="Session not found") - vocab_dicts = await get_vocabulary_db(session_id) - if not vocab_dicts: - raise HTTPException(status_code=400, detail="No vocabulary found in session") + session = _sessions[session_id] + pdf_data = session.get("pdf_data") - # Generate PDFs using NRU format + if not pdf_data: + raise HTTPException(status_code=400, detail="No PDF uploaded for this session") + + page_count = session.get("pdf_page_count", 1) + if page_number < 0 or page_number >= page_count: + raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") + + # Convert page to image + image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) + + methods_results = {} + all_vocab_sets = {} + + # --- Method: Vision LLM --- try: - from nru_worksheet_generator import generate_nru_pdf, separate_vocab_and_sentences - - # Get statistics - vocab_list, sentence_list = separate_vocab_and_sentences(vocab_dicts) - - worksheet_pdf, solution_pdf = await generate_nru_pdf( - entries=vocab_dicts, - title=request.title or session.name, - include_solutions=request.include_solutions + start = time.time() + vocab, confidence, error = await extract_vocabulary_from_image( + image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False ) + duration = time.time() - start - # Save PDFs - worksheet_id = str(uuid.uuid4()) - session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) - os.makedirs(session_dir, exist_ok=True) + vocab_list = [] + for v in vocab: + entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v)) + vocab_list.append({ + "english": entry.get("english", ""), + "german": entry.get("german", ""), + "example": entry.get("example_sentence", ""), + }) - pdf_path = os.path.join(session_dir, f"nru_worksheet_{worksheet_id}.pdf") - with open(pdf_path, 'wb') as f: - f.write(worksheet_pdf) + methods_results["vision_llm"] = { + "name": "Vision LLM", + "model": VISION_MODEL, + "duration_seconds": round(duration, 1), + "vocabulary_count": len(vocab_list), + "vocabulary": vocab_list, + "confidence": confidence, + "success": len(vocab_list) > 0 and not error, + "error": error if error else None, + } + all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]} + except Exception as e: + logger.error(f"Vision LLM failed: {e}") + methods_results["vision_llm"] = { + "name": "Vision LLM", + "model": VISION_MODEL, + "duration_seconds": 0, + "vocabulary_count": 0, + "vocabulary": [], + "confidence": 0, + "success": False, + "error": str(e), + } + all_vocab_sets["vision_llm"] = set() - solution_path = None - if solution_pdf: - solution_path = os.path.join(session_dir, f"nru_solution_{worksheet_id}.pdf") - with open(solution_path, 'wb') as f: - f.write(solution_pdf) + # --- Method: Local LLM (same as vision but noted separately if available) --- + # For now, we treat vision_llm as the primary method. + # Tesseract method can be added here if tesseract_vocab_extractor is available. - # Store worksheet info - await create_worksheet_db( - worksheet_id=worksheet_id, - session_id=session_id, - worksheet_types=["nru_format"], - pdf_path=pdf_path, - solution_path=solution_path, - ) + # --- Build comparison --- + all_unique = set() + for vs in all_vocab_sets.values(): + all_unique |= vs - # Get unique pages - pages = sorted(set(v.get("source_page", 1) for v in vocab_dicts)) + found_by_all = [] + found_by_some = [] + for english, german in sorted(all_unique): + found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs] + entry = {"english": english, "german": german, "methods": found_in} + if len(found_in) == len(all_vocab_sets): + found_by_all.append(entry) + else: + found_by_some.append(entry) - return { - "worksheet_id": worksheet_id, - "session_id": session_id, - "format": "nru", - "pdf_path": pdf_path, - "solution_path": solution_path, - "statistics": { - "total_entries": len(vocab_dicts), - "vocabulary_count": len(vocab_list), - "sentence_count": len(sentence_list), - "source_pages": pages, - "worksheet_pages": len(pages) * 2, # 2 pages per source page + total_methods = max(len(all_vocab_sets), 1) + agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0 + + # Find best method + best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm" + + # Save vocabulary from best method in session for grid analysis (no second Ollama call needed) + best_vocab_list = methods_results.get(best_method, {}).get("vocabulary", []) + session["comparison_vocabulary"] = best_vocab_list + session["comparison_page"] = page_number + + return { + "session_id": session_id, + "page_number": page_number, + "methods": methods_results, + "comparison": { + "found_by_all_methods": found_by_all, + "found_by_some_methods": found_by_some, + "total_unique_vocabulary": len(all_unique), + "agreement_rate": agreement_rate, + }, + "recommendation": { + "best_method": best_method, + "reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz", + }, + } + + +@router.post("/sessions/{session_id}/analyze-grid/{page_number}") +async def analyze_grid(session_id: str, page_number: int): + """ + Build grid structure from comparison results (no Ollama call needed). + + Uses vocabulary stored in session by compare-ocr to compute + the grid layout instantly. + + page_number is 0-indexed. + Returns GridData structure expected by the frontend GridOverlay component. + """ + logger.info(f"Grid analysis for session {session_id}, page {page_number}") + + if session_id not in _sessions: + raise HTTPException(status_code=404, detail="Session not found") + + session = _sessions[session_id] + + # Read vocabulary from session (saved by compare-ocr) + vocab_list = session.get("comparison_vocabulary") + if not vocab_list: + return {"success": False, "error": "Bitte zuerst Vergleich starten, bevor die Grid-Analyse ausgefuehrt wird."} + + try: + # Check if example column is present (at least one vocab has non-empty example) + has_examples = any(v.get("example", "").strip() for v in vocab_list) + num_cols = 3 if has_examples else 2 + column_types = ["english", "german", "example"] if has_examples else ["english", "german"] + + num_rows = len(vocab_list) + if num_rows == 0: + return {"success": False, "error": "Keine Vokabeln im Vergleichsergebnis gefunden."} + + row_height = 100.0 / num_rows + col_width = 100.0 / num_cols + + cells = [] + recognized_count = 0 + empty_count = 0 + + for r, vocab in enumerate(vocab_list): + row_cells = [] + english = vocab.get("english", "").strip() + german = vocab.get("german", "").strip() + example = vocab.get("example", "").strip() if has_examples else None + + col_values = [("english", english), ("german", german)] + if has_examples: + col_values.append(("example", example)) + + for c, (col_type, text) in enumerate(col_values): + x = c * col_width + y = r * row_height + + if text: + status = "recognized" + recognized_count += 1 + conf = 0.9 + else: + status = "empty" + empty_count += 1 + conf = 0.0 + + row_cells.append({ + "row": r, + "col": c, + "x": round(x, 2), + "y": round(y, 2), + "width": round(col_width, 2), + "height": round(row_height, 2), + "text": text or "", + "confidence": conf, + "status": status, + "column_type": col_type, + }) + cells.append(row_cells) + + total = num_rows * num_cols + coverage = recognized_count / max(total, 1) + + col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)] + row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)] + + grid_data = { + "rows": num_rows, + "columns": num_cols, + "cells": cells, + "column_types": column_types, + "column_boundaries": col_boundaries, + "row_boundaries": row_boundaries, + "deskew_angle": 0.0, + "stats": { + "recognized": recognized_count, + "problematic": 0, + "empty": empty_count, + "manual": 0, + "total": total, + "coverage": round(coverage, 3), }, - "download_url": f"/api/v1/vocab/worksheets/{worksheet_id}/pdf", - "solution_url": f"/api/v1/vocab/worksheets/{worksheet_id}/solution" if solution_path else None, } - except ImportError as e: - logger.error(f"NRU generator not available: {e}") - raise HTTPException(status_code=500, detail="NRU worksheet generator not available") + return {"success": True, "grid": grid_data} + except Exception as e: - logger.error(f"NRU worksheet generation failed: {e}") + logger.error(f"Grid analysis failed: {e}") import traceback - logger.error(traceback.format_exc()) - raise HTTPException(status_code=500, detail=f"Worksheet generation failed: {str(e)}") + logger.debug(traceback.format_exc()) + return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"} + + +@router.get("/ocr-export/latest") +async def load_latest_ocr_export(): + """Load the most recently saved OCR export data.""" + + latest_path = os.path.join(OCR_EXPORT_DIR, "latest.json") + + if not os.path.exists(latest_path): + raise HTTPException(status_code=404, detail="No OCR exports found") + + with open(latest_path, 'r', encoding='utf-8') as f: + pointer = json.load(f) + + session_id = pointer.get("session_id") + page_number = pointer.get("page_number") + + export_path = os.path.join(OCR_EXPORT_DIR, f"{session_id}_page{page_number}.json") + + if not os.path.exists(export_path): + raise HTTPException(status_code=404, detail="Latest OCR export file not found") + + with open(export_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + return data