""" Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing. Extracted from vocab_worksheet_api.py to keep modules under 500 LOC. Routes (no prefix — included into the main /api/v1/vocab router): POST /sessions/{session_id}/upload-pdf-info GET /sessions/{session_id}/pdf-thumbnail/{page_number} GET /sessions/{session_id}/pdf-page-image/{page_number} POST /sessions/{session_id}/process-single-page/{page_number} POST /sessions/{session_id}/process-pages """ import io import logging import os import uuid from typing import List from fastapi import APIRouter, HTTPException, Query, UploadFile, File from fastapi.responses import StreamingResponse from .models import SessionStatus logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Local storage path # --------------------------------------------------------------------------- LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets") # --------------------------------------------------------------------------- # Optional heavy dependencies # --------------------------------------------------------------------------- try: import numpy as np from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation OCR_PIPELINE_AVAILABLE = True except ImportError: np = None # type: ignore[assignment] OCR_PIPELINE_AVAILABLE = False logger.warning("OCR pipeline imports not available in upload module") # Sub-module imports (already split out) from .generation import ( convert_pdf_page_to_image, convert_pdf_to_images, get_pdf_page_count, ) from .extraction import extract_vocabulary_from_image try: from .ocr import _run_ocr_pipeline_for_page except ImportError: _run_ocr_pipeline_for_page = None # type: ignore[assignment] logger.warning("vocab_worksheet_ocr not available — process-single-page disabled") # --------------------------------------------------------------------------- # In-memory session store (shared with main module) # --------------------------------------------------------------------------- def _get_sessions(): from .api import _sessions return _sessions # --------------------------------------------------------------------------- # Router (no prefix — will be included into the main vocab router) # --------------------------------------------------------------------------- upload_router = APIRouter() # ============================================================================= # POST /sessions/{session_id}/upload-pdf-info # ============================================================================= @upload_router.post("/sessions/{session_id}/upload-pdf-info") async def upload_pdf_get_info( session_id: str, file: UploadFile = File(...), ): """ Upload a PDF and get page count and thumbnails for preview. Use this before processing to let user select pages. """ logger.info(f"PDF info request for session {session_id}") if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] # Validate file type extension = file.filename.split('.')[-1].lower() if file.filename else '' content_type = file.content_type or '' if extension != 'pdf' and content_type != 'application/pdf': raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint") content = await file.read() # Save PDF temporarily session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) os.makedirs(session_dir, exist_ok=True) pdf_path = os.path.join(session_dir, "source.pdf") with open(pdf_path, 'wb') as f: f.write(content) # Get page count page_count = get_pdf_page_count(content) # Store PDF data in session for later processing session["pdf_data"] = content session["pdf_path"] = pdf_path session["pdf_page_count"] = page_count session["status"] = "pdf_uploaded" # Detect orientation for each page so thumbnails are shown correctly page_rotations: dict = {} if OCR_PIPELINE_AVAILABLE: for pg in range(page_count): try: img_bgr = render_pdf_high_res(content, pg, zoom=2.0) _, rotation = detect_and_fix_orientation(img_bgr) if rotation: page_rotations[pg] = rotation logger.info(f"Page {pg + 1}: orientation {rotation}°") except Exception as e: logger.warning(f"Orientation detection failed for page {pg + 1}: {e}") session["page_rotations"] = page_rotations return { "session_id": session_id, "page_count": page_count, "filename": file.filename, "page_rotations": page_rotations, } # ============================================================================= # GET /sessions/{session_id}/pdf-thumbnail/{page_number} # ============================================================================= @upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}") async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)): """Get a thumbnail image of a specific PDF page. Uses fitz for rendering so that page_rotations (from OCR orientation detection) are applied consistently. Args: hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5). """ if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") try: import fitz zoom = 2.0 if hires else 0.5 pdf_document = fitz.open(stream=pdf_data, filetype="pdf") page = pdf_document[page_number] # Apply orientation correction detected during OCR processing rot = session.get("page_rotations", {}).get(page_number, 0) if rot: page.set_rotation(rot) mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) png_data = pix.tobytes("png") pdf_document.close() except Exception as e: logger.error(f"PDF thumbnail failed: {e}") raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}") return StreamingResponse( io.BytesIO(png_data), media_type="image/png", ) # ============================================================================= # GET /sessions/{session_id}/pdf-page-image/{page_number} # ============================================================================= @upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}") async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)): """PDF page as PNG at arbitrary resolution (for editor view). Args: zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI). """ if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") page_count = session.get("pdf_page_count", 1) if page_number < 0 or page_number >= page_count: raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") try: import fitz pdf_document = fitz.open(stream=pdf_data, filetype="pdf") page = pdf_document[page_number] # Apply orientation correction detected during OCR processing rot = session.get("page_rotations", {}).get(page_number, 0) if rot: page.set_rotation(rot) mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) png_data = pix.tobytes("png") pdf_document.close() logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes") except Exception as e: logger.error(f"PDF page image failed: {e}") raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}") return StreamingResponse( io.BytesIO(png_data), media_type="image/png", ) # ============================================================================= # POST /sessions/{session_id}/process-single-page/{page_number} # ============================================================================= @upload_router.post("/sessions/{session_id}/process-single-page/{page_number}") async def process_single_page( session_id: str, page_number: int, ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"), syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"), enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"), max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"), min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"), ): """ Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline. Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop -> dual-engine OCR -> grid-build with autocorrect/merge) for best quality. Query params: ipa_mode: "none" (default), "auto", "all", "en", "de" syllable_mode: "none" (default), "auto", "all", "en", "de" enhance: true (default) -- apply CLAHE/denoise for degraded scans max_cols: 3 (default) -- max column count (0=unlimited) min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score) The frontend should call this sequentially for each page. Returns the vocabulary for just this one page. """ logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}") if session_id not in _get_sessions(): raise HTTPException( status_code=404, detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.", ) session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") page_count = session.get("pdf_page_count", 1) if page_number < 0 or page_number >= page_count: raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") # Derive pipeline-level variable names for the quality report enable_enhance = enhance max_columns = max_cols if max_cols > 0 else None override_min_conf = min_conf if min_conf > 0 else None # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) --- rotation_deg = 0 quality_report = None min_ocr_conf = 40 # default; overridden by pipeline when quality report is available if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None: try: img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0) page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page( img_bgr, page_number, session_id, ipa_mode=ipa_mode, syllable_mode=syllable_mode, enable_enhance=enable_enhance, max_columns=max_columns, override_min_conf=override_min_conf, ) # Update min_ocr_conf from quality report if available if quality_report and hasattr(quality_report, 'recommended_min_conf'): min_ocr_conf = quality_report.recommended_min_conf except Exception as e: logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True) return { "session_id": session_id, "page_number": page_number + 1, "success": False, "error": f"OCR pipeline error: {e}", "vocabulary": [], "vocabulary_count": 0, } else: # Fallback to LLM vision extraction logger.warning("OCR pipeline not available, falling back to LLM vision") image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) vocabulary, confidence, error = await extract_vocabulary_from_image( image_data, f"page_{page_number + 1}.png", page_number=page_number ) if error: logger.warning(f"Page {page_number + 1} failed: {error}") return { "session_id": session_id, "page_number": page_number + 1, "success": False, "error": error, "vocabulary": [], "vocabulary_count": 0, } page_vocabulary = [] for entry in vocabulary: entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry)) entry_dict['source_page'] = page_number + 1 if 'id' not in entry_dict or not entry_dict['id']: entry_dict['id'] = str(uuid.uuid4()) page_vocabulary.append(entry_dict) logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert") # Store rotation for this page (used by image/thumbnail endpoints) session.setdefault("page_rotations", {})[page_number] = rotation_deg # Add to session's vocabulary (append, don't replace) existing_vocab = session.get("vocabulary", []) # Remove any existing entries from this page (in case of re-processing) existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1] existing_vocab.extend(page_vocabulary) session["vocabulary"] = existing_vocab session["vocabulary_count"] = len(existing_vocab) session["status"] = SessionStatus.EXTRACTED.value result = { "session_id": session_id, "page_number": page_number + 1, "success": True, "vocabulary": page_vocabulary, "vocabulary_count": len(page_vocabulary), "total_vocabulary_count": len(existing_vocab), "extraction_confidence": 0.9, "rotation": rotation_deg, } # Add scan quality report + active steps info if quality_report: sq = quality_report.to_dict() sq["active_steps"] = { "step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)", "step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited", "step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off", } result["scan_quality"] = sq return result # ============================================================================= # POST /sessions/{session_id}/process-pages (DEPRECATED) # ============================================================================= @upload_router.post("/sessions/{session_id}/process-pages") async def process_pdf_pages( session_id: str, pages: List[int] = None, process_all: bool = False, ): """ Process specific pages of an uploaded PDF. DEPRECATED: Use /process-single-page/{page_number} instead for better results. Args: pages: List of 0-indexed page numbers to process process_all: If True, process all pages """ logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}") if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") page_count = session.get("pdf_page_count", 1) # Determine which pages to process if process_all: pages = list(range(page_count)) elif pages is None or len(pages) == 0: pages = [0] # Default to first page # Convert selected pages to images images = await convert_pdf_to_images(pdf_data, pages) # Extract vocabulary from each page SEQUENTIALLY all_vocabulary = [] total_confidence = 0.0 successful_pages = [] failed_pages = [] error_messages = [] for i, image_data in enumerate(images): page_num = pages[i] logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...") vocabulary, confidence, error = await extract_vocabulary_from_image( image_data, f"page_{page_num + 1}.png", page_number=page_num ) if error: failed_pages.append(page_num + 1) error_messages.append(error) logger.warning(f"Page {page_num + 1} failed: {error}") else: successful_pages.append(page_num + 1) total_confidence += confidence # Add page info to each entry and convert to dict for entry in vocabulary: entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry)) entry_dict['source_page'] = page_num + 1 all_vocabulary.append(entry_dict) logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert") avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0 # Update session session["vocabulary"] = all_vocabulary session["vocabulary_count"] = len(all_vocabulary) session["extraction_confidence"] = avg_confidence session["processed_pages"] = pages session["successful_pages"] = successful_pages session["failed_pages"] = failed_pages session["status"] = SessionStatus.EXTRACTED.value # Save first page as preview image if images: session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) image_path = os.path.join(session_dir, "source.png") with open(image_path, 'wb') as f: f.write(images[0]) session["image_path"] = image_path result = { "session_id": session_id, "pages_processed": len(pages), "pages_successful": len(successful_pages), "pages_failed": len(failed_pages), "successful_pages": successful_pages, "failed_pages": failed_pages, "vocabulary_count": len(all_vocabulary), "extraction_confidence": avg_confidence, "status": SessionStatus.EXTRACTED.value, } if error_messages: result["errors"] = error_messages return result