""" Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing. Extracted from vocab_worksheet_api.py to keep modules under 500 LOC. Routes (no prefix — included into the main /api/v1/vocab router): POST /sessions/{session_id}/upload-pdf-info GET /sessions/{session_id}/pdf-thumbnail/{page_number} GET /sessions/{session_id}/pdf-page-image/{page_number} POST /sessions/{session_id}/process-single-page/{page_number} POST /sessions/{session_id}/process-pages """ import io import logging import os import uuid from typing import List, Optional from fastapi import APIRouter, HTTPException, Query, UploadFile, File from fastapi.responses import StreamingResponse from vocab_worksheet_models import SessionStatus, VocabularyEntry logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Local storage path # --------------------------------------------------------------------------- LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets") # --------------------------------------------------------------------------- # Optional heavy dependencies # --------------------------------------------------------------------------- try: import numpy as np from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation OCR_PIPELINE_AVAILABLE = True except ImportError: np = None # type: ignore[assignment] OCR_PIPELINE_AVAILABLE = False logger.warning("OCR pipeline imports not available in upload module") # Sub-module imports (already split out) from vocab_worksheet_generation import ( convert_pdf_page_to_image, convert_pdf_to_images, get_pdf_page_count, ) from vocab_worksheet_extraction import extract_vocabulary_from_image try: from vocab_worksheet_ocr import _run_ocr_pipeline_for_page except ImportError: _run_ocr_pipeline_for_page = None # type: ignore[assignment] logger.warning("vocab_worksheet_ocr not available — process-single-page disabled") # --------------------------------------------------------------------------- # In-memory session store (shared with main module) # --------------------------------------------------------------------------- def _get_sessions(): from vocab_worksheet_api import _sessions return _sessions # --------------------------------------------------------------------------- # Router (no prefix — will be included into the main vocab router) # --------------------------------------------------------------------------- upload_router = APIRouter() # ============================================================================= # POST /sessions/{session_id}/upload-pdf-info # ============================================================================= @upload_router.post("/sessions/{session_id}/upload-pdf-info") async def upload_pdf_get_info( session_id: str, file: UploadFile = File(...), ): """ Upload a PDF and get page count and thumbnails for preview. Use this before processing to let user select pages. """ logger.info(f"PDF info request for session {session_id}") if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] # Validate file type extension = file.filename.split('.')[-1].lower() if file.filename else '' content_type = file.content_type or '' if extension != 'pdf' and content_type != 'application/pdf': raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint") content = await file.read() # Save PDF temporarily session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) os.makedirs(session_dir, exist_ok=True) pdf_path = os.path.join(session_dir, "source.pdf") with open(pdf_path, 'wb') as f: f.write(content) # Get page count page_count = get_pdf_page_count(content) # Store PDF data in session for later processing session["pdf_data"] = content session["pdf_path"] = pdf_path session["pdf_page_count"] = page_count session["status"] = "pdf_uploaded" # Detect orientation for each page so thumbnails are shown correctly page_rotations: dict = {} if OCR_PIPELINE_AVAILABLE: for pg in range(page_count): try: img_bgr = render_pdf_high_res(content, pg, zoom=2.0) _, rotation = detect_and_fix_orientation(img_bgr) if rotation: page_rotations[pg] = rotation logger.info(f"Page {pg + 1}: orientation {rotation}°") except Exception as e: logger.warning(f"Orientation detection failed for page {pg + 1}: {e}") session["page_rotations"] = page_rotations return { "session_id": session_id, "page_count": page_count, "filename": file.filename, "page_rotations": page_rotations, } # ============================================================================= # GET /sessions/{session_id}/pdf-thumbnail/{page_number} # ============================================================================= @upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}") async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)): """Get a thumbnail image of a specific PDF page. Uses fitz for rendering so that page_rotations (from OCR orientation detection) are applied consistently. Args: hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5). """ if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") try: import fitz zoom = 2.0 if hires else 0.5 pdf_document = fitz.open(stream=pdf_data, filetype="pdf") page = pdf_document[page_number] # Apply orientation correction detected during OCR processing rot = session.get("page_rotations", {}).get(page_number, 0) if rot: page.set_rotation(rot) mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) png_data = pix.tobytes("png") pdf_document.close() except Exception as e: logger.error(f"PDF thumbnail failed: {e}") raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}") return StreamingResponse( io.BytesIO(png_data), media_type="image/png", ) # ============================================================================= # GET /sessions/{session_id}/pdf-page-image/{page_number} # ============================================================================= @upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}") async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)): """PDF page as PNG at arbitrary resolution (for editor view). Args: zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI). """ if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") page_count = session.get("pdf_page_count", 1) if page_number < 0 or page_number >= page_count: raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") try: import fitz pdf_document = fitz.open(stream=pdf_data, filetype="pdf") page = pdf_document[page_number] # Apply orientation correction detected during OCR processing rot = session.get("page_rotations", {}).get(page_number, 0) if rot: page.set_rotation(rot) mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) png_data = pix.tobytes("png") pdf_document.close() logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes") except Exception as e: logger.error(f"PDF page image failed: {e}") raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}") return StreamingResponse( io.BytesIO(png_data), media_type="image/png", ) # ============================================================================= # POST /sessions/{session_id}/process-single-page/{page_number} # ============================================================================= @upload_router.post("/sessions/{session_id}/process-single-page/{page_number}") async def process_single_page( session_id: str, page_number: int, ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"), syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"), enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"), max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"), min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"), ): """ Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline. Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop -> dual-engine OCR -> grid-build with autocorrect/merge) for best quality. Query params: ipa_mode: "none" (default), "auto", "all", "en", "de" syllable_mode: "none" (default), "auto", "all", "en", "de" enhance: true (default) -- apply CLAHE/denoise for degraded scans max_cols: 3 (default) -- max column count (0=unlimited) min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score) The frontend should call this sequentially for each page. Returns the vocabulary for just this one page. """ logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}") if session_id not in _get_sessions(): raise HTTPException( status_code=404, detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.", ) session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") page_count = session.get("pdf_page_count", 1) if page_number < 0 or page_number >= page_count: raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") # Derive pipeline-level variable names for the quality report enable_enhance = enhance max_columns = max_cols if max_cols > 0 else None override_min_conf = min_conf if min_conf > 0 else None # --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) --- rotation_deg = 0 quality_report = None min_ocr_conf = 40 # default; overridden by pipeline when quality report is available if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None: try: img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0) page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page( img_bgr, page_number, session_id, ipa_mode=ipa_mode, syllable_mode=syllable_mode, enable_enhance=enable_enhance, max_columns=max_columns, override_min_conf=override_min_conf, ) # Update min_ocr_conf from quality report if available if quality_report and hasattr(quality_report, 'recommended_min_conf'): min_ocr_conf = quality_report.recommended_min_conf except Exception as e: logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True) return { "session_id": session_id, "page_number": page_number + 1, "success": False, "error": f"OCR pipeline error: {e}", "vocabulary": [], "vocabulary_count": 0, } else: # Fallback to LLM vision extraction logger.warning("OCR pipeline not available, falling back to LLM vision") image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) vocabulary, confidence, error = await extract_vocabulary_from_image( image_data, f"page_{page_number + 1}.png", page_number=page_number ) if error: logger.warning(f"Page {page_number + 1} failed: {error}") return { "session_id": session_id, "page_number": page_number + 1, "success": False, "error": error, "vocabulary": [], "vocabulary_count": 0, } page_vocabulary = [] for entry in vocabulary: entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry)) entry_dict['source_page'] = page_number + 1 if 'id' not in entry_dict or not entry_dict['id']: entry_dict['id'] = str(uuid.uuid4()) page_vocabulary.append(entry_dict) logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert") # Store rotation for this page (used by image/thumbnail endpoints) session.setdefault("page_rotations", {})[page_number] = rotation_deg # Add to session's vocabulary (append, don't replace) existing_vocab = session.get("vocabulary", []) # Remove any existing entries from this page (in case of re-processing) existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1] existing_vocab.extend(page_vocabulary) session["vocabulary"] = existing_vocab session["vocabulary_count"] = len(existing_vocab) session["status"] = SessionStatus.EXTRACTED.value result = { "session_id": session_id, "page_number": page_number + 1, "success": True, "vocabulary": page_vocabulary, "vocabulary_count": len(page_vocabulary), "total_vocabulary_count": len(existing_vocab), "extraction_confidence": 0.9, "rotation": rotation_deg, } # Add scan quality report + active steps info if quality_report: sq = quality_report.to_dict() sq["active_steps"] = { "step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)", "step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited", "step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off", } result["scan_quality"] = sq return result # ============================================================================= # POST /sessions/{session_id}/process-pages (DEPRECATED) # ============================================================================= @upload_router.post("/sessions/{session_id}/process-pages") async def process_pdf_pages( session_id: str, pages: List[int] = None, process_all: bool = False, ): """ Process specific pages of an uploaded PDF. DEPRECATED: Use /process-single-page/{page_number} instead for better results. Args: pages: List of 0-indexed page numbers to process process_all: If True, process all pages """ logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}") if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") page_count = session.get("pdf_page_count", 1) # Determine which pages to process if process_all: pages = list(range(page_count)) elif pages is None or len(pages) == 0: pages = [0] # Default to first page # Convert selected pages to images images = await convert_pdf_to_images(pdf_data, pages) # Extract vocabulary from each page SEQUENTIALLY all_vocabulary = [] total_confidence = 0.0 successful_pages = [] failed_pages = [] error_messages = [] for i, image_data in enumerate(images): page_num = pages[i] logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...") vocabulary, confidence, error = await extract_vocabulary_from_image( image_data, f"page_{page_num + 1}.png", page_number=page_num ) if error: failed_pages.append(page_num + 1) error_messages.append(error) logger.warning(f"Page {page_num + 1} failed: {error}") else: successful_pages.append(page_num + 1) total_confidence += confidence # Add page info to each entry and convert to dict for entry in vocabulary: entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry)) entry_dict['source_page'] = page_num + 1 all_vocabulary.append(entry_dict) logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert") avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0 # Update session session["vocabulary"] = all_vocabulary session["vocabulary_count"] = len(all_vocabulary) session["extraction_confidence"] = avg_confidence session["processed_pages"] = pages session["successful_pages"] = successful_pages session["failed_pages"] = failed_pages session["status"] = SessionStatus.EXTRACTED.value # Save first page as preview image if images: session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) image_path = os.path.join(session_dir, "source.png") with open(image_path, 'wb') as f: f.write(images[0]) session["image_path"] = image_path result = { "session_id": session_id, "pages_processed": len(pages), "pages_successful": len(successful_pages), "pages_failed": len(failed_pages), "successful_pages": successful_pages, "failed_pages": failed_pages, "vocabulary_count": len(all_vocabulary), "extraction_confidence": avg_confidence, "status": SessionStatus.EXTRACTED.value, } if error_messages: result["errors"] = error_messages return result