""" Vocabulary Worksheet Compare & Grid Analysis API. Split from vocab_worksheet_analysis_api.py — contains the two largest route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC). """ from fastapi import APIRouter, HTTPException, Query from typing import Dict, Any import base64 import json import logging import os from vocab_worksheet_extraction import extract_vocabulary_from_image OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b") def _get_sessions(): from vocab_worksheet_api import _sessions return _sessions from vocab_worksheet_generation import convert_pdf_page_to_image # Try to import Tesseract extractor try: from tesseract_vocab_extractor import ( run_tesseract_pipeline, match_positions_to_vocab, TESSERACT_AVAILABLE, ) except ImportError: TESSERACT_AVAILABLE = False # Try to import CV Pipeline try: from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE except ImportError: CV_PIPELINE_AVAILABLE = False # Try to import Grid Detection Service try: from services.grid_detection_service import GridDetectionService GRID_SERVICE_AVAILABLE = True except ImportError: GRID_SERVICE_AVAILABLE = False logger = logging.getLogger(__name__) compare_router = APIRouter() # ============================================================================= # OCR Compare & Grid Analysis Endpoints # ============================================================================= @compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}") async def compare_ocr_methods(session_id: str, page_number: int): """ Run multiple OCR methods on a page and compare results. This endpoint: 1. Gets the page image from the session's uploaded PDF 2. Runs Vision LLM extraction (primary method) 3. Optionally runs Tesseract extraction 4. Compares found vocabulary across methods 5. Returns structured comparison results page_number is 0-indexed. """ import httpx import time logger.info(f"Compare OCR for session {session_id}, page {page_number}") if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") page_count = session.get("pdf_page_count", 1) if page_number < 0 or page_number >= page_count: raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") # Convert page to image image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) methods_results = {} all_vocab_sets = {} # --- Method: Vision LLM --- try: start = time.time() vocab, confidence, error = await extract_vocabulary_from_image( image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False ) duration = time.time() - start vocab_list = [] for v in vocab: entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v)) vocab_list.append({ "english": entry.get("english", ""), "german": entry.get("german", ""), "example": entry.get("example_sentence", ""), }) methods_results["vision_llm"] = { "name": "Vision LLM", "model": VISION_MODEL, "duration_seconds": round(duration, 1), "vocabulary_count": len(vocab_list), "vocabulary": vocab_list, "confidence": confidence, "success": len(vocab_list) > 0 and not error, "error": error if error else None, } all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]} except Exception as e: logger.error(f"Vision LLM failed: {e}") methods_results["vision_llm"] = { "name": "Vision LLM", "model": VISION_MODEL, "duration_seconds": 0, "vocabulary_count": 0, "vocabulary": [], "confidence": 0, "success": False, "error": str(e), } all_vocab_sets["vision_llm"] = set() # --- Method: Tesseract OCR (bounding boxes + vocab extraction) --- if TESSERACT_AVAILABLE: try: start = time.time() tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu") duration = time.time() - start tess_vocab = tess_result.get("vocabulary", []) tess_words = tess_result.get("words", []) # Store Tesseract words in session for later use (grid analysis, position matching) session["tesseract_words"] = tess_words session["tesseract_image_width"] = tess_result.get("image_width", 0) session["tesseract_image_height"] = tess_result.get("image_height", 0) session[f"tesseract_page_{page_number}"] = tess_result vocab_list_tess = [] for v in tess_vocab: vocab_list_tess.append({ "english": v.get("english", ""), "german": v.get("german", ""), "example": v.get("example", ""), }) methods_results["tesseract"] = { "name": "Tesseract OCR", "model": "tesseract-ocr (eng+deu)", "duration_seconds": round(duration, 1), "vocabulary_count": len(vocab_list_tess), "vocabulary": vocab_list_tess, "confidence": 0.7 if tess_vocab else 0, "success": len(vocab_list_tess) > 0, "error": tess_result.get("error"), "word_count": tess_result.get("word_count", 0), "columns_detected": len(tess_result.get("columns", [])), } all_vocab_sets["tesseract"] = { (v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list_tess if v["english"] and v["german"] } # Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]: llm_vocab_with_bbox = match_positions_to_vocab( tess_words, methods_results["vision_llm"]["vocabulary"], tess_result.get("image_width", 1), tess_result.get("image_height", 1), ) methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox except Exception as e: logger.error(f"Tesseract failed: {e}") import traceback logger.debug(traceback.format_exc()) methods_results["tesseract"] = { "name": "Tesseract OCR", "model": "tesseract-ocr", "duration_seconds": 0, "vocabulary_count": 0, "vocabulary": [], "confidence": 0, "success": False, "error": str(e), } all_vocab_sets["tesseract"] = set() # --- Method: CV Pipeline (Document Reconstruction) --- if CV_PIPELINE_AVAILABLE: try: start = time.time() cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number) duration = time.time() - start cv_vocab = cv_result.vocabulary if not cv_result.error else [] vocab_list_cv = [] for v in cv_vocab: vocab_list_cv.append({ "english": v.get("english", ""), "german": v.get("german", ""), "example": v.get("example", ""), }) methods_results["cv_pipeline"] = { "name": "CV Pipeline (Document Reconstruction)", "model": "opencv + tesseract (multi-pass)", "duration_seconds": round(duration, 1), "vocabulary_count": len(vocab_list_cv), "vocabulary": vocab_list_cv, "confidence": 0.8 if cv_vocab else 0, "success": len(vocab_list_cv) > 0, "error": cv_result.error, "word_count": cv_result.word_count, "columns_detected": cv_result.columns_detected, "stages": cv_result.stages, } all_vocab_sets["cv_pipeline"] = { (v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list_cv if v["english"] and v["german"] } except Exception as e: logger.error(f"CV Pipeline failed: {e}") import traceback logger.debug(traceback.format_exc()) methods_results["cv_pipeline"] = { "name": "CV Pipeline (Document Reconstruction)", "model": "opencv + tesseract (multi-pass)", "duration_seconds": 0, "vocabulary_count": 0, "vocabulary": [], "confidence": 0, "success": False, "error": str(e), } all_vocab_sets["cv_pipeline"] = set() # --- Build comparison --- all_unique = set() for vs in all_vocab_sets.values(): all_unique |= vs found_by_all = [] found_by_some = [] for english, german in sorted(all_unique): found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs] entry = {"english": english, "german": german, "methods": found_in} if len(found_in) == len(all_vocab_sets): found_by_all.append(entry) else: found_by_some.append(entry) total_methods = max(len(all_vocab_sets), 1) agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0 # Find best method best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm" return { "session_id": session_id, "page_number": page_number, "methods": methods_results, "comparison": { "found_by_all_methods": found_by_all, "found_by_some_methods": found_by_some, "total_unique_vocabulary": len(all_unique), "agreement_rate": agreement_rate, }, "recommendation": { "best_method": best_method, "reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz", }, } @compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}") async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)): """ Analyze the grid/table structure of a vocabulary page. Hybrid approach: 1. If Tesseract bounding boxes are available (from compare-ocr), use them for real spatial positions via GridDetectionService. 2. Otherwise fall back to Vision LLM for grid structure detection. page_number is 0-indexed. Returns GridData structure expected by the frontend GridOverlay component. """ import httpx import time logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})") if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") page_count = session.get("pdf_page_count", 1) if page_number < 0 or page_number >= page_count: raise HTTPException(status_code=400, detail=f"Invalid page number.") # Convert page to image image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) # --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService --- tess_page_data = session.get(f"tesseract_page_{page_number}") if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE: try: # Run Tesseract if not already cached if not tess_page_data: logger.info("Running Tesseract for grid analysis (not cached)") from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess tess_page_data = await _run_tess(image_data, lang="eng+deu") session[f"tesseract_page_{page_number}"] = tess_page_data session["tesseract_words"] = tess_page_data.get("words", []) session["tesseract_image_width"] = tess_page_data.get("image_width", 0) session["tesseract_image_height"] = tess_page_data.get("image_height", 0) tess_words = tess_page_data.get("words", []) img_w = tess_page_data.get("image_width", 0) img_h = tess_page_data.get("image_height", 0) if tess_words and img_w > 0 and img_h > 0: service = GridDetectionService() regions = service.convert_tesseract_regions(tess_words, img_w, img_h) if regions: grid_result = service.detect_grid(regions) grid_dict = grid_result.to_dict() # Merge LLM text if available (better quality than Tesseract text) # The LLM vocab was stored during compare-ocr grid_dict["source"] = "tesseract+grid_service" grid_dict["word_count"] = len(tess_words) logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, " f"{grid_result.stats.get('recognized', 0)} recognized") return {"success": True, "grid": grid_dict} logger.info("Tesseract data insufficient, falling back to LLM") except Exception as e: logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}") import traceback logger.debug(traceback.format_exc()) # --- Strategy 2: Fall back to Vision LLM --- image_base64 = base64.b64encode(image_data).decode("utf-8") grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid. Your task: Identify the TABLE STRUCTURE and extract each cell's content. Return a JSON object with this EXACT structure: { "rows": , "columns": , "column_types": ["english", "german", "example"], "entries": [ { "row": 0, "col": 0, "text": "the word or phrase in this cell", "column_type": "english", "confidence": 0.95 } ] } Rules: - row and col are 0-indexed - column_type is one of: "english", "german", "example", "unknown" - Detect whether each column contains English words, German translations, or example sentences - Include ALL non-empty cells - confidence is 0.0-1.0 based on how clear the text is - If a cell is empty, don't include it - Return ONLY the JSON, no other text""" try: import asyncio raw_text = "" max_retries = 3 for attempt in range(max_retries): async with httpx.AsyncClient(timeout=300.0) as client: response = await client.post( f"{OLLAMA_URL}/api/chat", json={ "model": VISION_MODEL, "messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}], "stream": False, "options": {"temperature": 0.1, "num_predict": 8192}, }, timeout=300.0, ) if response.status_code == 500 and attempt < max_retries - 1: wait_time = 10 * (attempt + 1) logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})") await asyncio.sleep(wait_time) continue elif response.status_code != 200: error_detail = response.text[:200] if response.text else "Unknown error" return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."} raw_text = response.json().get("message", {}).get("content", "") break # Parse JSON from response import re json_match = re.search(r'\{[\s\S]*\}', raw_text) if not json_match: return {"success": False, "error": "Could not parse grid structure from LLM response"} grid_raw = json.loads(json_match.group()) num_rows = grid_raw.get("rows", 0) num_cols = grid_raw.get("columns", 0) column_types = grid_raw.get("column_types", []) entries = grid_raw.get("entries", []) if num_rows == 0 or num_cols == 0: return {"success": False, "error": "No grid structure detected"} # Ensure column_types has the right length while len(column_types) < num_cols: column_types.append("unknown") # Build cell grid with percentage-based coordinates row_height = 100.0 / num_rows col_width = 100.0 / num_cols # Track which cells have content cell_map = {} for entry in entries: r = entry.get("row", 0) c = entry.get("col", 0) cell_map[(r, c)] = entry cells = [] recognized_count = 0 empty_count = 0 problematic_count = 0 for r in range(num_rows): row_cells = [] for c in range(num_cols): x = c * col_width y = r * row_height if (r, c) in cell_map: entry = cell_map[(r, c)] text = entry.get("text", "").strip() conf = entry.get("confidence", 0.8) col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown") if text: status = "recognized" if conf >= 0.5 else "problematic" if status == "recognized": recognized_count += 1 else: problematic_count += 1 else: status = "empty" empty_count += 1 else: text = "" conf = 0.0 col_type = column_types[c] if c < len(column_types) else "unknown" status = "empty" empty_count += 1 row_cells.append({ "row": r, "col": c, "x": round(x, 2), "y": round(y, 2), "width": round(col_width, 2), "height": round(row_height, 2), "text": text, "confidence": conf, "status": status, "column_type": col_type, }) cells.append(row_cells) total = num_rows * num_cols coverage = (recognized_count + problematic_count) / max(total, 1) # Column and row boundaries as percentages col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)] row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)] grid_data = { "rows": num_rows, "columns": num_cols, "cells": cells, "column_types": column_types, "column_boundaries": col_boundaries, "row_boundaries": row_boundaries, "deskew_angle": 0.0, "source": "vision_llm", "stats": { "recognized": recognized_count, "problematic": problematic_count, "empty": empty_count, "manual": 0, "total": total, "coverage": round(coverage, 3), }, } return {"success": True, "grid": grid_data} except httpx.TimeoutException: logger.error("Grid analysis timed out") return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"} except Exception as e: logger.error(f"Grid analysis failed: {e}") import traceback logger.debug(traceback.format_exc()) return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}