""" Vocabulary Worksheet Analysis API - OCR export, ground truth labeling, extract-with-boxes, deskewed images, and learning unit generation. The two large handlers (compare_ocr_methods, analyze_grid) live in vocab_worksheet_compare_api.py and are included via compare_router. """ from fastapi import APIRouter, Body, HTTPException from fastapi.responses import StreamingResponse from pydantic import BaseModel from typing import Optional, Dict, Any from datetime import datetime import os import io import json import logging def _get_sessions(): from vocab_worksheet_api import _sessions return _sessions def _get_local_storage_path(): from vocab_worksheet_api import LOCAL_STORAGE_PATH return LOCAL_STORAGE_PATH from vocab_worksheet_generation import convert_pdf_page_to_image # Try to import Tesseract extractor try: from tesseract_vocab_extractor import ( extract_bounding_boxes, TESSERACT_AVAILABLE, ) except ImportError: TESSERACT_AVAILABLE = False # Try to import Grid Detection Service try: from services.grid_detection_service import GridDetectionService GRID_SERVICE_AVAILABLE = True except ImportError: GRID_SERVICE_AVAILABLE = False logger = logging.getLogger(__name__) analysis_router = APIRouter() def _ocr_export_dir(): return os.path.join(_get_local_storage_path(), "ocr-exports") def _ground_truth_dir(): return os.path.join(_get_local_storage_path(), "ground-truth") # ============================================================================= # OCR Export Endpoints (for cross-app OCR data sharing) # ============================================================================= @analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}") async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)): """ Save OCR export data for cross-app sharing (admin-v2 -> studio-v2). Both apps proxy to klausur-service via /klausur-api/, so this endpoint serves as shared storage accessible from both ports. """ logger.info(f"Saving OCR export for session {session_id}, page {page_number}") os.makedirs(_ocr_export_dir(), exist_ok=True) # Save the export data export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json") with open(export_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) # Update latest pointer latest_path = os.path.join(_ocr_export_dir(), "latest.json") with open(latest_path, 'w', encoding='utf-8') as f: json.dump({ "session_id": session_id, "page_number": page_number, "saved_at": datetime.utcnow().isoformat(), }, f, ensure_ascii=False, indent=2) return { "success": True, "session_id": session_id, "page_number": page_number, "message": "OCR export saved successfully", } @analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}") async def load_ocr_export(session_id: str, page_number: int): """Load a specific OCR export by session and page number.""" export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json") if not os.path.exists(export_path): raise HTTPException(status_code=404, detail="OCR export not found") with open(export_path, 'r', encoding='utf-8') as f: data = json.load(f) return data @analysis_router.get("/ocr-export/latest") async def load_latest_ocr_export(): """Load the most recently saved OCR export data.""" latest_path = os.path.join(_ocr_export_dir(), "latest.json") if not os.path.exists(latest_path): raise HTTPException(status_code=404, detail="No OCR exports found") with open(latest_path, 'r', encoding='utf-8') as f: pointer = json.load(f) session_id = pointer.get("session_id") page_number = pointer.get("page_number") export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json") if not os.path.exists(export_path): raise HTTPException(status_code=404, detail="Latest OCR export file not found") with open(export_path, 'r', encoding='utf-8') as f: data = json.load(f) return data # ============================================================================= # Extract with Boxes & Deskewed Image # ============================================================================= async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict: """Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService. Returns dict with 'entries' list and 'image_width'/'image_height'. Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex. All bbox coordinates are in percent (0-100). """ if not TESSERACT_AVAILABLE: raise HTTPException(status_code=500, detail="Tesseract not available") if not GRID_SERVICE_AVAILABLE: raise HTTPException(status_code=500, detail="GridDetectionService not available") # Step 1: Tesseract word-level bounding boxes tess_result = await extract_bounding_boxes(image_bytes, lang=lang) words = tess_result.get("words", []) img_w = tess_result.get("image_width", 0) img_h = tess_result.get("image_height", 0) if not words or img_w == 0 or img_h == 0: return {"entries": [], "image_width": img_w, "image_height": img_h} # Step 2: Convert to OCR regions (percentage-based) service = GridDetectionService() regions = service.convert_tesseract_regions(words, img_w, img_h) if not regions: return {"entries": [], "image_width": img_w, "image_height": img_h} # Step 3: Detect grid grid_result = service.detect_grid(regions) if not grid_result.cells: return {"entries": [], "image_width": img_w, "image_height": img_h} # Step 4: Group cells by logical_row and column_type from services.grid_detection_service import ColumnType entries = [] for row_idx, row_cells in enumerate(grid_result.cells): en_text = "" de_text = "" ex_text = "" en_bbox = None de_bbox = None ex_bbox = None row_conf_sum = 0.0 row_conf_count = 0 for cell in row_cells: cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2), "w": round(cell.width, 2), "h": round(cell.height, 2)} if cell.column_type == ColumnType.ENGLISH: en_text = cell.text.strip() en_bbox = cell_bbox elif cell.column_type == ColumnType.GERMAN: de_text = cell.text.strip() de_bbox = cell_bbox elif cell.column_type == ColumnType.EXAMPLE: ex_text = cell.text.strip() ex_bbox = cell_bbox if cell.text.strip(): row_conf_sum += cell.confidence row_conf_count += 1 # Skip completely empty rows if not en_text and not de_text and not ex_text: continue # Calculate whole-row bounding box all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None] if all_bboxes: row_x = min(b["x"] for b in all_bboxes) row_y = min(b["y"] for b in all_bboxes) row_right = max(b["x"] + b["w"] for b in all_bboxes) row_bottom = max(b["y"] + b["h"] for b in all_bboxes) row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2), "w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)} else: row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3} avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1) entries.append({ "row_index": row_idx, "english": en_text, "german": de_text, "example": ex_text, "confidence": avg_conf, "bbox": row_bbox, "bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0}, "bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0}, "bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0}, }) return {"entries": entries, "image_width": img_w, "image_height": img_h} @analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}") async def extract_with_boxes(session_id: str, page_number: int): """Extract vocabulary entries with bounding boxes for ground truth labeling. Uses Tesseract + GridDetectionService for spatial positioning. page_number is 0-indexed. """ logger.info(f"Extract with boxes for session {session_id}, page {page_number}") if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") page_count = session.get("pdf_page_count", 1) if page_number < 0 or page_number >= page_count: raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") # Convert page to hires image image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) # Deskew image before OCR deskew_angle = 0.0 try: from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE if CV2_AVAILABLE: image_data, deskew_angle = deskew_image_by_word_alignment(image_data) logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}") except Exception as e: logger.warning(f"Deskew failed for page {page_number}: {e}") # Cache deskewed image in session for later serving if "deskewed_images" not in session: session["deskewed_images"] = {} session["deskewed_images"][str(page_number)] = image_data # Extract entries with boxes (now on deskewed image) result = await extract_entries_with_boxes(image_data) # Cache in session if "gt_entries" not in session: session["gt_entries"] = {} session["gt_entries"][str(page_number)] = result["entries"] return { "success": True, "entries": result["entries"], "entry_count": len(result["entries"]), "image_width": result["image_width"], "image_height": result["image_height"], "deskew_angle": round(deskew_angle, 2), "deskewed": abs(deskew_angle) > 0.05, } @analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}") async def get_deskewed_image(session_id: str, page_number: int): """Return the deskewed page image as PNG. Falls back to the original hires image if no deskewed version is cached. """ if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] deskewed = session.get("deskewed_images", {}).get(str(page_number)) if deskewed: return StreamingResponse(io.BytesIO(deskewed), media_type="image/png") # Fallback: render original hires image pdf_data = session.get("pdf_data") if not pdf_data: raise HTTPException(status_code=400, detail="No PDF uploaded for this session") image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) return StreamingResponse(io.BytesIO(image_data), media_type="image/png") # ============================================================================= # Ground Truth Labeling # ============================================================================= @analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}") async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)): """Save ground truth labels for a page. Expects body with 'entries' list - each entry has english, german, example, status ('confirmed' | 'edited' | 'skipped'), and bbox fields. """ logger.info(f"Save ground truth for session {session_id}, page {page_number}") if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") entries = data.get("entries", []) if not entries: raise HTTPException(status_code=400, detail="No entries provided") # Save in session session = _get_sessions()[session_id] if "ground_truth" not in session: session["ground_truth"] = {} session["ground_truth"][str(page_number)] = entries # Also save to disk os.makedirs(_ground_truth_dir(), exist_ok=True) gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json") gt_data = { "session_id": session_id, "page_number": page_number, "saved_at": datetime.now().isoformat(), "entry_count": len(entries), "entries": entries, } with open(gt_path, 'w', encoding='utf-8') as f: json.dump(gt_data, f, ensure_ascii=False, indent=2) logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}") confirmed = sum(1 for e in entries if e.get("status") == "confirmed") edited = sum(1 for e in entries if e.get("status") == "edited") skipped = sum(1 for e in entries if e.get("status") == "skipped") return { "success": True, "saved_count": len(entries), "confirmed": confirmed, "edited": edited, "skipped": skipped, "file_path": gt_path, } @analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}") async def load_ground_truth(session_id: str, page_number: int): """Load saved ground truth for a page.""" logger.info(f"Load ground truth for session {session_id}, page {page_number}") if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") # Try session cache first session = _get_sessions()[session_id] cached = session.get("ground_truth", {}).get(str(page_number)) if cached: return {"success": True, "entries": cached, "source": "cache"} # Try disk gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json") if not os.path.exists(gt_path): raise HTTPException(status_code=404, detail="No ground truth found for this page") with open(gt_path, 'r', encoding='utf-8') as f: gt_data = json.load(f) return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"} # ─── Learning Module Generation ───────────────────────────────────────────── class GenerateLearningUnitRequest(BaseModel): grade: Optional[str] = None generate_modules: bool = True @analysis_router.post("/sessions/{session_id}/generate-learning-unit") async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None): """ Create a Learning Unit from the vocabulary in this session. 1. Takes vocabulary from the session 2. Creates a Learning Unit in backend-lehrer 3. Optionally triggers MC/Cloze/QA generation Returns the created unit info and generation status. """ if request is None: request = GenerateLearningUnitRequest() if session_id not in _get_sessions(): raise HTTPException(status_code=404, detail="Session not found") session = _get_sessions()[session_id] vocabulary = session.get("vocabulary", []) if not vocabulary: raise HTTPException(status_code=400, detail="No vocabulary in this session") try: from vocab_learn_bridge import create_learning_unit, generate_learning_modules # Step 1: Create Learning Unit result = await create_learning_unit( session_name=session["name"], vocabulary=vocabulary, grade=request.grade, ) # Step 2: Generate modules if requested if request.generate_modules: try: gen_result = await generate_learning_modules( unit_id=result["unit_id"], analysis_path=result["analysis_path"], ) result["generation"] = gen_result except Exception as e: logger.warning(f"Module generation failed (unit created): {e}") result["generation"] = {"status": "error", "reason": str(e)} return result except ImportError: raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available") except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) except RuntimeError as e: raise HTTPException(status_code=502, detail=str(e)) # ============================================================================= # Include compare_ocr_methods & analyze_grid from companion module # ============================================================================= from vocab_worksheet_compare_api import compare_router # noqa: E402 analysis_router.include_router(compare_router)