feat(ocr): Add Ground Truth labeling UI for OCR comparison

Adds a step-through tool for creating 100% correct reference data (ground truth) with position information. Users scan a page, review each vocabulary entry with image crops, confirm or correct the OCR text, and save the result as JSON. Backend: extract_entries_with_boxes() helper + 3 endpoints (extract-with-boxes, ground-truth save/load). Frontend: GroundTruthPanel component with SVG overlay, ImageCrop, keyboard shortcuts (Enter/Tab/arrows), and tab navigation in page.tsx. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 09:04:36 +01:00
parent d4a23e8d99
commit 8c77df494b
4 changed files with 872 additions and 3 deletions
@@ -2001,3 +2001,227 @@ async def load_latest_ocr_export():
        data = json.load(f)

    return data
+
+
+# =============================================================================
+# Ground Truth Labeling
+# =============================================================================
+
+GROUND_TRUTH_DIR = os.path.join(LOCAL_STORAGE_PATH, "ground-truth")
+
+
+async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
+    """Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
+
+    Returns dict with 'entries' list and 'image_width'/'image_height'.
+    Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
+    All bbox coordinates are in percent (0-100).
+    """
+    if not TESSERACT_AVAILABLE:
+        raise HTTPException(status_code=500, detail="Tesseract not available")
+    if not GRID_SERVICE_AVAILABLE:
+        raise HTTPException(status_code=500, detail="GridDetectionService not available")
+
+    # Step 1: Tesseract word-level bounding boxes
+    tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
+    words = tess_result.get("words", [])
+    img_w = tess_result.get("image_width", 0)
+    img_h = tess_result.get("image_height", 0)
+
+    if not words or img_w == 0 or img_h == 0:
+        return {"entries": [], "image_width": img_w, "image_height": img_h}
+
+    # Step 2: Convert to OCR regions (percentage-based)
+    service = GridDetectionService()
+    regions = service.convert_tesseract_regions(words, img_w, img_h)
+
+    if not regions:
+        return {"entries": [], "image_width": img_w, "image_height": img_h}
+
+    # Step 3: Detect grid
+    grid_result = service.detect_grid(regions)
+
+    if not grid_result.cells:
+        return {"entries": [], "image_width": img_w, "image_height": img_h}
+
+    # Step 4: Group cells by logical_row and column_type
+    from services.grid_detection_service import ColumnType
+
+    entries = []
+    for row_idx, row_cells in enumerate(grid_result.cells):
+        en_text = ""
+        de_text = ""
+        ex_text = ""
+        en_bbox = None
+        de_bbox = None
+        ex_bbox = None
+        row_conf_sum = 0.0
+        row_conf_count = 0
+
+        for cell in row_cells:
+            cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
+                         "w": round(cell.width, 2), "h": round(cell.height, 2)}
+
+            if cell.column_type == ColumnType.ENGLISH:
+                en_text = cell.text.strip()
+                en_bbox = cell_bbox
+            elif cell.column_type == ColumnType.GERMAN:
+                de_text = cell.text.strip()
+                de_bbox = cell_bbox
+            elif cell.column_type == ColumnType.EXAMPLE:
+                ex_text = cell.text.strip()
+                ex_bbox = cell_bbox
+
+            if cell.text.strip():
+                row_conf_sum += cell.confidence
+                row_conf_count += 1
+
+        # Skip completely empty rows
+        if not en_text and not de_text and not ex_text:
+            continue
+
+        # Calculate whole-row bounding box
+        all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
+        if all_bboxes:
+            row_x = min(b["x"] for b in all_bboxes)
+            row_y = min(b["y"] for b in all_bboxes)
+            row_right = max(b["x"] + b["w"] for b in all_bboxes)
+            row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
+            row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
+                        "w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
+        else:
+            row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
+
+        avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
+
+        entries.append({
+            "row_index": row_idx,
+            "english": en_text,
+            "german": de_text,
+            "example": ex_text,
+            "confidence": avg_conf,
+            "bbox": row_bbox,
+            "bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
+            "bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
+            "bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
+        })
+
+    return {"entries": entries, "image_width": img_w, "image_height": img_h}
+
+
+@router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
+async def extract_with_boxes(session_id: str, page_number: int):
+    """Extract vocabulary entries with bounding boxes for ground truth labeling.
+
+    Uses Tesseract + GridDetectionService for spatial positioning.
+    page_number is 0-indexed.
+    """
+    logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
+
+    if session_id not in _sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = _sessions[session_id]
+    pdf_data = session.get("pdf_data")
+
+    if not pdf_data:
+        raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
+
+    page_count = session.get("pdf_page_count", 1)
+    if page_number < 0 or page_number >= page_count:
+        raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
+
+    # Convert page to hires image
+    image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
+
+    # Extract entries with boxes
+    result = await extract_entries_with_boxes(image_data)
+
+    # Cache in session
+    if "gt_entries" not in session:
+        session["gt_entries"] = {}
+    session["gt_entries"][str(page_number)] = result["entries"]
+
+    return {
+        "success": True,
+        "entries": result["entries"],
+        "entry_count": len(result["entries"]),
+        "image_width": result["image_width"],
+        "image_height": result["image_height"],
+    }
+
+
+@router.post("/sessions/{session_id}/ground-truth/{page_number}")
+async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
+    """Save ground truth labels for a page.
+
+    Expects body with 'entries' list - each entry has english, german, example,
+    status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
+    """
+    logger.info(f"Save ground truth for session {session_id}, page {page_number}")
+
+    if session_id not in _sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    entries = data.get("entries", [])
+    if not entries:
+        raise HTTPException(status_code=400, detail="No entries provided")
+
+    # Save in session
+    session = _sessions[session_id]
+    if "ground_truth" not in session:
+        session["ground_truth"] = {}
+    session["ground_truth"][str(page_number)] = entries
+
+    # Also save to disk
+    os.makedirs(GROUND_TRUTH_DIR, exist_ok=True)
+    gt_path = os.path.join(GROUND_TRUTH_DIR, f"{session_id}_page{page_number}.json")
+    gt_data = {
+        "session_id": session_id,
+        "page_number": page_number,
+        "saved_at": datetime.now().isoformat(),
+        "entry_count": len(entries),
+        "entries": entries,
+    }
+    with open(gt_path, 'w', encoding='utf-8') as f:
+        json.dump(gt_data, f, ensure_ascii=False, indent=2)
+
+    logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
+
+    confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
+    edited = sum(1 for e in entries if e.get("status") == "edited")
+    skipped = sum(1 for e in entries if e.get("status") == "skipped")
+
+    return {
+        "success": True,
+        "saved_count": len(entries),
+        "confirmed": confirmed,
+        "edited": edited,
+        "skipped": skipped,
+        "file_path": gt_path,
+    }
+
+
+@router.get("/sessions/{session_id}/ground-truth/{page_number}")
+async def load_ground_truth(session_id: str, page_number: int):
+    """Load saved ground truth for a page."""
+    logger.info(f"Load ground truth for session {session_id}, page {page_number}")
+
+    if session_id not in _sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    # Try session cache first
+    session = _sessions[session_id]
+    cached = session.get("ground_truth", {}).get(str(page_number))
+    if cached:
+        return {"success": True, "entries": cached, "source": "cache"}
+
+    # Try disk
+    gt_path = os.path.join(GROUND_TRUTH_DIR, f"{session_id}_page{page_number}.json")
+    if not os.path.exists(gt_path):
+        raise HTTPException(status_code=404, detail="No ground truth found for this page")
+
+    with open(gt_path, 'r', encoding='utf-8') as f:
+        gt_data = json.load(f)
+
+    return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}