feat(ocr-pipeline): generic cell-grid with optional vocab mapping

Extract build_cell_grid() as layout-agnostic foundation from build_word_grid(). Step 5 now produces a generic cell grid (columns x rows) and auto-detects whether vocab layout is present. Frontend dynamically switches between vocab table (EN/DE/Example) and generic cell table based on layout type. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 17:22:56 +01:00
parent 3bcb7aa638
commit 27b895a848
4 changed files with 802 additions and 301 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -31,8 +31,14 @@ from pydantic import BaseModel
 from cv_vocab_pipeline import (
    PageRegion,
    RowGeometry,
+    _cells_to_vocab_entries,
+    _fix_character_confusion,
+    _fix_phonetic_brackets,
+    _split_comma_entries,
+    _attach_example_sentences,
    analyze_layout,
    analyze_layout_by_words,
+    build_cell_grid,
    build_word_grid,
    classify_column_types,
    create_layout_image,
@@ -1075,35 +1081,60 @@ async def detect_words(session_id: str, engine: str = "auto", pronunciation: str
        for r in row_result["rows"]
    ]

-    # Build word grid — pass both binarized (for Tesseract) and BGR (for RapidOCR)
-    entries = build_word_grid(
+    # Build generic cell grid
+    cells, columns_meta = build_cell_grid(
        ocr_img, col_regions, row_geoms, img_w, img_h,
        ocr_engine=engine, img_bgr=dewarped_bgr,
-        pronunciation=pronunciation,
    )
    duration = time.time() - t0

-    # Build summary
-    summary = {
-        "total_entries": len(entries),
-        "with_english": sum(1 for e in entries if e.get("english")),
-        "with_german": sum(1 for e in entries if e.get("german")),
-        "low_confidence": sum(1 for e in entries if e.get("confidence", 0) < 50),
-    }
+    # Layout detection
+    col_types = {c['type'] for c in columns_meta}
+    is_vocab = bool(col_types & {'column_en', 'column_de'})
+
+    # Count content rows and columns for grid_shape
+    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
+    n_cols = len(columns_meta)

    # Determine which engine was actually used
-    used_engine = entries[0].get("ocr_engine", "tesseract") if entries else engine
+    used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine

+    # Grid result (always generic)
    word_result = {
-        "entries": entries,
-        "entry_count": len(entries),
+        "cells": cells,
+        "grid_shape": {
+            "rows": n_content_rows,
+            "cols": n_cols,
+            "total_cells": len(cells),
+        },
+        "columns_used": columns_meta,
+        "layout": "vocab" if is_vocab else "generic",
        "image_width": img_w,
        "image_height": img_h,
        "duration_seconds": round(duration, 2),
-        "summary": summary,
        "ocr_engine": used_engine,
+        "summary": {
+            "total_cells": len(cells),
+            "non_empty_cells": sum(1 for c in cells if c.get("text")),
+            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
+        },
    }

+    # For vocab layout: add post-processed vocab_entries (backwards compat)
+    if is_vocab:
+        entries = _cells_to_vocab_entries(cells, columns_meta)
+        entries = _fix_character_confusion(entries)
+        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+        entries = _split_comma_entries(entries)
+        entries = _attach_example_sentences(entries)
+        word_result["vocab_entries"] = entries
+        # Also keep "entries" key for backwards compatibility
+        word_result["entries"] = entries
+        word_result["entry_count"] = len(entries)
+        word_result["summary"]["total_entries"] = len(entries)
+        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
+        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
+
    # Persist to DB
    await update_session_db(
        session_id,
@@ -1114,7 +1145,8 @@ async def detect_words(session_id: str, engine: str = "auto", pronunciation: str
    cached["word_result"] = word_result

    logger.info(f"OCR Pipeline: words session {session_id}: "
-                f"{len(entries)} entries ({duration:.2f}s), summary: {summary}")
+                f"layout={word_result['layout']}, "
+                f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}")

    return {
        "session_id": session_id,
@@ -1232,17 +1264,19 @@ async def _get_rows_overlay(session_id: str) -> Response:


 async def _get_words_overlay(session_id: str) -> Response:
-    """Generate dewarped image with word grid cells drawn on it."""
+    """Generate dewarped image with cell grid drawn on it."""
    session = await get_session_db(session_id)
    if not session:
        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")

    word_result = session.get("word_result")
-    if not word_result or not word_result.get("entries"):
+    if not word_result:
        raise HTTPException(status_code=404, detail="No word data available")

-    column_result = session.get("column_result")
-    row_result = session.get("row_result")
+    # Support both new cell-based and legacy entry-based formats
+    cells = word_result.get("cells")
+    if not cells and not word_result.get("entries"):
+        raise HTTPException(status_code=404, detail="No word data available")

    # Load dewarped image
    dewarped_png = await get_session_image(session_id, "dewarped")
@@ -1256,80 +1290,105 @@ async def _get_words_overlay(session_id: str) -> Response:

    img_h, img_w = img.shape[:2]

-    # Color map for column types (BGR)
-    col_colors = {
-        "column_en": (255, 180, 0),      # Blue
-        "column_de": (0, 200, 0),         # Green
-        "column_example": (0, 140, 255),  # Orange
-    }
-
    overlay = img.copy()

-    # Build grid from column_result × row_result (the actual cells)
-    columns = []
-    if column_result and column_result.get("columns"):
-        columns = [c for c in column_result["columns"]
-                   if c.get("type", "").startswith("column_")]
+    if cells:
+        # New cell-based overlay: color by column index
+        col_palette = [
+            (255, 180, 0),      # Blue (BGR)
+            (0, 200, 0),        # Green
+            (0, 140, 255),      # Orange
+            (200, 100, 200),    # Purple
+            (200, 200, 0),      # Cyan
+            (100, 200, 200),    # Yellow-ish
+        ]

-    content_rows_data = []
-    if row_result and row_result.get("rows"):
-        content_rows_data = [r for r in row_result["rows"]
-                             if r.get("row_type") == "content"]
+        for cell in cells:
+            bbox = cell.get("bbox_px", {})
+            cx = bbox.get("x", 0)
+            cy = bbox.get("y", 0)
+            cw = bbox.get("w", 0)
+            ch = bbox.get("h", 0)
+            if cw <= 0 or ch <= 0:
+                continue

-    # Draw grid: column × row cells
-    for col in columns:
-        col_type = col.get("type", "")
-        color = col_colors.get(col_type, (200, 200, 200))
-        cx, cw = col["x"], col["width"]
+            col_idx = cell.get("col_index", 0)
+            color = col_palette[col_idx % len(col_palette)]

-        for row in content_rows_data:
-            ry, rh = row["y"], row["height"]
-            # Cell rectangle (exact grid intersection, no padding)
-            cv2.rectangle(img, (cx, ry), (cx + cw, ry + rh), color, 1)
+            # Cell rectangle border
+            cv2.rectangle(img, (cx, cy), (cx + cw, cy + ch), color, 1)
            # Semi-transparent fill
-            cv2.rectangle(overlay, (cx, ry), (cx + cw, ry + rh), color, -1)
+            cv2.rectangle(overlay, (cx, cy), (cx + cw, cy + ch), color, -1)

-    # Place OCR text labels inside grid cells
-    # Build lookup: row_index → entry for fast access
-    entries = word_result["entries"]
-    entry_by_row: Dict[int, Dict] = {}
-    for entry in entries:
-        entry_by_row[entry.get("row_index", -1)] = entry
+            # Cell-ID label (top-left corner)
+            cell_id = cell.get("cell_id", "")
+            cv2.putText(img, cell_id, (cx + 2, cy + 10),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.28, color, 1)

-    for row_idx, row in enumerate(content_rows_data):
-        entry = entry_by_row.get(row_idx)
-        if not entry:
-            continue
+            # Text label (bottom of cell)
+            text = cell.get("text", "")
+            if text:
+                conf = cell.get("confidence", 0)
+                if conf >= 70:
+                    text_color = (0, 180, 0)
+                elif conf >= 50:
+                    text_color = (0, 180, 220)
+                else:
+                    text_color = (0, 0, 220)

-        conf = entry.get("confidence", 0)
-        if conf >= 70:
-            text_color = (0, 180, 0)
-        elif conf >= 50:
-            text_color = (0, 180, 220)
-        else:
-            text_color = (0, 0, 220)
+                label = text.replace('\n', ' ')[:30]
+                cv2.putText(img, label, (cx + 3, cy + ch - 4),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.35, text_color, 1)
+    else:
+        # Legacy fallback: entry-based overlay (for old sessions)
+        column_result = session.get("column_result")
+        row_result = session.get("row_result")
+        col_colors = {
+            "column_en": (255, 180, 0),
+            "column_de": (0, 200, 0),
+            "column_example": (0, 140, 255),
+        }

-        ry, rh = row["y"], row["height"]
+        columns = []
+        if column_result and column_result.get("columns"):
+            columns = [c for c in column_result["columns"]
+                       if c.get("type", "").startswith("column_")]
+
+        content_rows_data = []
+        if row_result and row_result.get("rows"):
+            content_rows_data = [r for r in row_result["rows"]
+                                 if r.get("row_type") == "content"]

        for col in columns:
            col_type = col.get("type", "")
+            color = col_colors.get(col_type, (200, 200, 200))
            cx, cw = col["x"], col["width"]
+            for row in content_rows_data:
+                ry, rh = row["y"], row["height"]
+                cv2.rectangle(img, (cx, ry), (cx + cw, ry + rh), color, 1)
+                cv2.rectangle(overlay, (cx, ry), (cx + cw, ry + rh), color, -1)

-            # Pick the right text field for this column
-            if col_type == "column_en":
-                text = entry.get("english", "")
-            elif col_type == "column_de":
-                text = entry.get("german", "")
-            elif col_type == "column_example":
-                text = entry.get("example", "")
-            else:
-                text = ""
+        entries = word_result["entries"]
+        entry_by_row: Dict[int, Dict] = {}
+        for entry in entries:
+            entry_by_row[entry.get("row_index", -1)] = entry

-            if text:
-                label = text.replace('\n', ' ')[:30]
-                font_scale = 0.35
-                cv2.putText(img, label, (cx + 3, ry + rh - 4),
-                            cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, 1)
+        for row_idx, row in enumerate(content_rows_data):
+            entry = entry_by_row.get(row_idx)
+            if not entry:
+                continue
+            conf = entry.get("confidence", 0)
+            text_color = (0, 180, 0) if conf >= 70 else (0, 180, 220) if conf >= 50 else (0, 0, 220)
+            ry, rh = row["y"], row["height"]
+            for col in columns:
+                col_type = col.get("type", "")
+                cx, cw = col["x"], col["width"]
+                field = {"column_en": "english", "column_de": "german", "column_example": "example"}.get(col_type, "")
+                text = entry.get(field, "") if field else ""
+                if text:
+                    label = text.replace('\n', ' ')[:30]
+                    cv2.putText(img, label, (cx + 3, ry + rh - 4),
+                                cv2.FONT_HERSHEY_SIMPLEX, 0.35, text_color, 1)

    # Blend overlay at 10% opacity
    cv2.addWeighted(overlay, 0.1, img, 0.9, 0, img)