Add Box-Grid-Review step (Step 11) to OCR pipeline

New pipeline step between Gutter Repair and Ground Truth that processes embedded boxes (grammar tips, exercises) independently from the main grid. Backend: - cv_box_layout.py: classify_box_layout() detects flowing/columnar/ bullet_list/header_only layout types per box - build_box_zone_grid(): layout-aware grid building (single-column for flowing text, independent columns for tabular content) - POST /sessions/{id}/build-box-grids endpoint with SmartSpellChecker - Layout type overridable per box via request body Frontend: - StepBoxGridReview.tsx: shows each box with cropped image + editable GridTable. Layout type dropdown per box. Auto-builds on first load. - Auto-skip when no boxes detected on page - Pipeline steps updated: 13 steps (0-12), Ground Truth moved to 12 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 17:26:06 +02:00
parent 52637778b9
commit 5da9a550bf
6 changed files with 661 additions and 2 deletions
@@ -0,0 +1,256 @@
+"""
+Box layout classifier — detects internal layout type of embedded boxes.
+
+Classifies each box as: flowing | columnar | bullet_list | header_only
+and provides layout-appropriate grid building.
+
+Used by the Box-Grid-Review step to rebuild box zones with correct structure.
+"""
+
+import logging
+import re
+import statistics
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# Bullet / list-item patterns at the start of a line
+_BULLET_RE = re.compile(
+    r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s'  # dash, bullet chars
+    r'|^\d{1,2}[.)]\s'     # numbered: "1) " or "1. "
+    r'|^[a-z][.)]\s'       # lettered: "a) " or "a. "
+)
+
+
+def classify_box_layout(
+    words: List[Dict],
+    box_w: int,
+    box_h: int,
+) -> str:
+    """Classify the internal layout of a detected box.
+
+    Args:
+        words: OCR word dicts within the box (with top, left, width, height, text)
+        box_w: Box width in pixels
+        box_h: Box height in pixels
+
+    Returns:
+        'header_only' | 'bullet_list' | 'columnar' | 'flowing'
+    """
+    if not words:
+        return "header_only"
+
+    # Group words into lines by y-proximity
+    lines = _group_into_lines(words)
+
+    # Header only: very few words or single line
+    total_words = sum(len(line) for line in lines)
+    if total_words <= 5 or len(lines) <= 1:
+        return "header_only"
+
+    # Bullet list: check if majority of lines start with bullet patterns
+    bullet_count = 0
+    for line in lines:
+        first_text = line[0].get("text", "") if line else ""
+        if _BULLET_RE.match(first_text):
+            bullet_count += 1
+        # Also check if first word IS a bullet char
+        elif first_text.strip() in ("-", "–", "—", "•", "·", "▪", "▸"):
+            bullet_count += 1
+    if bullet_count >= len(lines) * 0.4 and bullet_count >= 2:
+        return "bullet_list"
+
+    # Columnar: check for multiple distinct x-clusters
+    if len(lines) >= 3 and _has_column_structure(words, box_w):
+        return "columnar"
+
+    # Default: flowing text
+    return "flowing"
+
+
+def _group_into_lines(words: List[Dict]) -> List[List[Dict]]:
+    """Group words into lines by y-proximity."""
+    if not words:
+        return []
+
+    sorted_words = sorted(words, key=lambda w: (w["top"], w["left"]))
+    heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
+    median_h = statistics.median(heights) if heights else 20
+    y_tolerance = max(median_h * 0.5, 5)
+
+    lines: List[List[Dict]] = []
+    current_line: List[Dict] = [sorted_words[0]]
+    current_y = sorted_words[0]["top"]
+
+    for w in sorted_words[1:]:
+        if abs(w["top"] - current_y) <= y_tolerance:
+            current_line.append(w)
+        else:
+            lines.append(sorted(current_line, key=lambda ww: ww["left"]))
+            current_line = [w]
+            current_y = w["top"]
+
+    if current_line:
+        lines.append(sorted(current_line, key=lambda ww: ww["left"]))
+
+    return lines
+
+
+def _has_column_structure(words: List[Dict], box_w: int) -> bool:
+    """Check if words have multiple distinct left-edge clusters (columns)."""
+    if box_w <= 0:
+        return False
+
+    lines = _group_into_lines(words)
+    if len(lines) < 3:
+        return False
+
+    # Collect left-edges of non-first words in each line
+    # (first word of each line often aligns regardless of columns)
+    left_edges = []
+    for line in lines:
+        for w in line[1:]:  # skip first word
+            left_edges.append(w["left"])
+
+    if len(left_edges) < 4:
+        return False
+
+    # Check if left edges cluster into 2+ distinct groups
+    left_edges.sort()
+    gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)]
+    if not gaps:
+        return False
+
+    median_gap = statistics.median(gaps)
+    # A column gap is typically > 15% of box width
+    column_gap_threshold = box_w * 0.15
+    large_gaps = [g for g in gaps if g > column_gap_threshold]
+
+    return len(large_gaps) >= 1
+
+
+def build_box_zone_grid(
+    zone_words: List[Dict],
+    box_x: int,
+    box_y: int,
+    box_w: int,
+    box_h: int,
+    zone_index: int,
+    img_w: int,
+    img_h: int,
+    layout_type: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Build a grid for a box zone with layout-aware processing.
+
+    If layout_type is None, auto-detects it.
+    For 'flowing' and 'bullet_list', forces single-column layout.
+    For 'columnar', uses the standard multi-column detection.
+    For 'header_only', creates a single cell.
+
+    Returns the same format as _build_zone_grid (columns, rows, cells, header_rows).
+    """
+    from grid_editor_helpers import _build_zone_grid, _cluster_rows
+
+    if not zone_words:
+        return {
+            "columns": [],
+            "rows": [],
+            "cells": [],
+            "header_rows": [],
+            "box_layout_type": layout_type or "header_only",
+            "box_grid_reviewed": False,
+        }
+
+    # Auto-detect layout if not specified
+    if not layout_type:
+        layout_type = classify_box_layout(zone_words, box_w, box_h)
+
+    logger.info(
+        "Box zone %d: layout_type=%s, %d words, %dx%d",
+        zone_index, layout_type, len(zone_words), box_w, box_h,
+    )
+
+    if layout_type == "header_only":
+        # Single cell with all text concatenated
+        all_text = " ".join(
+            w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"]))
+        ).strip()
+        return {
+            "columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}],
+            "rows": [{"index": 0, "row_index": 0, "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2}],
+            "cells": [{
+                "cell_id": f"Z{zone_index}_R0C0",
+                "row_index": 0,
+                "col_index": 0,
+                "col_type": "column_1",
+                "text": all_text,
+                "word_boxes": zone_words,
+            }],
+            "header_rows": [0],
+            "box_layout_type": layout_type,
+            "box_grid_reviewed": False,
+        }
+
+    if layout_type in ("flowing", "bullet_list"):
+        # Force single column — each line becomes one row with one cell
+        lines = _group_into_lines(zone_words)
+        column = {"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}
+        rows = []
+        cells = []
+
+        for row_idx, line_words in enumerate(lines):
+            if not line_words:
+                continue
+            y_min = min(w["top"] for w in line_words)
+            y_max = max(w["top"] + w["height"] for w in line_words)
+            y_center = (y_min + y_max) / 2
+
+            row = {
+                "index": row_idx,
+                "row_index": row_idx,
+                "y_min": y_min,
+                "y_max": y_max,
+                "y_center": y_center,
+            }
+            rows.append(row)
+
+            line_text = " ".join(w.get("text", "") for w in line_words).strip()
+            cell = {
+                "cell_id": f"Z{zone_index}_R{row_idx}C0",
+                "row_index": row_idx,
+                "col_index": 0,
+                "col_type": "column_1",
+                "text": line_text,
+                "word_boxes": line_words,
+            }
+            cells.append(cell)
+
+        # Detect header: first row if it's notably different (bold, larger, or short)
+        header_rows = []
+        if len(lines) >= 2:
+            first_line = lines[0]
+            first_text = " ".join(w.get("text", "") for w in first_line).strip()
+            # Header heuristic: short text, or all-caps, or ends with ':'
+            if (len(first_text) < 40
+                    or first_text.isupper()
+                    or first_text.rstrip().endswith(':')):
+                header_rows = [0]
+
+        return {
+            "columns": [column],
+            "rows": rows,
+            "cells": cells,
+            "header_rows": header_rows,
+            "box_layout_type": layout_type,
+            "box_grid_reviewed": False,
+        }
+
+    # Columnar: use standard grid builder with independent column detection
+    result = _build_zone_grid(
+        zone_words, box_x, box_y, box_w, box_h,
+        zone_index, img_w, img_h,
+        global_columns=None,  # detect columns independently
+    )
+    result["box_layout_type"] = layout_type
+    result["box_grid_reviewed"] = False
+    return result
@@ -2181,3 +2181,117 @@ async def gutter_repair_apply(session_id: str, request: Request):
    )

    return result
+
+
+# ---------------------------------------------------------------------------
+# Box-Grid-Review endpoints
+# ---------------------------------------------------------------------------
+
+@router.post("/sessions/{session_id}/build-box-grids")
+async def build_box_grids(session_id: str, request: Request):
+    """Rebuild grid structure for all box zones with layout-aware detection.
+
+    For each zone with zone_type='box':
+    1. Auto-detect layout type (flowing / columnar / bullet_list / header_only)
+    2. Build grid with layout-appropriate parameters
+    3. Apply SmartSpellChecker corrections
+    4. Store results back in grid_editor_result.zones[]
+
+    Optional body: { "overrides": { "2": "bullet_list" } }
+    Maps zone_index → forced layout_type.
+    """
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    grid_data = session.get("grid_editor_result")
+    if not grid_data:
+        raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
+
+    word_result = session.get("word_result") or {}
+    all_words = word_result.get("cells") or word_result.get("words") or []
+
+    body = {}
+    try:
+        body = await request.json()
+    except Exception:
+        pass
+    layout_overrides = body.get("overrides", {})
+
+    from cv_box_layout import classify_box_layout, build_box_zone_grid, _group_into_lines
+    from grid_editor_helpers import _words_in_zone
+
+    img_w = grid_data.get("image_width", 0)
+    img_h = grid_data.get("image_height", 0)
+
+    zones = grid_data.get("zones", [])
+    box_count = 0
+    spell_fixes = 0
+
+    for z in zones:
+        if z.get("zone_type") != "box":
+            continue
+
+        bbox = z.get("bbox_px", {})
+        bx, by = bbox.get("x", 0), bbox.get("y", 0)
+        bw, bh = bbox.get("w", 0), bbox.get("h", 0)
+
+        if bw <= 0 or bh <= 0:
+            continue
+
+        zone_idx = z.get("zone_index", 0)
+
+        # Filter words inside this box
+        zone_words = _words_in_zone(all_words, by, bh, bx, bw)
+        if not zone_words:
+            logger.info("Box zone %d: no words found in bbox", zone_idx)
+            continue
+
+        # Get layout override or auto-detect
+        forced_layout = layout_overrides.get(str(zone_idx))
+
+        # Build box grid
+        box_grid = build_box_zone_grid(
+            zone_words, bx, by, bw, bh,
+            zone_idx, img_w, img_h,
+            layout_type=forced_layout,
+        )
+
+        # Apply SmartSpellChecker to all box cells
+        try:
+            from smart_spell import SmartSpellChecker
+            ssc = SmartSpellChecker()
+            for cell in box_grid.get("cells", []):
+                text = cell.get("text", "")
+                if not text:
+                    continue
+                result = ssc.correct_text(text, lang="auto")
+                if result.changed:
+                    cell["text"] = result.corrected
+                    spell_fixes += 1
+        except ImportError:
+            pass
+
+        # Update zone data with new grid
+        z["columns"] = box_grid["columns"]
+        z["rows"] = box_grid["rows"]
+        z["cells"] = box_grid["cells"]
+        z["header_rows"] = box_grid.get("header_rows", [])
+        z["box_layout_type"] = box_grid.get("box_layout_type", "flowing")
+        z["box_grid_reviewed"] = False
+        box_count += 1
+
+    # Save updated grid back
+    await update_session_db(session_id, grid_editor_result=grid_data)
+
+    logger.info(
+        "build-box-grids session %s: %d box zones rebuilt, %d spell fixes",
+        session_id, box_count, spell_fixes,
+    )
+
+    return {
+        "session_id": session_id,
+        "box_zones_rebuilt": box_count,
+        "spell_fixes": spell_fixes,
+        "zones": zones,
+    }