Add zone merging across images + heading detection by color/height

Zone merging: content zones separated by box zones (images) are merged into a single zone with image_overlays, so split tables reconnect. Heading detection: after color annotation, rows where all words are non-black and taller than 1.2x median are merged into spanning heading cells. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 12:22:11 +01:00
parent 2e6ab3a646
commit df30d4eae3
3 changed files with 586 additions and 0 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -21,6 +21,7 @@ import numpy as np
 from fastapi import APIRouter, HTTPException, Request

 from cv_box_detect import detect_boxes, split_page_into_zones
+from cv_vocab_types import PageZone
 from cv_color_detect import detect_word_colors, recover_colored_text
 from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa
 from cv_words_first import _cluster_rows, _build_cells
@@ -439,6 +440,217 @@ def _words_in_zone(
    return result


+def _merge_content_zones_across_boxes(
+    zones: List,
+    content_x: int,
+    content_w: int,
+) -> List:
+    """Merge content zones separated by box zones into single zones.
+
+    Box zones become image_overlays on the merged content zone.
+    Pattern: [content, box*, content] → [merged_content with overlay]
+    Box zones NOT between two content zones stay as standalone zones.
+    """
+    if len(zones) < 3:
+        return zones
+
+    # Group consecutive runs of [content, box+, content]
+    result: List = []
+    i = 0
+    while i < len(zones):
+        z = zones[i]
+        if z.zone_type != "content":
+            result.append(z)
+            i += 1
+            continue
+
+        # Start of a potential merge group: content zone
+        group_contents = [z]
+        group_boxes = []
+        j = i + 1
+        # Absorb [box, content] pairs — only absorb a box if it's
+        # confirmed to be followed by another content zone.
+        while j < len(zones):
+            if (zones[j].zone_type == "box"
+                    and j + 1 < len(zones)
+                    and zones[j + 1].zone_type == "content"):
+                group_boxes.append(zones[j])
+                group_contents.append(zones[j + 1])
+                j += 2
+            else:
+                break
+
+        if len(group_contents) >= 2 and group_boxes:
+            # Merge: create one large content zone spanning all
+            y_min = min(c.y for c in group_contents)
+            y_max = max(c.y + c.height for c in group_contents)
+            overlays = []
+            for bz in group_boxes:
+                overlay = {
+                    "y": bz.y,
+                    "height": bz.height,
+                    "x": bz.x,
+                    "width": bz.width,
+                }
+                if bz.box:
+                    overlay["box"] = {
+                        "x": bz.box.x,
+                        "y": bz.box.y,
+                        "width": bz.box.width,
+                        "height": bz.box.height,
+                        "confidence": bz.box.confidence,
+                        "border_thickness": bz.box.border_thickness,
+                    }
+                overlays.append(overlay)
+
+            merged = PageZone(
+                index=0,  # re-indexed below
+                zone_type="content",
+                y=y_min,
+                height=y_max - y_min,
+                x=content_x,
+                width=content_w,
+                image_overlays=overlays,
+            )
+            result.append(merged)
+            i = j
+        else:
+            # No merge possible — emit just the content zone
+            result.append(z)
+            i += 1
+
+    # Re-index zones
+    for idx, z in enumerate(result):
+        z.index = idx
+
+    logger.info(
+        "zone-merge: %d zones → %d zones after merging across boxes",
+        len(zones), len(result),
+    )
+    return result
+
+
+def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int:
+    """Detect heading rows by color + height after color annotation.
+
+    A row is a heading if:
+    1. ALL word_boxes have color_name != 'black' (typically 'blue')
+    2. Mean word height > 1.2x median height of all words in the zone
+
+    Detected heading rows are merged into a single spanning cell.
+    Returns count of headings detected.
+    """
+    heading_count = 0
+
+    for z in zones_data:
+        cells = z.get("cells", [])
+        rows = z.get("rows", [])
+        columns = z.get("columns", [])
+        if not cells or not rows or len(columns) < 2:
+            continue
+
+        # Compute median word height across the zone
+        all_heights = []
+        for cell in cells:
+            for wb in cell.get("word_boxes") or []:
+                h = wb.get("height", 0)
+                if h > 0:
+                    all_heights.append(h)
+        if not all_heights:
+            continue
+        all_heights_sorted = sorted(all_heights)
+        median_h = all_heights_sorted[len(all_heights_sorted) // 2]
+
+        heading_row_indices = []
+        for row in rows:
+            if row.get("is_header"):
+                continue  # already detected as header
+            ri = row["index"]
+            row_cells = [c for c in cells if c.get("row_index") == ri]
+            row_wbs = [
+                wb for cell in row_cells
+                for wb in cell.get("word_boxes") or []
+            ]
+            if not row_wbs:
+                continue
+
+            # Condition 1: ALL words are non-black
+            all_colored = all(
+                wb.get("color_name", "black") != "black"
+                for wb in row_wbs
+            )
+            if not all_colored:
+                continue
+
+            # Condition 2: mean height > 1.2x median
+            mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs)
+            if mean_h <= median_h * 1.2:
+                continue
+
+            heading_row_indices.append(ri)
+
+        # Merge heading cells into spanning cells
+        for hri in heading_row_indices:
+            header_cells = [c for c in cells if c.get("row_index") == hri]
+            if len(header_cells) <= 1:
+                # Single cell — just mark it as heading
+                if header_cells:
+                    header_cells[0]["col_type"] = "heading"
+                    heading_count += 1
+                    # Mark row as header
+                    for row in rows:
+                        if row["index"] == hri:
+                            row["is_header"] = True
+                continue
+
+            # Collect all word_boxes and text from all columns
+            all_wb = []
+            all_text_parts = []
+            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
+                all_wb.extend(hc.get("word_boxes", []))
+                if hc.get("text", "").strip():
+                    all_text_parts.append(hc["text"].strip())
+
+            # Remove all cells for this row, replace with one spanning cell
+            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
+
+            if all_wb:
+                x_min = min(wb["left"] for wb in all_wb)
+                y_min = min(wb["top"] for wb in all_wb)
+                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
+                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
+
+                zone_idx = z.get("zone_index", 0)
+                z["cells"].append({
+                    "cell_id": f"Z{zone_idx}_R{hri:02d}_C0",
+                    "zone_index": zone_idx,
+                    "row_index": hri,
+                    "col_index": 0,
+                    "col_type": "heading",
+                    "text": " ".join(all_text_parts),
+                    "confidence": 0.0,
+                    "bbox_px": {"x": x_min, "y": y_min,
+                                "w": x_max - x_min, "h": y_max - y_min},
+                    "bbox_pct": {
+                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
+                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
+                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
+                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
+                    },
+                    "word_boxes": all_wb,
+                    "ocr_engine": "words_first",
+                    "is_bold": True,
+                })
+
+            # Mark row as header
+            for row in rows:
+                if row["index"] == hri:
+                    row["is_header"] = True
+            heading_count += 1
+
+    return heading_count
+
+
 def _detect_header_rows(
    rows: List[Dict],
    zone_words: List[Dict],
@@ -1023,6 +1235,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    content_x, content_y, content_w, content_h, boxes
                )

+                # Merge content zones separated by box zones
+                page_zones = _merge_content_zones_across_boxes(
+                    page_zones, content_x, content_w
+                )
+
                # --- Union columns from all content zones ---
                # Each content zone detects columns independently.  Narrow
                # columns (page refs, markers) may appear in only one zone.
@@ -1161,6 +1378,9 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                            "confidence": pz.box.confidence,
                        }

+                    if pz.image_overlays:
+                        zone_entry["image_overlays"] = pz.image_overlays
+
                    zones_data.append(zone_entry)

    # 4. Fallback: no boxes detected → single zone with all words
@@ -1282,6 +1502,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                all_wb.extend(cell.get("word_boxes", []))
        detect_word_colors(img_bgr, all_wb)

+    # 5a. Heading detection by color + height (after color is available)
+    heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
+    if heading_count:
+        logger.info("Detected %d heading rows by color+height", heading_count)
+
    # 5b. Fix unmatched parentheses in cell text
    # OCR often misses opening "(" while detecting closing ")".
    # If a cell's text has ")" without a matching "(", prepend "(".