Fix overlay word leak, ghost filter false positive, merged zone header

1. Filter words inside image_overlays (removes OCR from images) 2. Ghost filter: only remove single-char border artifacts, not multi-char like (= which is real content 3. Skip first-row header detection for zones with image_overlays (merged geometry creates artificial gaps) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 13:56:04 +01:00
parent df30d4eae3
commit e3395ae8cf
2 changed files with 129 additions and 22 deletions
@@ -320,7 +320,7 @@ def _filter_border_ghosts(
        )
        if not on_border:
            return False
-        if all(c in _GRID_GHOST_CHARS for c in text):
+        if len(text) == 1 and text in _GRID_GHOST_CHARS:
            return True
        return False

@@ -656,6 +656,7 @@ def _detect_header_rows(
    zone_words: List[Dict],
    zone_y: int,
    columns: Optional[List[Dict]] = None,
+    skip_first_row_header: bool = False,
 ) -> List[int]:
    """Detect header rows: first-row heuristic + spanning header detection.

@@ -666,27 +667,29 @@ def _detect_header_rows(
        return []

    headers = []
-    first_row = rows[0]
-    second_row = rows[1]

-    # Gap between first and second row > 0.5x average row height
-    avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
-    gap = second_row["y_min"] - first_row["y_max"]
-    if gap > avg_h * 0.5:
-        headers.append(0)
+    if not skip_first_row_header:
+        first_row = rows[0]
+        second_row = rows[1]

-    # Also check if first row words are taller than average (bold/header text)
-    all_heights = [w["height"] for w in zone_words]
-    median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
-    first_row_words = [
-        w for w in zone_words
-        if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
-    ]
-    if first_row_words:
-        first_h = max(w["height"] for w in first_row_words)
-        if first_h > median_h * 1.3:
-            if 0 not in headers:
-                headers.append(0)
+        # Gap between first and second row > 0.5x average row height
+        avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
+        gap = second_row["y_min"] - first_row["y_max"]
+        if gap > avg_h * 0.5:
+            headers.append(0)
+
+        # Also check if first row words are taller than average (bold/header text)
+        all_heights = [w["height"] for w in zone_words]
+        median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
+        first_row_words = [
+            w for w in zone_words
+            if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
+        ]
+        if first_row_words:
+            first_h = max(w["height"] for w in first_row_words)
+            if first_h > median_h * 1.3:
+                if 0 not in headers:
+                    headers.append(0)

    # Note: Spanning-header detection (rows spanning all columns) has been
    # disabled because it produces too many false positives on vocabulary
@@ -707,6 +710,7 @@ def _build_zone_grid(
    img_w: int,
    img_h: int,
    global_columns: Optional[List[Dict]] = None,
+    skip_first_row_header: bool = False,
 ) -> Dict[str, Any]:
    """Build columns, rows, cells for a single zone from its words.

@@ -773,7 +777,8 @@ def _build_zone_grid(
        cell["zone_index"] = zone_index

    # Detect header rows (pass columns for spanning header detection)
-    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns)
+    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
+                                      skip_first_row_header=skip_first_row_header)

    # Merge cells in spanning header rows into a single col-0 cell
    if header_rows and len(columns) >= 2:
@@ -1270,9 +1275,27 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                            "build-grid: filtered %d recovered artifacts from %s zone %d",
                            removed, pz.zone_type, pz.index,
                        )
+                    # Filter words inside image overlay regions (merged box zones)
+                    if pz.image_overlays:
+                        before_ov = len(zone_words)
+                        zone_words = [
+                            w for w in zone_words
+                            if not any(
+                                ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
+                                and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
+                                for ov in pz.image_overlays
+                            )
+                        ]
+                        ov_removed = before_ov - len(zone_words)
+                        if ov_removed:
+                            logger.info(
+                                "build-grid: filtered %d words inside image overlays from zone %d",
+                                ov_removed, pz.index,
+                            )
                    grid = _build_zone_grid(
                        zone_words, pz.x, pz.y, pz.width, pz.height,
                        pz.index, img_w, img_h,
+                        skip_first_row_header=bool(pz.image_overlays),
                    )
                    zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})

@@ -1339,6 +1362,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                                        pz.width, pz.height,
                                        pz.index, img_w, img_h,
                                        global_columns=merged_columns,
+                                        skip_first_row_header=bool(pz.image_overlays),
                                    )
                                    zg["grid"] = grid
                            logger.info(
@@ -1,9 +1,11 @@
 """
-Tests for grid_editor_api zone merging and heading detection.
+Tests for grid_editor_api zone merging, heading detection, and ghost filtering.

 Covers:
 - _merge_content_zones_across_boxes: zone merging logic
 - _detect_heading_rows_by_color: heading detection by color + height
+- _filter_border_ghosts: single-char ghost detection
+- _detect_header_rows: skip_first_row_header flag
 """

 import sys
@@ -13,6 +15,8 @@ import pytest
 from cv_vocab_types import PageZone, DetectedBox
 from grid_editor_api import (
    _merge_content_zones_across_boxes,
+    _filter_border_ghosts,
+    _detect_header_rows,
    _detect_heading_rows_by_color,
 )

@@ -358,3 +362,82 @@ class TestDetectHeadingRowsByColor:
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0
+
+
+# ---------------------------------------------------------------------------
+# _filter_border_ghosts (Fix 2: single-char only)
+# ---------------------------------------------------------------------------
+
+class TestFilterBorderGhosts:
+    """Test that ghost filtering only removes single-char words."""
+
+    def test_single_char_ghost_removed(self):
+        """Single '|' on a box border → filtered as ghost."""
+        box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
+        words = [
+            {"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
+            {"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
+        ]
+        filtered, count = _filter_border_ghosts(words, [box])
+        assert count == 1
+        assert len(filtered) == 1
+        assert filtered[0]["text"] == "hello"
+
+    def test_multi_char_ghost_kept(self):
+        """Multi-char '(=' on a box border → NOT filtered (real content)."""
+        box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
+        words = [
+            {"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
+            {"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
+        ]
+        filtered, count = _filter_border_ghosts(words, [box])
+        assert count == 0
+        assert len(filtered) == 2
+
+    def test_single_paren_on_border_removed(self):
+        """Single ')' on border → filtered."""
+        box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
+        words = [
+            {"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
+        ]
+        filtered, count = _filter_border_ghosts(words, [box])
+        assert count == 1
+        assert len(filtered) == 0
+
+
+# ---------------------------------------------------------------------------
+# _detect_header_rows (Fix 3: skip_first_row_header)
+# ---------------------------------------------------------------------------
+
+class TestDetectHeaderRowsSkipFlag:
+    """Test skip_first_row_header flag."""
+
+    def test_first_row_detected_without_flag(self):
+        """Without flag, first row with big gap → header."""
+        rows = [
+            {"y_min": 100, "y_max": 120, "index": 0},
+            {"y_min": 160, "y_max": 180, "index": 1},
+            {"y_min": 185, "y_max": 205, "index": 2},
+        ]
+        words = [
+            {"height": 20, "top": 105, "left": 10, "width": 80},
+            {"height": 20, "top": 165, "left": 10, "width": 80},
+            {"height": 20, "top": 190, "left": 10, "width": 80},
+        ]
+        headers = _detect_header_rows(rows, words, 0)
+        assert 0 in headers
+
+    def test_first_row_skipped_with_flag(self):
+        """With skip flag, first row NOT detected even with big gap."""
+        rows = [
+            {"y_min": 100, "y_max": 120, "index": 0},
+            {"y_min": 160, "y_max": 180, "index": 1},
+            {"y_min": 185, "y_max": 205, "index": 2},
+        ]
+        words = [
+            {"height": 20, "top": 105, "left": 10, "width": 80},
+            {"height": 20, "top": 165, "left": 10, "width": 80},
+            {"height": 20, "top": 190, "left": 10, "width": 80},
+        ]
+        headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
+        assert 0 not in headers