Fix overlay word leak, ghost filter false positive, merged zone header

1. Filter words inside image_overlays (removes OCR from images) 2. Ghost filter: only remove single-char border artifacts, not multi-char like (= which is real content 3. Skip first-row header detection for zones with image_overlays (merged geometry creates artificial gaps) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 13:56:04 +01:00
parent df30d4eae3
commit e3395ae8cf
2 changed files with 129 additions and 22 deletions
@@ -320,7 +320,7 @@ def _filter_border_ghosts(
        )
        if not on_border:
            return False
-        if all(c in _GRID_GHOST_CHARS for c in text):
+        if len(text) == 1 and text in _GRID_GHOST_CHARS:
            return True
        return False
@@ -656,6 +656,7 @@ def _detect_header_rows(
    zone_words: List[Dict],
    zone_y: int,
    columns: Optional[List[Dict]] = None,
    skip_first_row_header: bool = False,
 ) -> List[int]:
    """Detect header rows: first-row heuristic + spanning header detection.
@@ -666,27 +667,29 @@ def _detect_header_rows(
        return []
    headers = []
    first_row = rows[0]
    second_row = rows[1]
-    # Gap between first and second row > 0.5x average row height
+    if not skip_first_row_header:
-    avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
+        first_row = rows[0]
-    gap = second_row["y_min"] - first_row["y_max"]
+        second_row = rows[1]
    if gap > avg_h * 0.5:
        headers.append(0)
-    # Also check if first row words are taller than average (bold/header text)
+        # Gap between first and second row > 0.5x average row height
-    all_heights = [w["height"] for w in zone_words]
+        avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
-    median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
+        gap = second_row["y_min"] - first_row["y_max"]
-    first_row_words = [
+        if gap > avg_h * 0.5:
-        w for w in zone_words
+            headers.append(0)
-        if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
+
-    ]
+        # Also check if first row words are taller than average (bold/header text)
-    if first_row_words:
+        all_heights = [w["height"] for w in zone_words]
-        first_h = max(w["height"] for w in first_row_words)
+        median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
-        if first_h > median_h * 1.3:
+        first_row_words = [
-            if 0 not in headers:
+            w for w in zone_words
-                headers.append(0)
+            if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
        ]
        if first_row_words:
            first_h = max(w["height"] for w in first_row_words)
            if first_h > median_h * 1.3:
                if 0 not in headers:
                    headers.append(0)
    # Note: Spanning-header detection (rows spanning all columns) has been
    # disabled because it produces too many false positives on vocabulary
@@ -707,6 +710,7 @@ def _build_zone_grid(
    img_w: int,
    img_h: int,
    global_columns: Optional[List[Dict]] = None,
    skip_first_row_header: bool = False,
 ) -> Dict[str, Any]:
    """Build columns, rows, cells for a single zone from its words.
@@ -773,7 +777,8 @@ def _build_zone_grid(
        cell["zone_index"] = zone_index
    # Detect header rows (pass columns for spanning header detection)
-    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns)
+    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
                                      skip_first_row_header=skip_first_row_header)
    # Merge cells in spanning header rows into a single col-0 cell
    if header_rows and len(columns) >= 2:
@@ -1270,9 +1275,27 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                            "build-grid: filtered %d recovered artifacts from %s zone %d",
                            removed, pz.zone_type, pz.index,
                        )
                    # Filter words inside image overlay regions (merged box zones)
                    if pz.image_overlays:
                        before_ov = len(zone_words)
                        zone_words = [
                            w for w in zone_words
                            if not any(
                                ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
                                and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
                                for ov in pz.image_overlays
                            )
                        ]
                        ov_removed = before_ov - len(zone_words)
                        if ov_removed:
                            logger.info(
                                "build-grid: filtered %d words inside image overlays from zone %d",
                                ov_removed, pz.index,
                            )
                    grid = _build_zone_grid(
                        zone_words, pz.x, pz.y, pz.width, pz.height,
                        pz.index, img_w, img_h,
                        skip_first_row_header=bool(pz.image_overlays),
                    )
                    zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
@@ -1339,6 +1362,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                                        pz.width, pz.height,
                                        pz.index, img_w, img_h,
                                        global_columns=merged_columns,
                                        skip_first_row_header=bool(pz.image_overlays),
                                    )
                                    zg["grid"] = grid
                            logger.info(
@@ -1,9 +1,11 @@
 """
-Tests for grid_editor_api zone merging and heading detection.
+Tests for grid_editor_api zone merging, heading detection, and ghost filtering.
 Covers:
 - _merge_content_zones_across_boxes: zone merging logic
 - _detect_heading_rows_by_color: heading detection by color + height
 - _filter_border_ghosts: single-char ghost detection
 - _detect_header_rows: skip_first_row_header flag
 """
 import sys
@@ -13,6 +15,8 @@ import pytest
 from cv_vocab_types import PageZone, DetectedBox
 from grid_editor_api import (
    _merge_content_zones_across_boxes,
    _filter_border_ghosts,
    _detect_header_rows,
    _detect_heading_rows_by_color,
 )
@@ -358,3 +362,82 @@ class TestDetectHeadingRowsByColor:
        zones_data = [self._make_zone(cells, rows, columns)]
        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
        assert count == 0
 # ---------------------------------------------------------------------------
 # _filter_border_ghosts (Fix 2: single-char only)
 # ---------------------------------------------------------------------------
 class TestFilterBorderGhosts:
    """Test that ghost filtering only removes single-char words."""
    def test_single_char_ghost_removed(self):
        """Single '|' on a box border → filtered as ghost."""
        box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
        words = [
            {"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
            {"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 1
        assert len(filtered) == 1
        assert filtered[0]["text"] == "hello"
    def test_multi_char_ghost_kept(self):
        """Multi-char '(=' on a box border → NOT filtered (real content)."""
        box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
        words = [
            {"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
            {"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 0
        assert len(filtered) == 2
    def test_single_paren_on_border_removed(self):
        """Single ')' on border → filtered."""
        box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
        words = [
            {"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
        ]
        filtered, count = _filter_border_ghosts(words, [box])
        assert count == 1
        assert len(filtered) == 0
 # ---------------------------------------------------------------------------
 # _detect_header_rows (Fix 3: skip_first_row_header)
 # ---------------------------------------------------------------------------
 class TestDetectHeaderRowsSkipFlag:
    """Test skip_first_row_header flag."""
    def test_first_row_detected_without_flag(self):
        """Without flag, first row with big gap → header."""
        rows = [
            {"y_min": 100, "y_max": 120, "index": 0},
            {"y_min": 160, "y_max": 180, "index": 1},
            {"y_min": 185, "y_max": 205, "index": 2},
        ]
        words = [
            {"height": 20, "top": 105, "left": 10, "width": 80},
            {"height": 20, "top": 165, "left": 10, "width": 80},
            {"height": 20, "top": 190, "left": 10, "width": 80},
        ]
        headers = _detect_header_rows(rows, words, 0)
        assert 0 in headers
    def test_first_row_skipped_with_flag(self):
        """With skip flag, first row NOT detected even with big gap."""
        rows = [
            {"y_min": 100, "y_max": 120, "index": 0},
            {"y_min": 160, "y_max": 180, "index": 1},
            {"y_min": 185, "y_max": 205, "index": 2},
        ]
        words = [
            {"height": 20, "top": 105, "left": 10, "width": 80},
            {"height": 20, "top": 165, "left": 10, "width": 80},
            {"height": 20, "top": 190, "left": 10, "width": 80},
        ]
        headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
        assert 0 not in headers