Fix overlay word leak, ghost filter false positive, merged zone header
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 41s

1. Filter words inside image_overlays (removes OCR from images)
2. Ghost filter: only remove single-char border artifacts, not multi-char
   like (= which is real content
3. Skip first-row header detection for zones with image_overlays
   (merged geometry creates artificial gaps)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-19 13:56:04 +01:00
parent df30d4eae3
commit e3395ae8cf
2 changed files with 129 additions and 22 deletions

View File

@@ -320,7 +320,7 @@ def _filter_border_ghosts(
) )
if not on_border: if not on_border:
return False return False
if all(c in _GRID_GHOST_CHARS for c in text): if len(text) == 1 and text in _GRID_GHOST_CHARS:
return True return True
return False return False
@@ -656,6 +656,7 @@ def _detect_header_rows(
zone_words: List[Dict], zone_words: List[Dict],
zone_y: int, zone_y: int,
columns: Optional[List[Dict]] = None, columns: Optional[List[Dict]] = None,
skip_first_row_header: bool = False,
) -> List[int]: ) -> List[int]:
"""Detect header rows: first-row heuristic + spanning header detection. """Detect header rows: first-row heuristic + spanning header detection.
@@ -666,27 +667,29 @@ def _detect_header_rows(
return [] return []
headers = [] headers = []
first_row = rows[0]
second_row = rows[1]
# Gap between first and second row > 0.5x average row height if not skip_first_row_header:
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows) first_row = rows[0]
gap = second_row["y_min"] - first_row["y_max"] second_row = rows[1]
if gap > avg_h * 0.5:
headers.append(0)
# Also check if first row words are taller than average (bold/header text) # Gap between first and second row > 0.5x average row height
all_heights = [w["height"] for w in zone_words] avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20 gap = second_row["y_min"] - first_row["y_max"]
first_row_words = [ if gap > avg_h * 0.5:
w for w in zone_words headers.append(0)
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
] # Also check if first row words are taller than average (bold/header text)
if first_row_words: all_heights = [w["height"] for w in zone_words]
first_h = max(w["height"] for w in first_row_words) median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
if first_h > median_h * 1.3: first_row_words = [
if 0 not in headers: w for w in zone_words
headers.append(0) if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
]
if first_row_words:
first_h = max(w["height"] for w in first_row_words)
if first_h > median_h * 1.3:
if 0 not in headers:
headers.append(0)
# Note: Spanning-header detection (rows spanning all columns) has been # Note: Spanning-header detection (rows spanning all columns) has been
# disabled because it produces too many false positives on vocabulary # disabled because it produces too many false positives on vocabulary
@@ -707,6 +710,7 @@ def _build_zone_grid(
img_w: int, img_w: int,
img_h: int, img_h: int,
global_columns: Optional[List[Dict]] = None, global_columns: Optional[List[Dict]] = None,
skip_first_row_header: bool = False,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Build columns, rows, cells for a single zone from its words. """Build columns, rows, cells for a single zone from its words.
@@ -773,7 +777,8 @@ def _build_zone_grid(
cell["zone_index"] = zone_index cell["zone_index"] = zone_index
# Detect header rows (pass columns for spanning header detection) # Detect header rows (pass columns for spanning header detection)
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns) header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
skip_first_row_header=skip_first_row_header)
# Merge cells in spanning header rows into a single col-0 cell # Merge cells in spanning header rows into a single col-0 cell
if header_rows and len(columns) >= 2: if header_rows and len(columns) >= 2:
@@ -1270,9 +1275,27 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
"build-grid: filtered %d recovered artifacts from %s zone %d", "build-grid: filtered %d recovered artifacts from %s zone %d",
removed, pz.zone_type, pz.index, removed, pz.zone_type, pz.index,
) )
# Filter words inside image overlay regions (merged box zones)
if pz.image_overlays:
before_ov = len(zone_words)
zone_words = [
w for w in zone_words
if not any(
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
for ov in pz.image_overlays
)
]
ov_removed = before_ov - len(zone_words)
if ov_removed:
logger.info(
"build-grid: filtered %d words inside image overlays from zone %d",
ov_removed, pz.index,
)
grid = _build_zone_grid( grid = _build_zone_grid(
zone_words, pz.x, pz.y, pz.width, pz.height, zone_words, pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h, pz.index, img_w, img_h,
skip_first_row_header=bool(pz.image_overlays),
) )
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid}) zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
@@ -1339,6 +1362,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
pz.width, pz.height, pz.width, pz.height,
pz.index, img_w, img_h, pz.index, img_w, img_h,
global_columns=merged_columns, global_columns=merged_columns,
skip_first_row_header=bool(pz.image_overlays),
) )
zg["grid"] = grid zg["grid"] = grid
logger.info( logger.info(

View File

@@ -1,9 +1,11 @@
""" """
Tests for grid_editor_api zone merging and heading detection. Tests for grid_editor_api zone merging, heading detection, and ghost filtering.
Covers: Covers:
- _merge_content_zones_across_boxes: zone merging logic - _merge_content_zones_across_boxes: zone merging logic
- _detect_heading_rows_by_color: heading detection by color + height - _detect_heading_rows_by_color: heading detection by color + height
- _filter_border_ghosts: single-char ghost detection
- _detect_header_rows: skip_first_row_header flag
""" """
import sys import sys
@@ -13,6 +15,8 @@ import pytest
from cv_vocab_types import PageZone, DetectedBox from cv_vocab_types import PageZone, DetectedBox
from grid_editor_api import ( from grid_editor_api import (
_merge_content_zones_across_boxes, _merge_content_zones_across_boxes,
_filter_border_ghosts,
_detect_header_rows,
_detect_heading_rows_by_color, _detect_heading_rows_by_color,
) )
@@ -358,3 +362,82 @@ class TestDetectHeadingRowsByColor:
zones_data = [self._make_zone(cells, rows, columns)] zones_data = [self._make_zone(cells, rows, columns)]
count = _detect_heading_rows_by_color(zones_data, 800, 1000) count = _detect_heading_rows_by_color(zones_data, 800, 1000)
assert count == 0 assert count == 0
# ---------------------------------------------------------------------------
# _filter_border_ghosts (Fix 2: single-char only)
# ---------------------------------------------------------------------------
class TestFilterBorderGhosts:
"""Test that ghost filtering only removes single-char words."""
def test_single_char_ghost_removed(self):
"""Single '|' on a box border → filtered as ghost."""
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
words = [
{"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
{"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 1
assert len(filtered) == 1
assert filtered[0]["text"] == "hello"
def test_multi_char_ghost_kept(self):
"""Multi-char '(=' on a box border → NOT filtered (real content)."""
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
words = [
{"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
{"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 0
assert len(filtered) == 2
def test_single_paren_on_border_removed(self):
"""Single ')' on border → filtered."""
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
words = [
{"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
]
filtered, count = _filter_border_ghosts(words, [box])
assert count == 1
assert len(filtered) == 0
# ---------------------------------------------------------------------------
# _detect_header_rows (Fix 3: skip_first_row_header)
# ---------------------------------------------------------------------------
class TestDetectHeaderRowsSkipFlag:
"""Test skip_first_row_header flag."""
def test_first_row_detected_without_flag(self):
"""Without flag, first row with big gap → header."""
rows = [
{"y_min": 100, "y_max": 120, "index": 0},
{"y_min": 160, "y_max": 180, "index": 1},
{"y_min": 185, "y_max": 205, "index": 2},
]
words = [
{"height": 20, "top": 105, "left": 10, "width": 80},
{"height": 20, "top": 165, "left": 10, "width": 80},
{"height": 20, "top": 190, "left": 10, "width": 80},
]
headers = _detect_header_rows(rows, words, 0)
assert 0 in headers
def test_first_row_skipped_with_flag(self):
"""With skip flag, first row NOT detected even with big gap."""
rows = [
{"y_min": 100, "y_max": 120, "index": 0},
{"y_min": 160, "y_max": 180, "index": 1},
{"y_min": 185, "y_max": 205, "index": 2},
]
words = [
{"height": 20, "top": 105, "left": 10, "width": 80},
{"height": 20, "top": 165, "left": 10, "width": 80},
{"height": 20, "top": 190, "left": 10, "width": 80},
]
headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
assert 0 not in headers