Fix overlay word leak, ghost filter false positive, merged zone header
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 41s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 41s
1. Filter words inside image_overlays (removes OCR from images) 2. Ghost filter: only remove single-char border artifacts, not multi-char like (= which is real content 3. Skip first-row header detection for zones with image_overlays (merged geometry creates artificial gaps) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -320,7 +320,7 @@ def _filter_border_ghosts(
|
||||
)
|
||||
if not on_border:
|
||||
return False
|
||||
if all(c in _GRID_GHOST_CHARS for c in text):
|
||||
if len(text) == 1 and text in _GRID_GHOST_CHARS:
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -656,6 +656,7 @@ def _detect_header_rows(
|
||||
zone_words: List[Dict],
|
||||
zone_y: int,
|
||||
columns: Optional[List[Dict]] = None,
|
||||
skip_first_row_header: bool = False,
|
||||
) -> List[int]:
|
||||
"""Detect header rows: first-row heuristic + spanning header detection.
|
||||
|
||||
@@ -666,27 +667,29 @@ def _detect_header_rows(
|
||||
return []
|
||||
|
||||
headers = []
|
||||
first_row = rows[0]
|
||||
second_row = rows[1]
|
||||
|
||||
# Gap between first and second row > 0.5x average row height
|
||||
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
|
||||
gap = second_row["y_min"] - first_row["y_max"]
|
||||
if gap > avg_h * 0.5:
|
||||
headers.append(0)
|
||||
if not skip_first_row_header:
|
||||
first_row = rows[0]
|
||||
second_row = rows[1]
|
||||
|
||||
# Also check if first row words are taller than average (bold/header text)
|
||||
all_heights = [w["height"] for w in zone_words]
|
||||
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
|
||||
first_row_words = [
|
||||
w for w in zone_words
|
||||
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
|
||||
]
|
||||
if first_row_words:
|
||||
first_h = max(w["height"] for w in first_row_words)
|
||||
if first_h > median_h * 1.3:
|
||||
if 0 not in headers:
|
||||
headers.append(0)
|
||||
# Gap between first and second row > 0.5x average row height
|
||||
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
|
||||
gap = second_row["y_min"] - first_row["y_max"]
|
||||
if gap > avg_h * 0.5:
|
||||
headers.append(0)
|
||||
|
||||
# Also check if first row words are taller than average (bold/header text)
|
||||
all_heights = [w["height"] for w in zone_words]
|
||||
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
|
||||
first_row_words = [
|
||||
w for w in zone_words
|
||||
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
|
||||
]
|
||||
if first_row_words:
|
||||
first_h = max(w["height"] for w in first_row_words)
|
||||
if first_h > median_h * 1.3:
|
||||
if 0 not in headers:
|
||||
headers.append(0)
|
||||
|
||||
# Note: Spanning-header detection (rows spanning all columns) has been
|
||||
# disabled because it produces too many false positives on vocabulary
|
||||
@@ -707,6 +710,7 @@ def _build_zone_grid(
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
global_columns: Optional[List[Dict]] = None,
|
||||
skip_first_row_header: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""Build columns, rows, cells for a single zone from its words.
|
||||
|
||||
@@ -773,7 +777,8 @@ def _build_zone_grid(
|
||||
cell["zone_index"] = zone_index
|
||||
|
||||
# Detect header rows (pass columns for spanning header detection)
|
||||
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns)
|
||||
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
|
||||
skip_first_row_header=skip_first_row_header)
|
||||
|
||||
# Merge cells in spanning header rows into a single col-0 cell
|
||||
if header_rows and len(columns) >= 2:
|
||||
@@ -1270,9 +1275,27 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
||||
removed, pz.zone_type, pz.index,
|
||||
)
|
||||
# Filter words inside image overlay regions (merged box zones)
|
||||
if pz.image_overlays:
|
||||
before_ov = len(zone_words)
|
||||
zone_words = [
|
||||
w for w in zone_words
|
||||
if not any(
|
||||
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
|
||||
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
|
||||
for ov in pz.image_overlays
|
||||
)
|
||||
]
|
||||
ov_removed = before_ov - len(zone_words)
|
||||
if ov_removed:
|
||||
logger.info(
|
||||
"build-grid: filtered %d words inside image overlays from zone %d",
|
||||
ov_removed, pz.index,
|
||||
)
|
||||
grid = _build_zone_grid(
|
||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
skip_first_row_header=bool(pz.image_overlays),
|
||||
)
|
||||
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
|
||||
|
||||
@@ -1339,6 +1362,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
global_columns=merged_columns,
|
||||
skip_first_row_header=bool(pz.image_overlays),
|
||||
)
|
||||
zg["grid"] = grid
|
||||
logger.info(
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
"""
|
||||
Tests for grid_editor_api zone merging and heading detection.
|
||||
Tests for grid_editor_api zone merging, heading detection, and ghost filtering.
|
||||
|
||||
Covers:
|
||||
- _merge_content_zones_across_boxes: zone merging logic
|
||||
- _detect_heading_rows_by_color: heading detection by color + height
|
||||
- _filter_border_ghosts: single-char ghost detection
|
||||
- _detect_header_rows: skip_first_row_header flag
|
||||
"""
|
||||
|
||||
import sys
|
||||
@@ -13,6 +15,8 @@ import pytest
|
||||
from cv_vocab_types import PageZone, DetectedBox
|
||||
from grid_editor_api import (
|
||||
_merge_content_zones_across_boxes,
|
||||
_filter_border_ghosts,
|
||||
_detect_header_rows,
|
||||
_detect_heading_rows_by_color,
|
||||
)
|
||||
|
||||
@@ -358,3 +362,82 @@ class TestDetectHeadingRowsByColor:
|
||||
zones_data = [self._make_zone(cells, rows, columns)]
|
||||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||||
assert count == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _filter_border_ghosts (Fix 2: single-char only)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFilterBorderGhosts:
|
||||
"""Test that ghost filtering only removes single-char words."""
|
||||
|
||||
def test_single_char_ghost_removed(self):
|
||||
"""Single '|' on a box border → filtered as ghost."""
|
||||
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
|
||||
words = [
|
||||
{"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
|
||||
{"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
|
||||
]
|
||||
filtered, count = _filter_border_ghosts(words, [box])
|
||||
assert count == 1
|
||||
assert len(filtered) == 1
|
||||
assert filtered[0]["text"] == "hello"
|
||||
|
||||
def test_multi_char_ghost_kept(self):
|
||||
"""Multi-char '(=' on a box border → NOT filtered (real content)."""
|
||||
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
|
||||
words = [
|
||||
{"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
|
||||
{"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
|
||||
]
|
||||
filtered, count = _filter_border_ghosts(words, [box])
|
||||
assert count == 0
|
||||
assert len(filtered) == 2
|
||||
|
||||
def test_single_paren_on_border_removed(self):
|
||||
"""Single ')' on border → filtered."""
|
||||
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
|
||||
words = [
|
||||
{"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
|
||||
]
|
||||
filtered, count = _filter_border_ghosts(words, [box])
|
||||
assert count == 1
|
||||
assert len(filtered) == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _detect_header_rows (Fix 3: skip_first_row_header)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDetectHeaderRowsSkipFlag:
|
||||
"""Test skip_first_row_header flag."""
|
||||
|
||||
def test_first_row_detected_without_flag(self):
|
||||
"""Without flag, first row with big gap → header."""
|
||||
rows = [
|
||||
{"y_min": 100, "y_max": 120, "index": 0},
|
||||
{"y_min": 160, "y_max": 180, "index": 1},
|
||||
{"y_min": 185, "y_max": 205, "index": 2},
|
||||
]
|
||||
words = [
|
||||
{"height": 20, "top": 105, "left": 10, "width": 80},
|
||||
{"height": 20, "top": 165, "left": 10, "width": 80},
|
||||
{"height": 20, "top": 190, "left": 10, "width": 80},
|
||||
]
|
||||
headers = _detect_header_rows(rows, words, 0)
|
||||
assert 0 in headers
|
||||
|
||||
def test_first_row_skipped_with_flag(self):
|
||||
"""With skip flag, first row NOT detected even with big gap."""
|
||||
rows = [
|
||||
{"y_min": 100, "y_max": 120, "index": 0},
|
||||
{"y_min": 160, "y_max": 180, "index": 1},
|
||||
{"y_min": 185, "y_max": 205, "index": 2},
|
||||
]
|
||||
words = [
|
||||
{"height": 20, "top": 105, "left": 10, "width": 80},
|
||||
{"height": 20, "top": 165, "left": 10, "width": 80},
|
||||
{"height": 20, "top": 190, "left": 10, "width": 80},
|
||||
]
|
||||
headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
|
||||
assert 0 not in headers
|
||||
|
||||
Reference in New Issue
Block a user