Fix overlay word leak, ghost filter false positive, merged zone header
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 41s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 41s
1. Filter words inside image_overlays (removes OCR from images) 2. Ghost filter: only remove single-char border artifacts, not multi-char like (= which is real content 3. Skip first-row header detection for zones with image_overlays (merged geometry creates artificial gaps) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -320,7 +320,7 @@ def _filter_border_ghosts(
|
|||||||
)
|
)
|
||||||
if not on_border:
|
if not on_border:
|
||||||
return False
|
return False
|
||||||
if all(c in _GRID_GHOST_CHARS for c in text):
|
if len(text) == 1 and text in _GRID_GHOST_CHARS:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -656,6 +656,7 @@ def _detect_header_rows(
|
|||||||
zone_words: List[Dict],
|
zone_words: List[Dict],
|
||||||
zone_y: int,
|
zone_y: int,
|
||||||
columns: Optional[List[Dict]] = None,
|
columns: Optional[List[Dict]] = None,
|
||||||
|
skip_first_row_header: bool = False,
|
||||||
) -> List[int]:
|
) -> List[int]:
|
||||||
"""Detect header rows: first-row heuristic + spanning header detection.
|
"""Detect header rows: first-row heuristic + spanning header detection.
|
||||||
|
|
||||||
@@ -666,27 +667,29 @@ def _detect_header_rows(
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
headers = []
|
headers = []
|
||||||
first_row = rows[0]
|
|
||||||
second_row = rows[1]
|
|
||||||
|
|
||||||
# Gap between first and second row > 0.5x average row height
|
if not skip_first_row_header:
|
||||||
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
|
first_row = rows[0]
|
||||||
gap = second_row["y_min"] - first_row["y_max"]
|
second_row = rows[1]
|
||||||
if gap > avg_h * 0.5:
|
|
||||||
headers.append(0)
|
|
||||||
|
|
||||||
# Also check if first row words are taller than average (bold/header text)
|
# Gap between first and second row > 0.5x average row height
|
||||||
all_heights = [w["height"] for w in zone_words]
|
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
|
||||||
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
|
gap = second_row["y_min"] - first_row["y_max"]
|
||||||
first_row_words = [
|
if gap > avg_h * 0.5:
|
||||||
w for w in zone_words
|
headers.append(0)
|
||||||
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
|
|
||||||
]
|
# Also check if first row words are taller than average (bold/header text)
|
||||||
if first_row_words:
|
all_heights = [w["height"] for w in zone_words]
|
||||||
first_h = max(w["height"] for w in first_row_words)
|
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
|
||||||
if first_h > median_h * 1.3:
|
first_row_words = [
|
||||||
if 0 not in headers:
|
w for w in zone_words
|
||||||
headers.append(0)
|
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
|
||||||
|
]
|
||||||
|
if first_row_words:
|
||||||
|
first_h = max(w["height"] for w in first_row_words)
|
||||||
|
if first_h > median_h * 1.3:
|
||||||
|
if 0 not in headers:
|
||||||
|
headers.append(0)
|
||||||
|
|
||||||
# Note: Spanning-header detection (rows spanning all columns) has been
|
# Note: Spanning-header detection (rows spanning all columns) has been
|
||||||
# disabled because it produces too many false positives on vocabulary
|
# disabled because it produces too many false positives on vocabulary
|
||||||
@@ -707,6 +710,7 @@ def _build_zone_grid(
|
|||||||
img_w: int,
|
img_w: int,
|
||||||
img_h: int,
|
img_h: int,
|
||||||
global_columns: Optional[List[Dict]] = None,
|
global_columns: Optional[List[Dict]] = None,
|
||||||
|
skip_first_row_header: bool = False,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Build columns, rows, cells for a single zone from its words.
|
"""Build columns, rows, cells for a single zone from its words.
|
||||||
|
|
||||||
@@ -773,7 +777,8 @@ def _build_zone_grid(
|
|||||||
cell["zone_index"] = zone_index
|
cell["zone_index"] = zone_index
|
||||||
|
|
||||||
# Detect header rows (pass columns for spanning header detection)
|
# Detect header rows (pass columns for spanning header detection)
|
||||||
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns)
|
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
|
||||||
|
skip_first_row_header=skip_first_row_header)
|
||||||
|
|
||||||
# Merge cells in spanning header rows into a single col-0 cell
|
# Merge cells in spanning header rows into a single col-0 cell
|
||||||
if header_rows and len(columns) >= 2:
|
if header_rows and len(columns) >= 2:
|
||||||
@@ -1270,9 +1275,27 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
||||||
removed, pz.zone_type, pz.index,
|
removed, pz.zone_type, pz.index,
|
||||||
)
|
)
|
||||||
|
# Filter words inside image overlay regions (merged box zones)
|
||||||
|
if pz.image_overlays:
|
||||||
|
before_ov = len(zone_words)
|
||||||
|
zone_words = [
|
||||||
|
w for w in zone_words
|
||||||
|
if not any(
|
||||||
|
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
|
||||||
|
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
|
||||||
|
for ov in pz.image_overlays
|
||||||
|
)
|
||||||
|
]
|
||||||
|
ov_removed = before_ov - len(zone_words)
|
||||||
|
if ov_removed:
|
||||||
|
logger.info(
|
||||||
|
"build-grid: filtered %d words inside image overlays from zone %d",
|
||||||
|
ov_removed, pz.index,
|
||||||
|
)
|
||||||
grid = _build_zone_grid(
|
grid = _build_zone_grid(
|
||||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||||
pz.index, img_w, img_h,
|
pz.index, img_w, img_h,
|
||||||
|
skip_first_row_header=bool(pz.image_overlays),
|
||||||
)
|
)
|
||||||
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
|
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
|
||||||
|
|
||||||
@@ -1339,6 +1362,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
pz.width, pz.height,
|
pz.width, pz.height,
|
||||||
pz.index, img_w, img_h,
|
pz.index, img_w, img_h,
|
||||||
global_columns=merged_columns,
|
global_columns=merged_columns,
|
||||||
|
skip_first_row_header=bool(pz.image_overlays),
|
||||||
)
|
)
|
||||||
zg["grid"] = grid
|
zg["grid"] = grid
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
"""
|
"""
|
||||||
Tests for grid_editor_api zone merging and heading detection.
|
Tests for grid_editor_api zone merging, heading detection, and ghost filtering.
|
||||||
|
|
||||||
Covers:
|
Covers:
|
||||||
- _merge_content_zones_across_boxes: zone merging logic
|
- _merge_content_zones_across_boxes: zone merging logic
|
||||||
- _detect_heading_rows_by_color: heading detection by color + height
|
- _detect_heading_rows_by_color: heading detection by color + height
|
||||||
|
- _filter_border_ghosts: single-char ghost detection
|
||||||
|
- _detect_header_rows: skip_first_row_header flag
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
@@ -13,6 +15,8 @@ import pytest
|
|||||||
from cv_vocab_types import PageZone, DetectedBox
|
from cv_vocab_types import PageZone, DetectedBox
|
||||||
from grid_editor_api import (
|
from grid_editor_api import (
|
||||||
_merge_content_zones_across_boxes,
|
_merge_content_zones_across_boxes,
|
||||||
|
_filter_border_ghosts,
|
||||||
|
_detect_header_rows,
|
||||||
_detect_heading_rows_by_color,
|
_detect_heading_rows_by_color,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -358,3 +362,82 @@ class TestDetectHeadingRowsByColor:
|
|||||||
zones_data = [self._make_zone(cells, rows, columns)]
|
zones_data = [self._make_zone(cells, rows, columns)]
|
||||||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||||||
assert count == 0
|
assert count == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _filter_border_ghosts (Fix 2: single-char only)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestFilterBorderGhosts:
|
||||||
|
"""Test that ghost filtering only removes single-char words."""
|
||||||
|
|
||||||
|
def test_single_char_ghost_removed(self):
|
||||||
|
"""Single '|' on a box border → filtered as ghost."""
|
||||||
|
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
|
||||||
|
words = [
|
||||||
|
{"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
|
||||||
|
{"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
|
||||||
|
]
|
||||||
|
filtered, count = _filter_border_ghosts(words, [box])
|
||||||
|
assert count == 1
|
||||||
|
assert len(filtered) == 1
|
||||||
|
assert filtered[0]["text"] == "hello"
|
||||||
|
|
||||||
|
def test_multi_char_ghost_kept(self):
|
||||||
|
"""Multi-char '(=' on a box border → NOT filtered (real content)."""
|
||||||
|
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
|
||||||
|
words = [
|
||||||
|
{"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
|
||||||
|
{"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
|
||||||
|
]
|
||||||
|
filtered, count = _filter_border_ghosts(words, [box])
|
||||||
|
assert count == 0
|
||||||
|
assert len(filtered) == 2
|
||||||
|
|
||||||
|
def test_single_paren_on_border_removed(self):
|
||||||
|
"""Single ')' on border → filtered."""
|
||||||
|
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
|
||||||
|
words = [
|
||||||
|
{"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
|
||||||
|
]
|
||||||
|
filtered, count = _filter_border_ghosts(words, [box])
|
||||||
|
assert count == 1
|
||||||
|
assert len(filtered) == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _detect_header_rows (Fix 3: skip_first_row_header)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestDetectHeaderRowsSkipFlag:
|
||||||
|
"""Test skip_first_row_header flag."""
|
||||||
|
|
||||||
|
def test_first_row_detected_without_flag(self):
|
||||||
|
"""Without flag, first row with big gap → header."""
|
||||||
|
rows = [
|
||||||
|
{"y_min": 100, "y_max": 120, "index": 0},
|
||||||
|
{"y_min": 160, "y_max": 180, "index": 1},
|
||||||
|
{"y_min": 185, "y_max": 205, "index": 2},
|
||||||
|
]
|
||||||
|
words = [
|
||||||
|
{"height": 20, "top": 105, "left": 10, "width": 80},
|
||||||
|
{"height": 20, "top": 165, "left": 10, "width": 80},
|
||||||
|
{"height": 20, "top": 190, "left": 10, "width": 80},
|
||||||
|
]
|
||||||
|
headers = _detect_header_rows(rows, words, 0)
|
||||||
|
assert 0 in headers
|
||||||
|
|
||||||
|
def test_first_row_skipped_with_flag(self):
|
||||||
|
"""With skip flag, first row NOT detected even with big gap."""
|
||||||
|
rows = [
|
||||||
|
{"y_min": 100, "y_max": 120, "index": 0},
|
||||||
|
{"y_min": 160, "y_max": 180, "index": 1},
|
||||||
|
{"y_min": 185, "y_max": 205, "index": 2},
|
||||||
|
]
|
||||||
|
words = [
|
||||||
|
{"height": 20, "top": 105, "left": 10, "width": 80},
|
||||||
|
{"height": 20, "top": 165, "left": 10, "width": 80},
|
||||||
|
{"height": 20, "top": 190, "left": 10, "width": 80},
|
||||||
|
]
|
||||||
|
headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
|
||||||
|
assert 0 not in headers
|
||||||
|
|||||||
Reference in New Issue
Block a user