Fix overlay word leak, ghost filter false positive, merged zone header
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 41s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 41s
1. Filter words inside image_overlays (removes OCR from images) 2. Ghost filter: only remove single-char border artifacts, not multi-char like (= which is real content 3. Skip first-row header detection for zones with image_overlays (merged geometry creates artificial gaps) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -320,7 +320,7 @@ def _filter_border_ghosts(
|
||||
)
|
||||
if not on_border:
|
||||
return False
|
||||
if all(c in _GRID_GHOST_CHARS for c in text):
|
||||
if len(text) == 1 and text in _GRID_GHOST_CHARS:
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -656,6 +656,7 @@ def _detect_header_rows(
|
||||
zone_words: List[Dict],
|
||||
zone_y: int,
|
||||
columns: Optional[List[Dict]] = None,
|
||||
skip_first_row_header: bool = False,
|
||||
) -> List[int]:
|
||||
"""Detect header rows: first-row heuristic + spanning header detection.
|
||||
|
||||
@@ -666,27 +667,29 @@ def _detect_header_rows(
|
||||
return []
|
||||
|
||||
headers = []
|
||||
first_row = rows[0]
|
||||
second_row = rows[1]
|
||||
|
||||
# Gap between first and second row > 0.5x average row height
|
||||
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
|
||||
gap = second_row["y_min"] - first_row["y_max"]
|
||||
if gap > avg_h * 0.5:
|
||||
headers.append(0)
|
||||
if not skip_first_row_header:
|
||||
first_row = rows[0]
|
||||
second_row = rows[1]
|
||||
|
||||
# Also check if first row words are taller than average (bold/header text)
|
||||
all_heights = [w["height"] for w in zone_words]
|
||||
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
|
||||
first_row_words = [
|
||||
w for w in zone_words
|
||||
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
|
||||
]
|
||||
if first_row_words:
|
||||
first_h = max(w["height"] for w in first_row_words)
|
||||
if first_h > median_h * 1.3:
|
||||
if 0 not in headers:
|
||||
headers.append(0)
|
||||
# Gap between first and second row > 0.5x average row height
|
||||
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
|
||||
gap = second_row["y_min"] - first_row["y_max"]
|
||||
if gap > avg_h * 0.5:
|
||||
headers.append(0)
|
||||
|
||||
# Also check if first row words are taller than average (bold/header text)
|
||||
all_heights = [w["height"] for w in zone_words]
|
||||
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
|
||||
first_row_words = [
|
||||
w for w in zone_words
|
||||
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
|
||||
]
|
||||
if first_row_words:
|
||||
first_h = max(w["height"] for w in first_row_words)
|
||||
if first_h > median_h * 1.3:
|
||||
if 0 not in headers:
|
||||
headers.append(0)
|
||||
|
||||
# Note: Spanning-header detection (rows spanning all columns) has been
|
||||
# disabled because it produces too many false positives on vocabulary
|
||||
@@ -707,6 +710,7 @@ def _build_zone_grid(
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
global_columns: Optional[List[Dict]] = None,
|
||||
skip_first_row_header: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""Build columns, rows, cells for a single zone from its words.
|
||||
|
||||
@@ -773,7 +777,8 @@ def _build_zone_grid(
|
||||
cell["zone_index"] = zone_index
|
||||
|
||||
# Detect header rows (pass columns for spanning header detection)
|
||||
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns)
|
||||
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
|
||||
skip_first_row_header=skip_first_row_header)
|
||||
|
||||
# Merge cells in spanning header rows into a single col-0 cell
|
||||
if header_rows and len(columns) >= 2:
|
||||
@@ -1270,9 +1275,27 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
||||
removed, pz.zone_type, pz.index,
|
||||
)
|
||||
# Filter words inside image overlay regions (merged box zones)
|
||||
if pz.image_overlays:
|
||||
before_ov = len(zone_words)
|
||||
zone_words = [
|
||||
w for w in zone_words
|
||||
if not any(
|
||||
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
|
||||
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
|
||||
for ov in pz.image_overlays
|
||||
)
|
||||
]
|
||||
ov_removed = before_ov - len(zone_words)
|
||||
if ov_removed:
|
||||
logger.info(
|
||||
"build-grid: filtered %d words inside image overlays from zone %d",
|
||||
ov_removed, pz.index,
|
||||
)
|
||||
grid = _build_zone_grid(
|
||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
skip_first_row_header=bool(pz.image_overlays),
|
||||
)
|
||||
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
|
||||
|
||||
@@ -1339,6 +1362,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
global_columns=merged_columns,
|
||||
skip_first_row_header=bool(pz.image_overlays),
|
||||
)
|
||||
zg["grid"] = grid
|
||||
logger.info(
|
||||
|
||||
Reference in New Issue
Block a user