fix: border ghost filter + row overlap fix for box zones
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s
1. Add _filter_border_ghosts() to grid editor - removes OCR artefacts like | sitting on box borders before row/column clustering. The tall | (h=55) was inflating row 0's y_max, causing row overlap. 2. Fix _assign_word_to_row() to prefer closest y_center when rows overlap, instead of always returning the first matching row. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -134,12 +134,16 @@ def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
|
|||||||
|
|
||||||
|
|
||||||
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
|
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
|
||||||
"""Return row index for a word based on its Y-center."""
|
"""Return row index for a word based on its Y-center.
|
||||||
|
|
||||||
|
When rows overlap (e.g. due to tall border-ghost characters inflating
|
||||||
|
a row's y_max), prefer the row whose y_center is closest.
|
||||||
|
"""
|
||||||
y_center = word['top'] + word['height'] / 2
|
y_center = word['top'] + word['height'] / 2
|
||||||
# Find the row whose y_range contains this word's center
|
# Find all rows whose y_range contains this word's center
|
||||||
for row in rows:
|
matching = [r for r in rows if r['y_min'] <= y_center <= r['y_max']]
|
||||||
if row['y_min'] <= y_center <= row['y_max']:
|
if matching:
|
||||||
return row['index']
|
return min(matching, key=lambda r: abs(r['y_center'] - y_center))['index']
|
||||||
# Fallback: nearest row by Y-center
|
# Fallback: nearest row by Y-center
|
||||||
return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
|
return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
|
||||||
|
|
||||||
|
|||||||
@@ -264,6 +264,60 @@ def _cluster_columns_by_alignment(
|
|||||||
return columns
|
return columns
|
||||||
|
|
||||||
|
|
||||||
|
# Characters that are typically OCR artefacts from box border lines.
|
||||||
|
# Intentionally excludes ! (red markers) and . , ; (real punctuation).
|
||||||
|
_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~")
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_border_ghosts(
|
||||||
|
words: List[Dict],
|
||||||
|
boxes: List,
|
||||||
|
) -> tuple:
|
||||||
|
"""Remove words sitting on box borders that are OCR artefacts.
|
||||||
|
|
||||||
|
Returns (filtered_words, removed_count).
|
||||||
|
"""
|
||||||
|
if not boxes or not words:
|
||||||
|
return words, 0
|
||||||
|
|
||||||
|
# Build border bands from detected boxes
|
||||||
|
x_bands: List[tuple] = []
|
||||||
|
y_bands: List[tuple] = []
|
||||||
|
for b in boxes:
|
||||||
|
bx = b.x if hasattr(b, "x") else b.get("x", 0)
|
||||||
|
by = b.y if hasattr(b, "y") else b.get("y", 0)
|
||||||
|
bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
|
||||||
|
bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
|
||||||
|
bt = (
|
||||||
|
b.border_thickness
|
||||||
|
if hasattr(b, "border_thickness")
|
||||||
|
else b.get("border_thickness", 3)
|
||||||
|
)
|
||||||
|
margin = max(bt * 2, 10) + 6
|
||||||
|
x_bands.append((bx - margin, bx + margin))
|
||||||
|
x_bands.append((bx + bw - margin, bx + bw + margin))
|
||||||
|
y_bands.append((by - margin, by + margin))
|
||||||
|
y_bands.append((by + bh - margin, by + bh + margin))
|
||||||
|
|
||||||
|
def _is_ghost(w: Dict) -> bool:
|
||||||
|
text = (w.get("text") or "").strip()
|
||||||
|
if not text:
|
||||||
|
return False
|
||||||
|
cx = w["left"] + w["width"] / 2
|
||||||
|
cy = w["top"] + w["height"] / 2
|
||||||
|
on_border = any(lo <= cx <= hi for lo, hi in x_bands) or any(
|
||||||
|
lo <= cy <= hi for lo, hi in y_bands
|
||||||
|
)
|
||||||
|
if not on_border:
|
||||||
|
return False
|
||||||
|
if all(c in _GRID_GHOST_CHARS for c in text):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
filtered = [w for w in words if not _is_ghost(w)]
|
||||||
|
return filtered, len(words) - len(filtered)
|
||||||
|
|
||||||
|
|
||||||
def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
|
def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
|
||||||
"""Extract all word_boxes from cells into a flat list of word dicts."""
|
"""Extract all word_boxes from cells into a flat list of word dicts."""
|
||||||
words: List[Dict] = []
|
words: List[Dict] = []
|
||||||
@@ -539,6 +593,14 @@ async def build_grid(session_id: str):
|
|||||||
boxes_detected = len(boxes)
|
boxes_detected = len(boxes)
|
||||||
|
|
||||||
if boxes:
|
if boxes:
|
||||||
|
# Filter border ghost words before grid building
|
||||||
|
all_words, ghost_count = _filter_border_ghosts(all_words, boxes)
|
||||||
|
if ghost_count:
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: removed %d border ghost words",
|
||||||
|
session_id, ghost_count,
|
||||||
|
)
|
||||||
|
|
||||||
# Split page into zones
|
# Split page into zones
|
||||||
page_zones = split_page_into_zones(
|
page_zones = split_page_into_zones(
|
||||||
content_x, content_y, content_w, content_h, boxes
|
content_x, content_y, content_w, content_h, boxes
|
||||||
|
|||||||
Reference in New Issue
Block a user