fix: border ghost filter + row overlap fix for box zones

1. Add _filter_border_ghosts() to grid editor - removes OCR artefacts like | sitting on box borders before row/column clustering. The tall | (h=55) was inflating row 0's y_max, causing row overlap. 2. Fix _assign_word_to_row() to prefer closest y_center when rows overlap, instead of always returning the first matching row. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 09:54:50 +01:00
parent 43b1f8be58
commit febd0a2f84
2 changed files with 71 additions and 5 deletions
@@ -134,12 +134,16 @@ def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:


 def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
-    """Return row index for a word based on its Y-center."""
+    """Return row index for a word based on its Y-center.
+
+    When rows overlap (e.g. due to tall border-ghost characters inflating
+    a row's y_max), prefer the row whose y_center is closest.
+    """
    y_center = word['top'] + word['height'] / 2
-    # Find the row whose y_range contains this word's center
-    for row in rows:
-        if row['y_min'] <= y_center <= row['y_max']:
-            return row['index']
+    # Find all rows whose y_range contains this word's center
+    matching = [r for r in rows if r['y_min'] <= y_center <= r['y_max']]
+    if matching:
+        return min(matching, key=lambda r: abs(r['y_center'] - y_center))['index']
    # Fallback: nearest row by Y-center
    return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']

@@ -264,6 +264,60 @@ def _cluster_columns_by_alignment(
    return columns


+# Characters that are typically OCR artefacts from box border lines.
+# Intentionally excludes ! (red markers) and . , ; (real punctuation).
+_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~")
+
+
+def _filter_border_ghosts(
+    words: List[Dict],
+    boxes: List,
+) -> tuple:
+    """Remove words sitting on box borders that are OCR artefacts.
+
+    Returns (filtered_words, removed_count).
+    """
+    if not boxes or not words:
+        return words, 0
+
+    # Build border bands from detected boxes
+    x_bands: List[tuple] = []
+    y_bands: List[tuple] = []
+    for b in boxes:
+        bx = b.x if hasattr(b, "x") else b.get("x", 0)
+        by = b.y if hasattr(b, "y") else b.get("y", 0)
+        bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
+        bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
+        bt = (
+            b.border_thickness
+            if hasattr(b, "border_thickness")
+            else b.get("border_thickness", 3)
+        )
+        margin = max(bt * 2, 10) + 6
+        x_bands.append((bx - margin, bx + margin))
+        x_bands.append((bx + bw - margin, bx + bw + margin))
+        y_bands.append((by - margin, by + margin))
+        y_bands.append((by + bh - margin, by + bh + margin))
+
+    def _is_ghost(w: Dict) -> bool:
+        text = (w.get("text") or "").strip()
+        if not text:
+            return False
+        cx = w["left"] + w["width"] / 2
+        cy = w["top"] + w["height"] / 2
+        on_border = any(lo <= cx <= hi for lo, hi in x_bands) or any(
+            lo <= cy <= hi for lo, hi in y_bands
+        )
+        if not on_border:
+            return False
+        if all(c in _GRID_GHOST_CHARS for c in text):
+            return True
+        return False
+
+    filtered = [w for w in words if not _is_ghost(w)]
+    return filtered, len(words) - len(filtered)
+
+
 def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
    """Extract all word_boxes from cells into a flat list of word dicts."""
    words: List[Dict] = []
@@ -539,6 +593,14 @@ async def build_grid(session_id: str):
            boxes_detected = len(boxes)

            if boxes:
+                # Filter border ghost words before grid building
+                all_words, ghost_count = _filter_border_ghosts(all_words, boxes)
+                if ghost_count:
+                    logger.info(
+                        "build-grid session %s: removed %d border ghost words",
+                        session_id, ghost_count,
+                    )
+
                # Split page into zones
                page_zones = split_page_into_zones(
                    content_x, content_y, content_w, content_h, boxes