fix: merge inline marker columns + improve ghost edge detection

1. Add _merge_inline_marker_columns(): narrow columns (<80px) with avg word length <=2 chars (bullets, numbering) are merged into the adjacent text column. Fixes box zones getting 2 columns when bullet points are just indentation markers. 2. Improve ghost filter: check word edges (left/right/top/bottom) against border bands instead of center-only. Catches = at x=947 whose left edge touches the box border. 3. Add = and + to _GRID_GHOST_CHARS for border artifact detection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 10:10:07 +01:00
parent febd0a2f84
commit 324f39a9cc
1 changed files with 66 additions and 5 deletions
@@ -266,7 +266,7 @@ def _cluster_columns_by_alignment(

 # Characters that are typically OCR artefacts from box border lines.
 # Intentionally excludes ! (red markers) and . , ; (real punctuation).
-_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~")
+_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~=+")


 def _filter_border_ghosts(
@@ -303,10 +303,14 @@ def _filter_border_ghosts(
        text = (w.get("text") or "").strip()
        if not text:
            return False
-        cx = w["left"] + w["width"] / 2
-        cy = w["top"] + w["height"] / 2
-        on_border = any(lo <= cx <= hi for lo, hi in x_bands) or any(
-            lo <= cy <= hi for lo, hi in y_bands
+        # Check if any word edge (not just center) touches a border band
+        w_left = w["left"]
+        w_right = w["left"] + w["width"]
+        w_top = w["top"]
+        w_bottom = w["top"] + w["height"]
+        on_border = (
+            any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
+            or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
        )
        if not on_border:
            return False
@@ -318,6 +322,59 @@ def _filter_border_ghosts(
    return filtered, len(words) - len(filtered)


+def _merge_inline_marker_columns(
+    columns: List[Dict],
+    words: List[Dict],
+) -> List[Dict]:
+    """Merge narrow marker columns (bullets, numbering) into adjacent text.
+
+    Bullet points (•, *, -) and numbering (1., 2.) create narrow columns
+    at the left edge of a zone.  These are inline markers that indent text,
+    not real separate columns.  Merge them with their right neighbour.
+    """
+    if len(columns) < 2:
+        return columns
+
+    merged: List[Dict] = []
+    skip: set = set()
+
+    for i, col in enumerate(columns):
+        if i in skip:
+            continue
+
+        # Find words in this column
+        col_words = [
+            w for w in words
+            if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
+        ]
+        col_width = col["x_max"] - col["x_min"]
+
+        # Narrow column with mostly short words → likely inline markers
+        if col_words and col_width < 80:
+            avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
+            if avg_len <= 2 and i + 1 < len(columns):
+                # Merge into next column
+                next_col = columns[i + 1].copy()
+                next_col["x_min"] = col["x_min"]
+                merged.append(next_col)
+                skip.add(i + 1)
+                logger.info(
+                    "  merged inline marker column %d (w=%d, avg_len=%.1f) "
+                    "into column %d",
+                    i, col_width, avg_len, i + 1,
+                )
+                continue
+
+        merged.append(col)
+
+    # Re-index
+    for i, col in enumerate(merged):
+        col["index"] = i
+        col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
+
+    return merged
+
+
 def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
    """Extract all word_boxes from cells into a flat list of word dicts."""
    words: List[Dict] = []
@@ -445,6 +502,10 @@ def _build_zone_grid(
    # Use global columns if provided, otherwise detect per zone
    columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)

+    # Merge inline marker columns (bullets, numbering) into adjacent text
+    if not global_columns:
+        columns = _merge_inline_marker_columns(columns, zone_words)
+
    if not columns or not rows:
        return {
            "columns": [],