fix: merge inline marker columns + improve ghost edge detection

1. Add _merge_inline_marker_columns(): narrow columns (<80px) with avg word length <=2 chars (bullets, numbering) are merged into the adjacent text column. Fixes box zones getting 2 columns when bullet points are just indentation markers. 2. Improve ghost filter: check word edges (left/right/top/bottom) against border bands instead of center-only. Catches = at x=947 whose left edge touches the box border. 3. Add = and + to _GRID_GHOST_CHARS for border artifact detection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 10:10:07 +01:00
parent febd0a2f84
commit 324f39a9cc
1 changed files with 66 additions and 5 deletions
@@ -266,7 +266,7 @@ def _cluster_columns_by_alignment(
 # Characters that are typically OCR artefacts from box border lines.
 # Intentionally excludes ! (red markers) and . , ; (real punctuation).
-_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~")
+_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~=+")
 def _filter_border_ghosts(
@@ -303,10 +303,14 @@ def _filter_border_ghosts(
        text = (w.get("text") or "").strip()
        if not text:
            return False
-        cx = w["left"] + w["width"] / 2
+        # Check if any word edge (not just center) touches a border band
-        cy = w["top"] + w["height"] / 2
+        w_left = w["left"]
-        on_border = any(lo <= cx <= hi for lo, hi in x_bands) or any(
+        w_right = w["left"] + w["width"]
-            lo <= cy <= hi for lo, hi in y_bands
+        w_top = w["top"]
        w_bottom = w["top"] + w["height"]
        on_border = (
            any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
            or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
        )
        if not on_border:
            return False
@@ -318,6 +322,59 @@ def _filter_border_ghosts(
    return filtered, len(words) - len(filtered)
 def _merge_inline_marker_columns(
    columns: List[Dict],
    words: List[Dict],
 ) -> List[Dict]:
    """Merge narrow marker columns (bullets, numbering) into adjacent text.
    Bullet points (•, *, -) and numbering (1., 2.) create narrow columns
    at the left edge of a zone.  These are inline markers that indent text,
    not real separate columns.  Merge them with their right neighbour.
    """
    if len(columns) < 2:
        return columns
    merged: List[Dict] = []
    skip: set = set()
    for i, col in enumerate(columns):
        if i in skip:
            continue
        # Find words in this column
        col_words = [
            w for w in words
            if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
        ]
        col_width = col["x_max"] - col["x_min"]
        # Narrow column with mostly short words → likely inline markers
        if col_words and col_width < 80:
            avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
            if avg_len <= 2 and i + 1 < len(columns):
                # Merge into next column
                next_col = columns[i + 1].copy()
                next_col["x_min"] = col["x_min"]
                merged.append(next_col)
                skip.add(i + 1)
                logger.info(
                    "  merged inline marker column %d (w=%d, avg_len=%.1f) "
                    "into column %d",
                    i, col_width, avg_len, i + 1,
                )
                continue
        merged.append(col)
    # Re-index
    for i, col in enumerate(merged):
        col["index"] = i
        col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
    return merged
 def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
    """Extract all word_boxes from cells into a flat list of word dicts."""
    words: List[Dict] = []
@@ -445,6 +502,10 @@ def _build_zone_grid(
    # Use global columns if provided, otherwise detect per zone
    columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
    # Merge inline marker columns (bullets, numbering) into adjacent text
    if not global_columns:
        columns = _merge_inline_marker_columns(columns, zone_words)
    if not columns or not rows:
        return {
            "columns": [],