fix: merge inline marker columns + improve ghost edge detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
1. Add _merge_inline_marker_columns(): narrow columns (<80px) with avg word length <=2 chars (bullets, numbering) are merged into the adjacent text column. Fixes box zones getting 2 columns when bullet points are just indentation markers. 2. Improve ghost filter: check word edges (left/right/top/bottom) against border bands instead of center-only. Catches = at x=947 whose left edge touches the box border. 3. Add = and + to _GRID_GHOST_CHARS for border artifact detection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -266,7 +266,7 @@ def _cluster_columns_by_alignment(
|
||||
|
||||
# Characters that are typically OCR artefacts from box border lines.
|
||||
# Intentionally excludes ! (red markers) and . , ; (real punctuation).
|
||||
_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~")
|
||||
_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~=+")
|
||||
|
||||
|
||||
def _filter_border_ghosts(
|
||||
@@ -303,10 +303,14 @@ def _filter_border_ghosts(
|
||||
text = (w.get("text") or "").strip()
|
||||
if not text:
|
||||
return False
|
||||
cx = w["left"] + w["width"] / 2
|
||||
cy = w["top"] + w["height"] / 2
|
||||
on_border = any(lo <= cx <= hi for lo, hi in x_bands) or any(
|
||||
lo <= cy <= hi for lo, hi in y_bands
|
||||
# Check if any word edge (not just center) touches a border band
|
||||
w_left = w["left"]
|
||||
w_right = w["left"] + w["width"]
|
||||
w_top = w["top"]
|
||||
w_bottom = w["top"] + w["height"]
|
||||
on_border = (
|
||||
any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
|
||||
or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
|
||||
)
|
||||
if not on_border:
|
||||
return False
|
||||
@@ -318,6 +322,59 @@ def _filter_border_ghosts(
|
||||
return filtered, len(words) - len(filtered)
|
||||
|
||||
|
||||
def _merge_inline_marker_columns(
|
||||
columns: List[Dict],
|
||||
words: List[Dict],
|
||||
) -> List[Dict]:
|
||||
"""Merge narrow marker columns (bullets, numbering) into adjacent text.
|
||||
|
||||
Bullet points (•, *, -) and numbering (1., 2.) create narrow columns
|
||||
at the left edge of a zone. These are inline markers that indent text,
|
||||
not real separate columns. Merge them with their right neighbour.
|
||||
"""
|
||||
if len(columns) < 2:
|
||||
return columns
|
||||
|
||||
merged: List[Dict] = []
|
||||
skip: set = set()
|
||||
|
||||
for i, col in enumerate(columns):
|
||||
if i in skip:
|
||||
continue
|
||||
|
||||
# Find words in this column
|
||||
col_words = [
|
||||
w for w in words
|
||||
if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
|
||||
]
|
||||
col_width = col["x_max"] - col["x_min"]
|
||||
|
||||
# Narrow column with mostly short words → likely inline markers
|
||||
if col_words and col_width < 80:
|
||||
avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
|
||||
if avg_len <= 2 and i + 1 < len(columns):
|
||||
# Merge into next column
|
||||
next_col = columns[i + 1].copy()
|
||||
next_col["x_min"] = col["x_min"]
|
||||
merged.append(next_col)
|
||||
skip.add(i + 1)
|
||||
logger.info(
|
||||
" merged inline marker column %d (w=%d, avg_len=%.1f) "
|
||||
"into column %d",
|
||||
i, col_width, avg_len, i + 1,
|
||||
)
|
||||
continue
|
||||
|
||||
merged.append(col)
|
||||
|
||||
# Re-index
|
||||
for i, col in enumerate(merged):
|
||||
col["index"] = i
|
||||
col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
|
||||
"""Extract all word_boxes from cells into a flat list of word dicts."""
|
||||
words: List[Dict] = []
|
||||
@@ -445,6 +502,10 @@ def _build_zone_grid(
|
||||
# Use global columns if provided, otherwise detect per zone
|
||||
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
|
||||
|
||||
# Merge inline marker columns (bullets, numbering) into adjacent text
|
||||
if not global_columns:
|
||||
columns = _merge_inline_marker_columns(columns, zone_words)
|
||||
|
||||
if not columns or not rows:
|
||||
return {
|
||||
"columns": [],
|
||||
|
||||
Reference in New Issue
Block a user