fix: merge inline marker columns + improve ghost edge detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 16s
1. Add _merge_inline_marker_columns(): narrow columns (<80px) with avg word length <=2 chars (bullets, numbering) are merged into the adjacent text column. Fixes box zones getting 2 columns when bullet points are just indentation markers. 2. Improve ghost filter: check word edges (left/right/top/bottom) against border bands instead of center-only. Catches = at x=947 whose left edge touches the box border. 3. Add = and + to _GRID_GHOST_CHARS for border artifact detection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -266,7 +266,7 @@ def _cluster_columns_by_alignment(
|
|||||||
|
|
||||||
# Characters that are typically OCR artefacts from box border lines.
|
# Characters that are typically OCR artefacts from box border lines.
|
||||||
# Intentionally excludes ! (red markers) and . , ; (real punctuation).
|
# Intentionally excludes ! (red markers) and . , ; (real punctuation).
|
||||||
_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~")
|
_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~=+")
|
||||||
|
|
||||||
|
|
||||||
def _filter_border_ghosts(
|
def _filter_border_ghosts(
|
||||||
@@ -303,10 +303,14 @@ def _filter_border_ghosts(
|
|||||||
text = (w.get("text") or "").strip()
|
text = (w.get("text") or "").strip()
|
||||||
if not text:
|
if not text:
|
||||||
return False
|
return False
|
||||||
cx = w["left"] + w["width"] / 2
|
# Check if any word edge (not just center) touches a border band
|
||||||
cy = w["top"] + w["height"] / 2
|
w_left = w["left"]
|
||||||
on_border = any(lo <= cx <= hi for lo, hi in x_bands) or any(
|
w_right = w["left"] + w["width"]
|
||||||
lo <= cy <= hi for lo, hi in y_bands
|
w_top = w["top"]
|
||||||
|
w_bottom = w["top"] + w["height"]
|
||||||
|
on_border = (
|
||||||
|
any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands)
|
||||||
|
or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands)
|
||||||
)
|
)
|
||||||
if not on_border:
|
if not on_border:
|
||||||
return False
|
return False
|
||||||
@@ -318,6 +322,59 @@ def _filter_border_ghosts(
|
|||||||
return filtered, len(words) - len(filtered)
|
return filtered, len(words) - len(filtered)
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_inline_marker_columns(
|
||||||
|
columns: List[Dict],
|
||||||
|
words: List[Dict],
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Merge narrow marker columns (bullets, numbering) into adjacent text.
|
||||||
|
|
||||||
|
Bullet points (•, *, -) and numbering (1., 2.) create narrow columns
|
||||||
|
at the left edge of a zone. These are inline markers that indent text,
|
||||||
|
not real separate columns. Merge them with their right neighbour.
|
||||||
|
"""
|
||||||
|
if len(columns) < 2:
|
||||||
|
return columns
|
||||||
|
|
||||||
|
merged: List[Dict] = []
|
||||||
|
skip: set = set()
|
||||||
|
|
||||||
|
for i, col in enumerate(columns):
|
||||||
|
if i in skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find words in this column
|
||||||
|
col_words = [
|
||||||
|
w for w in words
|
||||||
|
if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"]
|
||||||
|
]
|
||||||
|
col_width = col["x_max"] - col["x_min"]
|
||||||
|
|
||||||
|
# Narrow column with mostly short words → likely inline markers
|
||||||
|
if col_words and col_width < 80:
|
||||||
|
avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
|
||||||
|
if avg_len <= 2 and i + 1 < len(columns):
|
||||||
|
# Merge into next column
|
||||||
|
next_col = columns[i + 1].copy()
|
||||||
|
next_col["x_min"] = col["x_min"]
|
||||||
|
merged.append(next_col)
|
||||||
|
skip.add(i + 1)
|
||||||
|
logger.info(
|
||||||
|
" merged inline marker column %d (w=%d, avg_len=%.1f) "
|
||||||
|
"into column %d",
|
||||||
|
i, col_width, avg_len, i + 1,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
merged.append(col)
|
||||||
|
|
||||||
|
# Re-index
|
||||||
|
for i, col in enumerate(merged):
|
||||||
|
col["index"] = i
|
||||||
|
col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text"
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
|
def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]:
|
||||||
"""Extract all word_boxes from cells into a flat list of word dicts."""
|
"""Extract all word_boxes from cells into a flat list of word dicts."""
|
||||||
words: List[Dict] = []
|
words: List[Dict] = []
|
||||||
@@ -445,6 +502,10 @@ def _build_zone_grid(
|
|||||||
# Use global columns if provided, otherwise detect per zone
|
# Use global columns if provided, otherwise detect per zone
|
||||||
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
|
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
|
||||||
|
|
||||||
|
# Merge inline marker columns (bullets, numbering) into adjacent text
|
||||||
|
if not global_columns:
|
||||||
|
columns = _merge_inline_marker_columns(columns, zone_words)
|
||||||
|
|
||||||
if not columns or not rows:
|
if not columns or not rows:
|
||||||
return {
|
return {
|
||||||
"columns": [],
|
"columns": [],
|
||||||
|
|||||||
Reference in New Issue
Block a user