fix(sub-columns): convert relative word positions to absolute coords for split
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 17s

Word 'left' values in ColumnGeometry.words are relative to the content
ROI (left_x), but geo.x is in absolute image coordinates. The split
position was computed from relative word positions and then compared
against absolute geo.x, resulting in negative widths and no splits on
real data. Pass left_x through to _detect_sub_columns to bridge the
two coordinate systems.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 19:16:13 +01:00
parent 6e1a349eed
commit 3904ddb493
3 changed files with 40 additions and 8 deletions

View File

@@ -1037,6 +1037,7 @@ def _detect_columns_by_clustering(
def _detect_sub_columns(
geometries: List[ColumnGeometry],
content_w: int,
left_x: int = 0,
_edge_tolerance: int = 8,
_min_col_start_ratio: float = 0.10,
) -> List[ColumnGeometry]:
@@ -1048,6 +1049,10 @@ def _detect_sub_columns(
start. Any words to the left of that bin form a sub-column, provided they
number >= 2 and < 35 % of total.
Word ``left`` values are relative to the content ROI (offset by *left_x*),
while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
bridges the two coordinate systems.
Returns a new list of ColumnGeometry — potentially longer than the input.
"""
if content_w <= 0:
@@ -1101,13 +1106,16 @@ def _detect_sub_columns(
continue
# --- Build two sub-column geometries ---
# Word 'left' values are relative to left_x; geo.x is absolute.
# Convert the split position from relative to absolute coordinates.
max_sub_left = max(w['left'] for w in sub_words)
split_x = (max_sub_left + col_start_bin[2]) // 2
split_rel = (max_sub_left + col_start_bin[2]) // 2
split_abs = split_rel + left_x
sub_x = geo.x
sub_width = split_x - geo.x
main_x = split_x
main_width = (geo.x + geo.width) - split_x
sub_width = split_abs - geo.x
main_x = split_abs
main_width = (geo.x + geo.width) - split_abs
if sub_width <= 0 or main_width <= 0:
result.append(geo)
@@ -1138,8 +1146,9 @@ def _detect_sub_columns(
result.append(main_geo)
logger.info(
f"SubColumnSplit: column idx={geo.index} split at x={split_x}, "
f"sub={len(sub_words)} words (left), main={len(main_words)} words, "
f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
f"(rel={split_rel}), sub={len(sub_words)} words, "
f"main={len(main_words)} words, "
f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
)
@@ -2846,7 +2855,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
content_w = right_x - left_x
# Split sub-columns (e.g. page references) before classification
geometries = _detect_sub_columns(geometries, content_w)
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x)
# Phase B: Content-based classification
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,