fix: move column expansion AFTER sub-column split
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s

The narrow column expansion was running inside detect_column_geometry()
on the 4 main columns, but the narrowest columns (marker ~14px,
page_ref ~93px) are created AFTERWARDS by _detect_sub_columns().

Extracted expand_narrow_columns() as standalone function and call it
after sub-column splitting in the columns API endpoint.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-04 10:07:40 +01:00
parent e426de937c
commit 9dd77ab54a
2 changed files with 84 additions and 65 deletions

View File

@@ -1883,35 +1883,51 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: " logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
# --- Step 10: Expand narrow columns into adjacent gaps --- return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
# Narrow columns (marker, page_ref, < 10% width) often lose content at
# image edges due to residual shear. Expand them into the gap toward
# the neighbouring column, but never past 40 % of the gap or past the def expand_narrow_columns(
# nearest word in the neighbour. geometries: List[ColumnGeometry],
_NARROW_THRESHOLD_PCT = 10.0 # columns below this % of content_w are "narrow" content_w: int,
_GAP_CLAIM_RATIO = 0.40 # narrow col may claim up to 40 % of the gap left_x: int,
_MIN_WORD_MARGIN = 4 # always keep 4 px between col edge and nearest word word_dicts: List[Dict],
) -> List[ColumnGeometry]:
"""Expand narrow columns into adjacent whitespace gaps.
Narrow columns (marker, page_ref, < 10% content width) often lose
content at image edges due to residual shear. This expands them toward
the neighbouring column, but never past 40% of the gap or past the
nearest word in the neighbour.
Must be called AFTER _detect_sub_columns() so that sub-column splits
(which create the narrowest columns) have already happened.
"""
_NARROW_THRESHOLD_PCT = 10.0
_GAP_CLAIM_RATIO = 0.40
_MIN_WORD_MARGIN = 4
if len(geometries) < 2:
return geometries
if len(geometries) >= 2:
for i, g in enumerate(geometries): for i, g in enumerate(geometries):
col_pct = g.width / content_w * 100 if content_w > 0 else 100 col_pct = g.width / content_w * 100 if content_w > 0 else 100
if col_pct >= _NARROW_THRESHOLD_PCT: if col_pct >= _NARROW_THRESHOLD_PCT:
continue # not narrow — skip continue
expanded = False expanded = False
orig_pct = col_pct
# --- try expanding to the LEFT (into gap with left neighbor) --- # --- try expanding to the LEFT ---
if i > 0: if i > 0:
left_nb = geometries[i - 1] left_nb = geometries[i - 1]
gap_left = g.x - (left_nb.x + left_nb.width) gap_left = g.x - (left_nb.x + left_nb.width)
if gap_left > _MIN_WORD_MARGIN * 2: if gap_left > _MIN_WORD_MARGIN * 2:
# Find nearest word in left neighbor (right edge)
nb_right_rel = (left_nb.x + left_nb.width) - left_x
nb_words_right = [wd['left'] + wd.get('width', 0) nb_words_right = [wd['left'] + wd.get('width', 0)
for wd in left_nb.words] for wd in left_nb.words]
max_word_right = max(nb_words_right) if nb_words_right else (nb_right_rel - 20) if nb_words_right:
# max_word_right is relative to left_x safe_left_abs = left_x + max(nb_words_right) + _MIN_WORD_MARGIN
safe_left_abs = left_x + max_word_right + _MIN_WORD_MARGIN else:
safe_left_abs = left_nb.x + left_nb.width + _MIN_WORD_MARGIN
max_expand = int(gap_left * _GAP_CLAIM_RATIO) max_expand = int(gap_left * _GAP_CLAIM_RATIO)
new_x = max(safe_left_abs, g.x - max_expand) new_x = max(safe_left_abs, g.x - max_expand)
if new_x < g.x: if new_x < g.x:
@@ -1920,15 +1936,16 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
g.x = new_x g.x = new_x
expanded = True expanded = True
# --- try expanding to the RIGHT (into gap with right neighbor) --- # --- try expanding to the RIGHT ---
if i + 1 < len(geometries): if i + 1 < len(geometries):
right_nb = geometries[i + 1] right_nb = geometries[i + 1]
gap_right = right_nb.x - (g.x + g.width) gap_right = right_nb.x - (g.x + g.width)
if gap_right > _MIN_WORD_MARGIN * 2: if gap_right > _MIN_WORD_MARGIN * 2:
# Find nearest word in right neighbor (left edge)
nb_words_left = [wd['left'] for wd in right_nb.words] nb_words_left = [wd['left'] for wd in right_nb.words]
min_word_left_rel = min(nb_words_left) if nb_words_left else ((right_nb.x - left_x) + 20) if nb_words_left:
safe_right_abs = left_x + min_word_left_rel - _MIN_WORD_MARGIN safe_right_abs = left_x + min(nb_words_left) - _MIN_WORD_MARGIN
else:
safe_right_abs = right_nb.x - _MIN_WORD_MARGIN
max_expand = int(gap_right * _GAP_CLAIM_RATIO) max_expand = int(gap_right * _GAP_CLAIM_RATIO)
new_right = min(safe_right_abs, g.x + g.width + max_expand) new_right = min(safe_right_abs, g.x + g.width + max_expand)
if new_right > g.x + g.width: if new_right > g.x + g.width:
@@ -1936,7 +1953,6 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
expanded = True expanded = True
if expanded: if expanded:
# Re-assign words to this expanded column
col_left_rel = g.x - left_x col_left_rel = g.x - left_x
col_right_rel = col_left_rel + g.width col_right_rel = col_left_rel + g.width
g.words = [wd for wd in word_dicts g.words = [wd for wd in word_dicts
@@ -1944,11 +1960,10 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
g.word_count = len(g.words) g.word_count = len(g.words)
g.width_ratio = g.width / content_w if content_w > 0 else 0.0 g.width_ratio = g.width / content_w if content_w > 0 else 0.0
logger.info( logger.info(
"ColumnGeometry: expanded narrow col %d " "ExpandNarrowCols: col %d (%.1f%%%.1f%%) x=%d w=%d words=%d",
"(%.1f%%%.1f%%) x=%d w=%d", i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
i, col_pct, g.width / content_w * 100, g.x, g.width)
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv) return geometries
# ============================================================================= # =============================================================================

View File

@@ -51,6 +51,7 @@ from cv_vocab_pipeline import (
deskew_image_by_word_alignment, deskew_image_by_word_alignment,
detect_column_geometry, detect_column_geometry,
detect_row_geometry, detect_row_geometry,
expand_narrow_columns,
_apply_shear, _apply_shear,
dewarp_image, dewarp_image,
dewarp_image_manual, dewarp_image_manual,
@@ -802,6 +803,9 @@ async def detect_columns(session_id: str):
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
top_y=top_y, header_y=header_y, footer_y=footer_y) top_y=top_y, header_y=header_y, footer_y=footer_y)
# Expand narrow columns (sub-columns are often very narrow)
geometries = expand_narrow_columns(geometries, content_w, left_x, word_dicts)
# Phase B: Content-based classification # Phase B: Content-based classification
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y, regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
left_x=left_x, right_x=right_x, inv=inv) left_x=left_x, right_x=right_x, inv=inv)