fix(sub-columns): exclude header/footer words from alignment clustering
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s

Header/footer words (page numbers, chapter titles) could pollute the
left-edge alignment bins and trigger false sub-column splits. Now
_detect_header_footer_gaps() runs early and its boundaries are passed
to _detect_sub_columns() to filter those words from clustering and
the split threshold check.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 07:33:54 +01:00
parent 3904ddb493
commit 6527beae03
3 changed files with 101 additions and 5 deletions

View File

@@ -1038,6 +1038,9 @@ def _detect_sub_columns(
geometries: List[ColumnGeometry],
content_w: int,
left_x: int = 0,
top_y: int = 0,
header_y: Optional[int] = None,
footer_y: Optional[int] = None,
_edge_tolerance: int = 8,
_min_col_start_ratio: float = 0.10,
) -> List[ColumnGeometry]:
@@ -1053,6 +1056,11 @@ def _detect_sub_columns(
while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
bridges the two coordinate systems.
If *header_y* / *footer_y* are provided (absolute y-coordinates), words
in header/footer regions are excluded from alignment clustering to avoid
polluting the bins with page numbers or chapter titles. Word ``top``
values are relative to *top_y*.
Returns a new list of ColumnGeometry — potentially longer than the input.
"""
if content_w <= 0:
@@ -1065,8 +1073,15 @@ def _detect_sub_columns(
result.append(geo)
continue
# Collect left-edges of confident words
confident = [w for w in geo.words if w.get('conf', 0) >= 30]
# Collect left-edges of confident words, excluding header/footer
# Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
min_top_rel = (header_y - top_y) if header_y is not None else None
max_top_rel = (footer_y - top_y) if footer_y is not None else None
confident = [w for w in geo.words
if w.get('conf', 0) >= 30
and (min_top_rel is None or w['top'] >= min_top_rel)
and (max_top_rel is None or w['top'] <= max_top_rel)]
if len(confident) < 3:
result.append(geo)
continue
@@ -1101,7 +1116,12 @@ def _detect_sub_columns(
sub_words = [w for w in geo.words if w['left'] < split_threshold]
main_words = [w for w in geo.words if w['left'] >= split_threshold]
if len(sub_words) < 2 or len(sub_words) / len(geo.words) >= 0.35:
# Count only body words (excluding header/footer) for the threshold check
# so that header/footer words don't artificially trigger a split.
sub_body = [w for w in sub_words
if (min_top_rel is None or w['top'] >= min_top_rel)
and (max_top_rel is None or w['top'] <= max_top_rel)]
if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
result.append(geo)
continue
@@ -2854,8 +2874,12 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
content_w = right_x - left_x
# Detect header/footer early so sub-column clustering ignores them
header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
# Split sub-columns (e.g. page references) before classification
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x)
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
top_y=top_y, header_y=header_y, footer_y=footer_y)
# Phase B: Content-based classification
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,