fix(sub-columns): exclude header/footer words from alignment clustering
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 24s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s
Header/footer words (page numbers, chapter titles) could pollute the left-edge alignment bins and trigger false sub-column splits. Now _detect_header_footer_gaps() runs early and its boundaries are passed to _detect_sub_columns() to filter those words from clustering and the split threshold check. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1038,6 +1038,9 @@ def _detect_sub_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
left_x: int = 0,
|
||||
top_y: int = 0,
|
||||
header_y: Optional[int] = None,
|
||||
footer_y: Optional[int] = None,
|
||||
_edge_tolerance: int = 8,
|
||||
_min_col_start_ratio: float = 0.10,
|
||||
) -> List[ColumnGeometry]:
|
||||
@@ -1053,6 +1056,11 @@ def _detect_sub_columns(
|
||||
while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
|
||||
bridges the two coordinate systems.
|
||||
|
||||
If *header_y* / *footer_y* are provided (absolute y-coordinates), words
|
||||
in header/footer regions are excluded from alignment clustering to avoid
|
||||
polluting the bins with page numbers or chapter titles. Word ``top``
|
||||
values are relative to *top_y*.
|
||||
|
||||
Returns a new list of ColumnGeometry — potentially longer than the input.
|
||||
"""
|
||||
if content_w <= 0:
|
||||
@@ -1065,8 +1073,15 @@ def _detect_sub_columns(
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Collect left-edges of confident words
|
||||
confident = [w for w in geo.words if w.get('conf', 0) >= 30]
|
||||
# Collect left-edges of confident words, excluding header/footer
|
||||
# Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
|
||||
min_top_rel = (header_y - top_y) if header_y is not None else None
|
||||
max_top_rel = (footer_y - top_y) if footer_y is not None else None
|
||||
|
||||
confident = [w for w in geo.words
|
||||
if w.get('conf', 0) >= 30
|
||||
and (min_top_rel is None or w['top'] >= min_top_rel)
|
||||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||||
if len(confident) < 3:
|
||||
result.append(geo)
|
||||
continue
|
||||
@@ -1101,7 +1116,12 @@ def _detect_sub_columns(
|
||||
sub_words = [w for w in geo.words if w['left'] < split_threshold]
|
||||
main_words = [w for w in geo.words if w['left'] >= split_threshold]
|
||||
|
||||
if len(sub_words) < 2 or len(sub_words) / len(geo.words) >= 0.35:
|
||||
# Count only body words (excluding header/footer) for the threshold check
|
||||
# so that header/footer words don't artificially trigger a split.
|
||||
sub_body = [w for w in sub_words
|
||||
if (min_top_rel is None or w['top'] >= min_top_rel)
|
||||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||||
if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
@@ -2854,8 +2874,12 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
||||
geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
|
||||
content_w = right_x - left_x
|
||||
|
||||
# Detect header/footer early so sub-column clustering ignores them
|
||||
header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
|
||||
|
||||
# Split sub-columns (e.g. page references) before classification
|
||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x)
|
||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||||
|
||||
# Phase B: Content-based classification
|
||||
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
||||
|
||||
@@ -34,6 +34,7 @@ from cv_vocab_pipeline import (
|
||||
PageRegion,
|
||||
RowGeometry,
|
||||
_cells_to_vocab_entries,
|
||||
_detect_header_footer_gaps,
|
||||
_detect_sub_columns,
|
||||
_fix_character_confusion,
|
||||
_fix_phonetic_brackets,
|
||||
@@ -699,8 +700,12 @@ async def detect_columns(session_id: str):
|
||||
cached["_inv"] = inv
|
||||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||
|
||||
# Detect header/footer early so sub-column clustering ignores them
|
||||
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
|
||||
|
||||
# Split sub-columns (e.g. page references) before classification
|
||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x)
|
||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||||
|
||||
# Phase B: Content-based classification
|
||||
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
||||
|
||||
@@ -1330,6 +1330,73 @@ class TestSubColumnDetection:
|
||||
assert result[0].word_count == 3
|
||||
assert result[1].word_count == 40
|
||||
|
||||
def test_header_words_excluded_from_alignment(self):
|
||||
"""Header words (top < header_y) should not participate in alignment clustering.
|
||||
|
||||
Without header_y: 3 header words at left=100 + 40 content words at left=250
|
||||
would cause a split (3 outliers vs 40 main).
|
||||
With header_y: the 3 header words are excluded from clustering, leaving only
|
||||
40 uniform words at left=250 → no split.
|
||||
"""
|
||||
content_w = 1000
|
||||
top_y = 0
|
||||
# Header words: top=5 (relative to top_y=0), well above header_y=50
|
||||
header_words = [{'left': 100, 'top': 5, 'width': 50, 'height': 20,
|
||||
'text': f"Ch.{i}", 'conf': 90} for i in range(3)]
|
||||
# Content words: top=200, below header_y=50
|
||||
content_words = [{'left': 250, 'top': 200, 'width': 50, 'height': 20,
|
||||
'text': f"word{i}", 'conf': 90} for i in range(40)]
|
||||
all_words = header_words + content_words
|
||||
geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w)
|
||||
|
||||
# Without header_y: split happens (3 outliers at left=100)
|
||||
result_no_filter = _detect_sub_columns([geo], content_w)
|
||||
assert len(result_no_filter) == 2, "Should split without header filtering"
|
||||
|
||||
# With header_y=50: header words excluded, only 40 uniform words remain → no split
|
||||
result_filtered = _detect_sub_columns([geo], content_w, top_y=top_y, header_y=50)
|
||||
assert len(result_filtered) == 1, "Should NOT split with header words excluded"
|
||||
assert result_filtered[0].word_count == 43 # all words still in the geometry
|
||||
|
||||
def test_footer_words_excluded_from_alignment(self):
|
||||
"""Footer words (top > footer_y) should not participate in alignment clustering.
|
||||
|
||||
Analog to header test but with footer words at the bottom.
|
||||
"""
|
||||
content_w = 1000
|
||||
top_y = 0
|
||||
# Content words: top=200, above footer_y=800
|
||||
content_words = [{'left': 250, 'top': 200, 'width': 50, 'height': 20,
|
||||
'text': f"word{i}", 'conf': 90} for i in range(40)]
|
||||
# Footer words: top=900, below footer_y=800
|
||||
footer_words = [{'left': 100, 'top': 900, 'width': 50, 'height': 20,
|
||||
'text': f"p.{i}", 'conf': 90} for i in range(3)]
|
||||
all_words = content_words + footer_words
|
||||
geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w)
|
||||
|
||||
# Without footer_y: split happens (3 outliers at left=100)
|
||||
result_no_filter = _detect_sub_columns([geo], content_w)
|
||||
assert len(result_no_filter) == 2, "Should split without footer filtering"
|
||||
|
||||
# With footer_y=800: footer words excluded → no split
|
||||
result_filtered = _detect_sub_columns([geo], content_w, top_y=top_y, footer_y=800)
|
||||
assert len(result_filtered) == 1, "Should NOT split with footer words excluded"
|
||||
assert result_filtered[0].word_count == 43
|
||||
|
||||
def test_header_footer_none_no_filtering(self):
|
||||
"""header_y=None, footer_y=None → same behavior as before (no filtering)."""
|
||||
content_w = 1000
|
||||
page_words = [self._make_word(100, f"p.{59+i}") for i in range(3)]
|
||||
vocab_words = [self._make_word(250, f"word{i}") for i in range(40)]
|
||||
all_words = page_words + vocab_words
|
||||
geo = self._make_geo(x=80, width=300, words=all_words, content_w=content_w)
|
||||
|
||||
result = _detect_sub_columns([geo], content_w, header_y=None, footer_y=None)
|
||||
|
||||
assert len(result) == 2, "Should still split with None header/footer"
|
||||
assert result[0].word_count == 3
|
||||
assert result[1].word_count == 40
|
||||
|
||||
|
||||
class TestCellsToVocabEntriesPageRef:
|
||||
"""Test that page_ref cells are mapped to source_page field."""
|
||||
|
||||
Reference in New Issue
Block a user