Fix false positive: exclude first/last rows from single-cell heading detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
Page numbers like "two hundred and twelve" in the last row were falsely detected as headings. Now first and last non-header rows are excluded. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -706,11 +706,21 @@ def _detect_heading_rows_by_single_cell(
|
||||
if multi_col_rows < len(rows) * 0.4:
|
||||
continue
|
||||
|
||||
# Exclude first and last non-header rows — these are typically
|
||||
# page numbers or footer text, not headings.
|
||||
non_header_rows = [r for r in rows if not r.get("is_header")]
|
||||
if len(non_header_rows) < 3:
|
||||
continue
|
||||
first_ri = non_header_rows[0]["index"]
|
||||
last_ri = non_header_rows[-1]["index"]
|
||||
|
||||
heading_row_indices = []
|
||||
for row in rows:
|
||||
if row.get("is_header"):
|
||||
continue
|
||||
ri = row["index"]
|
||||
if ri == first_ri or ri == last_ri:
|
||||
continue
|
||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||
content_cells = [
|
||||
c for c in row_cells
|
||||
|
||||
@@ -675,3 +675,22 @@ class TestDetectHeadingRowsBySingleCell:
|
||||
assert len(heading) == 1
|
||||
assert heading[0]["col_type"] == "heading"
|
||||
assert heading[0]["col_index"] == 1 # Should start at column_2, not 0
|
||||
|
||||
def test_last_row_single_cell_not_heading(self):
|
||||
"""Single-cell in last row (e.g. page number '212') → NOT heading."""
|
||||
zone = self._make_vocab_zone()
|
||||
# Make row 7 (the last) have only 1 cell in column_2
|
||||
zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 7]
|
||||
zone["cells"].append({
|
||||
"cell_id": "Z0_R07_C1",
|
||||
"zone_index": 0, "row_index": 7, "col_index": 1,
|
||||
"col_type": "column_2", "text": "two hundred and twelve",
|
||||
"word_boxes": [self._make_word_box("two", 130, 310, 30, 20)],
|
||||
})
|
||||
zones_data = [zone]
|
||||
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||||
# Row 4 "Theme" = heading, but row 7 (last) should NOT be heading
|
||||
assert count == 1
|
||||
heading_cells = [c for c in zone["cells"]
|
||||
if c.get("col_type") == "heading"]
|
||||
assert all(c["row_index"] != 7 for c in heading_cells)
|
||||
|
||||
Reference in New Issue
Block a user