Fix false positive: exclude first/last rows from single-cell heading detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
Page numbers like "two hundred and twelve" in the last row were falsely detected as headings. Now first and last non-header rows are excluded. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -706,11 +706,21 @@ def _detect_heading_rows_by_single_cell(
|
|||||||
if multi_col_rows < len(rows) * 0.4:
|
if multi_col_rows < len(rows) * 0.4:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Exclude first and last non-header rows — these are typically
|
||||||
|
# page numbers or footer text, not headings.
|
||||||
|
non_header_rows = [r for r in rows if not r.get("is_header")]
|
||||||
|
if len(non_header_rows) < 3:
|
||||||
|
continue
|
||||||
|
first_ri = non_header_rows[0]["index"]
|
||||||
|
last_ri = non_header_rows[-1]["index"]
|
||||||
|
|
||||||
heading_row_indices = []
|
heading_row_indices = []
|
||||||
for row in rows:
|
for row in rows:
|
||||||
if row.get("is_header"):
|
if row.get("is_header"):
|
||||||
continue
|
continue
|
||||||
ri = row["index"]
|
ri = row["index"]
|
||||||
|
if ri == first_ri or ri == last_ri:
|
||||||
|
continue
|
||||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||||
content_cells = [
|
content_cells = [
|
||||||
c for c in row_cells
|
c for c in row_cells
|
||||||
|
|||||||
@@ -675,3 +675,22 @@ class TestDetectHeadingRowsBySingleCell:
|
|||||||
assert len(heading) == 1
|
assert len(heading) == 1
|
||||||
assert heading[0]["col_type"] == "heading"
|
assert heading[0]["col_type"] == "heading"
|
||||||
assert heading[0]["col_index"] == 1 # Should start at column_2, not 0
|
assert heading[0]["col_index"] == 1 # Should start at column_2, not 0
|
||||||
|
|
||||||
|
def test_last_row_single_cell_not_heading(self):
|
||||||
|
"""Single-cell in last row (e.g. page number '212') → NOT heading."""
|
||||||
|
zone = self._make_vocab_zone()
|
||||||
|
# Make row 7 (the last) have only 1 cell in column_2
|
||||||
|
zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 7]
|
||||||
|
zone["cells"].append({
|
||||||
|
"cell_id": "Z0_R07_C1",
|
||||||
|
"zone_index": 0, "row_index": 7, "col_index": 1,
|
||||||
|
"col_type": "column_2", "text": "two hundred and twelve",
|
||||||
|
"word_boxes": [self._make_word_box("two", 130, 310, 30, 20)],
|
||||||
|
})
|
||||||
|
zones_data = [zone]
|
||||||
|
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||||||
|
# Row 4 "Theme" = heading, but row 7 (last) should NOT be heading
|
||||||
|
assert count == 1
|
||||||
|
heading_cells = [c for c in zone["cells"]
|
||||||
|
if c.get("col_type") == "heading"]
|
||||||
|
assert all(c["row_index"] != 7 for c in heading_cells)
|
||||||
|
|||||||
Reference in New Issue
Block a user