Fix false positive: exclude first/last rows from single-cell heading detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s

Page numbers like "two hundred and twelve" in the last row were falsely
detected as headings. Now first and last non-header rows are excluded.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 08:06:05 +01:00
parent 7c5d95b858
commit bc5ab29c06
2 changed files with 29 additions and 0 deletions

View File

@@ -706,11 +706,21 @@ def _detect_heading_rows_by_single_cell(
if multi_col_rows < len(rows) * 0.4: if multi_col_rows < len(rows) * 0.4:
continue continue
# Exclude first and last non-header rows — these are typically
# page numbers or footer text, not headings.
non_header_rows = [r for r in rows if not r.get("is_header")]
if len(non_header_rows) < 3:
continue
first_ri = non_header_rows[0]["index"]
last_ri = non_header_rows[-1]["index"]
heading_row_indices = [] heading_row_indices = []
for row in rows: for row in rows:
if row.get("is_header"): if row.get("is_header"):
continue continue
ri = row["index"] ri = row["index"]
if ri == first_ri or ri == last_ri:
continue
row_cells = [c for c in cells if c.get("row_index") == ri] row_cells = [c for c in cells if c.get("row_index") == ri]
content_cells = [ content_cells = [
c for c in row_cells c for c in row_cells

View File

@@ -675,3 +675,22 @@ class TestDetectHeadingRowsBySingleCell:
assert len(heading) == 1 assert len(heading) == 1
assert heading[0]["col_type"] == "heading" assert heading[0]["col_type"] == "heading"
assert heading[0]["col_index"] == 1 # Should start at column_2, not 0 assert heading[0]["col_index"] == 1 # Should start at column_2, not 0
def test_last_row_single_cell_not_heading(self):
"""Single-cell in last row (e.g. page number '212') → NOT heading."""
zone = self._make_vocab_zone()
# Make row 7 (the last) have only 1 cell in column_2
zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 7]
zone["cells"].append({
"cell_id": "Z0_R07_C1",
"zone_index": 0, "row_index": 7, "col_index": 1,
"col_type": "column_2", "text": "two hundred and twelve",
"word_boxes": [self._make_word_box("two", 130, 310, 30, 20)],
})
zones_data = [zone]
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
# Row 4 "Theme" = heading, but row 7 (last) should NOT be heading
assert count == 1
heading_cells = [c for c in zone["cells"]
if c.get("col_type") == "heading"]
assert all(c["row_index"] != 7 for c in heading_cells)