Fix false positive: exclude first/last rows from single-cell heading detection

Page numbers like "two hundred and twelve" in the last row were falsely detected as headings. Now first and last non-header rows are excluded. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 08:06:05 +01:00
parent 7c5d95b858
commit bc5ab29c06
2 changed files with 29 additions and 0 deletions
@@ -706,11 +706,21 @@ def _detect_heading_rows_by_single_cell(
        if multi_col_rows < len(rows) * 0.4:
            continue
        # Exclude first and last non-header rows — these are typically
        # page numbers or footer text, not headings.
        non_header_rows = [r for r in rows if not r.get("is_header")]
        if len(non_header_rows) < 3:
            continue
        first_ri = non_header_rows[0]["index"]
        last_ri = non_header_rows[-1]["index"]
        heading_row_indices = []
        for row in rows:
            if row.get("is_header"):
                continue
            ri = row["index"]
            if ri == first_ri or ri == last_ri:
                continue
            row_cells = [c for c in cells if c.get("row_index") == ri]
            content_cells = [
                c for c in row_cells
@@ -675,3 +675,22 @@ class TestDetectHeadingRowsBySingleCell:
        assert len(heading) == 1
        assert heading[0]["col_type"] == "heading"
        assert heading[0]["col_index"] == 1  # Should start at column_2, not 0
    def test_last_row_single_cell_not_heading(self):
        """Single-cell in last row (e.g. page number '212') → NOT heading."""
        zone = self._make_vocab_zone()
        # Make row 7 (the last) have only 1 cell in column_2
        zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 7]
        zone["cells"].append({
            "cell_id": "Z0_R07_C1",
            "zone_index": 0, "row_index": 7, "col_index": 1,
            "col_type": "column_2", "text": "two hundred and twelve",
            "word_boxes": [self._make_word_box("two", 130, 310, 30, 20)],
        })
        zones_data = [zone]
        count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
        # Row 4 "Theme" = heading, but row 7 (last) should NOT be heading
        assert count == 1
        heading_cells = [c for c in zone["cells"]
                         if c.get("col_type") == "heading"]
        assert all(c["row_index"] != 7 for c in heading_cells)