Fix false positive: exclude first/last rows from single-cell heading detection

Page numbers like "two hundred and twelve" in the last row were falsely detected as headings. Now first and last non-header rows are excluded. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 08:06:05 +01:00
parent 7c5d95b858
commit bc5ab29c06
2 changed files with 29 additions and 0 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -706,11 +706,21 @@ def _detect_heading_rows_by_single_cell(
        if multi_col_rows < len(rows) * 0.4:
            continue

+        # Exclude first and last non-header rows — these are typically
+        # page numbers or footer text, not headings.
+        non_header_rows = [r for r in rows if not r.get("is_header")]
+        if len(non_header_rows) < 3:
+            continue
+        first_ri = non_header_rows[0]["index"]
+        last_ri = non_header_rows[-1]["index"]
+
        heading_row_indices = []
        for row in rows:
            if row.get("is_header"):
                continue
            ri = row["index"]
+            if ri == first_ri or ri == last_ri:
+                continue
            row_cells = [c for c in cells if c.get("row_index") == ri]
            content_cells = [
                c for c in row_cells