Fix heading col_index + detect black single-cell headings like "Theme"

- Color headings now preserve actual starting col_index instead of hardcoded 0 - New _detect_heading_rows_by_single_cell: detects rows with only 1 content cell (excl. page_ref) as headings — catches black headings like "Theme" that have normal color/height but are alone in their row - Runs after Step 5d (IPA continuation) to avoid false positives - 5 new tests (32 total) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 08:00:06 +01:00
parent 65059471cf
commit 7c5d95b858
2 changed files with 300 additions and 2 deletions
@@ -18,6 +18,7 @@ from grid_editor_api import (
    _filter_border_ghosts,
    _detect_header_rows,
    _detect_heading_rows_by_color,
+    _detect_heading_rows_by_single_cell,
 )
 from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell

@@ -520,3 +521,157 @@ class TestGarbledIpaDetection:
        # But text does NOT start with '[' — Step 5d bracket guard blocks it
        text = "employee [im'ploi:]"
        assert not (text.strip().startswith('[') and text.strip().endswith(']'))
+
+
+# ---------------------------------------------------------------------------
+# _detect_heading_rows_by_single_cell
+# ---------------------------------------------------------------------------
+
+class TestDetectHeadingRowsBySingleCell:
+    """Test heading detection for black single-cell rows (e.g. 'Theme')."""
+
+    def _make_word_box(self, text, left, top, width, height, color="black"):
+        return {
+            "text": text, "left": left, "top": top,
+            "width": width, "height": height, "color_name": color, "conf": 90,
+        }
+
+    def _make_vocab_zone(self):
+        """Build a typical 4-column vocab zone with 8 rows.
+
+        Columns: column_1 (page_ref), column_2 (EN), column_3 (DE), column_4 (example)
+        Row 4 has only 1 cell in column_2 → heading candidate ("Theme").
+        """
+        cells = []
+        for ri in range(8):
+            if ri == 4:
+                # Single-cell row: "Theme" in column_2 only
+                cells.append({
+                    "cell_id": f"Z0_R{ri:02d}_C1",
+                    "zone_index": 0, "row_index": ri, "col_index": 1,
+                    "col_type": "column_2", "text": "Theme",
+                    "word_boxes": [self._make_word_box("Theme", 130, 100 + ri * 30, 70, 20)],
+                })
+                continue
+            # Normal vocab row: 3-4 cells
+            cells.append({
+                "cell_id": f"Z0_R{ri:02d}_C0",
+                "zone_index": 0, "row_index": ri, "col_index": 0,
+                "col_type": "column_1", "text": f"p.{70 + ri}",
+                "word_boxes": [self._make_word_box(f"p.{70+ri}", 10, 100 + ri * 30, 30, 20)],
+            })
+            cells.append({
+                "cell_id": f"Z0_R{ri:02d}_C1",
+                "zone_index": 0, "row_index": ri, "col_index": 1,
+                "col_type": "column_2", "text": f"word_{ri}",
+                "word_boxes": [self._make_word_box(f"word_{ri}", 130, 100 + ri * 30, 80, 20)],
+            })
+            cells.append({
+                "cell_id": f"Z0_R{ri:02d}_C2",
+                "zone_index": 0, "row_index": ri, "col_index": 2,
+                "col_type": "column_3", "text": f"Wort_{ri}",
+                "word_boxes": [self._make_word_box(f"Wort_{ri}", 400, 100 + ri * 30, 80, 20)],
+            })
+            cells.append({
+                "cell_id": f"Z0_R{ri:02d}_C3",
+                "zone_index": 0, "row_index": ri, "col_index": 3,
+                "col_type": "column_4", "text": f"Example sentence {ri}.",
+                "word_boxes": [self._make_word_box(f"Example", 600, 100 + ri * 30, 120, 20)],
+            })
+
+        rows = [
+            {"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
+            for ri in range(8)
+        ]
+        columns = [
+            {"col_index": 0, "col_type": "column_1"},
+            {"col_index": 1, "col_type": "column_2"},
+            {"col_index": 2, "col_type": "column_3"},
+            {"col_index": 3, "col_type": "column_4"},
+        ]
+        return {
+            "zone_index": 0, "zone_type": "content",
+            "bbox_px": {"x": 0, "y": 0, "w": 800, "h": 1000},
+            "cells": cells, "rows": rows, "columns": columns,
+        }
+
+    def test_single_cell_heading_detected(self):
+        """Row with only 1 content cell in column_2 → heading."""
+        zone = self._make_vocab_zone()
+        zones_data = [zone]
+        count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
+        assert count == 1
+        heading_cells = [c for c in zone["cells"] if c["row_index"] == 4]
+        assert len(heading_cells) == 1
+        assert heading_cells[0]["col_type"] == "heading"
+        assert heading_cells[0]["text"] == "Theme"
+        assert heading_cells[0]["col_index"] == 1  # Starts at column_2, not 0
+
+    def test_single_cell_in_last_column_not_heading(self):
+        """Row with only 1 cell in column_4 (last) → NOT heading (continuation)."""
+        zone = self._make_vocab_zone()
+        # Add a single-cell row in the last column (column_4)
+        zone["cells"].append({
+            "cell_id": "Z0_R04_C3",
+            "zone_index": 0, "row_index": 4, "col_index": 3,
+            "col_type": "column_4", "text": "2. Veränderung",
+            "word_boxes": [self._make_word_box("2.", 600, 220, 100, 20)],
+        })
+        # Remove the "Theme" cell from row 4
+        zone["cells"] = [c for c in zone["cells"]
+                         if not (c["row_index"] == 4 and c["col_index"] == 1)]
+        zones_data = [zone]
+        count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
+        # Row 4 now only has column_4 → should NOT be heading
+        # But original row 4 "Theme" was removed, so no heading at all
+        assert count == 0
+
+    def test_ipa_bracket_text_not_heading(self):
+        """Row with single cell starting with '[' → IPA continuation, not heading."""
+        zone = self._make_vocab_zone()
+        # Replace "Theme" with IPA continuation
+        for c in zone["cells"]:
+            if c["row_index"] == 4 and c["col_index"] == 1:
+                c["text"] = "[θˈiːm]"
+                break
+        zones_data = [zone]
+        count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
+        assert count == 0
+
+    def test_multi_cell_row_not_heading(self):
+        """Normal vocab row with multiple cells → NOT heading."""
+        zone = self._make_vocab_zone()
+        zones_data = [zone]
+        count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
+        # Only row 4 (Theme) should be heading, other rows have 3-4 cells
+        assert count == 1
+        # Verify normal rows are NOT marked as heading
+        for ri in [0, 1, 2, 3, 5, 6, 7]:
+            row_cells = [c for c in zone["cells"] if c["row_index"] == ri]
+            for c in row_cells:
+                assert c["col_type"] != "heading"
+
+    def test_color_heading_preserves_correct_col_index(self):
+        """Color heading starting in column_2 → col_index should be 1, not 0."""
+        zone = self._make_vocab_zone()
+        # Make row 3 a color heading: blue words in column_2 and column_3 only
+        # (no column_1 page_ref for this row)
+        zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 3]
+        zone["cells"].append({
+            "cell_id": "Z0_R03_C1", "zone_index": 0, "row_index": 3,
+            "col_index": 1, "col_type": "column_2", "text": "Unit 4:",
+            "word_boxes": [self._make_word_box("Unit", 130, 190, 50, 26, "blue"),
+                           self._make_word_box("4:", 185, 190, 20, 26, "blue")],
+        })
+        zone["cells"].append({
+            "cell_id": "Z0_R03_C2", "zone_index": 0, "row_index": 3,
+            "col_index": 2, "col_type": "column_3", "text": "Scotland",
+            "word_boxes": [self._make_word_box("Scotland", 400, 190, 100, 26, "blue")],
+        })
+        zones_data = [zone]
+        count = _detect_heading_rows_by_color(zones_data, 800, 1000)
+        assert count == 1
+        heading = [c for c in zone["cells"] if c["row_index"] == 3]
+        assert len(heading) == 1
+        assert heading[0]["col_type"] == "heading"
+        assert heading[0]["col_index"] == 1  # Should start at column_2, not 0