Fix heading col_index + detect black single-cell headings like "Theme"

- Color headings now preserve actual starting col_index instead of hardcoded 0 - New _detect_heading_rows_by_single_cell: detects rows with only 1 content cell (excl. page_ref) as headings — catches black headings like "Theme" that have normal color/height but are alone in their row - Runs after Step 5d (IPA continuation) to avoid false positives - 5 new tests (32 total) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 08:00:06 +01:00
parent 65059471cf
commit 7c5d95b858
2 changed files with 300 additions and 2 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -623,12 +623,14 @@ def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int
                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
                y_max = max(wb["top"] + wb["height"] for wb in all_wb)

+                # Use the actual starting col_index from the first cell
+                first_col = min(hc["col_index"] for hc in header_cells)
                zone_idx = z.get("zone_index", 0)
                z["cells"].append({
-                    "cell_id": f"Z{zone_idx}_R{hri:02d}_C0",
+                    "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
                    "zone_index": zone_idx,
                    "row_index": hri,
-                    "col_index": 0,
+                    "col_index": first_col,
                    "col_type": "heading",
                    "text": " ".join(all_text_parts),
                    "confidence": 0.0,
@@ -654,6 +656,139 @@ def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int
    return heading_count


+def _detect_heading_rows_by_single_cell(
+    zones_data: List[Dict], img_w: int, img_h: int,
+) -> int:
+    """Detect heading rows that have only a single content cell.
+
+    Black headings like "Theme" have normal color and height, so they are
+    missed by ``_detect_heading_rows_by_color``.  The distinguishing signal
+    is that they occupy only one column while normal vocabulary rows fill
+    at least 2-3 columns.
+
+    A row qualifies as a heading if:
+    1. It is not already marked as a header/heading.
+    2. It has exactly ONE cell whose col_type starts with ``column_``
+       (excluding column_1 / page_ref which only carries page numbers).
+    3. That single cell is NOT in the last column (continuation/example
+       lines like "2. Veränderung, Wechsel" often sit alone in column_4).
+    4. The text does not start with ``[`` (IPA continuation).
+    5. The zone has ≥3 columns and ≥5 rows (avoids false positives in
+       tiny zones).
+    6. The majority of rows in the zone have ≥2 content cells (ensures
+       we are in a multi-column vocab layout).
+    """
+    heading_count = 0
+
+    for z in zones_data:
+        cells = z.get("cells", [])
+        rows = z.get("rows", [])
+        columns = z.get("columns", [])
+        if len(columns) < 3 or len(rows) < 5:
+            continue
+
+        # Determine the last col_index (example/sentence column)
+        col_indices = sorted(set(c.get("col_index", 0) for c in cells))
+        if not col_indices:
+            continue
+        last_col = col_indices[-1]
+
+        # Count content cells per row (column_* but not column_1/page_ref)
+        row_content_counts: Dict[int, int] = {}
+        for cell in cells:
+            ct = cell.get("col_type", "")
+            if ct.startswith("column_") and ct != "column_1":
+                ri = cell.get("row_index", -1)
+                row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
+
+        # Majority of rows must have ≥2 content cells
+        multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
+        if multi_col_rows < len(rows) * 0.4:
+            continue
+
+        heading_row_indices = []
+        for row in rows:
+            if row.get("is_header"):
+                continue
+            ri = row["index"]
+            row_cells = [c for c in cells if c.get("row_index") == ri]
+            content_cells = [
+                c for c in row_cells
+                if c.get("col_type", "").startswith("column_")
+                and c.get("col_type") != "column_1"
+            ]
+            if len(content_cells) != 1:
+                continue
+            cell = content_cells[0]
+            # Not in the last column (continuation/example lines)
+            if cell.get("col_index") == last_col:
+                continue
+            text = (cell.get("text") or "").strip()
+            if not text or text.startswith("["):
+                continue
+            heading_row_indices.append(ri)
+
+        for hri in heading_row_indices:
+            header_cells = [c for c in cells if c.get("row_index") == hri]
+            if not header_cells:
+                continue
+
+            # Collect all word_boxes and text
+            all_wb = []
+            all_text_parts = []
+            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
+                all_wb.extend(hc.get("word_boxes", []))
+                if hc.get("text", "").strip():
+                    all_text_parts.append(hc["text"].strip())
+
+            first_col_idx = min(hc["col_index"] for hc in header_cells)
+
+            # Remove old cells for this row, add spanning heading cell
+            z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
+
+            if all_wb:
+                x_min = min(wb["left"] for wb in all_wb)
+                y_min = min(wb["top"] for wb in all_wb)
+                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
+                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
+            else:
+                # Fallback to first cell bbox
+                bp = header_cells[0].get("bbox_px", {})
+                x_min = bp.get("x", 0)
+                y_min = bp.get("y", 0)
+                x_max = x_min + bp.get("w", 0)
+                y_max = y_min + bp.get("h", 0)
+
+            zone_idx = z.get("zone_index", 0)
+            z["cells"].append({
+                "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
+                "zone_index": zone_idx,
+                "row_index": hri,
+                "col_index": first_col_idx,
+                "col_type": "heading",
+                "text": " ".join(all_text_parts),
+                "confidence": 0.0,
+                "bbox_px": {"x": x_min, "y": y_min,
+                            "w": x_max - x_min, "h": y_max - y_min},
+                "bbox_pct": {
+                    "x": round(x_min / img_w * 100, 2) if img_w else 0,
+                    "y": round(y_min / img_h * 100, 2) if img_h else 0,
+                    "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
+                    "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
+                },
+                "word_boxes": all_wb,
+                "ocr_engine": "words_first",
+                "is_bold": False,
+            })
+
+            for row in rows:
+                if row["index"] == hri:
+                    row["is_header"] = True
+            heading_count += 1
+
+    return heading_count
+
+
 def _detect_header_rows(
    rows: List[Dict],
    zone_words: List[Dict],
@@ -1680,6 +1815,14 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
        if ipa_cont_fixed:
            logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)

+    # 5e. Heading detection by single-cell rows — black headings like
+    # "Theme" that have normal color and height but are the ONLY cell
+    # in their row (excluding page_ref column_1).  Must run AFTER 5d
+    # so IPA continuation cells are already processed.
+    single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
+    if single_heading_count:
+        logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
+
    duration = time.time() - t0

    # 6. Build result