From 7c5d95b858bbbaebdbe5adfedf698c71dfe67bc4 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 20 Mar 2026 08:00:06 +0100 Subject: [PATCH] Fix heading col_index + detect black single-cell headings like "Theme" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Color headings now preserve actual starting col_index instead of hardcoded 0 - New _detect_heading_rows_by_single_cell: detects rows with only 1 content cell (excl. page_ref) as headings — catches black headings like "Theme" that have normal color/height but are alone in their row - Runs after Step 5d (IPA continuation) to avoid false positives - 5 new tests (32 total) Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 147 ++++++++++++++++- .../backend/tests/test_grid_editor_api.py | 155 ++++++++++++++++++ 2 files changed, 300 insertions(+), 2 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index cd804be..35bc996 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -623,12 +623,14 @@ def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int x_max = max(wb["left"] + wb["width"] for wb in all_wb) y_max = max(wb["top"] + wb["height"] for wb in all_wb) + # Use the actual starting col_index from the first cell + first_col = min(hc["col_index"] for hc in header_cells) zone_idx = z.get("zone_index", 0) z["cells"].append({ - "cell_id": f"Z{zone_idx}_R{hri:02d}_C0", + "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}", "zone_index": zone_idx, "row_index": hri, - "col_index": 0, + "col_index": first_col, "col_type": "heading", "text": " ".join(all_text_parts), "confidence": 0.0, @@ -654,6 +656,139 @@ def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int return heading_count +def _detect_heading_rows_by_single_cell( + zones_data: List[Dict], img_w: int, img_h: int, +) -> int: + """Detect heading rows that have only a single content cell. + + Black headings like "Theme" have normal color and height, so they are + missed by ``_detect_heading_rows_by_color``. The distinguishing signal + is that they occupy only one column while normal vocabulary rows fill + at least 2-3 columns. + + A row qualifies as a heading if: + 1. It is not already marked as a header/heading. + 2. It has exactly ONE cell whose col_type starts with ``column_`` + (excluding column_1 / page_ref which only carries page numbers). + 3. That single cell is NOT in the last column (continuation/example + lines like "2. Veränderung, Wechsel" often sit alone in column_4). + 4. The text does not start with ``[`` (IPA continuation). + 5. The zone has ≥3 columns and ≥5 rows (avoids false positives in + tiny zones). + 6. The majority of rows in the zone have ≥2 content cells (ensures + we are in a multi-column vocab layout). + """ + heading_count = 0 + + for z in zones_data: + cells = z.get("cells", []) + rows = z.get("rows", []) + columns = z.get("columns", []) + if len(columns) < 3 or len(rows) < 5: + continue + + # Determine the last col_index (example/sentence column) + col_indices = sorted(set(c.get("col_index", 0) for c in cells)) + if not col_indices: + continue + last_col = col_indices[-1] + + # Count content cells per row (column_* but not column_1/page_ref) + row_content_counts: Dict[int, int] = {} + for cell in cells: + ct = cell.get("col_type", "") + if ct.startswith("column_") and ct != "column_1": + ri = cell.get("row_index", -1) + row_content_counts[ri] = row_content_counts.get(ri, 0) + 1 + + # Majority of rows must have ≥2 content cells + multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2) + if multi_col_rows < len(rows) * 0.4: + continue + + heading_row_indices = [] + for row in rows: + if row.get("is_header"): + continue + ri = row["index"] + row_cells = [c for c in cells if c.get("row_index") == ri] + content_cells = [ + c for c in row_cells + if c.get("col_type", "").startswith("column_") + and c.get("col_type") != "column_1" + ] + if len(content_cells) != 1: + continue + cell = content_cells[0] + # Not in the last column (continuation/example lines) + if cell.get("col_index") == last_col: + continue + text = (cell.get("text") or "").strip() + if not text or text.startswith("["): + continue + heading_row_indices.append(ri) + + for hri in heading_row_indices: + header_cells = [c for c in cells if c.get("row_index") == hri] + if not header_cells: + continue + + # Collect all word_boxes and text + all_wb = [] + all_text_parts = [] + for hc in sorted(header_cells, key=lambda c: c["col_index"]): + all_wb.extend(hc.get("word_boxes", [])) + if hc.get("text", "").strip(): + all_text_parts.append(hc["text"].strip()) + + first_col_idx = min(hc["col_index"] for hc in header_cells) + + # Remove old cells for this row, add spanning heading cell + z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri] + + if all_wb: + x_min = min(wb["left"] for wb in all_wb) + y_min = min(wb["top"] for wb in all_wb) + x_max = max(wb["left"] + wb["width"] for wb in all_wb) + y_max = max(wb["top"] + wb["height"] for wb in all_wb) + else: + # Fallback to first cell bbox + bp = header_cells[0].get("bbox_px", {}) + x_min = bp.get("x", 0) + y_min = bp.get("y", 0) + x_max = x_min + bp.get("w", 0) + y_max = y_min + bp.get("h", 0) + + zone_idx = z.get("zone_index", 0) + z["cells"].append({ + "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}", + "zone_index": zone_idx, + "row_index": hri, + "col_index": first_col_idx, + "col_type": "heading", + "text": " ".join(all_text_parts), + "confidence": 0.0, + "bbox_px": {"x": x_min, "y": y_min, + "w": x_max - x_min, "h": y_max - y_min}, + "bbox_pct": { + "x": round(x_min / img_w * 100, 2) if img_w else 0, + "y": round(y_min / img_h * 100, 2) if img_h else 0, + "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, + "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, + }, + "word_boxes": all_wb, + "ocr_engine": "words_first", + "is_bold": False, + }) + + for row in rows: + if row["index"] == hri: + row["is_header"] = True + heading_count += 1 + + return heading_count + + def _detect_header_rows( rows: List[Dict], zone_words: List[Dict], @@ -1680,6 +1815,14 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: if ipa_cont_fixed: logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed) + # 5e. Heading detection by single-cell rows — black headings like + # "Theme" that have normal color and height but are the ONLY cell + # in their row (excluding page_ref column_1). Must run AFTER 5d + # so IPA continuation cells are already processed. + single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h) + if single_heading_count: + logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count) + duration = time.time() - t0 # 6. Build result diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index 74f0822..b0b0563 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -18,6 +18,7 @@ from grid_editor_api import ( _filter_border_ghosts, _detect_header_rows, _detect_heading_rows_by_color, + _detect_heading_rows_by_single_cell, ) from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell @@ -520,3 +521,157 @@ class TestGarbledIpaDetection: # But text does NOT start with '[' — Step 5d bracket guard blocks it text = "employee [im'ploi:]" assert not (text.strip().startswith('[') and text.strip().endswith(']')) + + +# --------------------------------------------------------------------------- +# _detect_heading_rows_by_single_cell +# --------------------------------------------------------------------------- + +class TestDetectHeadingRowsBySingleCell: + """Test heading detection for black single-cell rows (e.g. 'Theme').""" + + def _make_word_box(self, text, left, top, width, height, color="black"): + return { + "text": text, "left": left, "top": top, + "width": width, "height": height, "color_name": color, "conf": 90, + } + + def _make_vocab_zone(self): + """Build a typical 4-column vocab zone with 8 rows. + + Columns: column_1 (page_ref), column_2 (EN), column_3 (DE), column_4 (example) + Row 4 has only 1 cell in column_2 → heading candidate ("Theme"). + """ + cells = [] + for ri in range(8): + if ri == 4: + # Single-cell row: "Theme" in column_2 only + cells.append({ + "cell_id": f"Z0_R{ri:02d}_C1", + "zone_index": 0, "row_index": ri, "col_index": 1, + "col_type": "column_2", "text": "Theme", + "word_boxes": [self._make_word_box("Theme", 130, 100 + ri * 30, 70, 20)], + }) + continue + # Normal vocab row: 3-4 cells + cells.append({ + "cell_id": f"Z0_R{ri:02d}_C0", + "zone_index": 0, "row_index": ri, "col_index": 0, + "col_type": "column_1", "text": f"p.{70 + ri}", + "word_boxes": [self._make_word_box(f"p.{70+ri}", 10, 100 + ri * 30, 30, 20)], + }) + cells.append({ + "cell_id": f"Z0_R{ri:02d}_C1", + "zone_index": 0, "row_index": ri, "col_index": 1, + "col_type": "column_2", "text": f"word_{ri}", + "word_boxes": [self._make_word_box(f"word_{ri}", 130, 100 + ri * 30, 80, 20)], + }) + cells.append({ + "cell_id": f"Z0_R{ri:02d}_C2", + "zone_index": 0, "row_index": ri, "col_index": 2, + "col_type": "column_3", "text": f"Wort_{ri}", + "word_boxes": [self._make_word_box(f"Wort_{ri}", 400, 100 + ri * 30, 80, 20)], + }) + cells.append({ + "cell_id": f"Z0_R{ri:02d}_C3", + "zone_index": 0, "row_index": ri, "col_index": 3, + "col_type": "column_4", "text": f"Example sentence {ri}.", + "word_boxes": [self._make_word_box(f"Example", 600, 100 + ri * 30, 120, 20)], + }) + + rows = [ + {"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False} + for ri in range(8) + ] + columns = [ + {"col_index": 0, "col_type": "column_1"}, + {"col_index": 1, "col_type": "column_2"}, + {"col_index": 2, "col_type": "column_3"}, + {"col_index": 3, "col_type": "column_4"}, + ] + return { + "zone_index": 0, "zone_type": "content", + "bbox_px": {"x": 0, "y": 0, "w": 800, "h": 1000}, + "cells": cells, "rows": rows, "columns": columns, + } + + def test_single_cell_heading_detected(self): + """Row with only 1 content cell in column_2 → heading.""" + zone = self._make_vocab_zone() + zones_data = [zone] + count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000) + assert count == 1 + heading_cells = [c for c in zone["cells"] if c["row_index"] == 4] + assert len(heading_cells) == 1 + assert heading_cells[0]["col_type"] == "heading" + assert heading_cells[0]["text"] == "Theme" + assert heading_cells[0]["col_index"] == 1 # Starts at column_2, not 0 + + def test_single_cell_in_last_column_not_heading(self): + """Row with only 1 cell in column_4 (last) → NOT heading (continuation).""" + zone = self._make_vocab_zone() + # Add a single-cell row in the last column (column_4) + zone["cells"].append({ + "cell_id": "Z0_R04_C3", + "zone_index": 0, "row_index": 4, "col_index": 3, + "col_type": "column_4", "text": "2. Veränderung", + "word_boxes": [self._make_word_box("2.", 600, 220, 100, 20)], + }) + # Remove the "Theme" cell from row 4 + zone["cells"] = [c for c in zone["cells"] + if not (c["row_index"] == 4 and c["col_index"] == 1)] + zones_data = [zone] + count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000) + # Row 4 now only has column_4 → should NOT be heading + # But original row 4 "Theme" was removed, so no heading at all + assert count == 0 + + def test_ipa_bracket_text_not_heading(self): + """Row with single cell starting with '[' → IPA continuation, not heading.""" + zone = self._make_vocab_zone() + # Replace "Theme" with IPA continuation + for c in zone["cells"]: + if c["row_index"] == 4 and c["col_index"] == 1: + c["text"] = "[θˈiːm]" + break + zones_data = [zone] + count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000) + assert count == 0 + + def test_multi_cell_row_not_heading(self): + """Normal vocab row with multiple cells → NOT heading.""" + zone = self._make_vocab_zone() + zones_data = [zone] + count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000) + # Only row 4 (Theme) should be heading, other rows have 3-4 cells + assert count == 1 + # Verify normal rows are NOT marked as heading + for ri in [0, 1, 2, 3, 5, 6, 7]: + row_cells = [c for c in zone["cells"] if c["row_index"] == ri] + for c in row_cells: + assert c["col_type"] != "heading" + + def test_color_heading_preserves_correct_col_index(self): + """Color heading starting in column_2 → col_index should be 1, not 0.""" + zone = self._make_vocab_zone() + # Make row 3 a color heading: blue words in column_2 and column_3 only + # (no column_1 page_ref for this row) + zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 3] + zone["cells"].append({ + "cell_id": "Z0_R03_C1", "zone_index": 0, "row_index": 3, + "col_index": 1, "col_type": "column_2", "text": "Unit 4:", + "word_boxes": [self._make_word_box("Unit", 130, 190, 50, 26, "blue"), + self._make_word_box("4:", 185, 190, 20, 26, "blue")], + }) + zone["cells"].append({ + "cell_id": "Z0_R03_C2", "zone_index": 0, "row_index": 3, + "col_index": 2, "col_type": "column_3", "text": "Scotland", + "word_boxes": [self._make_word_box("Scotland", 400, 190, 100, 26, "blue")], + }) + zones_data = [zone] + count = _detect_heading_rows_by_color(zones_data, 800, 1000) + assert count == 1 + heading = [c for c in zone["cells"] if c["row_index"] == 3] + assert len(heading) == 1 + assert heading[0]["col_type"] == "heading" + assert heading[0]["col_index"] == 1 # Should start at column_2, not 0