From f139d0903e5cb599f06ebf1d481df8020f48c5d8 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 18 Mar 2026 11:08:23 +0100
Subject: [PATCH] Preserve alphabetic marker columns, broaden junk filter,
 enable IPA in grid
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- _merge_inline_marker_columns: skip merge when ≥50% of words are
  alphabetic (preserves "to", "in", "der" columns)
- Rule 2 (oversized stub): widen to ≤3 words / ≤5 chars (catches "SEA &")
- IPA phonetics: map longest-avg-text column to column_en so
  fix_cell_phonetics runs in the grid editor
- ocr_pipeline_overlays: add missing split_page_into_zones import

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py    | 81 +++++++++++++++----
 .../backend/ocr_pipeline_overlays.py          |  2 +-
 2 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 6b2be75..792ec4d 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -326,6 +326,9 @@ def _filter_border_ghosts(
     return filtered, len(words) - len(filtered)
 
 
+_MARKER_CHARS = set("•*·-–—|~=+#>→►▸▪◆○●□■✓✗✔✘")
+
+
 def _merge_inline_marker_columns(
     columns: List[Dict],
     words: List[Dict],
@@ -335,6 +338,9 @@ def _merge_inline_marker_columns(
     Bullet points (•, *, -) and numbering (1., 2.) create narrow columns
     at the left edge of a zone.  These are inline markers that indent text,
     not real separate columns.  Merge them with their right neighbour.
+
+    Does NOT merge columns containing alphabetic words like "to", "in",
+    "der", "die", "das" — those are legitimate content columns.
     """
     if len(columns) < 2:
         return columns
@@ -353,21 +359,38 @@ def _merge_inline_marker_columns(
         ]
         col_width = col["x_max"] - col["x_min"]
 
-        # Narrow column with mostly short words → likely inline markers
+        # Narrow column with mostly short words → MIGHT be inline markers
         if col_words and col_width < 80:
             avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words)
             if avg_len <= 2 and i + 1 < len(columns):
-                # Merge into next column
-                next_col = columns[i + 1].copy()
-                next_col["x_min"] = col["x_min"]
-                merged.append(next_col)
-                skip.add(i + 1)
-                logger.info(
-                    "  merged inline marker column %d (w=%d, avg_len=%.1f) "
-                    "into column %d",
-                    i, col_width, avg_len, i + 1,
+                # Check if words are actual markers (symbols/numbers) vs
+                # real alphabetic words like "to", "in", "der", "die"
+                texts = [(w.get("text") or "").strip() for w in col_words]
+                alpha_count = sum(
+                    1 for t in texts
+                    if t and t[0].isalpha() and t not in _MARKER_CHARS
                 )
-                continue
+                alpha_ratio = alpha_count / len(texts) if texts else 0
+
+                # If ≥50% of words are alphabetic, this is a real column
+                if alpha_ratio >= 0.5:
+                    logger.info(
+                        "  kept narrow column %d (w=%d, avg_len=%.1f, "
+                        "alpha=%.0f%%) — contains real words",
+                        i, col_width, avg_len, alpha_ratio * 100,
+                    )
+                else:
+                    # Merge into next column
+                    next_col = columns[i + 1].copy()
+                    next_col["x_min"] = col["x_min"]
+                    merged.append(next_col)
+                    skip.add(i + 1)
+                    logger.info(
+                        "  merged inline marker column %d (w=%d, avg_len=%.1f) "
+                        "into column %d",
+                        i, col_width, avg_len, i + 1,
+                    )
+                    continue
 
         merged.append(col)
 
@@ -1096,12 +1119,13 @@ async def build_grid(session_id: str):
                 junk_row_indices.add(ri)
                 continue
 
-            # Rule 2: oversized stub — ≤2 words, all short text (≤2 chars),
-            # and word height > 1.8× median (page numbers, stray marks)
-            if len(row_wbs) <= 2:
+            # Rule 2: oversized stub — ≤3 words, short total text,
+            # and word height > 1.8× median (page numbers, stray marks,
+            # OCR from illustration labels like "SEA &")
+            if len(row_wbs) <= 3:
                 total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
                 max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
-                if len(total_text) <= 3 and max_h > median_wb_h * 1.8:
+                if len(total_text) <= 5 and max_h > median_wb_h * 1.8:
                     junk_row_indices.add(ri)
                     continue
 
@@ -1141,8 +1165,35 @@ async def build_grid(session_id: str):
 
     # 5c. IPA phonetic correction — replace garbled OCR phonetics with
     # correct IPA from the dictionary (same as in the OCR pipeline).
+    # The grid uses generic col_types (column_1, column_2, ...) but
+    # fix_cell_phonetics expects column_en / column_text.  Identify
+    # the English headword column (longest average text) and mark it.
     all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
+    # Find which col_type has the longest average text → English headwords
+    col_avg_len: Dict[str, List[int]] = {}
+    for cell in all_cells:
+        ct = cell.get("col_type", "")
+        txt = cell.get("text", "")
+        col_avg_len.setdefault(ct, []).append(len(txt))
+    en_col_type = None
+    best_avg = 0
+    for ct, lengths in col_avg_len.items():
+        if not ct.startswith("column_"):
+            continue
+        avg = sum(lengths) / len(lengths) if lengths else 0
+        if avg > best_avg:
+            best_avg = avg
+            en_col_type = ct
+    if en_col_type:
+        for cell in all_cells:
+            if cell.get("col_type") == en_col_type:
+                cell["_orig_col_type"] = en_col_type
+                cell["col_type"] = "column_en"
     fix_cell_phonetics(all_cells, pronunciation="british")
+    for cell in all_cells:
+        orig = cell.pop("_orig_col_type", None)
+        if orig:
+            cell["col_type"] = orig
 
     duration = time.time() - t0
 
diff --git a/klausur-service/backend/ocr_pipeline_overlays.py b/klausur-service/backend/ocr_pipeline_overlays.py
index e63ead7..2789557 100644
--- a/klausur-service/backend/ocr_pipeline_overlays.py
+++ b/klausur-service/backend/ocr_pipeline_overlays.py
@@ -25,7 +25,7 @@ from ocr_pipeline_common import (
 )
 from ocr_pipeline_session_store import get_session_db, get_session_image
 from cv_color_detect import _COLOR_HEX, _COLOR_RANGES
-from cv_box_detect import detect_boxes
+from cv_box_detect import detect_boxes, split_page_into_zones
 from ocr_pipeline_rows import _draw_box_exclusion_overlay
 
 logger = logging.getLogger(__name__)