From 1fae39dbb84e0b960b6d74514a61c63469a4528f Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Sun, 22 Mar 2026 07:44:03 +0100
Subject: [PATCH] fix: lower secondary column threshold + strip pipe chars from
 word_boxes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dictionary pages have 2 dictionary columns, each with article + headword
sub-columns. The right article column (die/der at x≈626) had only 14.3%
row coverage — below the 20% secondary threshold. Lowered to 12% so
dictionary article columns qualify. Also strip pipe characters from
individual word_box text (not just cell text) to remove OCR syllable
separation marks (e.g. "zu|trau|en" → "zutrauen").

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index b2dc95c..6facc19 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -207,7 +207,7 @@ def _cluster_columns_by_alignment(
     # text (random inter-word gaps) while still detecting real columns in
     # vocabulary worksheets (which typically have >80% row coverage).
     MIN_COVERAGE_PRIMARY = 0.35
-    MIN_COVERAGE_SECONDARY = 0.20
+    MIN_COVERAGE_SECONDARY = 0.12
     MIN_WORDS_SECONDARY = 4
     MIN_DISTINCT_ROWS = 3
 
@@ -1956,10 +1956,14 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                 removed_pipes, z.get("zone_index", 0),
             )
 
-    # Also strip leading/trailing pipe chars from cell text that may remain
-    # from word_boxes that contained mixed text like "word|" or "|word".
+    # Also strip pipe chars from word_box text and cell text that may remain
+    # from OCR reading syllable-separation marks (e.g. "zu|trau|en" → "zutrauen").
     for z in zones_data:
         for cell in z.get("cells", []):
+            for wb in cell.get("word_boxes", []):
+                wbt = wb.get("text", "")
+                if "|" in wbt:
+                    wb["text"] = wbt.replace("|", "")
             text = cell.get("text", "")
             if "|" in text:
                 cleaned = text.replace("|", "").strip()