diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index b2dc95c..6facc19 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -207,7 +207,7 @@ def _cluster_columns_by_alignment( # text (random inter-word gaps) while still detecting real columns in # vocabulary worksheets (which typically have >80% row coverage). MIN_COVERAGE_PRIMARY = 0.35 - MIN_COVERAGE_SECONDARY = 0.20 + MIN_COVERAGE_SECONDARY = 0.12 MIN_WORDS_SECONDARY = 4 MIN_DISTINCT_ROWS = 3 @@ -1956,10 +1956,14 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: removed_pipes, z.get("zone_index", 0), ) - # Also strip leading/trailing pipe chars from cell text that may remain - # from word_boxes that contained mixed text like "word|" or "|word". + # Also strip pipe chars from word_box text and cell text that may remain + # from OCR reading syllable-separation marks (e.g. "zu|trau|en" → "zutrauen"). for z in zones_data: for cell in z.get("cells", []): + for wb in cell.get("word_boxes", []): + wbt = wb.get("text", "") + if "|" in wbt: + wb["text"] = wbt.replace("|", "") text = cell.get("text", "") if "|" in text: cleaned = text.replace("|", "").strip()