From a8773d5b00c8fe8e01cc62d52cac5a96c859746e Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Fri, 27 Mar 2026 15:24:35 +0100
Subject: [PATCH] Fix 4 Grid Editor bugs: syllable modes, heading detection,
 word gaps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Syllable "Original" (auto) mode: only normalize cells that already
   have | from OCR — don't add new syllable marks via pyphen to words
   without printed dividers on the original scan.

2. Syllable "Aus" (none) mode: strip residual | chars from OCR text
   so cells display clean (e.g. "Zel|le" → "Zelle").

3. Heading detection: add text length guard in single-cell heuristic —
   words > 4 alpha chars starting lowercase (like "zentral") are regular
   vocabulary, not section headings.

4. Word-gap merge: new merge_word_gaps_in_zones() step with relaxed
   threshold (6 chars) fixes OCR splits like "zerknit tert" → "zerknittert".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_syllable_detect.py | 92 +++++++++++++++++++
 klausur-service/backend/grid_editor_api.py    | 16 ++++
 .../backend/grid_editor_helpers.py            |  7 ++
 3 files changed, 115 insertions(+)

diff --git a/klausur-service/backend/cv_syllable_detect.py b/klausur-service/backend/cv_syllable_detect.py
index c1e1acc..8a47314 100644
--- a/klausur-service/backend/cv_syllable_detect.py
+++ b/klausur-service/backend/cv_syllable_detect.py
@@ -139,6 +139,92 @@ def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
     return ' '.join(result)
 
 
+def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
+    """Merge OCR word-gap fragments in cell texts using pyphen validation.
+
+    OCR often splits words at syllable boundaries into separate word_boxes,
+    producing text like "zerknit tert" instead of "zerknittert".  This
+    function tries to merge adjacent fragments in every content cell.
+
+    More permissive than ``_try_merge_pipe_gaps`` (threshold 6 instead of 3)
+    but still guarded by pyphen dictionary lookup and stop-word exclusion.
+
+    Returns the number of cells modified.
+    """
+    hyph_de, _ = _get_hyphenators()
+    if hyph_de is None:
+        return 0
+
+    modified = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            text = cell.get("text", "")
+            if not text or " " not in text:
+                continue
+
+            # Skip IPA cells
+            text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
+            if _IPA_RE.search(text_no_brackets):
+                continue
+
+            new_text = _try_merge_word_gaps(text, hyph_de)
+            if new_text != text:
+                cell["text"] = new_text
+                modified += 1
+
+    if modified:
+        logger.info(
+            "build-grid session %s: merged word gaps in %d cells",
+            session_id, modified,
+        )
+    return modified
+
+
+def _try_merge_word_gaps(text: str, hyph_de) -> str:
+    """Merge OCR word fragments with relaxed threshold (max_short=6).
+
+    Similar to ``_try_merge_pipe_gaps`` but allows longer fragments to be
+    merged.  Still requires pyphen to recognize the merged word.
+    """
+    parts = text.split(' ')
+    if len(parts) < 2:
+        return text
+
+    result = [parts[0]]
+    i = 1
+    while i < len(parts):
+        prev = result[-1]
+        curr = parts[i]
+
+        prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
+        curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
+
+        should_try = (
+            prev == prev_alpha
+            and prev_alpha and curr_alpha
+            and prev_alpha.lower() not in _STOP_WORDS
+            and curr_alpha.lower() not in _STOP_WORDS
+            and min(len(prev_alpha), len(curr_alpha)) <= 6
+            and len(prev_alpha) + len(curr_alpha) >= 4
+        )
+
+        if should_try:
+            merged_alpha = prev_alpha + curr_alpha
+            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
+            if '-' in hyph:
+                result[-1] = prev + curr
+                i += 1
+                continue
+
+        result.append(curr)
+        i += 1
+
+    return ' '.join(result)
+
+
 def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
     """Syllabify all significant words in a text string.
 
@@ -259,6 +345,12 @@ def insert_syllable_dividers(
             if not text:
                 continue
 
+            # In auto mode (force=False), only normalize cells that already
+            # have | from OCR (i.e. printed syllable dividers on the original
+            # scan).  Don't add new syllable marks to other words.
+            if not force and "|" not in text:
+                continue
+
             new_text = _syllabify_text(text, hyph_de, hyph_en)
             if new_text != text:
                 cell["text"] = new_text
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 8708804..2aa5ce3 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -1593,6 +1593,13 @@ async def _build_grid_core(
     except Exception as e:
         logger.warning("Dictionary detection failed: %s", e)
 
+    # --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
+    try:
+        from cv_syllable_detect import merge_word_gaps_in_zones
+        merge_word_gaps_in_zones(zones_data, session_id)
+    except Exception as e:
+        logger.warning("Word-gap merge failed: %s", e)
+
     # --- Syllable divider insertion for dictionary pages ---
     # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
     #   "all" = force on all content words, "en" = English column only,
@@ -1626,6 +1633,15 @@ async def _build_grid_core(
             except Exception as e:
                 logger.warning("Syllable insertion failed: %s", e)
 
+    # When syllable mode is "none", strip any residual | from OCR so
+    # that the displayed text is clean (e.g. "Zel|le" → "Zelle").
+    if syllable_mode == "none":
+        for z in zones_data:
+            for cell in z.get("cells", []):
+                t = cell.get("text", "")
+                if "|" in t:
+                    cell["text"] = t.replace("|", "")
+
     # Clean up internal flags before returning
     for z in zones_data:
         for cell in z.get("cells", []):
diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py
index 40c3b19..ad8fbff 100644
--- a/klausur-service/backend/grid_editor_helpers.py
+++ b/klausur-service/backend/grid_editor_helpers.py
@@ -912,6 +912,13 @@ def _detect_heading_rows_by_single_cell(
             _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
             if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
                 continue
+            # Guard: dictionary section headings are short (1-4 alpha chars
+            # like "A", "Ab", "Zi", "Sch").  Longer text that starts
+            # lowercase is a regular vocabulary word (e.g. "zentral") that
+            # happens to appear alone in its row.
+            alpha_only = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', text)
+            if len(alpha_only) > 4 and text[0].islower():
+                continue
             heading_row_indices.append(ri)
 
         # Guard: if >25% of eligible rows would become headings, the