From d9b2aa82e99efe2e0b98ef59863e961b5f9db8a4 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Tue, 24 Mar 2026 14:31:16 +0100
Subject: [PATCH] fix: CV-gated syllable insertion + grid editor scroll

1. Syllable dividers now require CV validation: morphological vertical
   line detection checks if word_box image actually shows thin isolated
   pipe lines before applying pyphen. Only first word per cell gets
   pipes (matching dictionary print layout).

2. Grid editor scroll: changed maxHeight from 80vh to calc(100vh-200px)
   so editor remains scrollable after edits.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../ocr-pipeline/StepGridReview.tsx           |   2 +-
 klausur-service/backend/grid_editor_api.py    | 111 ++++++++++++------
 2 files changed, 73 insertions(+), 40 deletions(-)
diff --git a/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx b/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx
index 388d078..91be3ab 100644
--- a/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx
+++ b/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx
@@ -314,7 +314,7 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
         )}
 
         {/* Right: Grid with row-accept buttons */}
-        <div className="space-y-3 overflow-y-auto" style={{ maxHeight: '80vh' }}>
+        <div className="space-y-3 overflow-y-auto" style={{ maxHeight: 'calc(100vh - 200px)' }}>
 
           {/* Zone tables with row-accept buttons */}
           {(() => {
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 5b96be4..5c4eaa8 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -2803,16 +2803,62 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
 
     # --- Syllable divider insertion for dictionary pages ---
     # Dictionary pages show syllable breaks as "|" (e.g. "Ka|me|rad").
-    # OCR engines rarely detect "|", so we insert them via pyphen
-    # hyphenation rules when the page is confirmed as a dictionary.
+    # OCR engines rarely detect "|", so we use a two-step approach:
+    #   1. CV: detect if a word_box image contains thin vertical pipe lines
+    #   2. pyphen: insert syllable breaks at linguistically correct positions
+    # Only the FIRST significant word per cell gets pipes (matching print layout).
     syllable_insertions = 0
-    if dict_detection.get("is_dictionary"):
+    if dict_detection.get("is_dictionary") and img_bgr is not None:
         try:
             import pyphen
             _hyph_de = pyphen.Pyphen(lang='de_DE')
             _hyph_en = pyphen.Pyphen(lang='en_US')
-            # IPA/bracket pattern — don't hyphenate phonetic transcriptions
             _ipa_re = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
+            img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+            def _word_has_pipe_lines(wb: Dict) -> bool:
+                """CV check: does this word_box image show thin vertical dividers?"""
+                x = wb.get("left", 0)
+                y = wb.get("top", 0)
+                w = wb.get("width", 0)
+                h = wb.get("height", 0)
+                if w < 30 or h < 12:
+                    return False
+                ih, iw = img_gray.shape[:2]
+                y1, y2 = max(0, y), min(ih, y + h)
+                x1, x2 = max(0, x), min(iw, x + w)
+                roi = img_gray[y1:y2, x1:x2]
+                if roi.size == 0:
+                    return False
+                rh, rw = roi.shape
+                # Binarize (ink = white)
+                _, binary = cv2.threshold(
+                    roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+                )
+                # Morphological opening: keep only tall vertical structures
+                kern_h = max(int(rh * 0.55), 8)
+                kernel = np.ones((kern_h, 1), np.uint8)
+                vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
+                # Find surviving contours
+                contours, _ = cv2.findContours(
+                    vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+                )
+                margin = max(int(rw * 0.08), 3)
+                for cnt in contours:
+                    cx, cy, cw, ch = cv2.boundingRect(cnt)
+                    if cw > 4:
+                        continue  # too wide
+                    if cx < margin or cx + cw > rw - margin:
+                        continue  # at word edge (l, I, 1)
+                    # Check isolation: adjacent columns should be mostly empty
+                    left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
+                    right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
+                    left_ink = np.mean(left_zone) if left_zone.size else 255
+                    right_ink = np.mean(right_zone) if right_zone.size else 255
+                    if left_ink < 80 and right_ink < 80:
+                        return True  # isolated thin vertical line = pipe
+                return False
+
             for z in zones_data:
                 for cell in z.get("cells", []):
                     ct = cell.get("col_type", "")
@@ -2820,45 +2866,32 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                         continue
                     text = cell.get("text", "")
                     if not text or "|" in text:
-                        continue  # already has pipes or empty
+                        continue
                     if _ipa_re.search(text):
-                        continue  # IPA content — skip
-                    # Split on commas/semicolons to handle "Kabel, die Kabel"
-                    parts = re.split(r'([,;]\s*)', text)
-                    new_parts = []
-                    changed = False
-                    for part in parts:
-                        if re.match(r'^[,;]\s*$', part):
-                            new_parts.append(part)
-                            continue
-                        # Process individual words in each part
-                        words_in = re.split(r'(\s+)', part)
-                        new_words = []
-                        for w in words_in:
-                            if re.match(r'^\s+$', w):
-                                new_words.append(w)
-                                continue
-                            # Only hyphenate words ≥ 4 chars, skip articles/short
-                            clean = re.sub(r'[().\-]', '', w)
-                            if len(clean) < 4:
-                                new_words.append(w)
-                                continue
-                            # Try DE first, then EN
-                            hyph = _hyph_de.inserted(w, hyphen='|')
-                            if '|' not in hyph:
-                                hyph = _hyph_en.inserted(w, hyphen='|')
-                            if '|' in hyph and hyph != w:
-                                new_words.append(hyph)
-                                changed = True
-                            else:
-                                new_words.append(w)
-                        new_parts.append(''.join(new_words))
-                    if changed:
-                        cell["text"] = ''.join(new_parts)
+                        continue
+                    # CV gate: check if ANY word_box in this cell has pipe lines
+                    wbs = cell.get("word_boxes") or []
+                    has_pipes = any(_word_has_pipe_lines(wb) for wb in wbs)
+                    if not has_pipes:
+                        continue
+                    # Apply pyphen to FIRST significant word only
+                    # (dictionary layout: only headword gets pipes)
+                    match = re.match(r'^(\s*)([\w\-äöüÄÖÜß]+)(.*)', text, re.DOTALL)
+                    if not match:
+                        continue
+                    prefix, first_word, rest = match.groups()
+                    if len(first_word) < 4:
+                        continue
+                    hyph = _hyph_de.inserted(first_word, hyphen='|')
+                    if '|' not in hyph:
+                        hyph = _hyph_en.inserted(first_word, hyphen='|')
+                    if '|' in hyph and hyph != first_word:
+                        cell["text"] = prefix + hyph + rest
                         syllable_insertions += 1
             if syllable_insertions:
                 logger.info(
-                    "build-grid session %s: inserted syllable dividers in %d cells",
+                    "build-grid session %s: inserted syllable dividers in %d cells "
+                    "(CV-validated)",
                     session_id, syllable_insertions,
                 )
         except ImportError: