diff --git a/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx b/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx
index 388d078..91be3ab 100644
--- a/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx
+++ b/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx
@@ -314,7 +314,7 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
)}
{/* Right: Grid with row-accept buttons */}
-
+
{/* Zone tables with row-accept buttons */}
{(() => {
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 5b96be4..5c4eaa8 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -2803,16 +2803,62 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# --- Syllable divider insertion for dictionary pages ---
# Dictionary pages show syllable breaks as "|" (e.g. "Ka|me|rad").
- # OCR engines rarely detect "|", so we insert them via pyphen
- # hyphenation rules when the page is confirmed as a dictionary.
+ # OCR engines rarely detect "|", so we use a two-step approach:
+ # 1. CV: detect if a word_box image contains thin vertical pipe lines
+ # 2. pyphen: insert syllable breaks at linguistically correct positions
+ # Only the FIRST significant word per cell gets pipes (matching print layout).
syllable_insertions = 0
- if dict_detection.get("is_dictionary"):
+ if dict_detection.get("is_dictionary") and img_bgr is not None:
try:
import pyphen
_hyph_de = pyphen.Pyphen(lang='de_DE')
_hyph_en = pyphen.Pyphen(lang='en_US')
- # IPA/bracket pattern — don't hyphenate phonetic transcriptions
_ipa_re = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
+ img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+ def _word_has_pipe_lines(wb: Dict) -> bool:
+ """CV check: does this word_box image show thin vertical dividers?"""
+ x = wb.get("left", 0)
+ y = wb.get("top", 0)
+ w = wb.get("width", 0)
+ h = wb.get("height", 0)
+ if w < 30 or h < 12:
+ return False
+ ih, iw = img_gray.shape[:2]
+ y1, y2 = max(0, y), min(ih, y + h)
+ x1, x2 = max(0, x), min(iw, x + w)
+ roi = img_gray[y1:y2, x1:x2]
+ if roi.size == 0:
+ return False
+ rh, rw = roi.shape
+ # Binarize (ink = white)
+ _, binary = cv2.threshold(
+ roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
+ )
+ # Morphological opening: keep only tall vertical structures
+ kern_h = max(int(rh * 0.55), 8)
+ kernel = np.ones((kern_h, 1), np.uint8)
+ vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
+ # Find surviving contours
+ contours, _ = cv2.findContours(
+ vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+ )
+ margin = max(int(rw * 0.08), 3)
+ for cnt in contours:
+ cx, cy, cw, ch = cv2.boundingRect(cnt)
+ if cw > 4:
+ continue # too wide
+ if cx < margin or cx + cw > rw - margin:
+ continue # at word edge (l, I, 1)
+ # Check isolation: adjacent columns should be mostly empty
+ left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
+ right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
+ left_ink = np.mean(left_zone) if left_zone.size else 255
+ right_ink = np.mean(right_zone) if right_zone.size else 255
+ if left_ink < 80 and right_ink < 80:
+ return True # isolated thin vertical line = pipe
+ return False
+
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
@@ -2820,45 +2866,32 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
continue
text = cell.get("text", "")
if not text or "|" in text:
- continue # already has pipes or empty
+ continue
if _ipa_re.search(text):
- continue # IPA content — skip
- # Split on commas/semicolons to handle "Kabel, die Kabel"
- parts = re.split(r'([,;]\s*)', text)
- new_parts = []
- changed = False
- for part in parts:
- if re.match(r'^[,;]\s*$', part):
- new_parts.append(part)
- continue
- # Process individual words in each part
- words_in = re.split(r'(\s+)', part)
- new_words = []
- for w in words_in:
- if re.match(r'^\s+$', w):
- new_words.append(w)
- continue
- # Only hyphenate words ≥ 4 chars, skip articles/short
- clean = re.sub(r'[().\-]', '', w)
- if len(clean) < 4:
- new_words.append(w)
- continue
- # Try DE first, then EN
- hyph = _hyph_de.inserted(w, hyphen='|')
- if '|' not in hyph:
- hyph = _hyph_en.inserted(w, hyphen='|')
- if '|' in hyph and hyph != w:
- new_words.append(hyph)
- changed = True
- else:
- new_words.append(w)
- new_parts.append(''.join(new_words))
- if changed:
- cell["text"] = ''.join(new_parts)
+ continue
+ # CV gate: check if ANY word_box in this cell has pipe lines
+ wbs = cell.get("word_boxes") or []
+ has_pipes = any(_word_has_pipe_lines(wb) for wb in wbs)
+ if not has_pipes:
+ continue
+ # Apply pyphen to FIRST significant word only
+ # (dictionary layout: only headword gets pipes)
+ match = re.match(r'^(\s*)([\w\-äöüÄÖÜß]+)(.*)', text, re.DOTALL)
+ if not match:
+ continue
+ prefix, first_word, rest = match.groups()
+ if len(first_word) < 4:
+ continue
+ hyph = _hyph_de.inserted(first_word, hyphen='|')
+ if '|' not in hyph:
+ hyph = _hyph_en.inserted(first_word, hyphen='|')
+ if '|' in hyph and hyph != first_word:
+ cell["text"] = prefix + hyph + rest
syllable_insertions += 1
if syllable_insertions:
logger.info(
- "build-grid session %s: inserted syllable dividers in %d cells",
+ "build-grid session %s: inserted syllable dividers in %d cells "
+ "(CV-validated)",
session_id, syllable_insertions,
)
except ImportError: