From d9b2aa82e99efe2e0b98ef59863e961b5f9db8a4 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 24 Mar 2026 14:31:16 +0100 Subject: [PATCH] fix: CV-gated syllable insertion + grid editor scroll 1. Syllable dividers now require CV validation: morphological vertical line detection checks if word_box image actually shows thin isolated pipe lines before applying pyphen. Only first word per cell gets pipes (matching dictionary print layout). 2. Grid editor scroll: changed maxHeight from 80vh to calc(100vh-200px) so editor remains scrollable after edits. Co-Authored-By: Claude Opus 4.6 --- .../ocr-pipeline/StepGridReview.tsx | 2 +- klausur-service/backend/grid_editor_api.py | 111 ++++++++++++------ 2 files changed, 73 insertions(+), 40 deletions(-) diff --git a/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx b/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx index 388d078..91be3ab 100644 --- a/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepGridReview.tsx @@ -314,7 +314,7 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro )} {/* Right: Grid with row-accept buttons */} -
+
{/* Zone tables with row-accept buttons */} {(() => { diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 5b96be4..5c4eaa8 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -2803,16 +2803,62 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # --- Syllable divider insertion for dictionary pages --- # Dictionary pages show syllable breaks as "|" (e.g. "Ka|me|rad"). - # OCR engines rarely detect "|", so we insert them via pyphen - # hyphenation rules when the page is confirmed as a dictionary. + # OCR engines rarely detect "|", so we use a two-step approach: + # 1. CV: detect if a word_box image contains thin vertical pipe lines + # 2. pyphen: insert syllable breaks at linguistically correct positions + # Only the FIRST significant word per cell gets pipes (matching print layout). syllable_insertions = 0 - if dict_detection.get("is_dictionary"): + if dict_detection.get("is_dictionary") and img_bgr is not None: try: import pyphen _hyph_de = pyphen.Pyphen(lang='de_DE') _hyph_en = pyphen.Pyphen(lang='en_US') - # IPA/bracket pattern — don't hyphenate phonetic transcriptions _ipa_re = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]') + img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) + + def _word_has_pipe_lines(wb: Dict) -> bool: + """CV check: does this word_box image show thin vertical dividers?""" + x = wb.get("left", 0) + y = wb.get("top", 0) + w = wb.get("width", 0) + h = wb.get("height", 0) + if w < 30 or h < 12: + return False + ih, iw = img_gray.shape[:2] + y1, y2 = max(0, y), min(ih, y + h) + x1, x2 = max(0, x), min(iw, x + w) + roi = img_gray[y1:y2, x1:x2] + if roi.size == 0: + return False + rh, rw = roi.shape + # Binarize (ink = white) + _, binary = cv2.threshold( + roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU + ) + # Morphological opening: keep only tall vertical structures + kern_h = max(int(rh * 0.55), 8) + kernel = np.ones((kern_h, 1), np.uint8) + vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel) + # Find surviving contours + contours, _ = cv2.findContours( + vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) + margin = max(int(rw * 0.08), 3) + for cnt in contours: + cx, cy, cw, ch = cv2.boundingRect(cnt) + if cw > 4: + continue # too wide + if cx < margin or cx + cw > rw - margin: + continue # at word edge (l, I, 1) + # Check isolation: adjacent columns should be mostly empty + left_zone = binary[cy:cy + ch, max(0, cx - 3):cx] + right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)] + left_ink = np.mean(left_zone) if left_zone.size else 255 + right_ink = np.mean(right_zone) if right_zone.size else 255 + if left_ink < 80 and right_ink < 80: + return True # isolated thin vertical line = pipe + return False + for z in zones_data: for cell in z.get("cells", []): ct = cell.get("col_type", "") @@ -2820,45 +2866,32 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: continue text = cell.get("text", "") if not text or "|" in text: - continue # already has pipes or empty + continue if _ipa_re.search(text): - continue # IPA content — skip - # Split on commas/semicolons to handle "Kabel, die Kabel" - parts = re.split(r'([,;]\s*)', text) - new_parts = [] - changed = False - for part in parts: - if re.match(r'^[,;]\s*$', part): - new_parts.append(part) - continue - # Process individual words in each part - words_in = re.split(r'(\s+)', part) - new_words = [] - for w in words_in: - if re.match(r'^\s+$', w): - new_words.append(w) - continue - # Only hyphenate words ≥ 4 chars, skip articles/short - clean = re.sub(r'[().\-]', '', w) - if len(clean) < 4: - new_words.append(w) - continue - # Try DE first, then EN - hyph = _hyph_de.inserted(w, hyphen='|') - if '|' not in hyph: - hyph = _hyph_en.inserted(w, hyphen='|') - if '|' in hyph and hyph != w: - new_words.append(hyph) - changed = True - else: - new_words.append(w) - new_parts.append(''.join(new_words)) - if changed: - cell["text"] = ''.join(new_parts) + continue + # CV gate: check if ANY word_box in this cell has pipe lines + wbs = cell.get("word_boxes") or [] + has_pipes = any(_word_has_pipe_lines(wb) for wb in wbs) + if not has_pipes: + continue + # Apply pyphen to FIRST significant word only + # (dictionary layout: only headword gets pipes) + match = re.match(r'^(\s*)([\w\-äöüÄÖÜß]+)(.*)', text, re.DOTALL) + if not match: + continue + prefix, first_word, rest = match.groups() + if len(first_word) < 4: + continue + hyph = _hyph_de.inserted(first_word, hyphen='|') + if '|' not in hyph: + hyph = _hyph_en.inserted(first_word, hyphen='|') + if '|' in hyph and hyph != first_word: + cell["text"] = prefix + hyph + rest syllable_insertions += 1 if syllable_insertions: logger.info( - "build-grid session %s: inserted syllable dividers in %d cells", + "build-grid session %s: inserted syllable dividers in %d cells " + "(CV-validated)", session_id, syllable_insertions, ) except ImportError: