From a8773d5b00c8fe8e01cc62d52cac5a96c859746e Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 27 Mar 2026 15:24:35 +0100 Subject: [PATCH] Fix 4 Grid Editor bugs: syllable modes, heading detection, word gaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Syllable "Original" (auto) mode: only normalize cells that already have | from OCR — don't add new syllable marks via pyphen to words without printed dividers on the original scan. 2. Syllable "Aus" (none) mode: strip residual | chars from OCR text so cells display clean (e.g. "Zel|le" → "Zelle"). 3. Heading detection: add text length guard in single-cell heuristic — words > 4 alpha chars starting lowercase (like "zentral") are regular vocabulary, not section headings. 4. Word-gap merge: new merge_word_gaps_in_zones() step with relaxed threshold (6 chars) fixes OCR splits like "zerknit tert" → "zerknittert". Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_syllable_detect.py | 92 +++++++++++++++++++ klausur-service/backend/grid_editor_api.py | 16 ++++ .../backend/grid_editor_helpers.py | 7 ++ 3 files changed, 115 insertions(+) diff --git a/klausur-service/backend/cv_syllable_detect.py b/klausur-service/backend/cv_syllable_detect.py index c1e1acc..8a47314 100644 --- a/klausur-service/backend/cv_syllable_detect.py +++ b/klausur-service/backend/cv_syllable_detect.py @@ -139,6 +139,92 @@ def _try_merge_pipe_gaps(text: str, hyph_de) -> str: return ' '.join(result) +def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int: + """Merge OCR word-gap fragments in cell texts using pyphen validation. + + OCR often splits words at syllable boundaries into separate word_boxes, + producing text like "zerknit tert" instead of "zerknittert". This + function tries to merge adjacent fragments in every content cell. + + More permissive than ``_try_merge_pipe_gaps`` (threshold 6 instead of 3) + but still guarded by pyphen dictionary lookup and stop-word exclusion. + + Returns the number of cells modified. + """ + hyph_de, _ = _get_hyphenators() + if hyph_de is None: + return 0 + + modified = 0 + for z in zones_data: + for cell in z.get("cells", []): + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + text = cell.get("text", "") + if not text or " " not in text: + continue + + # Skip IPA cells + text_no_brackets = re.sub(r'\[[^\]]*\]', '', text) + if _IPA_RE.search(text_no_brackets): + continue + + new_text = _try_merge_word_gaps(text, hyph_de) + if new_text != text: + cell["text"] = new_text + modified += 1 + + if modified: + logger.info( + "build-grid session %s: merged word gaps in %d cells", + session_id, modified, + ) + return modified + + +def _try_merge_word_gaps(text: str, hyph_de) -> str: + """Merge OCR word fragments with relaxed threshold (max_short=6). + + Similar to ``_try_merge_pipe_gaps`` but allows longer fragments to be + merged. Still requires pyphen to recognize the merged word. + """ + parts = text.split(' ') + if len(parts) < 2: + return text + + result = [parts[0]] + i = 1 + while i < len(parts): + prev = result[-1] + curr = parts[i] + + prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev) + curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr) + + should_try = ( + prev == prev_alpha + and prev_alpha and curr_alpha + and prev_alpha.lower() not in _STOP_WORDS + and curr_alpha.lower() not in _STOP_WORDS + and min(len(prev_alpha), len(curr_alpha)) <= 6 + and len(prev_alpha) + len(curr_alpha) >= 4 + ) + + if should_try: + merged_alpha = prev_alpha + curr_alpha + hyph = hyph_de.inserted(merged_alpha, hyphen='-') + if '-' in hyph: + result[-1] = prev + curr + i += 1 + continue + + result.append(curr) + i += 1 + + return ' '.join(result) + + def _syllabify_text(text: str, hyph_de, hyph_en) -> str: """Syllabify all significant words in a text string. @@ -259,6 +345,12 @@ def insert_syllable_dividers( if not text: continue + # In auto mode (force=False), only normalize cells that already + # have | from OCR (i.e. printed syllable dividers on the original + # scan). Don't add new syllable marks to other words. + if not force and "|" not in text: + continue + new_text = _syllabify_text(text, hyph_de, hyph_en) if new_text != text: cell["text"] = new_text diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 8708804..2aa5ce3 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1593,6 +1593,13 @@ async def _build_grid_core( except Exception as e: logger.warning("Dictionary detection failed: %s", e) + # --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" --- + try: + from cv_syllable_detect import merge_word_gaps_in_zones + merge_word_gaps_in_zones(zones_data, session_id) + except Exception as e: + logger.warning("Word-gap merge failed: %s", e) + # --- Syllable divider insertion for dictionary pages --- # syllable_mode: "auto" = only when original has pipe dividers (1% threshold), # "all" = force on all content words, "en" = English column only, @@ -1626,6 +1633,15 @@ async def _build_grid_core( except Exception as e: logger.warning("Syllable insertion failed: %s", e) + # When syllable mode is "none", strip any residual | from OCR so + # that the displayed text is clean (e.g. "Zel|le" → "Zelle"). + if syllable_mode == "none": + for z in zones_data: + for cell in z.get("cells", []): + t = cell.get("text", "") + if "|" in t: + cell["text"] = t.replace("|", "") + # Clean up internal flags before returning for z in zones_data: for cell in z.get("cells", []): diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py index 40c3b19..ad8fbff 100644 --- a/klausur-service/backend/grid_editor_helpers.py +++ b/klausur-service/backend/grid_editor_helpers.py @@ -912,6 +912,13 @@ def _detect_heading_rows_by_single_cell( _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text): continue + # Guard: dictionary section headings are short (1-4 alpha chars + # like "A", "Ab", "Zi", "Sch"). Longer text that starts + # lowercase is a regular vocabulary word (e.g. "zentral") that + # happens to appear alone in its row. + alpha_only = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', text) + if len(alpha_only) > 4 and text[0].islower(): + continue heading_row_indices.append(ri) # Guard: if >25% of eligible rows would become headings, the