diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 895c172..5b96be4 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -2801,6 +2801,71 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: except Exception as e: logger.warning("Dictionary detection failed: %s", e) + # --- Syllable divider insertion for dictionary pages --- + # Dictionary pages show syllable breaks as "|" (e.g. "Ka|me|rad"). + # OCR engines rarely detect "|", so we insert them via pyphen + # hyphenation rules when the page is confirmed as a dictionary. + syllable_insertions = 0 + if dict_detection.get("is_dictionary"): + try: + import pyphen + _hyph_de = pyphen.Pyphen(lang='de_DE') + _hyph_en = pyphen.Pyphen(lang='en_US') + # IPA/bracket pattern — don't hyphenate phonetic transcriptions + _ipa_re = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]') + for z in zones_data: + for cell in z.get("cells", []): + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + text = cell.get("text", "") + if not text or "|" in text: + continue # already has pipes or empty + if _ipa_re.search(text): + continue # IPA content — skip + # Split on commas/semicolons to handle "Kabel, die Kabel" + parts = re.split(r'([,;]\s*)', text) + new_parts = [] + changed = False + for part in parts: + if re.match(r'^[,;]\s*$', part): + new_parts.append(part) + continue + # Process individual words in each part + words_in = re.split(r'(\s+)', part) + new_words = [] + for w in words_in: + if re.match(r'^\s+$', w): + new_words.append(w) + continue + # Only hyphenate words ≥ 4 chars, skip articles/short + clean = re.sub(r'[().\-]', '', w) + if len(clean) < 4: + new_words.append(w) + continue + # Try DE first, then EN + hyph = _hyph_de.inserted(w, hyphen='|') + if '|' not in hyph: + hyph = _hyph_en.inserted(w, hyphen='|') + if '|' in hyph and hyph != w: + new_words.append(hyph) + changed = True + else: + new_words.append(w) + new_parts.append(''.join(new_words)) + if changed: + cell["text"] = ''.join(new_parts) + syllable_insertions += 1 + if syllable_insertions: + logger.info( + "build-grid session %s: inserted syllable dividers in %d cells", + session_id, syllable_insertions, + ) + except ImportError: + logger.warning("pyphen not installed — skipping syllable insertion") + except Exception as e: + logger.warning("Syllable insertion failed: %s", e) + result = { "session_id": session_id, "image_width": img_w, diff --git a/klausur-service/backend/requirements.txt b/klausur-service/backend/requirements.txt index 24a8fbb..a922d5f 100644 --- a/klausur-service/backend/requirements.txt +++ b/klausur-service/backend/requirements.txt @@ -38,6 +38,9 @@ eng-to-ipa # Spell-checker for rule-based OCR correction (MIT license) pyspellchecker>=0.8.1 +# Syllable hyphenation for dictionary pipe-divider insertion (MIT license) +pyphen>=0.16.0 + # PostgreSQL (for metrics storage) psycopg2-binary>=2.9.0 asyncpg>=0.29.0