feat: auto-insert syllable dividers via pyphen on dictionary pages

OCR engines don't detect | pipe chars used as syllable dividers in dictionaries. After dictionary detection (is_dict=True), use pyphen (MIT) to insert syllable breaks into headword cells. Tries DE first, then EN. Skips IPA content, short words, and cells already containing |. Also adds pyphen>=0.16.0 to requirements.txt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 14:17:26 +01:00
parent fe754398c0
commit 364086b86e
2 changed files with 68 additions and 0 deletions
@@ -2801,6 +2801,71 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
    except Exception as e:
        logger.warning("Dictionary detection failed: %s", e)

+    # --- Syllable divider insertion for dictionary pages ---
+    # Dictionary pages show syllable breaks as "|" (e.g. "Ka|me|rad").
+    # OCR engines rarely detect "|", so we insert them via pyphen
+    # hyphenation rules when the page is confirmed as a dictionary.
+    syllable_insertions = 0
+    if dict_detection.get("is_dictionary"):
+        try:
+            import pyphen
+            _hyph_de = pyphen.Pyphen(lang='de_DE')
+            _hyph_en = pyphen.Pyphen(lang='en_US')
+            # IPA/bracket pattern — don't hyphenate phonetic transcriptions
+            _ipa_re = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
+            for z in zones_data:
+                for cell in z.get("cells", []):
+                    ct = cell.get("col_type", "")
+                    if not ct.startswith("column_"):
+                        continue
+                    text = cell.get("text", "")
+                    if not text or "|" in text:
+                        continue  # already has pipes or empty
+                    if _ipa_re.search(text):
+                        continue  # IPA content — skip
+                    # Split on commas/semicolons to handle "Kabel, die Kabel"
+                    parts = re.split(r'([,;]\s*)', text)
+                    new_parts = []
+                    changed = False
+                    for part in parts:
+                        if re.match(r'^[,;]\s*$', part):
+                            new_parts.append(part)
+                            continue
+                        # Process individual words in each part
+                        words_in = re.split(r'(\s+)', part)
+                        new_words = []
+                        for w in words_in:
+                            if re.match(r'^\s+$', w):
+                                new_words.append(w)
+                                continue
+                            # Only hyphenate words ≥ 4 chars, skip articles/short
+                            clean = re.sub(r'[().\-]', '', w)
+                            if len(clean) < 4:
+                                new_words.append(w)
+                                continue
+                            # Try DE first, then EN
+                            hyph = _hyph_de.inserted(w, hyphen='|')
+                            if '|' not in hyph:
+                                hyph = _hyph_en.inserted(w, hyphen='|')
+                            if '|' in hyph and hyph != w:
+                                new_words.append(hyph)
+                                changed = True
+                            else:
+                                new_words.append(w)
+                        new_parts.append(''.join(new_words))
+                    if changed:
+                        cell["text"] = ''.join(new_parts)
+                        syllable_insertions += 1
+            if syllable_insertions:
+                logger.info(
+                    "build-grid session %s: inserted syllable dividers in %d cells",
+                    session_id, syllable_insertions,
+                )
+        except ImportError:
+            logger.warning("pyphen not installed — skipping syllable insertion")
+        except Exception as e:
+            logger.warning("Syllable insertion failed: %s", e)
+
    result = {
        "session_id": session_id,
        "image_width": img_w,
@@ -38,6 +38,9 @@ eng-to-ipa
 # Spell-checker for rule-based OCR correction (MIT license)
 pyspellchecker>=0.8.1

+# Syllable hyphenation for dictionary pipe-divider insertion (MIT license)
+pyphen>=0.16.0
+
 # PostgreSQL (for metrics storage)
 psycopg2-binary>=2.9.0
 asyncpg>=0.29.0