Improve syllable divider insertion for dictionary pages

Rewrite cv_syllable_detect.py with pyphen-first approach: - Remove unreliable CV gate (morphological pipe detection) - Strip existing pipes and re-syllabify via pyphen (DE then EN) - Merge pipe-gap spaces where OCR split words at divider positions - Guard merges with function word blacklist and punctuation checks Add false-positive prevention: - Pre-check: skip if <5% of cells have existing | from OCR - Call-site check: require article_col_index (der/die/das column) - Prevents syllabification of synonym dictionaries and word lists Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 19:44:29 +01:00
parent 7fbcae954b
commit ed7fc99fc4
2 changed files with 221 additions and 112 deletions
@@ -1,11 +1,15 @@
 """
-CV-based syllable divider detection and insertion for dictionary pages.
+Syllable divider insertion for dictionary pages.

-Two-step approach:
-  1. CV: morphological vertical line detection checks if a word_box image
-     contains thin, isolated pipe-like vertical lines (syllable dividers).
-  2. pyphen: inserts syllable breaks at linguistically correct positions
-     for words where CV confirmed the presence of dividers.
+For confirmed dictionary pages (is_dictionary=True), processes all content
+column cells:
+  1. Strips existing | dividers for clean normalization
+  2. Merges pipe-gap spaces (where OCR split a word at a divider position)
+  3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN)
+  4. Only modifies words that pyphen recognizes — garbled OCR stays as-is
+
+No CV gate needed — the dictionary detection confidence is sufficient.
+pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable.

 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
@@ -13,94 +17,223 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.

 import logging
 import re
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple

-import cv2
 import numpy as np

 logger = logging.getLogger(__name__)

-
-def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
-    """CV check: does this word_box image show thin vertical pipe dividers?
-
-    Uses morphological opening with a tall thin kernel to isolate vertical
-    structures, then filters for thin (≤4px), isolated contours that are
-    NOT at the word edges (those would be l, I, 1 etc.).
-    """
-    x = wb.get("left", 0)
-    y = wb.get("top", 0)
-    w = wb.get("width", 0)
-    h = wb.get("height", 0)
-    if w < 30 or h < 12:
-        return False
-    ih, iw = img_gray.shape[:2]
-    y1, y2 = max(0, y), min(ih, y + h)
-    x1, x2 = max(0, x), min(iw, x + w)
-    roi = img_gray[y1:y2, x1:x2]
-    if roi.size == 0:
-        return False
-    rh, rw = roi.shape
-
-    # Binarize (ink = white on black background)
-    _, binary = cv2.threshold(
-        roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
-    )
-
-    # Morphological opening: keep only tall vertical structures (≥55% height)
-    kern_h = max(int(rh * 0.55), 8)
-    kernel = np.ones((kern_h, 1), np.uint8)
-    vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
-
-    # Find surviving contours
-    contours, _ = cv2.findContours(
-        vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
-    )
-
-    margin = max(int(rw * 0.08), 3)
-    for cnt in contours:
-        cx, cy, cw, ch = cv2.boundingRect(cnt)
-        if cw > 4:
-            continue  # too wide for a pipe
-        if cx < margin or cx + cw > rw - margin:
-            continue  # at word edge — likely l, I, 1
-        # Check isolation: adjacent columns should be mostly empty (ink-free)
-        left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
-        right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
-        left_ink = np.mean(left_zone) if left_zone.size else 255
-        right_ink = np.mean(right_zone) if right_zone.size else 255
-        if left_ink < 80 and right_ink < 80:
-            return True  # isolated thin vertical line = pipe divider
-    return False
-
-
-# IPA/phonetic bracket pattern — don't hyphenate transcriptions
+# IPA/phonetic characters — skip cells containing these
 _IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')

+# Common German words that should NOT be merged with adjacent tokens.
+# These are function words that appear as standalone words between
+# headwords/definitions on dictionary pages.
+_STOP_WORDS = frozenset([
+    # Articles
+    'der', 'die', 'das', 'dem', 'den', 'des',
+    'ein', 'eine', 'einem', 'einen', 'einer',
+    # Pronouns
+    'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
+    # Prepositions
+    'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
+    'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
+    'zwischen', 'ohne', 'gegen',
+    # Conjunctions
+    'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
+    # Adverbs
+    'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
+    # Verbs
+    'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
+    'sein', 'haben',
+    # Other
+    'kein', 'keine', 'keinem', 'keinen', 'keiner',
+])
+
+# Cached hyphenators
+_hyph_de = None
+_hyph_en = None
+
+
+def _get_hyphenators():
+    """Lazy-load pyphen hyphenators (cached across calls)."""
+    global _hyph_de, _hyph_en
+    if _hyph_de is not None:
+        return _hyph_de, _hyph_en
+    try:
+        import pyphen
+    except ImportError:
+        return None, None
+    _hyph_de = pyphen.Pyphen(lang='de_DE')
+    _hyph_en = pyphen.Pyphen(lang='en_US')
+    return _hyph_de, _hyph_en
+
+
+def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
+    """Try to hyphenate a word using DE then EN dictionary.
+
+    Returns word with | separators, or None if not recognized.
+    """
+    hyph = hyph_de.inserted(word, hyphen='|')
+    if '|' in hyph:
+        return hyph
+    hyph = hyph_en.inserted(word, hyphen='|')
+    if '|' in hyph:
+        return hyph
+    return None
+
+
+def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
+    """Merge fragments separated by single spaces where OCR split at a pipe.
+
+    Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
+    Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
+
+    Guards against false merges:
+    - The FIRST token must be pure alpha (word start — no attached punctuation)
+    - The second token may have trailing punctuation (comma, period) which
+      stays attached to the merged word: "Kä" + "fer," -> "Käfer,"
+    - Common German function words (der, die, das, ...) are never merged
+    - At least one fragment must be very short (<=3 alpha chars)
+    """
+    parts = text.split(' ')
+    if len(parts) < 2:
+        return text
+
+    result = [parts[0]]
+    i = 1
+    while i < len(parts):
+        prev = result[-1]
+        curr = parts[i]
+
+        # Extract alpha-only core for lookup
+        prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
+        curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
+
+        # Guard 1: first token must be pure alpha (word-start fragment)
+        #          second token may have trailing punctuation
+        # Guard 2: neither alpha core can be a common German function word
+        # Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
+        # Guard 4: combined length must be >= 4
+        should_try = (
+            prev == prev_alpha  # first token: pure alpha (word start)
+            and prev_alpha and curr_alpha
+            and prev_alpha.lower() not in _STOP_WORDS
+            and curr_alpha.lower() not in _STOP_WORDS
+            and min(len(prev_alpha), len(curr_alpha)) <= 3
+            and len(prev_alpha) + len(curr_alpha) >= 4
+        )
+
+        if should_try:
+            merged_alpha = prev_alpha + curr_alpha
+            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
+            if '-' in hyph:
+                # pyphen recognizes merged word — collapse the space
+                result[-1] = prev + curr
+                i += 1
+                continue
+
+        result.append(curr)
+        i += 1
+
+    return ' '.join(result)
+
+
+def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
+    """Syllabify all significant words in a text string.
+
+    1. Strip existing | dividers
+    2. Merge pipe-gap spaces where possible
+    3. Apply pyphen to each word >= 3 alphabetic chars
+    4. Words pyphen doesn't recognize stay as-is (no bad guesses)
+    """
+    if not text:
+        return text
+
+    # Skip cells that contain IPA transcription characters
+    if _IPA_RE.search(text):
+        return text
+
+    # Phase 1: strip existing pipe dividers for clean normalization
+    clean = text.replace('|', '')
+
+    # Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
+    clean = _try_merge_pipe_gaps(clean, hyph_de)
+
+    # Phase 3: tokenize and syllabify each word
+    # Split on whitespace and comma/semicolon sequences, keeping separators
+    tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
+
+    result = []
+    for tok in tokens:
+        if not tok or re.match(r'^[\s,;:]+$', tok):
+            result.append(tok)
+            continue
+
+        # Strip trailing/leading punctuation for pyphen lookup
+        m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok)
+        if not m:
+            result.append(tok)
+            continue
+        lead, word, trail = m.group(1), m.group(2), m.group(3)
+
+        if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word):
+            result.append(tok)
+            continue
+
+        hyph = _hyphenate_word(word, hyph_de, hyph_en)
+        if hyph:
+            result.append(lead + hyph + trail)
+        else:
+            result.append(tok)
+
+    return ''.join(result)
+

 def insert_syllable_dividers(
    zones_data: List[Dict],
    img_bgr: np.ndarray,
    session_id: str,
 ) -> int:
-    """Insert pipe syllable dividers into dictionary cells where CV confirms them.
+    """Insert pipe syllable dividers into dictionary cells.

-    For each cell on a dictionary page:
-      1. Check if ANY word_box has CV-detected pipe lines
-      2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
-      3. Try DE hyphenation first, then EN
+    For dictionary pages: process all content column cells, strip existing
+    pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
+
+    Pre-check: at least 5% of content cells must already contain ``|`` from
+    OCR.  This guards against false-positive dictionary detection on pages
+    like synonym dictionaries or alphabetical word lists that have no actual
+    syllable divider lines.

    Returns the number of cells modified.
    """
-    try:
-        import pyphen
-    except ImportError:
+    hyph_de, hyph_en = _get_hyphenators()
+    if hyph_de is None:
        logger.warning("pyphen not installed — skipping syllable insertion")
        return 0

-    _hyph_de = pyphen.Pyphen(lang='de_DE')
-    _hyph_en = pyphen.Pyphen(lang='en_US')
-    img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    # Pre-check: count cells that already have | from OCR.
+    # Real dictionary pages with printed syllable dividers will have OCR-
+    # detected pipes in many cells.  Pages without syllable dividers will
+    # have zero — skip those to avoid false syllabification.
+    total_col_cells = 0
+    cells_with_pipes = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            if cell.get("col_type", "").startswith("column_"):
+                total_col_cells += 1
+                if "|" in cell.get("text", ""):
+                    cells_with_pipes += 1
+
+    if total_col_cells > 0:
+        pipe_ratio = cells_with_pipes / total_col_cells
+        if pipe_ratio < 0.05:
+            logger.info(
+                "build-grid session %s: skipping syllable insertion — "
+                "only %.1f%% of cells have existing pipes (need >=5%%)",
+                session_id, pipe_ratio * 100,
+            )
+            return 0

    insertions = 0
    for z in zones_data:
@@ -109,47 +242,18 @@ def insert_syllable_dividers(
            if not ct.startswith("column_"):
                continue
            text = cell.get("text", "")
-            if not text or "|" in text:
-                continue
-            if _IPA_RE.search(text):
+            if not text:
                continue

-            # CV gate: check if ANY word_box in this cell has pipe lines
-            wbs = cell.get("word_boxes") or []
-            if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs):
-                continue
-
-            # Apply pyphen to each significant word in the cell
-            tokens = re.split(r'(\s+|[,;]+\s*)', text)
-            new_tokens = []
-            changed = False
-            for tok in tokens:
-                # Skip whitespace/punctuation separators
-                if re.match(r'^[\s,;]+$', tok):
-                    new_tokens.append(tok)
-                    continue
-                # Only hyphenate words ≥ 4 alpha chars
-                clean = re.sub(r'[().\-]', '', tok)
-                if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
-                    new_tokens.append(tok)
-                    continue
-                # Try DE first, then EN
-                hyph = _hyph_de.inserted(tok, hyphen='|')
-                if '|' not in hyph:
-                    hyph = _hyph_en.inserted(tok, hyphen='|')
-                if '|' in hyph and hyph != tok:
-                    new_tokens.append(hyph)
-                    changed = True
-                else:
-                    new_tokens.append(tok)
-            if changed:
-                cell["text"] = ''.join(new_tokens)
+            new_text = _syllabify_text(text, hyph_de, hyph_en)
+            if new_text != text:
+                cell["text"] = new_text
                insertions += 1

    if insertions:
        logger.info(
-            "build-grid session %s: inserted syllable dividers in %d cells "
-            "(CV-validated)",
+            "build-grid session %s: syllable dividers inserted/normalized "
+            "in %d cells (pyphen)",
            session_id, insertions,
        )
    return insertions
@@ -1456,10 +1456,15 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
        logger.warning("Dictionary detection failed: %s", e)

    # --- Syllable divider insertion for dictionary pages ---
-    # CV-validated: only inserts "|" where image shows thin vertical lines.
-    # See cv_syllable_detect.py for the detection + insertion logic.
+    # Only on confirmed dictionary pages with article columns (der/die/das).
+    # The article_col_index check avoids false positives on synonym lists,
+    # word frequency tables, and other alphabetically sorted non-dictionary pages.
+    # Additionally, insert_syllable_dividers has its own pre-check for existing
+    # pipe characters in cells (OCR must have already found some).
    syllable_insertions = 0
-    if dict_detection.get("is_dictionary") and img_bgr is not None:
+    if (dict_detection.get("is_dictionary")
+            and dict_detection.get("article_col_index") is not None
+            and img_bgr is not None):
        try:
            from cv_syllable_detect import insert_syllable_dividers
            syllable_insertions = insert_syllable_dividers(