From ed7fc99fc4f5db149555c1d1c5ca051540e8a1b7 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 24 Mar 2026 19:44:29 +0100 Subject: [PATCH] Improve syllable divider insertion for dictionary pages Rewrite cv_syllable_detect.py with pyphen-first approach: - Remove unreliable CV gate (morphological pipe detection) - Strip existing pipes and re-syllabify via pyphen (DE then EN) - Merge pipe-gap spaces where OCR split words at divider positions - Guard merges with function word blacklist and punctuation checks Add false-positive prevention: - Pre-check: skip if <5% of cells have existing | from OCR - Call-site check: require article_col_index (der/die/das column) - Prevents syllabification of synonym dictionaries and word lists Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_syllable_detect.py | 322 ++++++++++++------ klausur-service/backend/grid_editor_api.py | 11 +- 2 files changed, 221 insertions(+), 112 deletions(-) diff --git a/klausur-service/backend/cv_syllable_detect.py b/klausur-service/backend/cv_syllable_detect.py index fc3bdb8..42fb162 100644 --- a/klausur-service/backend/cv_syllable_detect.py +++ b/klausur-service/backend/cv_syllable_detect.py @@ -1,11 +1,15 @@ """ -CV-based syllable divider detection and insertion for dictionary pages. +Syllable divider insertion for dictionary pages. -Two-step approach: - 1. CV: morphological vertical line detection checks if a word_box image - contains thin, isolated pipe-like vertical lines (syllable dividers). - 2. pyphen: inserts syllable breaks at linguistically correct positions - for words where CV confirmed the presence of dividers. +For confirmed dictionary pages (is_dictionary=True), processes all content +column cells: + 1. Strips existing | dividers for clean normalization + 2. Merges pipe-gap spaces (where OCR split a word at a divider position) + 3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN) + 4. Only modifies words that pyphen recognizes — garbled OCR stays as-is + +No CV gate needed — the dictionary detection confidence is sufficient. +pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. @@ -13,94 +17,223 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. import logging import re -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple -import cv2 import numpy as np logger = logging.getLogger(__name__) - -def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool: - """CV check: does this word_box image show thin vertical pipe dividers? - - Uses morphological opening with a tall thin kernel to isolate vertical - structures, then filters for thin (≤4px), isolated contours that are - NOT at the word edges (those would be l, I, 1 etc.). - """ - x = wb.get("left", 0) - y = wb.get("top", 0) - w = wb.get("width", 0) - h = wb.get("height", 0) - if w < 30 or h < 12: - return False - ih, iw = img_gray.shape[:2] - y1, y2 = max(0, y), min(ih, y + h) - x1, x2 = max(0, x), min(iw, x + w) - roi = img_gray[y1:y2, x1:x2] - if roi.size == 0: - return False - rh, rw = roi.shape - - # Binarize (ink = white on black background) - _, binary = cv2.threshold( - roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU - ) - - # Morphological opening: keep only tall vertical structures (≥55% height) - kern_h = max(int(rh * 0.55), 8) - kernel = np.ones((kern_h, 1), np.uint8) - vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel) - - # Find surviving contours - contours, _ = cv2.findContours( - vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE - ) - - margin = max(int(rw * 0.08), 3) - for cnt in contours: - cx, cy, cw, ch = cv2.boundingRect(cnt) - if cw > 4: - continue # too wide for a pipe - if cx < margin or cx + cw > rw - margin: - continue # at word edge — likely l, I, 1 - # Check isolation: adjacent columns should be mostly empty (ink-free) - left_zone = binary[cy:cy + ch, max(0, cx - 3):cx] - right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)] - left_ink = np.mean(left_zone) if left_zone.size else 255 - right_ink = np.mean(right_zone) if right_zone.size else 255 - if left_ink < 80 and right_ink < 80: - return True # isolated thin vertical line = pipe divider - return False - - -# IPA/phonetic bracket pattern — don't hyphenate transcriptions +# IPA/phonetic characters — skip cells containing these _IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]') +# Common German words that should NOT be merged with adjacent tokens. +# These are function words that appear as standalone words between +# headwords/definitions on dictionary pages. +_STOP_WORDS = frozenset([ + # Articles + 'der', 'die', 'das', 'dem', 'den', 'des', + 'ein', 'eine', 'einem', 'einen', 'einer', + # Pronouns + 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich', + # Prepositions + 'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im', + 'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter', + 'zwischen', 'ohne', 'gegen', + # Conjunctions + 'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber', + # Adverbs + 'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht', + # Verbs + 'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf', + 'sein', 'haben', + # Other + 'kein', 'keine', 'keinem', 'keinen', 'keiner', +]) + +# Cached hyphenators +_hyph_de = None +_hyph_en = None + + +def _get_hyphenators(): + """Lazy-load pyphen hyphenators (cached across calls).""" + global _hyph_de, _hyph_en + if _hyph_de is not None: + return _hyph_de, _hyph_en + try: + import pyphen + except ImportError: + return None, None + _hyph_de = pyphen.Pyphen(lang='de_DE') + _hyph_en = pyphen.Pyphen(lang='en_US') + return _hyph_de, _hyph_en + + +def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]: + """Try to hyphenate a word using DE then EN dictionary. + + Returns word with | separators, or None if not recognized. + """ + hyph = hyph_de.inserted(word, hyphen='|') + if '|' in hyph: + return hyph + hyph = hyph_en.inserted(word, hyphen='|') + if '|' in hyph: + return hyph + return None + + +def _try_merge_pipe_gaps(text: str, hyph_de) -> str: + """Merge fragments separated by single spaces where OCR split at a pipe. + + Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word). + Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau". + + Guards against false merges: + - The FIRST token must be pure alpha (word start — no attached punctuation) + - The second token may have trailing punctuation (comma, period) which + stays attached to the merged word: "Kä" + "fer," -> "Käfer," + - Common German function words (der, die, das, ...) are never merged + - At least one fragment must be very short (<=3 alpha chars) + """ + parts = text.split(' ') + if len(parts) < 2: + return text + + result = [parts[0]] + i = 1 + while i < len(parts): + prev = result[-1] + curr = parts[i] + + # Extract alpha-only core for lookup + prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev) + curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr) + + # Guard 1: first token must be pure alpha (word-start fragment) + # second token may have trailing punctuation + # Guard 2: neither alpha core can be a common German function word + # Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal) + # Guard 4: combined length must be >= 4 + should_try = ( + prev == prev_alpha # first token: pure alpha (word start) + and prev_alpha and curr_alpha + and prev_alpha.lower() not in _STOP_WORDS + and curr_alpha.lower() not in _STOP_WORDS + and min(len(prev_alpha), len(curr_alpha)) <= 3 + and len(prev_alpha) + len(curr_alpha) >= 4 + ) + + if should_try: + merged_alpha = prev_alpha + curr_alpha + hyph = hyph_de.inserted(merged_alpha, hyphen='-') + if '-' in hyph: + # pyphen recognizes merged word — collapse the space + result[-1] = prev + curr + i += 1 + continue + + result.append(curr) + i += 1 + + return ' '.join(result) + + +def _syllabify_text(text: str, hyph_de, hyph_en) -> str: + """Syllabify all significant words in a text string. + + 1. Strip existing | dividers + 2. Merge pipe-gap spaces where possible + 3. Apply pyphen to each word >= 3 alphabetic chars + 4. Words pyphen doesn't recognize stay as-is (no bad guesses) + """ + if not text: + return text + + # Skip cells that contain IPA transcription characters + if _IPA_RE.search(text): + return text + + # Phase 1: strip existing pipe dividers for clean normalization + clean = text.replace('|', '') + + # Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting) + clean = _try_merge_pipe_gaps(clean, hyph_de) + + # Phase 3: tokenize and syllabify each word + # Split on whitespace and comma/semicolon sequences, keeping separators + tokens = re.split(r'(\s+|[,;:]+\s*)', clean) + + result = [] + for tok in tokens: + if not tok or re.match(r'^[\s,;:]+$', tok): + result.append(tok) + continue + + # Strip trailing/leading punctuation for pyphen lookup + m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok) + if not m: + result.append(tok) + continue + lead, word, trail = m.group(1), m.group(2), m.group(3) + + if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word): + result.append(tok) + continue + + hyph = _hyphenate_word(word, hyph_de, hyph_en) + if hyph: + result.append(lead + hyph + trail) + else: + result.append(tok) + + return ''.join(result) + def insert_syllable_dividers( zones_data: List[Dict], img_bgr: np.ndarray, session_id: str, ) -> int: - """Insert pipe syllable dividers into dictionary cells where CV confirms them. + """Insert pipe syllable dividers into dictionary cells. - For each cell on a dictionary page: - 1. Check if ANY word_box has CV-detected pipe lines - 2. If yes, apply pyphen to EACH word (≥4 chars) in the cell - 3. Try DE hyphenation first, then EN + For dictionary pages: process all content column cells, strip existing + pipes, merge pipe-gap spaces, and re-syllabify using pyphen. + + Pre-check: at least 5% of content cells must already contain ``|`` from + OCR. This guards against false-positive dictionary detection on pages + like synonym dictionaries or alphabetical word lists that have no actual + syllable divider lines. Returns the number of cells modified. """ - try: - import pyphen - except ImportError: + hyph_de, hyph_en = _get_hyphenators() + if hyph_de is None: logger.warning("pyphen not installed — skipping syllable insertion") return 0 - _hyph_de = pyphen.Pyphen(lang='de_DE') - _hyph_en = pyphen.Pyphen(lang='en_US') - img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) + # Pre-check: count cells that already have | from OCR. + # Real dictionary pages with printed syllable dividers will have OCR- + # detected pipes in many cells. Pages without syllable dividers will + # have zero — skip those to avoid false syllabification. + total_col_cells = 0 + cells_with_pipes = 0 + for z in zones_data: + for cell in z.get("cells", []): + if cell.get("col_type", "").startswith("column_"): + total_col_cells += 1 + if "|" in cell.get("text", ""): + cells_with_pipes += 1 + + if total_col_cells > 0: + pipe_ratio = cells_with_pipes / total_col_cells + if pipe_ratio < 0.05: + logger.info( + "build-grid session %s: skipping syllable insertion — " + "only %.1f%% of cells have existing pipes (need >=5%%)", + session_id, pipe_ratio * 100, + ) + return 0 insertions = 0 for z in zones_data: @@ -109,47 +242,18 @@ def insert_syllable_dividers( if not ct.startswith("column_"): continue text = cell.get("text", "") - if not text or "|" in text: - continue - if _IPA_RE.search(text): + if not text: continue - # CV gate: check if ANY word_box in this cell has pipe lines - wbs = cell.get("word_boxes") or [] - if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs): - continue - - # Apply pyphen to each significant word in the cell - tokens = re.split(r'(\s+|[,;]+\s*)', text) - new_tokens = [] - changed = False - for tok in tokens: - # Skip whitespace/punctuation separators - if re.match(r'^[\s,;]+$', tok): - new_tokens.append(tok) - continue - # Only hyphenate words ≥ 4 alpha chars - clean = re.sub(r'[().\-]', '', tok) - if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean): - new_tokens.append(tok) - continue - # Try DE first, then EN - hyph = _hyph_de.inserted(tok, hyphen='|') - if '|' not in hyph: - hyph = _hyph_en.inserted(tok, hyphen='|') - if '|' in hyph and hyph != tok: - new_tokens.append(hyph) - changed = True - else: - new_tokens.append(tok) - if changed: - cell["text"] = ''.join(new_tokens) + new_text = _syllabify_text(text, hyph_de, hyph_en) + if new_text != text: + cell["text"] = new_text insertions += 1 if insertions: logger.info( - "build-grid session %s: inserted syllable dividers in %d cells " - "(CV-validated)", + "build-grid session %s: syllable dividers inserted/normalized " + "in %d cells (pyphen)", session_id, insertions, ) return insertions diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 8eb1bfc..1fa99d0 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1456,10 +1456,15 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: logger.warning("Dictionary detection failed: %s", e) # --- Syllable divider insertion for dictionary pages --- - # CV-validated: only inserts "|" where image shows thin vertical lines. - # See cv_syllable_detect.py for the detection + insertion logic. + # Only on confirmed dictionary pages with article columns (der/die/das). + # The article_col_index check avoids false positives on synonym lists, + # word frequency tables, and other alphabetically sorted non-dictionary pages. + # Additionally, insert_syllable_dividers has its own pre-check for existing + # pipe characters in cells (OCR must have already found some). syllable_insertions = 0 - if dict_detection.get("is_dictionary") and img_bgr is not None: + if (dict_detection.get("is_dictionary") + and dict_detection.get("article_col_index") is not None + and img_bgr is not None): try: from cv_syllable_detect import insert_syllable_dividers syllable_insertions = insert_syllable_dividers(