Improve syllable divider insertion for dictionary pages

Rewrite cv_syllable_detect.py with pyphen-first approach:
- Remove unreliable CV gate (morphological pipe detection)
- Strip existing pipes and re-syllabify via pyphen (DE then EN)
- Merge pipe-gap spaces where OCR split words at divider positions
- Guard merges with function word blacklist and punctuation checks

Add false-positive prevention:
- Pre-check: skip if <5% of cells have existing | from OCR
- Call-site check: require article_col_index (der/die/das column)
- Prevents syllabification of synonym dictionaries and word lists

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 19:44:29 +01:00
parent 7fbcae954b
commit ed7fc99fc4
2 changed files with 221 additions and 112 deletions

View File

@@ -1,11 +1,15 @@
""" """
CV-based syllable divider detection and insertion for dictionary pages. Syllable divider insertion for dictionary pages.
Two-step approach: For confirmed dictionary pages (is_dictionary=True), processes all content
1. CV: morphological vertical line detection checks if a word_box image column cells:
contains thin, isolated pipe-like vertical lines (syllable dividers). 1. Strips existing | dividers for clean normalization
2. pyphen: inserts syllable breaks at linguistically correct positions 2. Merges pipe-gap spaces (where OCR split a word at a divider position)
for words where CV confirmed the presence of dividers. 3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN)
4. Only modifies words that pyphen recognizes — garbled OCR stays as-is
No CV gate needed — the dictionary detection confidence is sufficient.
pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable.
Lizenz: Apache 2.0 (kommerziell nutzbar) Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
@@ -13,94 +17,223 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging import logging
import re import re
from typing import Any, Dict, List from typing import Any, Dict, List, Optional, Tuple
import cv2
import numpy as np import numpy as np
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# IPA/phonetic characters — skip cells containing these
def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
"""CV check: does this word_box image show thin vertical pipe dividers?
Uses morphological opening with a tall thin kernel to isolate vertical
structures, then filters for thin (≤4px), isolated contours that are
NOT at the word edges (those would be l, I, 1 etc.).
"""
x = wb.get("left", 0)
y = wb.get("top", 0)
w = wb.get("width", 0)
h = wb.get("height", 0)
if w < 30 or h < 12:
return False
ih, iw = img_gray.shape[:2]
y1, y2 = max(0, y), min(ih, y + h)
x1, x2 = max(0, x), min(iw, x + w)
roi = img_gray[y1:y2, x1:x2]
if roi.size == 0:
return False
rh, rw = roi.shape
# Binarize (ink = white on black background)
_, binary = cv2.threshold(
roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
)
# Morphological opening: keep only tall vertical structures (≥55% height)
kern_h = max(int(rh * 0.55), 8)
kernel = np.ones((kern_h, 1), np.uint8)
vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
# Find surviving contours
contours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
margin = max(int(rw * 0.08), 3)
for cnt in contours:
cx, cy, cw, ch = cv2.boundingRect(cnt)
if cw > 4:
continue # too wide for a pipe
if cx < margin or cx + cw > rw - margin:
continue # at word edge — likely l, I, 1
# Check isolation: adjacent columns should be mostly empty (ink-free)
left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
left_ink = np.mean(left_zone) if left_zone.size else 255
right_ink = np.mean(right_zone) if right_zone.size else 255
if left_ink < 80 and right_ink < 80:
return True # isolated thin vertical line = pipe divider
return False
# IPA/phonetic bracket pattern — don't hyphenate transcriptions
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]') _IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
# Common German words that should NOT be merged with adjacent tokens.
# These are function words that appear as standalone words between
# headwords/definitions on dictionary pages.
_STOP_WORDS = frozenset([
# Articles
'der', 'die', 'das', 'dem', 'den', 'des',
'ein', 'eine', 'einem', 'einen', 'einer',
# Pronouns
'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
# Prepositions
'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
'zwischen', 'ohne', 'gegen',
# Conjunctions
'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
# Adverbs
'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
# Verbs
'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
'sein', 'haben',
# Other
'kein', 'keine', 'keinem', 'keinen', 'keiner',
])
# Cached hyphenators
_hyph_de = None
_hyph_en = None
def _get_hyphenators():
"""Lazy-load pyphen hyphenators (cached across calls)."""
global _hyph_de, _hyph_en
if _hyph_de is not None:
return _hyph_de, _hyph_en
try:
import pyphen
except ImportError:
return None, None
_hyph_de = pyphen.Pyphen(lang='de_DE')
_hyph_en = pyphen.Pyphen(lang='en_US')
return _hyph_de, _hyph_en
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
"""Try to hyphenate a word using DE then EN dictionary.
Returns word with | separators, or None if not recognized.
"""
hyph = hyph_de.inserted(word, hyphen='|')
if '|' in hyph:
return hyph
hyph = hyph_en.inserted(word, hyphen='|')
if '|' in hyph:
return hyph
return None
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
"""Merge fragments separated by single spaces where OCR split at a pipe.
Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
Guards against false merges:
- The FIRST token must be pure alpha (word start — no attached punctuation)
- The second token may have trailing punctuation (comma, period) which
stays attached to the merged word: "" + "fer," -> "Käfer,"
- Common German function words (der, die, das, ...) are never merged
- At least one fragment must be very short (<=3 alpha chars)
"""
parts = text.split(' ')
if len(parts) < 2:
return text
result = [parts[0]]
i = 1
while i < len(parts):
prev = result[-1]
curr = parts[i]
# Extract alpha-only core for lookup
prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
# Guard 1: first token must be pure alpha (word-start fragment)
# second token may have trailing punctuation
# Guard 2: neither alpha core can be a common German function word
# Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
# Guard 4: combined length must be >= 4
should_try = (
prev == prev_alpha # first token: pure alpha (word start)
and prev_alpha and curr_alpha
and prev_alpha.lower() not in _STOP_WORDS
and curr_alpha.lower() not in _STOP_WORDS
and min(len(prev_alpha), len(curr_alpha)) <= 3
and len(prev_alpha) + len(curr_alpha) >= 4
)
if should_try:
merged_alpha = prev_alpha + curr_alpha
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
if '-' in hyph:
# pyphen recognizes merged word — collapse the space
result[-1] = prev + curr
i += 1
continue
result.append(curr)
i += 1
return ' '.join(result)
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
"""Syllabify all significant words in a text string.
1. Strip existing | dividers
2. Merge pipe-gap spaces where possible
3. Apply pyphen to each word >= 3 alphabetic chars
4. Words pyphen doesn't recognize stay as-is (no bad guesses)
"""
if not text:
return text
# Skip cells that contain IPA transcription characters
if _IPA_RE.search(text):
return text
# Phase 1: strip existing pipe dividers for clean normalization
clean = text.replace('|', '')
# Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
clean = _try_merge_pipe_gaps(clean, hyph_de)
# Phase 3: tokenize and syllabify each word
# Split on whitespace and comma/semicolon sequences, keeping separators
tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
result = []
for tok in tokens:
if not tok or re.match(r'^[\s,;:]+$', tok):
result.append(tok)
continue
# Strip trailing/leading punctuation for pyphen lookup
m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok)
if not m:
result.append(tok)
continue
lead, word, trail = m.group(1), m.group(2), m.group(3)
if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word):
result.append(tok)
continue
hyph = _hyphenate_word(word, hyph_de, hyph_en)
if hyph:
result.append(lead + hyph + trail)
else:
result.append(tok)
return ''.join(result)
def insert_syllable_dividers( def insert_syllable_dividers(
zones_data: List[Dict], zones_data: List[Dict],
img_bgr: np.ndarray, img_bgr: np.ndarray,
session_id: str, session_id: str,
) -> int: ) -> int:
"""Insert pipe syllable dividers into dictionary cells where CV confirms them. """Insert pipe syllable dividers into dictionary cells.
For each cell on a dictionary page: For dictionary pages: process all content column cells, strip existing
1. Check if ANY word_box has CV-detected pipe lines pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
3. Try DE hyphenation first, then EN Pre-check: at least 5% of content cells must already contain ``|`` from
OCR. This guards against false-positive dictionary detection on pages
like synonym dictionaries or alphabetical word lists that have no actual
syllable divider lines.
Returns the number of cells modified. Returns the number of cells modified.
""" """
try: hyph_de, hyph_en = _get_hyphenators()
import pyphen if hyph_de is None:
except ImportError:
logger.warning("pyphen not installed — skipping syllable insertion") logger.warning("pyphen not installed — skipping syllable insertion")
return 0 return 0
_hyph_de = pyphen.Pyphen(lang='de_DE') # Pre-check: count cells that already have | from OCR.
_hyph_en = pyphen.Pyphen(lang='en_US') # Real dictionary pages with printed syllable dividers will have OCR-
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) # detected pipes in many cells. Pages without syllable dividers will
# have zero — skip those to avoid false syllabification.
total_col_cells = 0
cells_with_pipes = 0
for z in zones_data:
for cell in z.get("cells", []):
if cell.get("col_type", "").startswith("column_"):
total_col_cells += 1
if "|" in cell.get("text", ""):
cells_with_pipes += 1
if total_col_cells > 0:
pipe_ratio = cells_with_pipes / total_col_cells
if pipe_ratio < 0.05:
logger.info(
"build-grid session %s: skipping syllable insertion — "
"only %.1f%% of cells have existing pipes (need >=5%%)",
session_id, pipe_ratio * 100,
)
return 0
insertions = 0 insertions = 0
for z in zones_data: for z in zones_data:
@@ -109,47 +242,18 @@ def insert_syllable_dividers(
if not ct.startswith("column_"): if not ct.startswith("column_"):
continue continue
text = cell.get("text", "") text = cell.get("text", "")
if not text or "|" in text: if not text:
continue
if _IPA_RE.search(text):
continue continue
# CV gate: check if ANY word_box in this cell has pipe lines new_text = _syllabify_text(text, hyph_de, hyph_en)
wbs = cell.get("word_boxes") or [] if new_text != text:
if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs): cell["text"] = new_text
continue
# Apply pyphen to each significant word in the cell
tokens = re.split(r'(\s+|[,;]+\s*)', text)
new_tokens = []
changed = False
for tok in tokens:
# Skip whitespace/punctuation separators
if re.match(r'^[\s,;]+$', tok):
new_tokens.append(tok)
continue
# Only hyphenate words ≥ 4 alpha chars
clean = re.sub(r'[().\-]', '', tok)
if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
new_tokens.append(tok)
continue
# Try DE first, then EN
hyph = _hyph_de.inserted(tok, hyphen='|')
if '|' not in hyph:
hyph = _hyph_en.inserted(tok, hyphen='|')
if '|' in hyph and hyph != tok:
new_tokens.append(hyph)
changed = True
else:
new_tokens.append(tok)
if changed:
cell["text"] = ''.join(new_tokens)
insertions += 1 insertions += 1
if insertions: if insertions:
logger.info( logger.info(
"build-grid session %s: inserted syllable dividers in %d cells " "build-grid session %s: syllable dividers inserted/normalized "
"(CV-validated)", "in %d cells (pyphen)",
session_id, insertions, session_id, insertions,
) )
return insertions return insertions

View File

@@ -1456,10 +1456,15 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
logger.warning("Dictionary detection failed: %s", e) logger.warning("Dictionary detection failed: %s", e)
# --- Syllable divider insertion for dictionary pages --- # --- Syllable divider insertion for dictionary pages ---
# CV-validated: only inserts "|" where image shows thin vertical lines. # Only on confirmed dictionary pages with article columns (der/die/das).
# See cv_syllable_detect.py for the detection + insertion logic. # The article_col_index check avoids false positives on synonym lists,
# word frequency tables, and other alphabetically sorted non-dictionary pages.
# Additionally, insert_syllable_dividers has its own pre-check for existing
# pipe characters in cells (OCR must have already found some).
syllable_insertions = 0 syllable_insertions = 0
if dict_detection.get("is_dictionary") and img_bgr is not None: if (dict_detection.get("is_dictionary")
and dict_detection.get("article_col_index") is not None
and img_bgr is not None):
try: try:
from cv_syllable_detect import insert_syllable_dividers from cv_syllable_detect import insert_syllable_dividers
syllable_insertions = insert_syllable_dividers( syllable_insertions = insert_syllable_dividers(