Improve syllable divider insertion for dictionary pages
Rewrite cv_syllable_detect.py with pyphen-first approach: - Remove unreliable CV gate (morphological pipe detection) - Strip existing pipes and re-syllabify via pyphen (DE then EN) - Merge pipe-gap spaces where OCR split words at divider positions - Guard merges with function word blacklist and punctuation checks Add false-positive prevention: - Pre-check: skip if <5% of cells have existing | from OCR - Call-site check: require article_col_index (der/die/das column) - Prevents syllabification of synonym dictionaries and word lists Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,15 @@
|
|||||||
"""
|
"""
|
||||||
CV-based syllable divider detection and insertion for dictionary pages.
|
Syllable divider insertion for dictionary pages.
|
||||||
|
|
||||||
Two-step approach:
|
For confirmed dictionary pages (is_dictionary=True), processes all content
|
||||||
1. CV: morphological vertical line detection checks if a word_box image
|
column cells:
|
||||||
contains thin, isolated pipe-like vertical lines (syllable dividers).
|
1. Strips existing | dividers for clean normalization
|
||||||
2. pyphen: inserts syllable breaks at linguistically correct positions
|
2. Merges pipe-gap spaces (where OCR split a word at a divider position)
|
||||||
for words where CV confirmed the presence of dividers.
|
3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN)
|
||||||
|
4. Only modifies words that pyphen recognizes — garbled OCR stays as-is
|
||||||
|
|
||||||
|
No CV gate needed — the dictionary detection confidence is sufficient.
|
||||||
|
pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable.
|
||||||
|
|
||||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
@@ -13,94 +17,223 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import cv2
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# IPA/phonetic characters — skip cells containing these
|
||||||
def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
|
|
||||||
"""CV check: does this word_box image show thin vertical pipe dividers?
|
|
||||||
|
|
||||||
Uses morphological opening with a tall thin kernel to isolate vertical
|
|
||||||
structures, then filters for thin (≤4px), isolated contours that are
|
|
||||||
NOT at the word edges (those would be l, I, 1 etc.).
|
|
||||||
"""
|
|
||||||
x = wb.get("left", 0)
|
|
||||||
y = wb.get("top", 0)
|
|
||||||
w = wb.get("width", 0)
|
|
||||||
h = wb.get("height", 0)
|
|
||||||
if w < 30 or h < 12:
|
|
||||||
return False
|
|
||||||
ih, iw = img_gray.shape[:2]
|
|
||||||
y1, y2 = max(0, y), min(ih, y + h)
|
|
||||||
x1, x2 = max(0, x), min(iw, x + w)
|
|
||||||
roi = img_gray[y1:y2, x1:x2]
|
|
||||||
if roi.size == 0:
|
|
||||||
return False
|
|
||||||
rh, rw = roi.shape
|
|
||||||
|
|
||||||
# Binarize (ink = white on black background)
|
|
||||||
_, binary = cv2.threshold(
|
|
||||||
roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
|
|
||||||
)
|
|
||||||
|
|
||||||
# Morphological opening: keep only tall vertical structures (≥55% height)
|
|
||||||
kern_h = max(int(rh * 0.55), 8)
|
|
||||||
kernel = np.ones((kern_h, 1), np.uint8)
|
|
||||||
vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
|
|
||||||
|
|
||||||
# Find surviving contours
|
|
||||||
contours, _ = cv2.findContours(
|
|
||||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
|
||||||
)
|
|
||||||
|
|
||||||
margin = max(int(rw * 0.08), 3)
|
|
||||||
for cnt in contours:
|
|
||||||
cx, cy, cw, ch = cv2.boundingRect(cnt)
|
|
||||||
if cw > 4:
|
|
||||||
continue # too wide for a pipe
|
|
||||||
if cx < margin or cx + cw > rw - margin:
|
|
||||||
continue # at word edge — likely l, I, 1
|
|
||||||
# Check isolation: adjacent columns should be mostly empty (ink-free)
|
|
||||||
left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
|
|
||||||
right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
|
|
||||||
left_ink = np.mean(left_zone) if left_zone.size else 255
|
|
||||||
right_ink = np.mean(right_zone) if right_zone.size else 255
|
|
||||||
if left_ink < 80 and right_ink < 80:
|
|
||||||
return True # isolated thin vertical line = pipe divider
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# IPA/phonetic bracket pattern — don't hyphenate transcriptions
|
|
||||||
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
||||||
|
|
||||||
|
# Common German words that should NOT be merged with adjacent tokens.
|
||||||
|
# These are function words that appear as standalone words between
|
||||||
|
# headwords/definitions on dictionary pages.
|
||||||
|
_STOP_WORDS = frozenset([
|
||||||
|
# Articles
|
||||||
|
'der', 'die', 'das', 'dem', 'den', 'des',
|
||||||
|
'ein', 'eine', 'einem', 'einen', 'einer',
|
||||||
|
# Pronouns
|
||||||
|
'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
|
||||||
|
# Prepositions
|
||||||
|
'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
|
||||||
|
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
|
||||||
|
'zwischen', 'ohne', 'gegen',
|
||||||
|
# Conjunctions
|
||||||
|
'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
|
||||||
|
# Adverbs
|
||||||
|
'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
|
||||||
|
# Verbs
|
||||||
|
'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
|
||||||
|
'sein', 'haben',
|
||||||
|
# Other
|
||||||
|
'kein', 'keine', 'keinem', 'keinen', 'keiner',
|
||||||
|
])
|
||||||
|
|
||||||
|
# Cached hyphenators
|
||||||
|
_hyph_de = None
|
||||||
|
_hyph_en = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_hyphenators():
|
||||||
|
"""Lazy-load pyphen hyphenators (cached across calls)."""
|
||||||
|
global _hyph_de, _hyph_en
|
||||||
|
if _hyph_de is not None:
|
||||||
|
return _hyph_de, _hyph_en
|
||||||
|
try:
|
||||||
|
import pyphen
|
||||||
|
except ImportError:
|
||||||
|
return None, None
|
||||||
|
_hyph_de = pyphen.Pyphen(lang='de_DE')
|
||||||
|
_hyph_en = pyphen.Pyphen(lang='en_US')
|
||||||
|
return _hyph_de, _hyph_en
|
||||||
|
|
||||||
|
|
||||||
|
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
||||||
|
"""Try to hyphenate a word using DE then EN dictionary.
|
||||||
|
|
||||||
|
Returns word with | separators, or None if not recognized.
|
||||||
|
"""
|
||||||
|
hyph = hyph_de.inserted(word, hyphen='|')
|
||||||
|
if '|' in hyph:
|
||||||
|
return hyph
|
||||||
|
hyph = hyph_en.inserted(word, hyphen='|')
|
||||||
|
if '|' in hyph:
|
||||||
|
return hyph
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
|
||||||
|
"""Merge fragments separated by single spaces where OCR split at a pipe.
|
||||||
|
|
||||||
|
Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
|
||||||
|
Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
|
||||||
|
|
||||||
|
Guards against false merges:
|
||||||
|
- The FIRST token must be pure alpha (word start — no attached punctuation)
|
||||||
|
- The second token may have trailing punctuation (comma, period) which
|
||||||
|
stays attached to the merged word: "Kä" + "fer," -> "Käfer,"
|
||||||
|
- Common German function words (der, die, das, ...) are never merged
|
||||||
|
- At least one fragment must be very short (<=3 alpha chars)
|
||||||
|
"""
|
||||||
|
parts = text.split(' ')
|
||||||
|
if len(parts) < 2:
|
||||||
|
return text
|
||||||
|
|
||||||
|
result = [parts[0]]
|
||||||
|
i = 1
|
||||||
|
while i < len(parts):
|
||||||
|
prev = result[-1]
|
||||||
|
curr = parts[i]
|
||||||
|
|
||||||
|
# Extract alpha-only core for lookup
|
||||||
|
prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
|
||||||
|
curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
|
||||||
|
|
||||||
|
# Guard 1: first token must be pure alpha (word-start fragment)
|
||||||
|
# second token may have trailing punctuation
|
||||||
|
# Guard 2: neither alpha core can be a common German function word
|
||||||
|
# Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
|
||||||
|
# Guard 4: combined length must be >= 4
|
||||||
|
should_try = (
|
||||||
|
prev == prev_alpha # first token: pure alpha (word start)
|
||||||
|
and prev_alpha and curr_alpha
|
||||||
|
and prev_alpha.lower() not in _STOP_WORDS
|
||||||
|
and curr_alpha.lower() not in _STOP_WORDS
|
||||||
|
and min(len(prev_alpha), len(curr_alpha)) <= 3
|
||||||
|
and len(prev_alpha) + len(curr_alpha) >= 4
|
||||||
|
)
|
||||||
|
|
||||||
|
if should_try:
|
||||||
|
merged_alpha = prev_alpha + curr_alpha
|
||||||
|
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
|
||||||
|
if '-' in hyph:
|
||||||
|
# pyphen recognizes merged word — collapse the space
|
||||||
|
result[-1] = prev + curr
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
result.append(curr)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return ' '.join(result)
|
||||||
|
|
||||||
|
|
||||||
|
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
|
||||||
|
"""Syllabify all significant words in a text string.
|
||||||
|
|
||||||
|
1. Strip existing | dividers
|
||||||
|
2. Merge pipe-gap spaces where possible
|
||||||
|
3. Apply pyphen to each word >= 3 alphabetic chars
|
||||||
|
4. Words pyphen doesn't recognize stay as-is (no bad guesses)
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Skip cells that contain IPA transcription characters
|
||||||
|
if _IPA_RE.search(text):
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Phase 1: strip existing pipe dividers for clean normalization
|
||||||
|
clean = text.replace('|', '')
|
||||||
|
|
||||||
|
# Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
|
||||||
|
clean = _try_merge_pipe_gaps(clean, hyph_de)
|
||||||
|
|
||||||
|
# Phase 3: tokenize and syllabify each word
|
||||||
|
# Split on whitespace and comma/semicolon sequences, keeping separators
|
||||||
|
tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for tok in tokens:
|
||||||
|
if not tok or re.match(r'^[\s,;:]+$', tok):
|
||||||
|
result.append(tok)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Strip trailing/leading punctuation for pyphen lookup
|
||||||
|
m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok)
|
||||||
|
if not m:
|
||||||
|
result.append(tok)
|
||||||
|
continue
|
||||||
|
lead, word, trail = m.group(1), m.group(2), m.group(3)
|
||||||
|
|
||||||
|
if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word):
|
||||||
|
result.append(tok)
|
||||||
|
continue
|
||||||
|
|
||||||
|
hyph = _hyphenate_word(word, hyph_de, hyph_en)
|
||||||
|
if hyph:
|
||||||
|
result.append(lead + hyph + trail)
|
||||||
|
else:
|
||||||
|
result.append(tok)
|
||||||
|
|
||||||
|
return ''.join(result)
|
||||||
|
|
||||||
|
|
||||||
def insert_syllable_dividers(
|
def insert_syllable_dividers(
|
||||||
zones_data: List[Dict],
|
zones_data: List[Dict],
|
||||||
img_bgr: np.ndarray,
|
img_bgr: np.ndarray,
|
||||||
session_id: str,
|
session_id: str,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Insert pipe syllable dividers into dictionary cells where CV confirms them.
|
"""Insert pipe syllable dividers into dictionary cells.
|
||||||
|
|
||||||
For each cell on a dictionary page:
|
For dictionary pages: process all content column cells, strip existing
|
||||||
1. Check if ANY word_box has CV-detected pipe lines
|
pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
|
||||||
2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
|
|
||||||
3. Try DE hyphenation first, then EN
|
Pre-check: at least 5% of content cells must already contain ``|`` from
|
||||||
|
OCR. This guards against false-positive dictionary detection on pages
|
||||||
|
like synonym dictionaries or alphabetical word lists that have no actual
|
||||||
|
syllable divider lines.
|
||||||
|
|
||||||
Returns the number of cells modified.
|
Returns the number of cells modified.
|
||||||
"""
|
"""
|
||||||
try:
|
hyph_de, hyph_en = _get_hyphenators()
|
||||||
import pyphen
|
if hyph_de is None:
|
||||||
except ImportError:
|
|
||||||
logger.warning("pyphen not installed — skipping syllable insertion")
|
logger.warning("pyphen not installed — skipping syllable insertion")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
_hyph_de = pyphen.Pyphen(lang='de_DE')
|
# Pre-check: count cells that already have | from OCR.
|
||||||
_hyph_en = pyphen.Pyphen(lang='en_US')
|
# Real dictionary pages with printed syllable dividers will have OCR-
|
||||||
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
# detected pipes in many cells. Pages without syllable dividers will
|
||||||
|
# have zero — skip those to avoid false syllabification.
|
||||||
|
total_col_cells = 0
|
||||||
|
cells_with_pipes = 0
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
if cell.get("col_type", "").startswith("column_"):
|
||||||
|
total_col_cells += 1
|
||||||
|
if "|" in cell.get("text", ""):
|
||||||
|
cells_with_pipes += 1
|
||||||
|
|
||||||
|
if total_col_cells > 0:
|
||||||
|
pipe_ratio = cells_with_pipes / total_col_cells
|
||||||
|
if pipe_ratio < 0.05:
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: skipping syllable insertion — "
|
||||||
|
"only %.1f%% of cells have existing pipes (need >=5%%)",
|
||||||
|
session_id, pipe_ratio * 100,
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
|
||||||
insertions = 0
|
insertions = 0
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
@@ -109,47 +242,18 @@ def insert_syllable_dividers(
|
|||||||
if not ct.startswith("column_"):
|
if not ct.startswith("column_"):
|
||||||
continue
|
continue
|
||||||
text = cell.get("text", "")
|
text = cell.get("text", "")
|
||||||
if not text or "|" in text:
|
if not text:
|
||||||
continue
|
|
||||||
if _IPA_RE.search(text):
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# CV gate: check if ANY word_box in this cell has pipe lines
|
new_text = _syllabify_text(text, hyph_de, hyph_en)
|
||||||
wbs = cell.get("word_boxes") or []
|
if new_text != text:
|
||||||
if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs):
|
cell["text"] = new_text
|
||||||
continue
|
|
||||||
|
|
||||||
# Apply pyphen to each significant word in the cell
|
|
||||||
tokens = re.split(r'(\s+|[,;]+\s*)', text)
|
|
||||||
new_tokens = []
|
|
||||||
changed = False
|
|
||||||
for tok in tokens:
|
|
||||||
# Skip whitespace/punctuation separators
|
|
||||||
if re.match(r'^[\s,;]+$', tok):
|
|
||||||
new_tokens.append(tok)
|
|
||||||
continue
|
|
||||||
# Only hyphenate words ≥ 4 alpha chars
|
|
||||||
clean = re.sub(r'[().\-]', '', tok)
|
|
||||||
if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
|
|
||||||
new_tokens.append(tok)
|
|
||||||
continue
|
|
||||||
# Try DE first, then EN
|
|
||||||
hyph = _hyph_de.inserted(tok, hyphen='|')
|
|
||||||
if '|' not in hyph:
|
|
||||||
hyph = _hyph_en.inserted(tok, hyphen='|')
|
|
||||||
if '|' in hyph and hyph != tok:
|
|
||||||
new_tokens.append(hyph)
|
|
||||||
changed = True
|
|
||||||
else:
|
|
||||||
new_tokens.append(tok)
|
|
||||||
if changed:
|
|
||||||
cell["text"] = ''.join(new_tokens)
|
|
||||||
insertions += 1
|
insertions += 1
|
||||||
|
|
||||||
if insertions:
|
if insertions:
|
||||||
logger.info(
|
logger.info(
|
||||||
"build-grid session %s: inserted syllable dividers in %d cells "
|
"build-grid session %s: syllable dividers inserted/normalized "
|
||||||
"(CV-validated)",
|
"in %d cells (pyphen)",
|
||||||
session_id, insertions,
|
session_id, insertions,
|
||||||
)
|
)
|
||||||
return insertions
|
return insertions
|
||||||
|
|||||||
@@ -1456,10 +1456,15 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
logger.warning("Dictionary detection failed: %s", e)
|
logger.warning("Dictionary detection failed: %s", e)
|
||||||
|
|
||||||
# --- Syllable divider insertion for dictionary pages ---
|
# --- Syllable divider insertion for dictionary pages ---
|
||||||
# CV-validated: only inserts "|" where image shows thin vertical lines.
|
# Only on confirmed dictionary pages with article columns (der/die/das).
|
||||||
# See cv_syllable_detect.py for the detection + insertion logic.
|
# The article_col_index check avoids false positives on synonym lists,
|
||||||
|
# word frequency tables, and other alphabetically sorted non-dictionary pages.
|
||||||
|
# Additionally, insert_syllable_dividers has its own pre-check for existing
|
||||||
|
# pipe characters in cells (OCR must have already found some).
|
||||||
syllable_insertions = 0
|
syllable_insertions = 0
|
||||||
if dict_detection.get("is_dictionary") and img_bgr is not None:
|
if (dict_detection.get("is_dictionary")
|
||||||
|
and dict_detection.get("article_col_index") is not None
|
||||||
|
and img_bgr is not None):
|
||||||
try:
|
try:
|
||||||
from cv_syllable_detect import insert_syllable_dividers
|
from cv_syllable_detect import insert_syllable_dividers
|
||||||
syllable_insertions = insert_syllable_dividers(
|
syllable_insertions = insert_syllable_dividers(
|
||||||
|
|||||||
Reference in New Issue
Block a user