1. Extracted 1367 lines of helper functions from grid_editor_api.py (3051→1620 lines) into grid_editor_helpers.py (filters, detectors, zone grid building). 2. Created cv_syllable_detect.py with generic CV+pyphen logic: - Checks EVERY word_box for vertical pipe lines (not just first word) - No article-column dependency — works with any dictionary layout - CV morphological detection gates pyphen insertion 3. Grid editor scroll: calc(100vh-200px) for reliable scrolling. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
156 lines
5.3 KiB
Python
156 lines
5.3 KiB
Python
"""
|
|
CV-based syllable divider detection and insertion for dictionary pages.
|
|
|
|
Two-step approach:
|
|
1. CV: morphological vertical line detection checks if a word_box image
|
|
contains thin, isolated pipe-like vertical lines (syllable dividers).
|
|
2. pyphen: inserts syllable breaks at linguistically correct positions
|
|
for words where CV confirmed the presence of dividers.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List
|
|
|
|
import cv2
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
|
|
"""CV check: does this word_box image show thin vertical pipe dividers?
|
|
|
|
Uses morphological opening with a tall thin kernel to isolate vertical
|
|
structures, then filters for thin (≤4px), isolated contours that are
|
|
NOT at the word edges (those would be l, I, 1 etc.).
|
|
"""
|
|
x = wb.get("left", 0)
|
|
y = wb.get("top", 0)
|
|
w = wb.get("width", 0)
|
|
h = wb.get("height", 0)
|
|
if w < 30 or h < 12:
|
|
return False
|
|
ih, iw = img_gray.shape[:2]
|
|
y1, y2 = max(0, y), min(ih, y + h)
|
|
x1, x2 = max(0, x), min(iw, x + w)
|
|
roi = img_gray[y1:y2, x1:x2]
|
|
if roi.size == 0:
|
|
return False
|
|
rh, rw = roi.shape
|
|
|
|
# Binarize (ink = white on black background)
|
|
_, binary = cv2.threshold(
|
|
roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
|
|
)
|
|
|
|
# Morphological opening: keep only tall vertical structures (≥55% height)
|
|
kern_h = max(int(rh * 0.55), 8)
|
|
kernel = np.ones((kern_h, 1), np.uint8)
|
|
vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
|
|
|
|
# Find surviving contours
|
|
contours, _ = cv2.findContours(
|
|
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
|
)
|
|
|
|
margin = max(int(rw * 0.08), 3)
|
|
for cnt in contours:
|
|
cx, cy, cw, ch = cv2.boundingRect(cnt)
|
|
if cw > 4:
|
|
continue # too wide for a pipe
|
|
if cx < margin or cx + cw > rw - margin:
|
|
continue # at word edge — likely l, I, 1
|
|
# Check isolation: adjacent columns should be mostly empty (ink-free)
|
|
left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
|
|
right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
|
|
left_ink = np.mean(left_zone) if left_zone.size else 255
|
|
right_ink = np.mean(right_zone) if right_zone.size else 255
|
|
if left_ink < 80 and right_ink < 80:
|
|
return True # isolated thin vertical line = pipe divider
|
|
return False
|
|
|
|
|
|
# IPA/phonetic bracket pattern — don't hyphenate transcriptions
|
|
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
|
|
|
|
|
def insert_syllable_dividers(
|
|
zones_data: List[Dict],
|
|
img_bgr: np.ndarray,
|
|
session_id: str,
|
|
) -> int:
|
|
"""Insert pipe syllable dividers into dictionary cells where CV confirms them.
|
|
|
|
For each cell on a dictionary page:
|
|
1. Check if ANY word_box has CV-detected pipe lines
|
|
2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
|
|
3. Try DE hyphenation first, then EN
|
|
|
|
Returns the number of cells modified.
|
|
"""
|
|
try:
|
|
import pyphen
|
|
except ImportError:
|
|
logger.warning("pyphen not installed — skipping syllable insertion")
|
|
return 0
|
|
|
|
_hyph_de = pyphen.Pyphen(lang='de_DE')
|
|
_hyph_en = pyphen.Pyphen(lang='en_US')
|
|
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
|
|
|
insertions = 0
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
ct = cell.get("col_type", "")
|
|
if not ct.startswith("column_"):
|
|
continue
|
|
text = cell.get("text", "")
|
|
if not text or "|" in text:
|
|
continue
|
|
if _IPA_RE.search(text):
|
|
continue
|
|
|
|
# CV gate: check if ANY word_box in this cell has pipe lines
|
|
wbs = cell.get("word_boxes") or []
|
|
if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs):
|
|
continue
|
|
|
|
# Apply pyphen to each significant word in the cell
|
|
tokens = re.split(r'(\s+|[,;]+\s*)', text)
|
|
new_tokens = []
|
|
changed = False
|
|
for tok in tokens:
|
|
# Skip whitespace/punctuation separators
|
|
if re.match(r'^[\s,;]+$', tok):
|
|
new_tokens.append(tok)
|
|
continue
|
|
# Only hyphenate words ≥ 4 alpha chars
|
|
clean = re.sub(r'[().\-]', '', tok)
|
|
if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
|
|
new_tokens.append(tok)
|
|
continue
|
|
# Try DE first, then EN
|
|
hyph = _hyph_de.inserted(tok, hyphen='|')
|
|
if '|' not in hyph:
|
|
hyph = _hyph_en.inserted(tok, hyphen='|')
|
|
if '|' in hyph and hyph != tok:
|
|
new_tokens.append(hyph)
|
|
changed = True
|
|
else:
|
|
new_tokens.append(tok)
|
|
if changed:
|
|
cell["text"] = ''.join(new_tokens)
|
|
insertions += 1
|
|
|
|
if insertions:
|
|
logger.info(
|
|
"build-grid session %s: inserted syllable dividers in %d cells "
|
|
"(CV-validated)",
|
|
session_id, insertions,
|
|
)
|
|
return insertions
|