refactor: extract grid helpers + generic CV-gated syllable insertion

1. Extracted 1367 lines of helper functions from grid_editor_api.py
   (3051→1620 lines) into grid_editor_helpers.py (filters, detectors,
   zone grid building).

2. Created cv_syllable_detect.py with generic CV+pyphen logic:
   - Checks EVERY word_box for vertical pipe lines (not just first word)
   - No article-column dependency — works with any dictionary layout
   - CV morphological detection gates pyphen insertion

3. Grid editor scroll: calc(100vh-200px) for reliable scrolling.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 14:39:33 +01:00
parent d9b2aa82e9
commit 12b4c61bac
3 changed files with 1572 additions and 1459 deletions

View File

@@ -0,0 +1,155 @@
"""
CV-based syllable divider detection and insertion for dictionary pages.
Two-step approach:
1. CV: morphological vertical line detection checks if a word_box image
contains thin, isolated pipe-like vertical lines (syllable dividers).
2. pyphen: inserts syllable breaks at linguistically correct positions
for words where CV confirmed the presence of dividers.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List
import cv2
import numpy as np
logger = logging.getLogger(__name__)
def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
"""CV check: does this word_box image show thin vertical pipe dividers?
Uses morphological opening with a tall thin kernel to isolate vertical
structures, then filters for thin (≤4px), isolated contours that are
NOT at the word edges (those would be l, I, 1 etc.).
"""
x = wb.get("left", 0)
y = wb.get("top", 0)
w = wb.get("width", 0)
h = wb.get("height", 0)
if w < 30 or h < 12:
return False
ih, iw = img_gray.shape[:2]
y1, y2 = max(0, y), min(ih, y + h)
x1, x2 = max(0, x), min(iw, x + w)
roi = img_gray[y1:y2, x1:x2]
if roi.size == 0:
return False
rh, rw = roi.shape
# Binarize (ink = white on black background)
_, binary = cv2.threshold(
roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
)
# Morphological opening: keep only tall vertical structures (≥55% height)
kern_h = max(int(rh * 0.55), 8)
kernel = np.ones((kern_h, 1), np.uint8)
vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
# Find surviving contours
contours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
margin = max(int(rw * 0.08), 3)
for cnt in contours:
cx, cy, cw, ch = cv2.boundingRect(cnt)
if cw > 4:
continue # too wide for a pipe
if cx < margin or cx + cw > rw - margin:
continue # at word edge — likely l, I, 1
# Check isolation: adjacent columns should be mostly empty (ink-free)
left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
left_ink = np.mean(left_zone) if left_zone.size else 255
right_ink = np.mean(right_zone) if right_zone.size else 255
if left_ink < 80 and right_ink < 80:
return True # isolated thin vertical line = pipe divider
return False
# IPA/phonetic bracket pattern — don't hyphenate transcriptions
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
def insert_syllable_dividers(
zones_data: List[Dict],
img_bgr: np.ndarray,
session_id: str,
) -> int:
"""Insert pipe syllable dividers into dictionary cells where CV confirms them.
For each cell on a dictionary page:
1. Check if ANY word_box has CV-detected pipe lines
2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
3. Try DE hyphenation first, then EN
Returns the number of cells modified.
"""
try:
import pyphen
except ImportError:
logger.warning("pyphen not installed — skipping syllable insertion")
return 0
_hyph_de = pyphen.Pyphen(lang='de_DE')
_hyph_en = pyphen.Pyphen(lang='en_US')
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
insertions = 0
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
text = cell.get("text", "")
if not text or "|" in text:
continue
if _IPA_RE.search(text):
continue
# CV gate: check if ANY word_box in this cell has pipe lines
wbs = cell.get("word_boxes") or []
if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs):
continue
# Apply pyphen to each significant word in the cell
tokens = re.split(r'(\s+|[,;]+\s*)', text)
new_tokens = []
changed = False
for tok in tokens:
# Skip whitespace/punctuation separators
if re.match(r'^[\s,;]+$', tok):
new_tokens.append(tok)
continue
# Only hyphenate words ≥ 4 alpha chars
clean = re.sub(r'[().\-]', '', tok)
if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
new_tokens.append(tok)
continue
# Try DE first, then EN
hyph = _hyph_de.inserted(tok, hyphen='|')
if '|' not in hyph:
hyph = _hyph_en.inserted(tok, hyphen='|')
if '|' in hyph and hyph != tok:
new_tokens.append(hyph)
changed = True
else:
new_tokens.append(tok)
if changed:
cell["text"] = ''.join(new_tokens)
insertions += 1
if insertions:
logger.info(
"build-grid session %s: inserted syllable dividers in %d cells "
"(CV-validated)",
session_id, insertions,
)
return insertions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff