refactor: extract grid helpers + generic CV-gated syllable insertion
1. Extracted 1367 lines of helper functions from grid_editor_api.py (3051→1620 lines) into grid_editor_helpers.py (filters, detectors, zone grid building). 2. Created cv_syllable_detect.py with generic CV+pyphen logic: - Checks EVERY word_box for vertical pipe lines (not just first word) - No article-column dependency — works with any dictionary layout - CV morphological detection gates pyphen insertion 3. Grid editor scroll: calc(100vh-200px) for reliable scrolling. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
155
klausur-service/backend/cv_syllable_detect.py
Normal file
155
klausur-service/backend/cv_syllable_detect.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
CV-based syllable divider detection and insertion for dictionary pages.
|
||||
|
||||
Two-step approach:
|
||||
1. CV: morphological vertical line detection checks if a word_box image
|
||||
contains thin, isolated pipe-like vertical lines (syllable dividers).
|
||||
2. pyphen: inserts syllable breaks at linguistically correct positions
|
||||
for words where CV confirmed the presence of dividers.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
|
||||
"""CV check: does this word_box image show thin vertical pipe dividers?
|
||||
|
||||
Uses morphological opening with a tall thin kernel to isolate vertical
|
||||
structures, then filters for thin (≤4px), isolated contours that are
|
||||
NOT at the word edges (those would be l, I, 1 etc.).
|
||||
"""
|
||||
x = wb.get("left", 0)
|
||||
y = wb.get("top", 0)
|
||||
w = wb.get("width", 0)
|
||||
h = wb.get("height", 0)
|
||||
if w < 30 or h < 12:
|
||||
return False
|
||||
ih, iw = img_gray.shape[:2]
|
||||
y1, y2 = max(0, y), min(ih, y + h)
|
||||
x1, x2 = max(0, x), min(iw, x + w)
|
||||
roi = img_gray[y1:y2, x1:x2]
|
||||
if roi.size == 0:
|
||||
return False
|
||||
rh, rw = roi.shape
|
||||
|
||||
# Binarize (ink = white on black background)
|
||||
_, binary = cv2.threshold(
|
||||
roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
|
||||
)
|
||||
|
||||
# Morphological opening: keep only tall vertical structures (≥55% height)
|
||||
kern_h = max(int(rh * 0.55), 8)
|
||||
kernel = np.ones((kern_h, 1), np.uint8)
|
||||
vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
|
||||
|
||||
# Find surviving contours
|
||||
contours, _ = cv2.findContours(
|
||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
|
||||
margin = max(int(rw * 0.08), 3)
|
||||
for cnt in contours:
|
||||
cx, cy, cw, ch = cv2.boundingRect(cnt)
|
||||
if cw > 4:
|
||||
continue # too wide for a pipe
|
||||
if cx < margin or cx + cw > rw - margin:
|
||||
continue # at word edge — likely l, I, 1
|
||||
# Check isolation: adjacent columns should be mostly empty (ink-free)
|
||||
left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
|
||||
right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
|
||||
left_ink = np.mean(left_zone) if left_zone.size else 255
|
||||
right_ink = np.mean(right_zone) if right_zone.size else 255
|
||||
if left_ink < 80 and right_ink < 80:
|
||||
return True # isolated thin vertical line = pipe divider
|
||||
return False
|
||||
|
||||
|
||||
# IPA/phonetic bracket pattern — don't hyphenate transcriptions
|
||||
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
||||
|
||||
|
||||
def insert_syllable_dividers(
|
||||
zones_data: List[Dict],
|
||||
img_bgr: np.ndarray,
|
||||
session_id: str,
|
||||
) -> int:
|
||||
"""Insert pipe syllable dividers into dictionary cells where CV confirms them.
|
||||
|
||||
For each cell on a dictionary page:
|
||||
1. Check if ANY word_box has CV-detected pipe lines
|
||||
2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
|
||||
3. Try DE hyphenation first, then EN
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
try:
|
||||
import pyphen
|
||||
except ImportError:
|
||||
logger.warning("pyphen not installed — skipping syllable insertion")
|
||||
return 0
|
||||
|
||||
_hyph_de = pyphen.Pyphen(lang='de_DE')
|
||||
_hyph_en = pyphen.Pyphen(lang='en_US')
|
||||
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
insertions = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if not text or "|" in text:
|
||||
continue
|
||||
if _IPA_RE.search(text):
|
||||
continue
|
||||
|
||||
# CV gate: check if ANY word_box in this cell has pipe lines
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs):
|
||||
continue
|
||||
|
||||
# Apply pyphen to each significant word in the cell
|
||||
tokens = re.split(r'(\s+|[,;]+\s*)', text)
|
||||
new_tokens = []
|
||||
changed = False
|
||||
for tok in tokens:
|
||||
# Skip whitespace/punctuation separators
|
||||
if re.match(r'^[\s,;]+$', tok):
|
||||
new_tokens.append(tok)
|
||||
continue
|
||||
# Only hyphenate words ≥ 4 alpha chars
|
||||
clean = re.sub(r'[().\-]', '', tok)
|
||||
if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
|
||||
new_tokens.append(tok)
|
||||
continue
|
||||
# Try DE first, then EN
|
||||
hyph = _hyph_de.inserted(tok, hyphen='|')
|
||||
if '|' not in hyph:
|
||||
hyph = _hyph_en.inserted(tok, hyphen='|')
|
||||
if '|' in hyph and hyph != tok:
|
||||
new_tokens.append(hyph)
|
||||
changed = True
|
||||
else:
|
||||
new_tokens.append(tok)
|
||||
if changed:
|
||||
cell["text"] = ''.join(new_tokens)
|
||||
insertions += 1
|
||||
|
||||
if insertions:
|
||||
logger.info(
|
||||
"build-grid session %s: inserted syllable dividers in %d cells "
|
||||
"(CV-validated)",
|
||||
session_id, insertions,
|
||||
)
|
||||
return insertions
|
||||
File diff suppressed because it is too large
Load Diff
1389
klausur-service/backend/grid_editor_helpers.py
Normal file
1389
klausur-service/backend/grid_editor_helpers.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user