fix: CV-gated syllable insertion + grid editor scroll

1. Syllable dividers now require CV validation: morphological vertical
   line detection checks if word_box image actually shows thin isolated
   pipe lines before applying pyphen. Only first word per cell gets
   pipes (matching dictionary print layout).

2. Grid editor scroll: changed maxHeight from 80vh to calc(100vh-200px)
   so editor remains scrollable after edits.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-24 14:31:16 +01:00
parent 364086b86e
commit d9b2aa82e9
2 changed files with 73 additions and 40 deletions

View File

@@ -314,7 +314,7 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
)}
{/* Right: Grid with row-accept buttons */}
<div className="space-y-3 overflow-y-auto" style={{ maxHeight: '80vh' }}>
<div className="space-y-3 overflow-y-auto" style={{ maxHeight: 'calc(100vh - 200px)' }}>
{/* Zone tables with row-accept buttons */}
{(() => {

View File

@@ -2803,16 +2803,62 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# --- Syllable divider insertion for dictionary pages ---
# Dictionary pages show syllable breaks as "|" (e.g. "Ka|me|rad").
# OCR engines rarely detect "|", so we insert them via pyphen
# hyphenation rules when the page is confirmed as a dictionary.
# OCR engines rarely detect "|", so we use a two-step approach:
# 1. CV: detect if a word_box image contains thin vertical pipe lines
# 2. pyphen: insert syllable breaks at linguistically correct positions
# Only the FIRST significant word per cell gets pipes (matching print layout).
syllable_insertions = 0
if dict_detection.get("is_dictionary"):
if dict_detection.get("is_dictionary") and img_bgr is not None:
try:
import pyphen
_hyph_de = pyphen.Pyphen(lang='de_DE')
_hyph_en = pyphen.Pyphen(lang='en_US')
# IPA/bracket pattern — don't hyphenate phonetic transcriptions
_ipa_re = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
def _word_has_pipe_lines(wb: Dict) -> bool:
"""CV check: does this word_box image show thin vertical dividers?"""
x = wb.get("left", 0)
y = wb.get("top", 0)
w = wb.get("width", 0)
h = wb.get("height", 0)
if w < 30 or h < 12:
return False
ih, iw = img_gray.shape[:2]
y1, y2 = max(0, y), min(ih, y + h)
x1, x2 = max(0, x), min(iw, x + w)
roi = img_gray[y1:y2, x1:x2]
if roi.size == 0:
return False
rh, rw = roi.shape
# Binarize (ink = white)
_, binary = cv2.threshold(
roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
)
# Morphological opening: keep only tall vertical structures
kern_h = max(int(rh * 0.55), 8)
kernel = np.ones((kern_h, 1), np.uint8)
vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
# Find surviving contours
contours, _ = cv2.findContours(
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
margin = max(int(rw * 0.08), 3)
for cnt in contours:
cx, cy, cw, ch = cv2.boundingRect(cnt)
if cw > 4:
continue # too wide
if cx < margin or cx + cw > rw - margin:
continue # at word edge (l, I, 1)
# Check isolation: adjacent columns should be mostly empty
left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
left_ink = np.mean(left_zone) if left_zone.size else 255
right_ink = np.mean(right_zone) if right_zone.size else 255
if left_ink < 80 and right_ink < 80:
return True # isolated thin vertical line = pipe
return False
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
@@ -2820,45 +2866,32 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
continue
text = cell.get("text", "")
if not text or "|" in text:
continue # already has pipes or empty
continue
if _ipa_re.search(text):
continue # IPA content — skip
# Split on commas/semicolons to handle "Kabel, die Kabel"
parts = re.split(r'([,;]\s*)', text)
new_parts = []
changed = False
for part in parts:
if re.match(r'^[,;]\s*$', part):
new_parts.append(part)
continue
# Process individual words in each part
words_in = re.split(r'(\s+)', part)
new_words = []
for w in words_in:
if re.match(r'^\s+$', w):
new_words.append(w)
# CV gate: check if ANY word_box in this cell has pipe lines
wbs = cell.get("word_boxes") or []
has_pipes = any(_word_has_pipe_lines(wb) for wb in wbs)
if not has_pipes:
continue
# Only hyphenate words ≥ 4 chars, skip articles/short
clean = re.sub(r'[().\-]', '', w)
if len(clean) < 4:
new_words.append(w)
# Apply pyphen to FIRST significant word only
# (dictionary layout: only headword gets pipes)
match = re.match(r'^(\s*)([\w\-äöüÄÖÜß]+)(.*)', text, re.DOTALL)
if not match:
continue
# Try DE first, then EN
hyph = _hyph_de.inserted(w, hyphen='|')
prefix, first_word, rest = match.groups()
if len(first_word) < 4:
continue
hyph = _hyph_de.inserted(first_word, hyphen='|')
if '|' not in hyph:
hyph = _hyph_en.inserted(w, hyphen='|')
if '|' in hyph and hyph != w:
new_words.append(hyph)
changed = True
else:
new_words.append(w)
new_parts.append(''.join(new_words))
if changed:
cell["text"] = ''.join(new_parts)
hyph = _hyph_en.inserted(first_word, hyphen='|')
if '|' in hyph and hyph != first_word:
cell["text"] = prefix + hyph + rest
syllable_insertions += 1
if syllable_insertions:
logger.info(
"build-grid session %s: inserted syllable dividers in %d cells",
"build-grid session %s: inserted syllable dividers in %d cells "
"(CV-validated)",
session_id, syllable_insertions,
)
except ImportError: