fix: CV-gated syllable insertion + grid editor scroll
1. Syllable dividers now require CV validation: morphological vertical line detection checks if word_box image actually shows thin isolated pipe lines before applying pyphen. Only first word per cell gets pipes (matching dictionary print layout). 2. Grid editor scroll: changed maxHeight from 80vh to calc(100vh-200px) so editor remains scrollable after edits. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -314,7 +314,7 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
|
||||
)}
|
||||
|
||||
{/* Right: Grid with row-accept buttons */}
|
||||
<div className="space-y-3 overflow-y-auto" style={{ maxHeight: '80vh' }}>
|
||||
<div className="space-y-3 overflow-y-auto" style={{ maxHeight: 'calc(100vh - 200px)' }}>
|
||||
|
||||
{/* Zone tables with row-accept buttons */}
|
||||
{(() => {
|
||||
|
||||
@@ -2803,16 +2803,62 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
|
||||
# --- Syllable divider insertion for dictionary pages ---
|
||||
# Dictionary pages show syllable breaks as "|" (e.g. "Ka|me|rad").
|
||||
# OCR engines rarely detect "|", so we insert them via pyphen
|
||||
# hyphenation rules when the page is confirmed as a dictionary.
|
||||
# OCR engines rarely detect "|", so we use a two-step approach:
|
||||
# 1. CV: detect if a word_box image contains thin vertical pipe lines
|
||||
# 2. pyphen: insert syllable breaks at linguistically correct positions
|
||||
# Only the FIRST significant word per cell gets pipes (matching print layout).
|
||||
syllable_insertions = 0
|
||||
if dict_detection.get("is_dictionary"):
|
||||
if dict_detection.get("is_dictionary") and img_bgr is not None:
|
||||
try:
|
||||
import pyphen
|
||||
_hyph_de = pyphen.Pyphen(lang='de_DE')
|
||||
_hyph_en = pyphen.Pyphen(lang='en_US')
|
||||
# IPA/bracket pattern — don't hyphenate phonetic transcriptions
|
||||
_ipa_re = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
||||
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
def _word_has_pipe_lines(wb: Dict) -> bool:
|
||||
"""CV check: does this word_box image show thin vertical dividers?"""
|
||||
x = wb.get("left", 0)
|
||||
y = wb.get("top", 0)
|
||||
w = wb.get("width", 0)
|
||||
h = wb.get("height", 0)
|
||||
if w < 30 or h < 12:
|
||||
return False
|
||||
ih, iw = img_gray.shape[:2]
|
||||
y1, y2 = max(0, y), min(ih, y + h)
|
||||
x1, x2 = max(0, x), min(iw, x + w)
|
||||
roi = img_gray[y1:y2, x1:x2]
|
||||
if roi.size == 0:
|
||||
return False
|
||||
rh, rw = roi.shape
|
||||
# Binarize (ink = white)
|
||||
_, binary = cv2.threshold(
|
||||
roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
|
||||
)
|
||||
# Morphological opening: keep only tall vertical structures
|
||||
kern_h = max(int(rh * 0.55), 8)
|
||||
kernel = np.ones((kern_h, 1), np.uint8)
|
||||
vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
|
||||
# Find surviving contours
|
||||
contours, _ = cv2.findContours(
|
||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
margin = max(int(rw * 0.08), 3)
|
||||
for cnt in contours:
|
||||
cx, cy, cw, ch = cv2.boundingRect(cnt)
|
||||
if cw > 4:
|
||||
continue # too wide
|
||||
if cx < margin or cx + cw > rw - margin:
|
||||
continue # at word edge (l, I, 1)
|
||||
# Check isolation: adjacent columns should be mostly empty
|
||||
left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
|
||||
right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
|
||||
left_ink = np.mean(left_zone) if left_zone.size else 255
|
||||
right_ink = np.mean(right_zone) if right_zone.size else 255
|
||||
if left_ink < 80 and right_ink < 80:
|
||||
return True # isolated thin vertical line = pipe
|
||||
return False
|
||||
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
ct = cell.get("col_type", "")
|
||||
@@ -2820,45 +2866,32 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if not text or "|" in text:
|
||||
continue # already has pipes or empty
|
||||
continue
|
||||
if _ipa_re.search(text):
|
||||
continue # IPA content — skip
|
||||
# Split on commas/semicolons to handle "Kabel, die Kabel"
|
||||
parts = re.split(r'([,;]\s*)', text)
|
||||
new_parts = []
|
||||
changed = False
|
||||
for part in parts:
|
||||
if re.match(r'^[,;]\s*$', part):
|
||||
new_parts.append(part)
|
||||
continue
|
||||
# Process individual words in each part
|
||||
words_in = re.split(r'(\s+)', part)
|
||||
new_words = []
|
||||
for w in words_in:
|
||||
if re.match(r'^\s+$', w):
|
||||
new_words.append(w)
|
||||
# CV gate: check if ANY word_box in this cell has pipe lines
|
||||
wbs = cell.get("word_boxes") or []
|
||||
has_pipes = any(_word_has_pipe_lines(wb) for wb in wbs)
|
||||
if not has_pipes:
|
||||
continue
|
||||
# Only hyphenate words ≥ 4 chars, skip articles/short
|
||||
clean = re.sub(r'[().\-]', '', w)
|
||||
if len(clean) < 4:
|
||||
new_words.append(w)
|
||||
# Apply pyphen to FIRST significant word only
|
||||
# (dictionary layout: only headword gets pipes)
|
||||
match = re.match(r'^(\s*)([\w\-äöüÄÖÜß]+)(.*)', text, re.DOTALL)
|
||||
if not match:
|
||||
continue
|
||||
# Try DE first, then EN
|
||||
hyph = _hyph_de.inserted(w, hyphen='|')
|
||||
prefix, first_word, rest = match.groups()
|
||||
if len(first_word) < 4:
|
||||
continue
|
||||
hyph = _hyph_de.inserted(first_word, hyphen='|')
|
||||
if '|' not in hyph:
|
||||
hyph = _hyph_en.inserted(w, hyphen='|')
|
||||
if '|' in hyph and hyph != w:
|
||||
new_words.append(hyph)
|
||||
changed = True
|
||||
else:
|
||||
new_words.append(w)
|
||||
new_parts.append(''.join(new_words))
|
||||
if changed:
|
||||
cell["text"] = ''.join(new_parts)
|
||||
hyph = _hyph_en.inserted(first_word, hyphen='|')
|
||||
if '|' in hyph and hyph != first_word:
|
||||
cell["text"] = prefix + hyph + rest
|
||||
syllable_insertions += 1
|
||||
if syllable_insertions:
|
||||
logger.info(
|
||||
"build-grid session %s: inserted syllable dividers in %d cells",
|
||||
"build-grid session %s: inserted syllable dividers in %d cells "
|
||||
"(CV-validated)",
|
||||
session_id, syllable_insertions,
|
||||
)
|
||||
except ImportError:
|
||||
|
||||
Reference in New Issue
Block a user