Fix 4 Grid Editor bugs: syllable modes, heading detection, word gaps
1. Syllable "Original" (auto) mode: only normalize cells that already have | from OCR — don't add new syllable marks via pyphen to words without printed dividers on the original scan. 2. Syllable "Aus" (none) mode: strip residual | chars from OCR text so cells display clean (e.g. "Zel|le" → "Zelle"). 3. Heading detection: add text length guard in single-cell heuristic — words > 4 alpha chars starting lowercase (like "zentral") are regular vocabulary, not section headings. 4. Word-gap merge: new merge_word_gaps_in_zones() step with relaxed threshold (6 chars) fixes OCR splits like "zerknit tert" → "zerknittert". Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -139,6 +139,92 @@ def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
|
|||||||
return ' '.join(result)
|
return ' '.join(result)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
|
||||||
|
"""Merge OCR word-gap fragments in cell texts using pyphen validation.
|
||||||
|
|
||||||
|
OCR often splits words at syllable boundaries into separate word_boxes,
|
||||||
|
producing text like "zerknit tert" instead of "zerknittert". This
|
||||||
|
function tries to merge adjacent fragments in every content cell.
|
||||||
|
|
||||||
|
More permissive than ``_try_merge_pipe_gaps`` (threshold 6 instead of 3)
|
||||||
|
but still guarded by pyphen dictionary lookup and stop-word exclusion.
|
||||||
|
|
||||||
|
Returns the number of cells modified.
|
||||||
|
"""
|
||||||
|
hyph_de, _ = _get_hyphenators()
|
||||||
|
if hyph_de is None:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
modified = 0
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if not ct.startswith("column_"):
|
||||||
|
continue
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if not text or " " not in text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip IPA cells
|
||||||
|
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
|
||||||
|
if _IPA_RE.search(text_no_brackets):
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_text = _try_merge_word_gaps(text, hyph_de)
|
||||||
|
if new_text != text:
|
||||||
|
cell["text"] = new_text
|
||||||
|
modified += 1
|
||||||
|
|
||||||
|
if modified:
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: merged word gaps in %d cells",
|
||||||
|
session_id, modified,
|
||||||
|
)
|
||||||
|
return modified
|
||||||
|
|
||||||
|
|
||||||
|
def _try_merge_word_gaps(text: str, hyph_de) -> str:
|
||||||
|
"""Merge OCR word fragments with relaxed threshold (max_short=6).
|
||||||
|
|
||||||
|
Similar to ``_try_merge_pipe_gaps`` but allows longer fragments to be
|
||||||
|
merged. Still requires pyphen to recognize the merged word.
|
||||||
|
"""
|
||||||
|
parts = text.split(' ')
|
||||||
|
if len(parts) < 2:
|
||||||
|
return text
|
||||||
|
|
||||||
|
result = [parts[0]]
|
||||||
|
i = 1
|
||||||
|
while i < len(parts):
|
||||||
|
prev = result[-1]
|
||||||
|
curr = parts[i]
|
||||||
|
|
||||||
|
prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
|
||||||
|
curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
|
||||||
|
|
||||||
|
should_try = (
|
||||||
|
prev == prev_alpha
|
||||||
|
and prev_alpha and curr_alpha
|
||||||
|
and prev_alpha.lower() not in _STOP_WORDS
|
||||||
|
and curr_alpha.lower() not in _STOP_WORDS
|
||||||
|
and min(len(prev_alpha), len(curr_alpha)) <= 6
|
||||||
|
and len(prev_alpha) + len(curr_alpha) >= 4
|
||||||
|
)
|
||||||
|
|
||||||
|
if should_try:
|
||||||
|
merged_alpha = prev_alpha + curr_alpha
|
||||||
|
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
|
||||||
|
if '-' in hyph:
|
||||||
|
result[-1] = prev + curr
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
result.append(curr)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return ' '.join(result)
|
||||||
|
|
||||||
|
|
||||||
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
|
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
|
||||||
"""Syllabify all significant words in a text string.
|
"""Syllabify all significant words in a text string.
|
||||||
|
|
||||||
@@ -259,6 +345,12 @@ def insert_syllable_dividers(
|
|||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# In auto mode (force=False), only normalize cells that already
|
||||||
|
# have | from OCR (i.e. printed syllable dividers on the original
|
||||||
|
# scan). Don't add new syllable marks to other words.
|
||||||
|
if not force and "|" not in text:
|
||||||
|
continue
|
||||||
|
|
||||||
new_text = _syllabify_text(text, hyph_de, hyph_en)
|
new_text = _syllabify_text(text, hyph_de, hyph_en)
|
||||||
if new_text != text:
|
if new_text != text:
|
||||||
cell["text"] = new_text
|
cell["text"] = new_text
|
||||||
|
|||||||
@@ -1593,6 +1593,13 @@ async def _build_grid_core(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Dictionary detection failed: %s", e)
|
logger.warning("Dictionary detection failed: %s", e)
|
||||||
|
|
||||||
|
# --- Word-gap merge: fix OCR splits like "zerknit tert" → "zerknittert" ---
|
||||||
|
try:
|
||||||
|
from cv_syllable_detect import merge_word_gaps_in_zones
|
||||||
|
merge_word_gaps_in_zones(zones_data, session_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Word-gap merge failed: %s", e)
|
||||||
|
|
||||||
# --- Syllable divider insertion for dictionary pages ---
|
# --- Syllable divider insertion for dictionary pages ---
|
||||||
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
|
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
|
||||||
# "all" = force on all content words, "en" = English column only,
|
# "all" = force on all content words, "en" = English column only,
|
||||||
@@ -1626,6 +1633,15 @@ async def _build_grid_core(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Syllable insertion failed: %s", e)
|
logger.warning("Syllable insertion failed: %s", e)
|
||||||
|
|
||||||
|
# When syllable mode is "none", strip any residual | from OCR so
|
||||||
|
# that the displayed text is clean (e.g. "Zel|le" → "Zelle").
|
||||||
|
if syllable_mode == "none":
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
t = cell.get("text", "")
|
||||||
|
if "|" in t:
|
||||||
|
cell["text"] = t.replace("|", "")
|
||||||
|
|
||||||
# Clean up internal flags before returning
|
# Clean up internal flags before returning
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
for cell in z.get("cells", []):
|
for cell in z.get("cells", []):
|
||||||
|
|||||||
@@ -912,6 +912,13 @@ def _detect_heading_rows_by_single_cell(
|
|||||||
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||||
if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
|
if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
|
||||||
continue
|
continue
|
||||||
|
# Guard: dictionary section headings are short (1-4 alpha chars
|
||||||
|
# like "A", "Ab", "Zi", "Sch"). Longer text that starts
|
||||||
|
# lowercase is a regular vocabulary word (e.g. "zentral") that
|
||||||
|
# happens to appear alone in its row.
|
||||||
|
alpha_only = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', text)
|
||||||
|
if len(alpha_only) > 4 and text[0].islower():
|
||||||
|
continue
|
||||||
heading_row_indices.append(ri)
|
heading_row_indices.append(ri)
|
||||||
|
|
||||||
# Guard: if >25% of eligible rows would become headings, the
|
# Guard: if >25% of eligible rows would become headings, the
|
||||||
|
|||||||
Reference in New Issue
Block a user