overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
301 lines
9.9 KiB
Python
301 lines
9.9 KiB
Python
"""
|
|
Syllable Merge — word gap merging, syllabification, divider insertion.
|
|
|
|
Extracted from cv_syllable_detect.py for modularity.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import numpy as np
|
|
|
|
from cv_syllable_core import (
|
|
_get_hyphenators,
|
|
_hyphenate_word,
|
|
_IPA_RE,
|
|
_STOP_WORDS,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
|
|
"""Merge fragments separated by single spaces where OCR split at a pipe.
|
|
|
|
Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
|
|
Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
|
|
|
|
Guards against false merges:
|
|
- The FIRST token must be pure alpha (word start -- no attached punctuation)
|
|
- The second token may have trailing punctuation (comma, period) which
|
|
stays attached to the merged word: "Ka" + "fer," -> "Kafer,"
|
|
- Common German function words (der, die, das, ...) are never merged
|
|
- At least one fragment must be very short (<=3 alpha chars)
|
|
"""
|
|
parts = text.split(' ')
|
|
if len(parts) < 2:
|
|
return text
|
|
|
|
result = [parts[0]]
|
|
i = 1
|
|
while i < len(parts):
|
|
prev = result[-1]
|
|
curr = parts[i]
|
|
|
|
# Extract alpha-only core for lookup
|
|
prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev)
|
|
curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr)
|
|
|
|
# Guard 1: first token must be pure alpha (word-start fragment)
|
|
# second token may have trailing punctuation
|
|
# Guard 2: neither alpha core can be a common German function word
|
|
# Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
|
|
# Guard 4: combined length must be >= 4
|
|
should_try = (
|
|
prev == prev_alpha # first token: pure alpha (word start)
|
|
and prev_alpha and curr_alpha
|
|
and prev_alpha.lower() not in _STOP_WORDS
|
|
and curr_alpha.lower() not in _STOP_WORDS
|
|
and min(len(prev_alpha), len(curr_alpha)) <= 3
|
|
and len(prev_alpha) + len(curr_alpha) >= 4
|
|
)
|
|
|
|
if should_try:
|
|
merged_alpha = prev_alpha + curr_alpha
|
|
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
|
|
if '-' in hyph:
|
|
# pyphen recognizes merged word -- collapse the space
|
|
result[-1] = prev + curr
|
|
i += 1
|
|
continue
|
|
|
|
result.append(curr)
|
|
i += 1
|
|
|
|
return ' '.join(result)
|
|
|
|
|
|
def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
|
|
"""Merge OCR word-gap fragments in cell texts using pyphen validation.
|
|
|
|
OCR often splits words at syllable boundaries into separate word_boxes,
|
|
producing text like "zerknit tert" instead of "zerknittert". This
|
|
function tries to merge adjacent fragments in every content cell.
|
|
|
|
More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
|
|
but still guarded by pyphen dictionary lookup and stop-word exclusion.
|
|
|
|
Returns the number of cells modified.
|
|
"""
|
|
hyph_de, _ = _get_hyphenators()
|
|
if hyph_de is None:
|
|
return 0
|
|
|
|
modified = 0
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
ct = cell.get("col_type", "")
|
|
if not ct.startswith("column_"):
|
|
continue
|
|
text = cell.get("text", "")
|
|
if not text or " " not in text:
|
|
continue
|
|
|
|
# Skip IPA cells
|
|
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
|
|
if _IPA_RE.search(text_no_brackets):
|
|
continue
|
|
|
|
new_text = _try_merge_word_gaps(text, hyph_de)
|
|
if new_text != text:
|
|
cell["text"] = new_text
|
|
modified += 1
|
|
|
|
if modified:
|
|
logger.info(
|
|
"build-grid session %s: merged word gaps in %d cells",
|
|
session_id, modified,
|
|
)
|
|
return modified
|
|
|
|
|
|
def _try_merge_word_gaps(text: str, hyph_de) -> str:
|
|
"""Merge OCR word fragments with relaxed threshold (max_short=5).
|
|
|
|
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
|
|
(max_short=5 instead of 3). Still requires pyphen to recognize the
|
|
merged word.
|
|
"""
|
|
parts = text.split(' ')
|
|
if len(parts) < 2:
|
|
return text
|
|
|
|
result = [parts[0]]
|
|
i = 1
|
|
while i < len(parts):
|
|
prev = result[-1]
|
|
curr = parts[i]
|
|
|
|
prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev)
|
|
curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr)
|
|
|
|
should_try = (
|
|
prev == prev_alpha
|
|
and prev_alpha and curr_alpha
|
|
and prev_alpha.lower() not in _STOP_WORDS
|
|
and curr_alpha.lower() not in _STOP_WORDS
|
|
and min(len(prev_alpha), len(curr_alpha)) <= 5
|
|
and len(prev_alpha) + len(curr_alpha) >= 4
|
|
)
|
|
|
|
if should_try:
|
|
merged_alpha = prev_alpha + curr_alpha
|
|
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
|
|
if '-' in hyph:
|
|
result[-1] = prev + curr
|
|
i += 1
|
|
continue
|
|
|
|
result.append(curr)
|
|
i += 1
|
|
|
|
return ' '.join(result)
|
|
|
|
|
|
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
|
|
"""Syllabify all significant words in a text string.
|
|
|
|
1. Strip existing | dividers
|
|
2. Merge pipe-gap spaces where possible
|
|
3. Apply pyphen to each word >= 3 alphabetic chars
|
|
4. Words pyphen doesn't recognize stay as-is (no bad guesses)
|
|
"""
|
|
if not text:
|
|
return text
|
|
|
|
# Skip cells that contain IPA transcription characters outside brackets.
|
|
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
|
|
if _IPA_RE.search(text_no_brackets):
|
|
return text
|
|
|
|
# Phase 1: strip existing pipe dividers for clean normalization
|
|
clean = text.replace('|', '')
|
|
|
|
# Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
|
|
clean = _try_merge_pipe_gaps(clean, hyph_de)
|
|
|
|
# Phase 3: tokenize and syllabify each word
|
|
# Split on whitespace and comma/semicolon sequences, keeping separators
|
|
tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
|
|
|
|
result = []
|
|
for tok in tokens:
|
|
if not tok or re.match(r'^[\s,;:]+$', tok):
|
|
result.append(tok)
|
|
continue
|
|
|
|
# Strip trailing/leading punctuation for pyphen lookup
|
|
m = re.match(r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)(.*?)([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$', tok)
|
|
if not m:
|
|
result.append(tok)
|
|
continue
|
|
lead, word, trail = m.group(1), m.group(2), m.group(3)
|
|
|
|
if len(word) < 3 or not re.search(r'[a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df]', word):
|
|
result.append(tok)
|
|
continue
|
|
|
|
hyph = _hyphenate_word(word, hyph_de, hyph_en)
|
|
if hyph:
|
|
result.append(lead + hyph + trail)
|
|
else:
|
|
result.append(tok)
|
|
|
|
return ''.join(result)
|
|
|
|
|
|
def insert_syllable_dividers(
|
|
zones_data: List[Dict],
|
|
img_bgr: np.ndarray,
|
|
session_id: str,
|
|
*,
|
|
force: bool = False,
|
|
col_filter: Optional[set] = None,
|
|
) -> int:
|
|
"""Insert pipe syllable dividers into dictionary cells.
|
|
|
|
For dictionary pages: process all content column cells, strip existing
|
|
pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
|
|
|
|
Pre-check: at least 1% of content cells must already contain ``|`` from
|
|
OCR. This guards against pages with zero pipe characters.
|
|
|
|
Args:
|
|
force: If True, skip the pipe-ratio pre-check and syllabify all
|
|
content words regardless of whether the original has pipe dividers.
|
|
col_filter: If set, only process cells whose col_type is in this set.
|
|
None means process all content columns.
|
|
|
|
Returns the number of cells modified.
|
|
"""
|
|
hyph_de, hyph_en = _get_hyphenators()
|
|
if hyph_de is None:
|
|
logger.warning("pyphen not installed -- skipping syllable insertion")
|
|
return 0
|
|
|
|
# Pre-check: count cells that already have | from OCR.
|
|
if not force:
|
|
total_col_cells = 0
|
|
cells_with_pipes = 0
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
if cell.get("col_type", "").startswith("column_"):
|
|
total_col_cells += 1
|
|
if "|" in cell.get("text", ""):
|
|
cells_with_pipes += 1
|
|
|
|
if total_col_cells > 0:
|
|
pipe_ratio = cells_with_pipes / total_col_cells
|
|
if pipe_ratio < 0.01:
|
|
logger.info(
|
|
"build-grid session %s: skipping syllable insertion -- "
|
|
"only %.1f%% of cells have existing pipes (need >=1%%)",
|
|
session_id, pipe_ratio * 100,
|
|
)
|
|
return 0
|
|
|
|
insertions = 0
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
ct = cell.get("col_type", "")
|
|
if not ct.startswith("column_"):
|
|
continue
|
|
if col_filter is not None and ct not in col_filter:
|
|
continue
|
|
text = cell.get("text", "")
|
|
if not text:
|
|
continue
|
|
|
|
# In auto mode (force=False), only normalize cells that already
|
|
# have | from OCR (i.e. printed syllable dividers on the original
|
|
# scan). Don't add new syllable marks to other words.
|
|
if not force and "|" not in text:
|
|
continue
|
|
|
|
new_text = _syllabify_text(text, hyph_de, hyph_en)
|
|
if new_text != text:
|
|
cell["text"] = new_text
|
|
insertions += 1
|
|
|
|
if insertions:
|
|
logger.info(
|
|
"build-grid session %s: syllable dividers inserted/normalized "
|
|
"in %d cells (pyphen)",
|
|
session_id, insertions,
|
|
)
|
|
return insertions
|