fix(ocr-pipeline): merge phonetic-only rows and fix bracket noise filter
Two fixes: 1. Tokens ending with ] (e.g. "serva]") were stripped by the noise filter because ] was not in the allowed punctuation list. 2. Rows containing only phonetic transcription (e.g. ['mani serva]) are now merged into the previous vocab entry instead of creating a separate (invalid) entry. This prevents the LLM from trying to "correct" phonetic fragments. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3281,9 +3281,11 @@ def _is_noise_tail_token(token: str) -> bool:
|
||||
if t in ('...', '…'):
|
||||
return False
|
||||
|
||||
# Keep phonetic brackets: [eg], [maus], ["a:mand], etc.
|
||||
# Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
|
||||
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
|
||||
return False
|
||||
if t.endswith(']'):
|
||||
return False
|
||||
|
||||
# Pure non-alpha → noise ("3", ")", "|")
|
||||
alpha_chars = _RE_ALPHA.findall(t)
|
||||
@@ -3792,6 +3794,81 @@ def _cells_to_vocab_entries(
|
||||
return entries
|
||||
|
||||
|
||||
# Regex: line starts with phonetic bracket content only (no real word before it)
|
||||
_PHONETIC_ONLY_RE = re.compile(
|
||||
r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
|
||||
)
|
||||
|
||||
|
||||
def _is_phonetic_only_text(text: str) -> bool:
|
||||
"""Check if text consists only of phonetic transcription.
|
||||
|
||||
Phonetic-only patterns:
|
||||
['mani serva] → True
|
||||
[dɑːns] → True
|
||||
["a:mand] → True
|
||||
almond ['a:mand] → False (has real word before bracket)
|
||||
Mandel → False
|
||||
"""
|
||||
t = text.strip()
|
||||
if not t:
|
||||
return False
|
||||
# Must contain at least one bracket
|
||||
if '[' not in t and ']' not in t:
|
||||
return False
|
||||
# Remove all bracket content and surrounding punctuation/whitespace
|
||||
without_brackets = re.sub(r"\[.*?\]", '', t)
|
||||
without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
|
||||
# If nothing meaningful remains, it's phonetic-only
|
||||
alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
|
||||
return len(alpha_remaining) < 2
|
||||
|
||||
|
||||
def _merge_phonetic_continuation_rows(
|
||||
entries: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Merge rows that contain only phonetic transcription into previous entry.
|
||||
|
||||
In dictionary pages, phonetic transcription sometimes wraps to the next
|
||||
row. E.g.:
|
||||
Row 28: EN="it's a money-saver" DE="es spart Kosten"
|
||||
Row 29: EN="['mani serva]" DE=""
|
||||
|
||||
Row 29 is phonetic-only → merge into row 28's EN field.
|
||||
"""
|
||||
if len(entries) < 2:
|
||||
return entries
|
||||
|
||||
merged: List[Dict[str, Any]] = []
|
||||
for entry in entries:
|
||||
en = (entry.get('english') or '').strip()
|
||||
de = (entry.get('german') or '').strip()
|
||||
ex = (entry.get('example') or '').strip()
|
||||
|
||||
# Check if this entry is phonetic-only (EN has only phonetics, DE empty)
|
||||
if merged and _is_phonetic_only_text(en) and not de:
|
||||
prev = merged[-1]
|
||||
prev_en = (prev.get('english') or '').strip()
|
||||
# Append phonetic to previous entry's EN
|
||||
if prev_en:
|
||||
prev['english'] = prev_en + ' ' + en
|
||||
else:
|
||||
prev['english'] = en
|
||||
# If there was an example, append to previous too
|
||||
if ex:
|
||||
prev_ex = (prev.get('example') or '').strip()
|
||||
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
||||
logger.debug(
|
||||
f"Merged phonetic row {entry.get('row_index')} "
|
||||
f"into previous entry: {prev['english']!r}"
|
||||
)
|
||||
continue
|
||||
|
||||
merged.append(entry)
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
def build_word_grid(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
@@ -3843,6 +3920,9 @@ def build_word_grid(
|
||||
# --- Post-processing pipeline (deterministic, no LLM) ---
|
||||
n_raw = len(entries)
|
||||
|
||||
# 0. Merge phonetic-only continuation rows into previous entry
|
||||
entries = _merge_phonetic_continuation_rows(entries)
|
||||
|
||||
# 1. Fix character confusion (I/1/l based on context)
|
||||
entries = _fix_character_confusion(entries)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user