fix(ocr-pipeline): merge phonetic-only rows and fix bracket noise filter
Two fixes: 1. Tokens ending with ] (e.g. "serva]") were stripped by the noise filter because ] was not in the allowed punctuation list. 2. Rows containing only phonetic transcription (e.g. ['mani serva]) are now merged into the previous vocab entry instead of creating a separate (invalid) entry. This prevents the LLM from trying to "correct" phonetic fragments. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3281,9 +3281,11 @@ def _is_noise_tail_token(token: str) -> bool:
|
|||||||
if t in ('...', '…'):
|
if t in ('...', '…'):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Keep phonetic brackets: [eg], [maus], ["a:mand], etc.
|
# Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
|
||||||
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
|
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
|
||||||
return False
|
return False
|
||||||
|
if t.endswith(']'):
|
||||||
|
return False
|
||||||
|
|
||||||
# Pure non-alpha → noise ("3", ")", "|")
|
# Pure non-alpha → noise ("3", ")", "|")
|
||||||
alpha_chars = _RE_ALPHA.findall(t)
|
alpha_chars = _RE_ALPHA.findall(t)
|
||||||
@@ -3792,6 +3794,81 @@ def _cells_to_vocab_entries(
|
|||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
# Regex: line starts with phonetic bracket content only (no real word before it)
|
||||||
|
_PHONETIC_ONLY_RE = re.compile(
|
||||||
|
r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_phonetic_only_text(text: str) -> bool:
|
||||||
|
"""Check if text consists only of phonetic transcription.
|
||||||
|
|
||||||
|
Phonetic-only patterns:
|
||||||
|
['mani serva] → True
|
||||||
|
[dɑːns] → True
|
||||||
|
["a:mand] → True
|
||||||
|
almond ['a:mand] → False (has real word before bracket)
|
||||||
|
Mandel → False
|
||||||
|
"""
|
||||||
|
t = text.strip()
|
||||||
|
if not t:
|
||||||
|
return False
|
||||||
|
# Must contain at least one bracket
|
||||||
|
if '[' not in t and ']' not in t:
|
||||||
|
return False
|
||||||
|
# Remove all bracket content and surrounding punctuation/whitespace
|
||||||
|
without_brackets = re.sub(r"\[.*?\]", '', t)
|
||||||
|
without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
|
||||||
|
# If nothing meaningful remains, it's phonetic-only
|
||||||
|
alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
|
||||||
|
return len(alpha_remaining) < 2
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_phonetic_continuation_rows(
|
||||||
|
entries: List[Dict[str, Any]],
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Merge rows that contain only phonetic transcription into previous entry.
|
||||||
|
|
||||||
|
In dictionary pages, phonetic transcription sometimes wraps to the next
|
||||||
|
row. E.g.:
|
||||||
|
Row 28: EN="it's a money-saver" DE="es spart Kosten"
|
||||||
|
Row 29: EN="['mani serva]" DE=""
|
||||||
|
|
||||||
|
Row 29 is phonetic-only → merge into row 28's EN field.
|
||||||
|
"""
|
||||||
|
if len(entries) < 2:
|
||||||
|
return entries
|
||||||
|
|
||||||
|
merged: List[Dict[str, Any]] = []
|
||||||
|
for entry in entries:
|
||||||
|
en = (entry.get('english') or '').strip()
|
||||||
|
de = (entry.get('german') or '').strip()
|
||||||
|
ex = (entry.get('example') or '').strip()
|
||||||
|
|
||||||
|
# Check if this entry is phonetic-only (EN has only phonetics, DE empty)
|
||||||
|
if merged and _is_phonetic_only_text(en) and not de:
|
||||||
|
prev = merged[-1]
|
||||||
|
prev_en = (prev.get('english') or '').strip()
|
||||||
|
# Append phonetic to previous entry's EN
|
||||||
|
if prev_en:
|
||||||
|
prev['english'] = prev_en + ' ' + en
|
||||||
|
else:
|
||||||
|
prev['english'] = en
|
||||||
|
# If there was an example, append to previous too
|
||||||
|
if ex:
|
||||||
|
prev_ex = (prev.get('example') or '').strip()
|
||||||
|
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
||||||
|
logger.debug(
|
||||||
|
f"Merged phonetic row {entry.get('row_index')} "
|
||||||
|
f"into previous entry: {prev['english']!r}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
merged.append(entry)
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
def build_word_grid(
|
def build_word_grid(
|
||||||
ocr_img: np.ndarray,
|
ocr_img: np.ndarray,
|
||||||
column_regions: List[PageRegion],
|
column_regions: List[PageRegion],
|
||||||
@@ -3843,6 +3920,9 @@ def build_word_grid(
|
|||||||
# --- Post-processing pipeline (deterministic, no LLM) ---
|
# --- Post-processing pipeline (deterministic, no LLM) ---
|
||||||
n_raw = len(entries)
|
n_raw = len(entries)
|
||||||
|
|
||||||
|
# 0. Merge phonetic-only continuation rows into previous entry
|
||||||
|
entries = _merge_phonetic_continuation_rows(entries)
|
||||||
|
|
||||||
# 1. Fix character confusion (I/1/l based on context)
|
# 1. Fix character confusion (I/1/l based on context)
|
||||||
entries = _fix_character_confusion(entries)
|
entries = _fix_character_confusion(entries)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user