fix(ocr-pipeline): stop noise filter from stripping parenthesized words
_is_noise_tail_token() treated words with unbalanced parentheses like "selbst)" or "(wir" as OCR noise because the parenthesis counted as "internal noise". Now strips leading/trailing parentheses before the noise check, so legitimate words in example sentences like "We baked ... (wir ... selbst)" are preserved. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3289,19 +3289,22 @@ def _is_noise_tail_token(token: str) -> bool:
|
|||||||
t_check = stripped_punct if stripped_punct else t
|
t_check = stripped_punct if stripped_punct else t
|
||||||
|
|
||||||
# Check for legitimate punctuation patterns vs. real noise.
|
# Check for legitimate punctuation patterns vs. real noise.
|
||||||
# Legitimate: "(auf)", "under-", "e.g.", "(on)"
|
# Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir"
|
||||||
# Noise: "Es)", "3d", "B|"
|
# Noise: "Es)", "3d", "B|"
|
||||||
# Strategy: strip balanced parens & trailing hyphens, THEN check residual.
|
# Strategy: strip parentheses & trailing hyphens, THEN check residual.
|
||||||
t_inner = t_check
|
t_inner = t_check
|
||||||
# Remove balanced parentheses wrapping the token: "(auf)" → "auf"
|
# Remove balanced parentheses wrapping the token: "(auf)" → "auf"
|
||||||
if t_inner.startswith('(') and t_inner.endswith(')'):
|
if t_inner.startswith('(') and t_inner.endswith(')'):
|
||||||
t_inner = t_inner[1:-1]
|
t_inner = t_inner[1:-1]
|
||||||
|
# Remove unbalanced parentheses at start/end (common in example sentences):
|
||||||
|
# "(wir" → "wir", "selbst)" → "selbst", "(selbst))" → "selbst"
|
||||||
|
t_inner = t_inner.lstrip('(').rstrip(')')
|
||||||
# Remove trailing hyphen (word continuation): "under-" → "under"
|
# Remove trailing hyphen (word continuation): "under-" → "under"
|
||||||
if t_inner.endswith('-'):
|
if t_inner.endswith('-'):
|
||||||
t_inner = t_inner[:-1]
|
t_inner = t_inner[:-1]
|
||||||
# Now check: does the inner form still have non-alpha noise?
|
# Now check: does the inner form still have non-alpha noise?
|
||||||
inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
|
inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
|
||||||
has_internal_noise = len(t_inner) > len(inner_alpha)
|
has_internal_noise = len(t_inner) > len(inner_alpha) if t_inner else False
|
||||||
|
|
||||||
# Long alpha words (4+ chars) without internal noise are likely real
|
# Long alpha words (4+ chars) without internal noise are likely real
|
||||||
if len(cleaned) >= 4 and not has_internal_noise:
|
if len(cleaned) >= 4 and not has_internal_noise:
|
||||||
|
|||||||
Reference in New Issue
Block a user