fix(ocr-pipeline): tolerate dictionary punctuation in noise filter

The noise filter was stripping words containing hyphens, parentheses, slashes, and dots (e.g. "money-saver", "Schild(chen)", "(Salat-)Gurke", "Tanz(veranstaltung)"). Now strips all common dictionary punctuation before checking for internal noise characters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 13:12:40 +01:00
parent 40a77a82f6
commit 650f15bc1b
1 changed files with 10 additions and 13 deletions
@@ -3302,22 +3302,19 @@ def _is_noise_tail_token(token: str) -> bool:
    t_check = stripped_punct if stripped_punct else t

    # Check for legitimate punctuation patterns vs. real noise.
-    # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir"
-    # Noise: "Es)", "3d", "B|"
-    # Strategy: strip parentheses & trailing hyphens, THEN check residual.
+    # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
+    #             "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
+    # Noise: "3d", "B|", "x7"
+    # Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
+    # THEN check if residual contains only alpha characters.
    t_inner = t_check
-    # Remove balanced parentheses wrapping the token: "(auf)" → "auf"
-    if t_inner.startswith('(') and t_inner.endswith(')'):
-        t_inner = t_inner[1:-1]
-    # Remove unbalanced parentheses at start/end (common in example sentences):
-    # "(wir" → "wir", "selbst)" → "selbst", "(selbst))" → "selbst"
-    t_inner = t_inner.lstrip('(').rstrip(')')
-    # Remove trailing hyphen (word continuation): "under-" → "under"
-    if t_inner.endswith('-'):
-        t_inner = t_inner[:-1]
+    # Remove all parentheses, hyphens, slashes, and dots — these are normal
+    # in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
+    # "(zer)brechen", "wir/uns", "e.g."
+    t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
    # Now check: does the inner form still have non-alpha noise?
    inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
-    has_internal_noise = len(t_inner) > len(inner_alpha) if t_inner else False
+    has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False

    # Long alpha words (4+ chars) without internal noise are likely real
    if len(cleaned) >= 4 and not has_internal_noise: