From 87931c35e43f2930b830144c407595ca0245c079 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 12:51:28 +0100 Subject: [PATCH] fix(ocr-pipeline): stop noise filter from stripping parenthesized words _is_noise_tail_token() treated words with unbalanced parentheses like "selbst)" or "(wir" as OCR noise because the parenthesis counted as "internal noise". Now strips leading/trailing parentheses before the noise check, so legitimate words in example sentences like "We baked ... (wir ... selbst)" are preserved. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 883aaff..4dee8c5 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3289,19 +3289,22 @@ def _is_noise_tail_token(token: str) -> bool: t_check = stripped_punct if stripped_punct else t # Check for legitimate punctuation patterns vs. real noise. - # Legitimate: "(auf)", "under-", "e.g.", "(on)" + # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir" # Noise: "Es)", "3d", "B|" - # Strategy: strip balanced parens & trailing hyphens, THEN check residual. + # Strategy: strip parentheses & trailing hyphens, THEN check residual. t_inner = t_check # Remove balanced parentheses wrapping the token: "(auf)" → "auf" if t_inner.startswith('(') and t_inner.endswith(')'): t_inner = t_inner[1:-1] + # Remove unbalanced parentheses at start/end (common in example sentences): + # "(wir" → "wir", "selbst)" → "selbst", "(selbst))" → "selbst" + t_inner = t_inner.lstrip('(').rstrip(')') # Remove trailing hyphen (word continuation): "under-" → "under" if t_inner.endswith('-'): t_inner = t_inner[:-1] # Now check: does the inner form still have non-alpha noise? inner_alpha = ''.join(_RE_ALPHA.findall(t_inner)) - has_internal_noise = len(t_inner) > len(inner_alpha) + has_internal_noise = len(t_inner) > len(inner_alpha) if t_inner else False # Long alpha words (4+ chars) without internal noise are likely real if len(cleaned) >= 4 and not has_internal_noise: