diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index a53d3af..d906c0a 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3302,22 +3302,19 @@ def _is_noise_tail_token(token: str) -> bool: t_check = stripped_punct if stripped_punct else t # Check for legitimate punctuation patterns vs. real noise. - # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir" - # Noise: "Es)", "3d", "B|" - # Strategy: strip parentheses & trailing hyphens, THEN check residual. + # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir", + # "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen" + # Noise: "3d", "B|", "x7" + # Strategy: strip common dictionary punctuation (parens, hyphens, slashes), + # THEN check if residual contains only alpha characters. t_inner = t_check - # Remove balanced parentheses wrapping the token: "(auf)" → "auf" - if t_inner.startswith('(') and t_inner.endswith(')'): - t_inner = t_inner[1:-1] - # Remove unbalanced parentheses at start/end (common in example sentences): - # "(wir" → "wir", "selbst)" → "selbst", "(selbst))" → "selbst" - t_inner = t_inner.lstrip('(').rstrip(')') - # Remove trailing hyphen (word continuation): "under-" → "under" - if t_inner.endswith('-'): - t_inner = t_inner[:-1] + # Remove all parentheses, hyphens, slashes, and dots — these are normal + # in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)", + # "(zer)brechen", "wir/uns", "e.g." + t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner) # Now check: does the inner form still have non-alpha noise? inner_alpha = ''.join(_RE_ALPHA.findall(t_inner)) - has_internal_noise = len(t_inner) > len(inner_alpha) if t_inner else False + has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False # Long alpha words (4+ chars) without internal noise are likely real if len(cleaned) >= 4 and not has_internal_noise: