diff --git a/klausur-service/backend/cv_review.py b/klausur-service/backend/cv_review.py index f026a65..35accc2 100644 --- a/klausur-service/backend/cv_review.py +++ b/klausur-service/backend/cv_review.py @@ -735,36 +735,39 @@ def _try_split_merged_word(token: str) -> Optional[str]: lower = token.lower() n = len(lower) - # dp[i] = shortest list of word lengths that covers lower[:i], or None + # dp[i] = (word_lengths_list, score) for best split of lower[:i], or None + # Score: (-word_count, sum_of_squared_lengths) — fewer words first, + # then prefer longer words (e.g. "come on" over "com eon") dp: list = [None] * (n + 1) - dp[0] = [] + dp[0] = ([], 0) for i in range(1, n + 1): - # Try all possible last-word lengths (2..min(i, 20)) - # Allow single-char words only for 'a' and 'I' - min_len = 1 for j in range(max(0, i - 20), i): if dp[j] is None: continue - word_len = i - j candidate = lower[j:i] + word_len = i - j if word_len == 1 and candidate not in ('a', 'i'): continue - if word_len < 2 and candidate not in ('a', 'i'): - continue if _spell_dict_knows(candidate): - new_split = dp[j] + [word_len] - # Prefer fewer words (shorter split) - if dp[i] is None or len(new_split) < len(dp[i]): - dp[i] = new_split + prev_words, prev_sq = dp[j] + new_words = prev_words + [word_len] + new_sq = prev_sq + word_len * word_len + new_key = (-len(new_words), new_sq) + if dp[i] is None: + dp[i] = (new_words, new_sq) + else: + old_key = (-len(dp[i][0]), dp[i][1]) + if new_key > old_key: + dp[i] = (new_words, new_sq) - if dp[n] is None or len(dp[n]) < 2: + if dp[n] is None or len(dp[n][0]) < 2: return None # Reconstruct with original casing result = [] pos = 0 - for wlen in dp[n]: + for wlen in dp[n][0]: result.append(token[pos:pos + wlen]) pos += wlen