Fix word split scoring: prefer longer words over short ones
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Failing after 20s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m41s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 30s

"Comeon" was split as "Com eon" instead of "Come on" because both
are 2-word splits. Now uses sum-of-squared-lengths as tiebreaker:
"come"(16) + "on"(4) = 20 > "com"(9) + "eon"(9) = 18.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-11 14:14:23 +02:00
parent 9e2c301723
commit aed0edbf6d

View File

@@ -735,36 +735,39 @@ def _try_split_merged_word(token: str) -> Optional[str]:
lower = token.lower()
n = len(lower)
# dp[i] = shortest list of word lengths that covers lower[:i], or None
# dp[i] = (word_lengths_list, score) for best split of lower[:i], or None
# Score: (-word_count, sum_of_squared_lengths) — fewer words first,
# then prefer longer words (e.g. "come on" over "com eon")
dp: list = [None] * (n + 1)
dp[0] = []
dp[0] = ([], 0)
for i in range(1, n + 1):
# Try all possible last-word lengths (2..min(i, 20))
# Allow single-char words only for 'a' and 'I'
min_len = 1
for j in range(max(0, i - 20), i):
if dp[j] is None:
continue
word_len = i - j
candidate = lower[j:i]
word_len = i - j
if word_len == 1 and candidate not in ('a', 'i'):
continue
if word_len < 2 and candidate not in ('a', 'i'):
continue
if _spell_dict_knows(candidate):
new_split = dp[j] + [word_len]
# Prefer fewer words (shorter split)
if dp[i] is None or len(new_split) < len(dp[i]):
dp[i] = new_split
prev_words, prev_sq = dp[j]
new_words = prev_words + [word_len]
new_sq = prev_sq + word_len * word_len
new_key = (-len(new_words), new_sq)
if dp[i] is None:
dp[i] = (new_words, new_sq)
else:
old_key = (-len(dp[i][0]), dp[i][1])
if new_key > old_key:
dp[i] = (new_words, new_sq)
if dp[n] is None or len(dp[n]) < 2:
if dp[n] is None or len(dp[n][0]) < 2:
return None
# Reconstruct with original casing
result = []
pos = 0
for wlen in dp[n]:
for wlen in dp[n][0]:
result.append(token[pos:pos + wlen])
pos += wlen