Fix word split scoring: prefer longer words over short ones
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Failing after 20s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m41s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 30s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Failing after 20s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m41s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 30s
"Comeon" was split as "Com eon" instead of "Come on" because both are 2-word splits. Now uses sum-of-squared-lengths as tiebreaker: "come"(16) + "on"(4) = 20 > "com"(9) + "eon"(9) = 18. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -735,36 +735,39 @@ def _try_split_merged_word(token: str) -> Optional[str]:
|
||||
lower = token.lower()
|
||||
n = len(lower)
|
||||
|
||||
# dp[i] = shortest list of word lengths that covers lower[:i], or None
|
||||
# dp[i] = (word_lengths_list, score) for best split of lower[:i], or None
|
||||
# Score: (-word_count, sum_of_squared_lengths) — fewer words first,
|
||||
# then prefer longer words (e.g. "come on" over "com eon")
|
||||
dp: list = [None] * (n + 1)
|
||||
dp[0] = []
|
||||
dp[0] = ([], 0)
|
||||
|
||||
for i in range(1, n + 1):
|
||||
# Try all possible last-word lengths (2..min(i, 20))
|
||||
# Allow single-char words only for 'a' and 'I'
|
||||
min_len = 1
|
||||
for j in range(max(0, i - 20), i):
|
||||
if dp[j] is None:
|
||||
continue
|
||||
word_len = i - j
|
||||
candidate = lower[j:i]
|
||||
word_len = i - j
|
||||
if word_len == 1 and candidate not in ('a', 'i'):
|
||||
continue
|
||||
if word_len < 2 and candidate not in ('a', 'i'):
|
||||
continue
|
||||
if _spell_dict_knows(candidate):
|
||||
new_split = dp[j] + [word_len]
|
||||
# Prefer fewer words (shorter split)
|
||||
if dp[i] is None or len(new_split) < len(dp[i]):
|
||||
dp[i] = new_split
|
||||
prev_words, prev_sq = dp[j]
|
||||
new_words = prev_words + [word_len]
|
||||
new_sq = prev_sq + word_len * word_len
|
||||
new_key = (-len(new_words), new_sq)
|
||||
if dp[i] is None:
|
||||
dp[i] = (new_words, new_sq)
|
||||
else:
|
||||
old_key = (-len(dp[i][0]), dp[i][1])
|
||||
if new_key > old_key:
|
||||
dp[i] = (new_words, new_sq)
|
||||
|
||||
if dp[n] is None or len(dp[n]) < 2:
|
||||
if dp[n] is None or len(dp[n][0]) < 2:
|
||||
return None
|
||||
|
||||
# Reconstruct with original casing
|
||||
result = []
|
||||
pos = 0
|
||||
for wlen in dp[n]:
|
||||
for wlen in dp[n][0]:
|
||||
result.append(token[pos:pos + wlen])
|
||||
pos += wlen
|
||||
|
||||
|
||||
Reference in New Issue
Block a user