Fix word split scoring: prefer longer words over short ones
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Failing after 20s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m41s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 30s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Failing after 20s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m41s
CI / test-python-agent-core (push) Successful in 24s
CI / test-nodejs-website (push) Successful in 30s
"Comeon" was split as "Com eon" instead of "Come on" because both are 2-word splits. Now uses sum-of-squared-lengths as tiebreaker: "come"(16) + "on"(4) = 20 > "com"(9) + "eon"(9) = 18. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -735,36 +735,39 @@ def _try_split_merged_word(token: str) -> Optional[str]:
|
|||||||
lower = token.lower()
|
lower = token.lower()
|
||||||
n = len(lower)
|
n = len(lower)
|
||||||
|
|
||||||
# dp[i] = shortest list of word lengths that covers lower[:i], or None
|
# dp[i] = (word_lengths_list, score) for best split of lower[:i], or None
|
||||||
|
# Score: (-word_count, sum_of_squared_lengths) — fewer words first,
|
||||||
|
# then prefer longer words (e.g. "come on" over "com eon")
|
||||||
dp: list = [None] * (n + 1)
|
dp: list = [None] * (n + 1)
|
||||||
dp[0] = []
|
dp[0] = ([], 0)
|
||||||
|
|
||||||
for i in range(1, n + 1):
|
for i in range(1, n + 1):
|
||||||
# Try all possible last-word lengths (2..min(i, 20))
|
|
||||||
# Allow single-char words only for 'a' and 'I'
|
|
||||||
min_len = 1
|
|
||||||
for j in range(max(0, i - 20), i):
|
for j in range(max(0, i - 20), i):
|
||||||
if dp[j] is None:
|
if dp[j] is None:
|
||||||
continue
|
continue
|
||||||
word_len = i - j
|
|
||||||
candidate = lower[j:i]
|
candidate = lower[j:i]
|
||||||
|
word_len = i - j
|
||||||
if word_len == 1 and candidate not in ('a', 'i'):
|
if word_len == 1 and candidate not in ('a', 'i'):
|
||||||
continue
|
continue
|
||||||
if word_len < 2 and candidate not in ('a', 'i'):
|
|
||||||
continue
|
|
||||||
if _spell_dict_knows(candidate):
|
if _spell_dict_knows(candidate):
|
||||||
new_split = dp[j] + [word_len]
|
prev_words, prev_sq = dp[j]
|
||||||
# Prefer fewer words (shorter split)
|
new_words = prev_words + [word_len]
|
||||||
if dp[i] is None or len(new_split) < len(dp[i]):
|
new_sq = prev_sq + word_len * word_len
|
||||||
dp[i] = new_split
|
new_key = (-len(new_words), new_sq)
|
||||||
|
if dp[i] is None:
|
||||||
|
dp[i] = (new_words, new_sq)
|
||||||
|
else:
|
||||||
|
old_key = (-len(dp[i][0]), dp[i][1])
|
||||||
|
if new_key > old_key:
|
||||||
|
dp[i] = (new_words, new_sq)
|
||||||
|
|
||||||
if dp[n] is None or len(dp[n]) < 2:
|
if dp[n] is None or len(dp[n][0]) < 2:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Reconstruct with original casing
|
# Reconstruct with original casing
|
||||||
result = []
|
result = []
|
||||||
pos = 0
|
pos = 0
|
||||||
for wlen in dp[n]:
|
for wlen in dp[n][0]:
|
||||||
result.append(token[pos:pos + wlen])
|
result.append(token[pos:pos + wlen])
|
||||||
pos += wlen
|
pos += wlen
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user