Fix gutter repair: detect short fragments + show spell alternatives
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 48s
CI / test-go-edu-search (push) Successful in 49s
CI / test-python-klausur (push) Failing after 2m37s
CI / test-python-agent-core (push) Successful in 35s
CI / test-nodejs-website (push) Successful in 35s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 48s
CI / test-go-edu-search (push) Successful in 49s
CI / test-python-klausur (push) Failing after 2m37s
CI / test-python-agent-core (push) Successful in 35s
CI / test-nodejs-website (push) Successful in 35s
- Lower min word length from 3→2 for hyphen-join candidates so fragments like "ve" (from "ver-künden") are no longer skipped - Return all spellchecker candidates instead of just top-1, so user can pick the correct form (e.g. "stammeln" vs "stammelt") - Frontend shows clickable alternative buttons for spell_fix suggestions - Backend accepts text_overrides in apply endpoint for user-selected alternatives Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,7 @@ interface GutterSuggestion {
|
||||
next_row_text: string
|
||||
missing_chars: string
|
||||
display_parts: string[]
|
||||
alternatives: string[]
|
||||
confidence: number
|
||||
reason: string
|
||||
}
|
||||
@@ -50,6 +51,7 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
|
||||
const [result, setResult] = useState<GutterRepairResult | null>(null)
|
||||
const [accepted, setAccepted] = useState<Set<string>>(new Set())
|
||||
const [rejected, setRejected] = useState<Set<string>>(new Set())
|
||||
const [selectedText, setSelectedText] = useState<Record<string, string>>({})
|
||||
const [applied, setApplied] = useState(false)
|
||||
const [error, setError] = useState('')
|
||||
const [applyMessage, setApplyMessage] = useState('')
|
||||
@@ -128,7 +130,10 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
|
||||
{
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ accepted: Array.from(accepted) }),
|
||||
body: JSON.stringify({
|
||||
accepted: Array.from(accepted),
|
||||
text_overrides: selectedText,
|
||||
}),
|
||||
},
|
||||
)
|
||||
if (!res.ok) {
|
||||
@@ -304,14 +309,38 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
|
||||
)}
|
||||
</div>
|
||||
) : (
|
||||
<div className="flex items-center gap-2 text-sm">
|
||||
<span className="font-mono text-red-600 dark:text-red-400 line-through">
|
||||
{s.original_text}
|
||||
</span>
|
||||
<span className="text-gray-400">→</span>
|
||||
<span className="font-mono text-green-600 dark:text-green-400 font-semibold">
|
||||
{s.suggested_text}
|
||||
</span>
|
||||
<div className="space-y-1">
|
||||
<div className="flex items-center gap-2 text-sm">
|
||||
<span className="font-mono text-red-600 dark:text-red-400 line-through">
|
||||
{s.original_text}
|
||||
</span>
|
||||
<span className="text-gray-400">→</span>
|
||||
<span className="font-mono text-green-600 dark:text-green-400 font-semibold">
|
||||
{selectedText[s.id] || s.suggested_text}
|
||||
</span>
|
||||
</div>
|
||||
{/* Alternatives: show other candidates the user can pick */}
|
||||
{s.alternatives && s.alternatives.length > 0 && !applied && (
|
||||
<div className="flex items-center gap-1.5 flex-wrap">
|
||||
<span className="text-[10px] text-gray-400">Alternativen:</span>
|
||||
{[s.suggested_text, ...s.alternatives].map((alt) => {
|
||||
const isSelected = (selectedText[s.id] || s.suggested_text) === alt
|
||||
return (
|
||||
<button
|
||||
key={alt}
|
||||
onClick={() => setSelectedText(prev => ({ ...prev, [s.id]: alt }))}
|
||||
className={`px-1.5 py-0.5 text-[11px] font-mono rounded transition-colors ${
|
||||
isSelected
|
||||
? 'bg-green-200 dark:bg-green-800 text-green-800 dark:text-green-200 font-semibold'
|
||||
: 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
|
||||
}`}
|
||||
>
|
||||
{alt}
|
||||
</button>
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
@@ -55,31 +55,40 @@ def _is_known(word: str) -> bool:
|
||||
return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))
|
||||
|
||||
|
||||
def _spell_correction(word: str, lang: str = "both") -> Optional[str]:
|
||||
"""Get best spellchecker correction for a word."""
|
||||
def _spell_candidates(word: str, lang: str = "both") -> List[str]:
|
||||
"""Get all plausible spellchecker candidates for a word (deduplicated)."""
|
||||
_init_spellcheckers()
|
||||
if not _SPELL_AVAILABLE:
|
||||
return None
|
||||
return []
|
||||
w = word.lower()
|
||||
result = None
|
||||
if lang in ("de", "both") and _spell_de:
|
||||
result = _spell_de.correction(w)
|
||||
if result and result != w and _spell_de.known([result]):
|
||||
return result
|
||||
if lang in ("en", "both") and _spell_en:
|
||||
result = _spell_en.correction(w)
|
||||
if result and result != w and _spell_en.known([result]):
|
||||
return result
|
||||
return None
|
||||
seen: set = set()
|
||||
results: List[str] = []
|
||||
|
||||
for checker in ([_spell_de, _spell_en] if lang == "both"
|
||||
else [_spell_de] if lang == "de"
|
||||
else [_spell_en]):
|
||||
if checker is None:
|
||||
continue
|
||||
cands = checker.candidates(w)
|
||||
if cands:
|
||||
for c in cands:
|
||||
if c and c != w and c not in seen:
|
||||
seen.add(c)
|
||||
results.append(c)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gutter position detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Minimum word length to consider for repair (very short words are often
|
||||
# legitimate: "a", "to", "in", etc.)
|
||||
_MIN_WORD_LEN = 3
|
||||
# Minimum word length for spell-fix (very short words are often legitimate)
|
||||
_MIN_WORD_LEN_SPELL = 3
|
||||
|
||||
# Minimum word length for hyphen-join candidates (fragments at the gutter
|
||||
# can be as short as 1-2 chars, e.g. "ve" from "ver-künden")
|
||||
_MIN_WORD_LEN_HYPHEN = 2
|
||||
|
||||
# How close to the right column edge a word must be to count as "gutter-adjacent".
|
||||
# Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
|
||||
@@ -138,6 +147,8 @@ class GutterSuggestion:
|
||||
next_row_text: str = ""
|
||||
missing_chars: str = ""
|
||||
display_parts: List[str] = field(default_factory=list)
|
||||
# Alternatives (other plausible corrections the user can pick from)
|
||||
alternatives: List[str] = field(default_factory=list)
|
||||
# Meta:
|
||||
confidence: float = 0.0
|
||||
reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"
|
||||
@@ -186,12 +197,16 @@ def _try_hyphen_join(
|
||||
return None
|
||||
|
||||
|
||||
def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, float]]:
|
||||
def _try_spell_fix(
|
||||
word_text: str, col_type: str = "",
|
||||
) -> Optional[Tuple[str, float, List[str]]]:
|
||||
"""Try to fix a single garbled gutter word via spellchecker.
|
||||
|
||||
Returns (corrected_word, confidence) or None.
|
||||
Returns (best_correction, confidence, alternatives_list) or None.
|
||||
The alternatives list contains other plausible corrections the user
|
||||
can choose from (e.g. "stammelt" vs "stammeln").
|
||||
"""
|
||||
if len(word_text) < _MIN_WORD_LEN:
|
||||
if len(word_text) < _MIN_WORD_LEN_SPELL:
|
||||
return None
|
||||
|
||||
# Determine language priority from column type
|
||||
@@ -202,21 +217,38 @@ def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, fl
|
||||
else:
|
||||
lang = "both"
|
||||
|
||||
correction = _spell_correction(word_text, lang=lang)
|
||||
if not correction:
|
||||
# Try the other language too
|
||||
correction = _spell_correction(word_text, lang="both")
|
||||
candidates = _spell_candidates(word_text, lang=lang)
|
||||
if not candidates and lang != "both":
|
||||
candidates = _spell_candidates(word_text, lang="both")
|
||||
|
||||
if correction and correction.lower() != word_text.lower():
|
||||
# Preserve original casing of first letter
|
||||
if word_text[0].isupper():
|
||||
correction = correction[0].upper() + correction[1:]
|
||||
# Confidence based on edit distance
|
||||
dist = _edit_distance(word_text.lower(), correction.lower())
|
||||
conf = max(0.5, 1.0 - dist * 0.15)
|
||||
return (correction, conf)
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
return None
|
||||
# Preserve original casing
|
||||
is_upper = word_text[0].isupper()
|
||||
|
||||
def _preserve_case(w: str) -> str:
|
||||
if is_upper and w:
|
||||
return w[0].upper() + w[1:]
|
||||
return w
|
||||
|
||||
# Sort candidates by edit distance (closest first)
|
||||
scored = []
|
||||
for c in candidates:
|
||||
dist = _edit_distance(word_text.lower(), c.lower())
|
||||
scored.append((dist, c))
|
||||
scored.sort(key=lambda x: x[0])
|
||||
|
||||
best_dist, best = scored[0]
|
||||
best = _preserve_case(best)
|
||||
conf = max(0.5, 1.0 - best_dist * 0.15)
|
||||
|
||||
# Build alternatives (all other candidates, also case-preserved)
|
||||
alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()]
|
||||
# Limit to top 5 alternatives
|
||||
alts = alts[:5]
|
||||
|
||||
return (best, conf, alts)
|
||||
|
||||
|
||||
def _edit_distance(a: str, b: str) -> int:
|
||||
@@ -299,7 +331,7 @@ def analyse_grid_for_gutter_repair(
|
||||
# right edge of its column AND not a known word.
|
||||
for (ri, ci), cell in cell_map.items():
|
||||
text = (cell.get("text") or "").strip()
|
||||
if not text or len(text) < _MIN_WORD_LEN:
|
||||
if not text:
|
||||
continue
|
||||
if _is_ipa_text(text):
|
||||
continue
|
||||
@@ -318,12 +350,12 @@ def analyse_grid_for_gutter_repair(
|
||||
|
||||
last_word = cell_words[-1]
|
||||
|
||||
# Skip stopwords and very short words
|
||||
# Skip stopwords
|
||||
if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
|
||||
continue
|
||||
|
||||
last_word_clean = last_word.rstrip(".,;:!?")
|
||||
if len(last_word_clean) < _MIN_WORD_LEN:
|
||||
if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
|
||||
continue
|
||||
|
||||
# Check if the last word is at the gutter edge
|
||||
@@ -382,11 +414,6 @@ def analyse_grid_for_gutter_repair(
|
||||
else:
|
||||
display_p1 += "-"
|
||||
|
||||
# Reconstruct cell texts after join
|
||||
# Current cell: replace last word with first part (hyphenated)
|
||||
# Next cell: remove first word
|
||||
remaining_next = " ".join(next_words[1:])
|
||||
|
||||
suggestion = GutterSuggestion(
|
||||
type="hyphen_join",
|
||||
zone_index=zi,
|
||||
@@ -407,10 +434,10 @@ def analyse_grid_for_gutter_repair(
|
||||
suggestions.append(suggestion)
|
||||
continue # skip spell_fix if hyphen_join found
|
||||
|
||||
# --- Strategy 2: Single-word spell fix ---
|
||||
# --- Strategy 2: Single-word spell fix (only for longer words) ---
|
||||
fix_result = _try_spell_fix(last_word_clean, col_type)
|
||||
if fix_result:
|
||||
corrected, conf = fix_result
|
||||
corrected, conf, alts = fix_result
|
||||
suggestion = GutterSuggestion(
|
||||
type="spell_fix",
|
||||
zone_index=zi,
|
||||
@@ -420,6 +447,7 @@ def analyse_grid_for_gutter_repair(
|
||||
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
|
||||
original_text=last_word,
|
||||
suggested_text=corrected,
|
||||
alternatives=alts,
|
||||
confidence=conf,
|
||||
reason="gutter_blur",
|
||||
)
|
||||
|
||||
@@ -1923,9 +1923,20 @@ async def gutter_repair_apply(session_id: str, request: Request):
|
||||
if not accepted_ids:
|
||||
return {"applied_count": 0, "changes": []}
|
||||
|
||||
# text_overrides: { suggestion_id: "alternative_text" }
|
||||
# Allows the user to pick a different correction from the alternatives list
|
||||
text_overrides = body.get("text_overrides", {})
|
||||
|
||||
from cv_gutter_repair import apply_gutter_suggestions
|
||||
|
||||
suggestions = gutter_result.get("suggestions", [])
|
||||
|
||||
# Apply user-selected alternatives before passing to apply
|
||||
for s in suggestions:
|
||||
sid = s.get("id", "")
|
||||
if sid in text_overrides and text_overrides[sid]:
|
||||
s["suggested_text"] = text_overrides[sid]
|
||||
|
||||
result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
|
||||
|
||||
# Save updated grid back to session
|
||||
|
||||
@@ -13,6 +13,8 @@ from cv_gutter_repair import (
|
||||
_try_spell_fix,
|
||||
_edit_distance,
|
||||
_word_is_at_gutter_edge,
|
||||
_MIN_WORD_LEN_SPELL,
|
||||
_MIN_WORD_LEN_HYPHEN,
|
||||
analyse_grid_for_gutter_repair,
|
||||
apply_gutter_suggestions,
|
||||
)
|
||||
@@ -127,27 +129,33 @@ class TestTryHyphenJoin:
|
||||
|
||||
@needs_spellchecker
|
||||
class TestTrySpellFix:
|
||||
def test_fix_garbled_ending(self):
|
||||
# "stammeli" should suggest "stammeln"
|
||||
def test_fix_garbled_ending_returns_alternatives(self):
|
||||
# "stammeli" should return a correction with alternatives
|
||||
result = _try_spell_fix("stammeli", col_type="column_de")
|
||||
assert result is not None
|
||||
corrected, conf = result
|
||||
assert corrected == "stammeln"
|
||||
corrected, conf, alts = result
|
||||
# The best correction is one of the valid forms
|
||||
all_options = [corrected] + alts
|
||||
all_lower = [w.lower() for w in all_options]
|
||||
# "stammeln" must be among the candidates
|
||||
assert "stammeln" in all_lower, f"Expected 'stammeln' in {all_options}"
|
||||
|
||||
def test_known_word_not_fixed(self):
|
||||
# "Haus" is correct — no fix needed
|
||||
result = _try_spell_fix("Haus", col_type="column_de")
|
||||
# Should be None since the word is correct
|
||||
# (unless spellchecker suggests something else)
|
||||
# Either None or same word is acceptable
|
||||
if result is not None:
|
||||
corrected, _ = result
|
||||
corrected, _, _ = result
|
||||
assert corrected.lower() == "haus"
|
||||
|
||||
def test_short_word_skipped(self):
|
||||
result = _try_spell_fix("ab")
|
||||
assert result is None
|
||||
|
||||
def test_min_word_len_thresholds(self):
|
||||
assert _MIN_WORD_LEN_HYPHEN == 2
|
||||
assert _MIN_WORD_LEN_SPELL == 3
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Grid analysis tests
|
||||
|
||||
Reference in New Issue
Block a user