Fix gutter repair: detect short fragments + show spell alternatives
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 48s
CI / test-go-edu-search (push) Successful in 49s
CI / test-python-klausur (push) Failing after 2m37s
CI / test-python-agent-core (push) Successful in 35s
CI / test-nodejs-website (push) Successful in 35s

- Lower min word length from 3→2 for hyphen-join candidates so fragments
  like "ve" (from "ver-künden") are no longer skipped
- Return all spellchecker candidates instead of just top-1, so user can
  pick the correct form (e.g. "stammeln" vs "stammelt")
- Frontend shows clickable alternative buttons for spell_fix suggestions
- Backend accepts text_overrides in apply endpoint for user-selected alternatives

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-10 19:09:12 +02:00
parent 71e1b10ac7
commit d1e7dd1c4a
4 changed files with 134 additions and 58 deletions

View File

@@ -19,6 +19,7 @@ interface GutterSuggestion {
next_row_text: string next_row_text: string
missing_chars: string missing_chars: string
display_parts: string[] display_parts: string[]
alternatives: string[]
confidence: number confidence: number
reason: string reason: string
} }
@@ -50,6 +51,7 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
const [result, setResult] = useState<GutterRepairResult | null>(null) const [result, setResult] = useState<GutterRepairResult | null>(null)
const [accepted, setAccepted] = useState<Set<string>>(new Set()) const [accepted, setAccepted] = useState<Set<string>>(new Set())
const [rejected, setRejected] = useState<Set<string>>(new Set()) const [rejected, setRejected] = useState<Set<string>>(new Set())
const [selectedText, setSelectedText] = useState<Record<string, string>>({})
const [applied, setApplied] = useState(false) const [applied, setApplied] = useState(false)
const [error, setError] = useState('') const [error, setError] = useState('')
const [applyMessage, setApplyMessage] = useState('') const [applyMessage, setApplyMessage] = useState('')
@@ -128,7 +130,10 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
{ {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ accepted: Array.from(accepted) }), body: JSON.stringify({
accepted: Array.from(accepted),
text_overrides: selectedText,
}),
}, },
) )
if (!res.ok) { if (!res.ok) {
@@ -304,14 +309,38 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
)} )}
</div> </div>
) : ( ) : (
<div className="flex items-center gap-2 text-sm"> <div className="space-y-1">
<span className="font-mono text-red-600 dark:text-red-400 line-through"> <div className="flex items-center gap-2 text-sm">
{s.original_text} <span className="font-mono text-red-600 dark:text-red-400 line-through">
</span> {s.original_text}
<span className="text-gray-400">&rarr;</span> </span>
<span className="font-mono text-green-600 dark:text-green-400 font-semibold"> <span className="text-gray-400">&rarr;</span>
{s.suggested_text} <span className="font-mono text-green-600 dark:text-green-400 font-semibold">
</span> {selectedText[s.id] || s.suggested_text}
</span>
</div>
{/* Alternatives: show other candidates the user can pick */}
{s.alternatives && s.alternatives.length > 0 && !applied && (
<div className="flex items-center gap-1.5 flex-wrap">
<span className="text-[10px] text-gray-400">Alternativen:</span>
{[s.suggested_text, ...s.alternatives].map((alt) => {
const isSelected = (selectedText[s.id] || s.suggested_text) === alt
return (
<button
key={alt}
onClick={() => setSelectedText(prev => ({ ...prev, [s.id]: alt }))}
className={`px-1.5 py-0.5 text-[11px] font-mono rounded transition-colors ${
isSelected
? 'bg-green-200 dark:bg-green-800 text-green-800 dark:text-green-200 font-semibold'
: 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
}`}
>
{alt}
</button>
)
})}
</div>
)}
</div> </div>
)} )}
</div> </div>

View File

@@ -55,31 +55,40 @@ def _is_known(word: str) -> bool:
return bool(_spell_de.known([w])) or bool(_spell_en.known([w])) return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))
def _spell_correction(word: str, lang: str = "both") -> Optional[str]: def _spell_candidates(word: str, lang: str = "both") -> List[str]:
"""Get best spellchecker correction for a word.""" """Get all plausible spellchecker candidates for a word (deduplicated)."""
_init_spellcheckers() _init_spellcheckers()
if not _SPELL_AVAILABLE: if not _SPELL_AVAILABLE:
return None return []
w = word.lower() w = word.lower()
result = None seen: set = set()
if lang in ("de", "both") and _spell_de: results: List[str] = []
result = _spell_de.correction(w)
if result and result != w and _spell_de.known([result]): for checker in ([_spell_de, _spell_en] if lang == "both"
return result else [_spell_de] if lang == "de"
if lang in ("en", "both") and _spell_en: else [_spell_en]):
result = _spell_en.correction(w) if checker is None:
if result and result != w and _spell_en.known([result]): continue
return result cands = checker.candidates(w)
return None if cands:
for c in cands:
if c and c != w and c not in seen:
seen.add(c)
results.append(c)
return results
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Gutter position detection # Gutter position detection
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Minimum word length to consider for repair (very short words are often # Minimum word length for spell-fix (very short words are often legitimate)
# legitimate: "a", "to", "in", etc.) _MIN_WORD_LEN_SPELL = 3
_MIN_WORD_LEN = 3
# Minimum word length for hyphen-join candidates (fragments at the gutter
# can be as short as 1-2 chars, e.g. "ve" from "ver-künden")
_MIN_WORD_LEN_HYPHEN = 2
# How close to the right column edge a word must be to count as "gutter-adjacent". # How close to the right column edge a word must be to count as "gutter-adjacent".
# Expressed as fraction of column width (e.g. 0.75 = rightmost 25%). # Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
@@ -138,6 +147,8 @@ class GutterSuggestion:
next_row_text: str = "" next_row_text: str = ""
missing_chars: str = "" missing_chars: str = ""
display_parts: List[str] = field(default_factory=list) display_parts: List[str] = field(default_factory=list)
# Alternatives (other plausible corrections the user can pick from)
alternatives: List[str] = field(default_factory=list)
# Meta: # Meta:
confidence: float = 0.0 confidence: float = 0.0
reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation" reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"
@@ -186,12 +197,16 @@ def _try_hyphen_join(
return None return None
def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, float]]: def _try_spell_fix(
word_text: str, col_type: str = "",
) -> Optional[Tuple[str, float, List[str]]]:
"""Try to fix a single garbled gutter word via spellchecker. """Try to fix a single garbled gutter word via spellchecker.
Returns (corrected_word, confidence) or None. Returns (best_correction, confidence, alternatives_list) or None.
The alternatives list contains other plausible corrections the user
can choose from (e.g. "stammelt" vs "stammeln").
""" """
if len(word_text) < _MIN_WORD_LEN: if len(word_text) < _MIN_WORD_LEN_SPELL:
return None return None
# Determine language priority from column type # Determine language priority from column type
@@ -202,21 +217,38 @@ def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, fl
else: else:
lang = "both" lang = "both"
correction = _spell_correction(word_text, lang=lang) candidates = _spell_candidates(word_text, lang=lang)
if not correction: if not candidates and lang != "both":
# Try the other language too candidates = _spell_candidates(word_text, lang="both")
correction = _spell_correction(word_text, lang="both")
if correction and correction.lower() != word_text.lower(): if not candidates:
# Preserve original casing of first letter return None
if word_text[0].isupper():
correction = correction[0].upper() + correction[1:]
# Confidence based on edit distance
dist = _edit_distance(word_text.lower(), correction.lower())
conf = max(0.5, 1.0 - dist * 0.15)
return (correction, conf)
return None # Preserve original casing
is_upper = word_text[0].isupper()
def _preserve_case(w: str) -> str:
if is_upper and w:
return w[0].upper() + w[1:]
return w
# Sort candidates by edit distance (closest first)
scored = []
for c in candidates:
dist = _edit_distance(word_text.lower(), c.lower())
scored.append((dist, c))
scored.sort(key=lambda x: x[0])
best_dist, best = scored[0]
best = _preserve_case(best)
conf = max(0.5, 1.0 - best_dist * 0.15)
# Build alternatives (all other candidates, also case-preserved)
alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()]
# Limit to top 5 alternatives
alts = alts[:5]
return (best, conf, alts)
def _edit_distance(a: str, b: str) -> int: def _edit_distance(a: str, b: str) -> int:
@@ -299,7 +331,7 @@ def analyse_grid_for_gutter_repair(
# right edge of its column AND not a known word. # right edge of its column AND not a known word.
for (ri, ci), cell in cell_map.items(): for (ri, ci), cell in cell_map.items():
text = (cell.get("text") or "").strip() text = (cell.get("text") or "").strip()
if not text or len(text) < _MIN_WORD_LEN: if not text:
continue continue
if _is_ipa_text(text): if _is_ipa_text(text):
continue continue
@@ -318,12 +350,12 @@ def analyse_grid_for_gutter_repair(
last_word = cell_words[-1] last_word = cell_words[-1]
# Skip stopwords and very short words # Skip stopwords
if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS: if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
continue continue
last_word_clean = last_word.rstrip(".,;:!?") last_word_clean = last_word.rstrip(".,;:!?")
if len(last_word_clean) < _MIN_WORD_LEN: if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
continue continue
# Check if the last word is at the gutter edge # Check if the last word is at the gutter edge
@@ -382,11 +414,6 @@ def analyse_grid_for_gutter_repair(
else: else:
display_p1 += "-" display_p1 += "-"
# Reconstruct cell texts after join
# Current cell: replace last word with first part (hyphenated)
# Next cell: remove first word
remaining_next = " ".join(next_words[1:])
suggestion = GutterSuggestion( suggestion = GutterSuggestion(
type="hyphen_join", type="hyphen_join",
zone_index=zi, zone_index=zi,
@@ -407,10 +434,10 @@ def analyse_grid_for_gutter_repair(
suggestions.append(suggestion) suggestions.append(suggestion)
continue # skip spell_fix if hyphen_join found continue # skip spell_fix if hyphen_join found
# --- Strategy 2: Single-word spell fix --- # --- Strategy 2: Single-word spell fix (only for longer words) ---
fix_result = _try_spell_fix(last_word_clean, col_type) fix_result = _try_spell_fix(last_word_clean, col_type)
if fix_result: if fix_result:
corrected, conf = fix_result corrected, conf, alts = fix_result
suggestion = GutterSuggestion( suggestion = GutterSuggestion(
type="spell_fix", type="spell_fix",
zone_index=zi, zone_index=zi,
@@ -420,6 +447,7 @@ def analyse_grid_for_gutter_repair(
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"), cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
original_text=last_word, original_text=last_word,
suggested_text=corrected, suggested_text=corrected,
alternatives=alts,
confidence=conf, confidence=conf,
reason="gutter_blur", reason="gutter_blur",
) )

View File

@@ -1923,9 +1923,20 @@ async def gutter_repair_apply(session_id: str, request: Request):
if not accepted_ids: if not accepted_ids:
return {"applied_count": 0, "changes": []} return {"applied_count": 0, "changes": []}
# text_overrides: { suggestion_id: "alternative_text" }
# Allows the user to pick a different correction from the alternatives list
text_overrides = body.get("text_overrides", {})
from cv_gutter_repair import apply_gutter_suggestions from cv_gutter_repair import apply_gutter_suggestions
suggestions = gutter_result.get("suggestions", []) suggestions = gutter_result.get("suggestions", [])
# Apply user-selected alternatives before passing to apply
for s in suggestions:
sid = s.get("id", "")
if sid in text_overrides and text_overrides[sid]:
s["suggested_text"] = text_overrides[sid]
result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions) result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
# Save updated grid back to session # Save updated grid back to session

View File

@@ -13,6 +13,8 @@ from cv_gutter_repair import (
_try_spell_fix, _try_spell_fix,
_edit_distance, _edit_distance,
_word_is_at_gutter_edge, _word_is_at_gutter_edge,
_MIN_WORD_LEN_SPELL,
_MIN_WORD_LEN_HYPHEN,
analyse_grid_for_gutter_repair, analyse_grid_for_gutter_repair,
apply_gutter_suggestions, apply_gutter_suggestions,
) )
@@ -127,27 +129,33 @@ class TestTryHyphenJoin:
@needs_spellchecker @needs_spellchecker
class TestTrySpellFix: class TestTrySpellFix:
def test_fix_garbled_ending(self): def test_fix_garbled_ending_returns_alternatives(self):
# "stammeli" should suggest "stammeln" # "stammeli" should return a correction with alternatives
result = _try_spell_fix("stammeli", col_type="column_de") result = _try_spell_fix("stammeli", col_type="column_de")
assert result is not None assert result is not None
corrected, conf = result corrected, conf, alts = result
assert corrected == "stammeln" # The best correction is one of the valid forms
all_options = [corrected] + alts
all_lower = [w.lower() for w in all_options]
# "stammeln" must be among the candidates
assert "stammeln" in all_lower, f"Expected 'stammeln' in {all_options}"
def test_known_word_not_fixed(self): def test_known_word_not_fixed(self):
# "Haus" is correct — no fix needed # "Haus" is correct — no fix needed
result = _try_spell_fix("Haus", col_type="column_de") result = _try_spell_fix("Haus", col_type="column_de")
# Should be None since the word is correct # Should be None since the word is correct
# (unless spellchecker suggests something else)
# Either None or same word is acceptable
if result is not None: if result is not None:
corrected, _ = result corrected, _, _ = result
assert corrected.lower() == "haus" assert corrected.lower() == "haus"
def test_short_word_skipped(self): def test_short_word_skipped(self):
result = _try_spell_fix("ab") result = _try_spell_fix("ab")
assert result is None assert result is None
def test_min_word_len_thresholds(self):
assert _MIN_WORD_LEN_HYPHEN == 2
assert _MIN_WORD_LEN_SPELL == 3
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Grid analysis tests # Grid analysis tests