Fix gutter repair: detect short fragments + show spell alternatives
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 48s
CI / test-go-edu-search (push) Successful in 49s
CI / test-python-klausur (push) Failing after 2m37s
CI / test-python-agent-core (push) Successful in 35s
CI / test-nodejs-website (push) Successful in 35s

- Lower min word length from 3→2 for hyphen-join candidates so fragments
  like "ve" (from "ver-künden") are no longer skipped
- Return all spellchecker candidates instead of just top-1, so user can
  pick the correct form (e.g. "stammeln" vs "stammelt")
- Frontend shows clickable alternative buttons for spell_fix suggestions
- Backend accepts text_overrides in apply endpoint for user-selected alternatives

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-10 19:09:12 +02:00
parent 71e1b10ac7
commit d1e7dd1c4a
4 changed files with 134 additions and 58 deletions

View File

@@ -19,6 +19,7 @@ interface GutterSuggestion {
next_row_text: string
missing_chars: string
display_parts: string[]
alternatives: string[]
confidence: number
reason: string
}
@@ -50,6 +51,7 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
const [result, setResult] = useState<GutterRepairResult | null>(null)
const [accepted, setAccepted] = useState<Set<string>>(new Set())
const [rejected, setRejected] = useState<Set<string>>(new Set())
const [selectedText, setSelectedText] = useState<Record<string, string>>({})
const [applied, setApplied] = useState(false)
const [error, setError] = useState('')
const [applyMessage, setApplyMessage] = useState('')
@@ -128,7 +130,10 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
{
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ accepted: Array.from(accepted) }),
body: JSON.stringify({
accepted: Array.from(accepted),
text_overrides: selectedText,
}),
},
)
if (!res.ok) {
@@ -304,14 +309,38 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
)}
</div>
) : (
<div className="flex items-center gap-2 text-sm">
<span className="font-mono text-red-600 dark:text-red-400 line-through">
{s.original_text}
</span>
<span className="text-gray-400">&rarr;</span>
<span className="font-mono text-green-600 dark:text-green-400 font-semibold">
{s.suggested_text}
</span>
<div className="space-y-1">
<div className="flex items-center gap-2 text-sm">
<span className="font-mono text-red-600 dark:text-red-400 line-through">
{s.original_text}
</span>
<span className="text-gray-400">&rarr;</span>
<span className="font-mono text-green-600 dark:text-green-400 font-semibold">
{selectedText[s.id] || s.suggested_text}
</span>
</div>
{/* Alternatives: show other candidates the user can pick */}
{s.alternatives && s.alternatives.length > 0 && !applied && (
<div className="flex items-center gap-1.5 flex-wrap">
<span className="text-[10px] text-gray-400">Alternativen:</span>
{[s.suggested_text, ...s.alternatives].map((alt) => {
const isSelected = (selectedText[s.id] || s.suggested_text) === alt
return (
<button
key={alt}
onClick={() => setSelectedText(prev => ({ ...prev, [s.id]: alt }))}
className={`px-1.5 py-0.5 text-[11px] font-mono rounded transition-colors ${
isSelected
? 'bg-green-200 dark:bg-green-800 text-green-800 dark:text-green-200 font-semibold'
: 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
}`}
>
{alt}
</button>
)
})}
</div>
)}
</div>
)}
</div>

View File

@@ -55,31 +55,40 @@ def _is_known(word: str) -> bool:
return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))
def _spell_correction(word: str, lang: str = "both") -> Optional[str]:
"""Get best spellchecker correction for a word."""
def _spell_candidates(word: str, lang: str = "both") -> List[str]:
"""Get all plausible spellchecker candidates for a word (deduplicated)."""
_init_spellcheckers()
if not _SPELL_AVAILABLE:
return None
return []
w = word.lower()
result = None
if lang in ("de", "both") and _spell_de:
result = _spell_de.correction(w)
if result and result != w and _spell_de.known([result]):
return result
if lang in ("en", "both") and _spell_en:
result = _spell_en.correction(w)
if result and result != w and _spell_en.known([result]):
return result
return None
seen: set = set()
results: List[str] = []
for checker in ([_spell_de, _spell_en] if lang == "both"
else [_spell_de] if lang == "de"
else [_spell_en]):
if checker is None:
continue
cands = checker.candidates(w)
if cands:
for c in cands:
if c and c != w and c not in seen:
seen.add(c)
results.append(c)
return results
# ---------------------------------------------------------------------------
# Gutter position detection
# ---------------------------------------------------------------------------
# Minimum word length to consider for repair (very short words are often
# legitimate: "a", "to", "in", etc.)
_MIN_WORD_LEN = 3
# Minimum word length for spell-fix (very short words are often legitimate)
_MIN_WORD_LEN_SPELL = 3
# Minimum word length for hyphen-join candidates (fragments at the gutter
# can be as short as 1-2 chars, e.g. "ve" from "ver-künden")
_MIN_WORD_LEN_HYPHEN = 2
# How close to the right column edge a word must be to count as "gutter-adjacent".
# Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
@@ -138,6 +147,8 @@ class GutterSuggestion:
next_row_text: str = ""
missing_chars: str = ""
display_parts: List[str] = field(default_factory=list)
# Alternatives (other plausible corrections the user can pick from)
alternatives: List[str] = field(default_factory=list)
# Meta:
confidence: float = 0.0
reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"
@@ -186,12 +197,16 @@ def _try_hyphen_join(
return None
def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, float]]:
def _try_spell_fix(
word_text: str, col_type: str = "",
) -> Optional[Tuple[str, float, List[str]]]:
"""Try to fix a single garbled gutter word via spellchecker.
Returns (corrected_word, confidence) or None.
Returns (best_correction, confidence, alternatives_list) or None.
The alternatives list contains other plausible corrections the user
can choose from (e.g. "stammelt" vs "stammeln").
"""
if len(word_text) < _MIN_WORD_LEN:
if len(word_text) < _MIN_WORD_LEN_SPELL:
return None
# Determine language priority from column type
@@ -202,21 +217,38 @@ def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, fl
else:
lang = "both"
correction = _spell_correction(word_text, lang=lang)
if not correction:
# Try the other language too
correction = _spell_correction(word_text, lang="both")
candidates = _spell_candidates(word_text, lang=lang)
if not candidates and lang != "both":
candidates = _spell_candidates(word_text, lang="both")
if correction and correction.lower() != word_text.lower():
# Preserve original casing of first letter
if word_text[0].isupper():
correction = correction[0].upper() + correction[1:]
# Confidence based on edit distance
dist = _edit_distance(word_text.lower(), correction.lower())
conf = max(0.5, 1.0 - dist * 0.15)
return (correction, conf)
if not candidates:
return None
return None
# Preserve original casing
is_upper = word_text[0].isupper()
def _preserve_case(w: str) -> str:
if is_upper and w:
return w[0].upper() + w[1:]
return w
# Sort candidates by edit distance (closest first)
scored = []
for c in candidates:
dist = _edit_distance(word_text.lower(), c.lower())
scored.append((dist, c))
scored.sort(key=lambda x: x[0])
best_dist, best = scored[0]
best = _preserve_case(best)
conf = max(0.5, 1.0 - best_dist * 0.15)
# Build alternatives (all other candidates, also case-preserved)
alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()]
# Limit to top 5 alternatives
alts = alts[:5]
return (best, conf, alts)
def _edit_distance(a: str, b: str) -> int:
@@ -299,7 +331,7 @@ def analyse_grid_for_gutter_repair(
# right edge of its column AND not a known word.
for (ri, ci), cell in cell_map.items():
text = (cell.get("text") or "").strip()
if not text or len(text) < _MIN_WORD_LEN:
if not text:
continue
if _is_ipa_text(text):
continue
@@ -318,12 +350,12 @@ def analyse_grid_for_gutter_repair(
last_word = cell_words[-1]
# Skip stopwords and very short words
# Skip stopwords
if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
continue
last_word_clean = last_word.rstrip(".,;:!?")
if len(last_word_clean) < _MIN_WORD_LEN:
if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
continue
# Check if the last word is at the gutter edge
@@ -382,11 +414,6 @@ def analyse_grid_for_gutter_repair(
else:
display_p1 += "-"
# Reconstruct cell texts after join
# Current cell: replace last word with first part (hyphenated)
# Next cell: remove first word
remaining_next = " ".join(next_words[1:])
suggestion = GutterSuggestion(
type="hyphen_join",
zone_index=zi,
@@ -407,10 +434,10 @@ def analyse_grid_for_gutter_repair(
suggestions.append(suggestion)
continue # skip spell_fix if hyphen_join found
# --- Strategy 2: Single-word spell fix ---
# --- Strategy 2: Single-word spell fix (only for longer words) ---
fix_result = _try_spell_fix(last_word_clean, col_type)
if fix_result:
corrected, conf = fix_result
corrected, conf, alts = fix_result
suggestion = GutterSuggestion(
type="spell_fix",
zone_index=zi,
@@ -420,6 +447,7 @@ def analyse_grid_for_gutter_repair(
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
original_text=last_word,
suggested_text=corrected,
alternatives=alts,
confidence=conf,
reason="gutter_blur",
)

View File

@@ -1923,9 +1923,20 @@ async def gutter_repair_apply(session_id: str, request: Request):
if not accepted_ids:
return {"applied_count": 0, "changes": []}
# text_overrides: { suggestion_id: "alternative_text" }
# Allows the user to pick a different correction from the alternatives list
text_overrides = body.get("text_overrides", {})
from cv_gutter_repair import apply_gutter_suggestions
suggestions = gutter_result.get("suggestions", [])
# Apply user-selected alternatives before passing to apply
for s in suggestions:
sid = s.get("id", "")
if sid in text_overrides and text_overrides[sid]:
s["suggested_text"] = text_overrides[sid]
result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
# Save updated grid back to session

View File

@@ -13,6 +13,8 @@ from cv_gutter_repair import (
_try_spell_fix,
_edit_distance,
_word_is_at_gutter_edge,
_MIN_WORD_LEN_SPELL,
_MIN_WORD_LEN_HYPHEN,
analyse_grid_for_gutter_repair,
apply_gutter_suggestions,
)
@@ -127,27 +129,33 @@ class TestTryHyphenJoin:
@needs_spellchecker
class TestTrySpellFix:
def test_fix_garbled_ending(self):
# "stammeli" should suggest "stammeln"
def test_fix_garbled_ending_returns_alternatives(self):
# "stammeli" should return a correction with alternatives
result = _try_spell_fix("stammeli", col_type="column_de")
assert result is not None
corrected, conf = result
assert corrected == "stammeln"
corrected, conf, alts = result
# The best correction is one of the valid forms
all_options = [corrected] + alts
all_lower = [w.lower() for w in all_options]
# "stammeln" must be among the candidates
assert "stammeln" in all_lower, f"Expected 'stammeln' in {all_options}"
def test_known_word_not_fixed(self):
# "Haus" is correct — no fix needed
result = _try_spell_fix("Haus", col_type="column_de")
# Should be None since the word is correct
# (unless spellchecker suggests something else)
# Either None or same word is acceptable
if result is not None:
corrected, _ = result
corrected, _, _ = result
assert corrected.lower() == "haus"
def test_short_word_skipped(self):
result = _try_spell_fix("ab")
assert result is None
def test_min_word_len_thresholds(self):
assert _MIN_WORD_LEN_HYPHEN == 2
assert _MIN_WORD_LEN_SPELL == 3
# ---------------------------------------------------------------------------
# Grid analysis tests