Fix gutter repair: detect short fragments + show spell alternatives

- Lower min word length from 3→2 for hyphen-join candidates so fragments like "ve" (from "ver-künden") are no longer skipped - Return all spellchecker candidates instead of just top-1, so user can pick the correct form (e.g. "stammeln" vs "stammelt") - Frontend shows clickable alternative buttons for spell_fix suggestions - Backend accepts text_overrides in apply endpoint for user-selected alternatives Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 19:09:12 +02:00
parent 71e1b10ac7
commit d1e7dd1c4a
4 changed files with 134 additions and 58 deletions
@@ -19,6 +19,7 @@ interface GutterSuggestion {
  next_row_text: string
  missing_chars: string
  display_parts: string[]
+  alternatives: string[]
  confidence: number
  reason: string
 }
@@ -50,6 +51,7 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
  const [result, setResult] = useState<GutterRepairResult | null>(null)
  const [accepted, setAccepted] = useState<Set<string>>(new Set())
  const [rejected, setRejected] = useState<Set<string>>(new Set())
+  const [selectedText, setSelectedText] = useState<Record<string, string>>({})
  const [applied, setApplied] = useState(false)
  const [error, setError] = useState('')
  const [applyMessage, setApplyMessage] = useState('')
@@ -128,7 +130,10 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
        {
          method: 'POST',
          headers: { 'Content-Type': 'application/json' },
-          body: JSON.stringify({ accepted: Array.from(accepted) }),
+          body: JSON.stringify({
+            accepted: Array.from(accepted),
+            text_overrides: selectedText,
+          }),
        },
      )
      if (!res.ok) {
@@ -304,14 +309,38 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
                          )}
                        </div>
                      ) : (
-                        <div className="flex items-center gap-2 text-sm">
-                          <span className="font-mono text-red-600 dark:text-red-400 line-through">
-                            {s.original_text}
-                          </span>
-                          <span className="text-gray-400">&rarr;</span>
-                          <span className="font-mono text-green-600 dark:text-green-400 font-semibold">
-                            {s.suggested_text}
-                          </span>
+                        <div className="space-y-1">
+                          <div className="flex items-center gap-2 text-sm">
+                            <span className="font-mono text-red-600 dark:text-red-400 line-through">
+                              {s.original_text}
+                            </span>
+                            <span className="text-gray-400">&rarr;</span>
+                            <span className="font-mono text-green-600 dark:text-green-400 font-semibold">
+                              {selectedText[s.id] || s.suggested_text}
+                            </span>
+                          </div>
+                          {/* Alternatives: show other candidates the user can pick */}
+                          {s.alternatives && s.alternatives.length > 0 && !applied && (
+                            <div className="flex items-center gap-1.5 flex-wrap">
+                              <span className="text-[10px] text-gray-400">Alternativen:</span>
+                              {[s.suggested_text, ...s.alternatives].map((alt) => {
+                                const isSelected = (selectedText[s.id] || s.suggested_text) === alt
+                                return (
+                                  <button
+                                    key={alt}
+                                    onClick={() => setSelectedText(prev => ({ ...prev, [s.id]: alt }))}
+                                    className={`px-1.5 py-0.5 text-[11px] font-mono rounded transition-colors ${
+                                      isSelected
+                                        ? 'bg-green-200 dark:bg-green-800 text-green-800 dark:text-green-200 font-semibold'
+                                        : 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
+                                    }`}
+                                  >
+                                    {alt}
+                                  </button>
+                                )
+                              })}
+                            </div>
+                          )}
                        </div>
                      )}
                    </div>
@@ -55,31 +55,40 @@ def _is_known(word: str) -> bool:
    return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))


-def _spell_correction(word: str, lang: str = "both") -> Optional[str]:
-    """Get best spellchecker correction for a word."""
+def _spell_candidates(word: str, lang: str = "both") -> List[str]:
+    """Get all plausible spellchecker candidates for a word (deduplicated)."""
    _init_spellcheckers()
    if not _SPELL_AVAILABLE:
-        return None
+        return []
    w = word.lower()
-    result = None
-    if lang in ("de", "both") and _spell_de:
-        result = _spell_de.correction(w)
-        if result and result != w and _spell_de.known([result]):
-            return result
-    if lang in ("en", "both") and _spell_en:
-        result = _spell_en.correction(w)
-        if result and result != w and _spell_en.known([result]):
-            return result
-    return None
+    seen: set = set()
+    results: List[str] = []
+
+    for checker in ([_spell_de, _spell_en] if lang == "both"
+                    else [_spell_de] if lang == "de"
+                    else [_spell_en]):
+        if checker is None:
+            continue
+        cands = checker.candidates(w)
+        if cands:
+            for c in cands:
+                if c and c != w and c not in seen:
+                    seen.add(c)
+                    results.append(c)
+
+    return results


 # ---------------------------------------------------------------------------
 # Gutter position detection
 # ---------------------------------------------------------------------------

-# Minimum word length to consider for repair (very short words are often
-# legitimate: "a", "to", "in", etc.)
-_MIN_WORD_LEN = 3
+# Minimum word length for spell-fix (very short words are often legitimate)
+_MIN_WORD_LEN_SPELL = 3
+
+# Minimum word length for hyphen-join candidates (fragments at the gutter
+# can be as short as 1-2 chars, e.g. "ve" from "ver-künden")
+_MIN_WORD_LEN_HYPHEN = 2

 # How close to the right column edge a word must be to count as "gutter-adjacent".
 # Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
@@ -138,6 +147,8 @@ class GutterSuggestion:
    next_row_text: str = ""
    missing_chars: str = ""
    display_parts: List[str] = field(default_factory=list)
+    # Alternatives (other plausible corrections the user can pick from)
+    alternatives: List[str] = field(default_factory=list)
    # Meta:
    confidence: float = 0.0
    reason: str = ""           # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"
@@ -186,12 +197,16 @@ def _try_hyphen_join(
    return None


-def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, float]]:
+def _try_spell_fix(
+    word_text: str, col_type: str = "",
+) -> Optional[Tuple[str, float, List[str]]]:
    """Try to fix a single garbled gutter word via spellchecker.

-    Returns (corrected_word, confidence) or None.
+    Returns (best_correction, confidence, alternatives_list) or None.
+    The alternatives list contains other plausible corrections the user
+    can choose from (e.g. "stammelt" vs "stammeln").
    """
-    if len(word_text) < _MIN_WORD_LEN:
+    if len(word_text) < _MIN_WORD_LEN_SPELL:
        return None

    # Determine language priority from column type
@@ -202,21 +217,38 @@ def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, fl
    else:
        lang = "both"

-    correction = _spell_correction(word_text, lang=lang)
-    if not correction:
-        # Try the other language too
-        correction = _spell_correction(word_text, lang="both")
+    candidates = _spell_candidates(word_text, lang=lang)
+    if not candidates and lang != "both":
+        candidates = _spell_candidates(word_text, lang="both")

-    if correction and correction.lower() != word_text.lower():
-        # Preserve original casing of first letter
-        if word_text[0].isupper():
-            correction = correction[0].upper() + correction[1:]
-        # Confidence based on edit distance
-        dist = _edit_distance(word_text.lower(), correction.lower())
-        conf = max(0.5, 1.0 - dist * 0.15)
-        return (correction, conf)
+    if not candidates:
+        return None

-    return None
+    # Preserve original casing
+    is_upper = word_text[0].isupper()
+
+    def _preserve_case(w: str) -> str:
+        if is_upper and w:
+            return w[0].upper() + w[1:]
+        return w
+
+    # Sort candidates by edit distance (closest first)
+    scored = []
+    for c in candidates:
+        dist = _edit_distance(word_text.lower(), c.lower())
+        scored.append((dist, c))
+    scored.sort(key=lambda x: x[0])
+
+    best_dist, best = scored[0]
+    best = _preserve_case(best)
+    conf = max(0.5, 1.0 - best_dist * 0.15)
+
+    # Build alternatives (all other candidates, also case-preserved)
+    alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()]
+    # Limit to top 5 alternatives
+    alts = alts[:5]
+
+    return (best, conf, alts)


 def _edit_distance(a: str, b: str) -> int:
@@ -299,7 +331,7 @@ def analyse_grid_for_gutter_repair(
        # right edge of its column AND not a known word.
        for (ri, ci), cell in cell_map.items():
            text = (cell.get("text") or "").strip()
-            if not text or len(text) < _MIN_WORD_LEN:
+            if not text:
                continue
            if _is_ipa_text(text):
                continue
@@ -318,12 +350,12 @@ def analyse_grid_for_gutter_repair(

            last_word = cell_words[-1]

-            # Skip stopwords and very short words
+            # Skip stopwords
            if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
                continue

            last_word_clean = last_word.rstrip(".,;:!?")
-            if len(last_word_clean) < _MIN_WORD_LEN:
+            if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
                continue

            # Check if the last word is at the gutter edge
@@ -382,11 +414,6 @@ def analyse_grid_for_gutter_repair(
                                else:
                                    display_p1 += "-"

-                            # Reconstruct cell texts after join
-                            # Current cell: replace last word with first part (hyphenated)
-                            # Next cell: remove first word
-                            remaining_next = " ".join(next_words[1:])
-
                            suggestion = GutterSuggestion(
                                type="hyphen_join",
                                zone_index=zi,
@@ -407,10 +434,10 @@ def analyse_grid_for_gutter_repair(
                            suggestions.append(suggestion)
                            continue  # skip spell_fix if hyphen_join found

-            # --- Strategy 2: Single-word spell fix ---
+            # --- Strategy 2: Single-word spell fix (only for longer words) ---
            fix_result = _try_spell_fix(last_word_clean, col_type)
            if fix_result:
-                corrected, conf = fix_result
+                corrected, conf, alts = fix_result
                suggestion = GutterSuggestion(
                    type="spell_fix",
                    zone_index=zi,
@@ -420,6 +447,7 @@ def analyse_grid_for_gutter_repair(
                    cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
                    original_text=last_word,
                    suggested_text=corrected,
+                    alternatives=alts,
                    confidence=conf,
                    reason="gutter_blur",
                )
@@ -1923,9 +1923,20 @@ async def gutter_repair_apply(session_id: str, request: Request):
    if not accepted_ids:
        return {"applied_count": 0, "changes": []}

+    # text_overrides: { suggestion_id: "alternative_text" }
+    # Allows the user to pick a different correction from the alternatives list
+    text_overrides = body.get("text_overrides", {})
+
    from cv_gutter_repair import apply_gutter_suggestions

    suggestions = gutter_result.get("suggestions", [])
+
+    # Apply user-selected alternatives before passing to apply
+    for s in suggestions:
+        sid = s.get("id", "")
+        if sid in text_overrides and text_overrides[sid]:
+            s["suggested_text"] = text_overrides[sid]
+
    result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)

    # Save updated grid back to session
@@ -13,6 +13,8 @@ from cv_gutter_repair import (
    _try_spell_fix,
    _edit_distance,
    _word_is_at_gutter_edge,
+    _MIN_WORD_LEN_SPELL,
+    _MIN_WORD_LEN_HYPHEN,
    analyse_grid_for_gutter_repair,
    apply_gutter_suggestions,
 )
@@ -127,27 +129,33 @@ class TestTryHyphenJoin:

@needs_spellchecker
 class TestTrySpellFix:
-    def test_fix_garbled_ending(self):
-        # "stammeli" should suggest "stammeln"
+    def test_fix_garbled_ending_returns_alternatives(self):
+        # "stammeli" should return a correction with alternatives
        result = _try_spell_fix("stammeli", col_type="column_de")
        assert result is not None
-        corrected, conf = result
-        assert corrected == "stammeln"
+        corrected, conf, alts = result
+        # The best correction is one of the valid forms
+        all_options = [corrected] + alts
+        all_lower = [w.lower() for w in all_options]
+        # "stammeln" must be among the candidates
+        assert "stammeln" in all_lower, f"Expected 'stammeln' in {all_options}"

    def test_known_word_not_fixed(self):
        # "Haus" is correct — no fix needed
        result = _try_spell_fix("Haus", col_type="column_de")
        # Should be None since the word is correct
-        # (unless spellchecker suggests something else)
-        # Either None or same word is acceptable
        if result is not None:
-            corrected, _ = result
+            corrected, _, _ = result
            assert corrected.lower() == "haus"

    def test_short_word_skipped(self):
        result = _try_spell_fix("ab")
        assert result is None

+    def test_min_word_len_thresholds(self):
+        assert _MIN_WORD_LEN_HYPHEN == 2
+        assert _MIN_WORD_LEN_SPELL == 3
+

 # ---------------------------------------------------------------------------
 # Grid analysis tests