diff --git a/admin-lehrer/components/ocr-kombi/StepGutterRepair.tsx b/admin-lehrer/components/ocr-kombi/StepGutterRepair.tsx index 5302237..37e71d5 100644 --- a/admin-lehrer/components/ocr-kombi/StepGutterRepair.tsx +++ b/admin-lehrer/components/ocr-kombi/StepGutterRepair.tsx @@ -19,6 +19,7 @@ interface GutterSuggestion { next_row_text: string missing_chars: string display_parts: string[] + alternatives: string[] confidence: number reason: string } @@ -50,6 +51,7 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) { const [result, setResult] = useState(null) const [accepted, setAccepted] = useState>(new Set()) const [rejected, setRejected] = useState>(new Set()) + const [selectedText, setSelectedText] = useState>({}) const [applied, setApplied] = useState(false) const [error, setError] = useState('') const [applyMessage, setApplyMessage] = useState('') @@ -128,7 +130,10 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) { { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ accepted: Array.from(accepted) }), + body: JSON.stringify({ + accepted: Array.from(accepted), + text_overrides: selectedText, + }), }, ) if (!res.ok) { @@ -304,14 +309,38 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) { )} ) : ( -
- - {s.original_text} - - - - {s.suggested_text} - +
+
+ + {s.original_text} + + + + {selectedText[s.id] || s.suggested_text} + +
+ {/* Alternatives: show other candidates the user can pick */} + {s.alternatives && s.alternatives.length > 0 && !applied && ( +
+ Alternativen: + {[s.suggested_text, ...s.alternatives].map((alt) => { + const isSelected = (selectedText[s.id] || s.suggested_text) === alt + return ( + + ) + })} +
+ )}
)}
diff --git a/klausur-service/backend/cv_gutter_repair.py b/klausur-service/backend/cv_gutter_repair.py index bc0780b..c9d4fea 100644 --- a/klausur-service/backend/cv_gutter_repair.py +++ b/klausur-service/backend/cv_gutter_repair.py @@ -55,31 +55,40 @@ def _is_known(word: str) -> bool: return bool(_spell_de.known([w])) or bool(_spell_en.known([w])) -def _spell_correction(word: str, lang: str = "both") -> Optional[str]: - """Get best spellchecker correction for a word.""" +def _spell_candidates(word: str, lang: str = "both") -> List[str]: + """Get all plausible spellchecker candidates for a word (deduplicated).""" _init_spellcheckers() if not _SPELL_AVAILABLE: - return None + return [] w = word.lower() - result = None - if lang in ("de", "both") and _spell_de: - result = _spell_de.correction(w) - if result and result != w and _spell_de.known([result]): - return result - if lang in ("en", "both") and _spell_en: - result = _spell_en.correction(w) - if result and result != w and _spell_en.known([result]): - return result - return None + seen: set = set() + results: List[str] = [] + + for checker in ([_spell_de, _spell_en] if lang == "both" + else [_spell_de] if lang == "de" + else [_spell_en]): + if checker is None: + continue + cands = checker.candidates(w) + if cands: + for c in cands: + if c and c != w and c not in seen: + seen.add(c) + results.append(c) + + return results # --------------------------------------------------------------------------- # Gutter position detection # --------------------------------------------------------------------------- -# Minimum word length to consider for repair (very short words are often -# legitimate: "a", "to", "in", etc.) -_MIN_WORD_LEN = 3 +# Minimum word length for spell-fix (very short words are often legitimate) +_MIN_WORD_LEN_SPELL = 3 + +# Minimum word length for hyphen-join candidates (fragments at the gutter +# can be as short as 1-2 chars, e.g. "ve" from "ver-künden") +_MIN_WORD_LEN_HYPHEN = 2 # How close to the right column edge a word must be to count as "gutter-adjacent". # Expressed as fraction of column width (e.g. 0.75 = rightmost 25%). @@ -138,6 +147,8 @@ class GutterSuggestion: next_row_text: str = "" missing_chars: str = "" display_parts: List[str] = field(default_factory=list) + # Alternatives (other plausible corrections the user can pick from) + alternatives: List[str] = field(default_factory=list) # Meta: confidence: float = 0.0 reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation" @@ -186,12 +197,16 @@ def _try_hyphen_join( return None -def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, float]]: +def _try_spell_fix( + word_text: str, col_type: str = "", +) -> Optional[Tuple[str, float, List[str]]]: """Try to fix a single garbled gutter word via spellchecker. - Returns (corrected_word, confidence) or None. + Returns (best_correction, confidence, alternatives_list) or None. + The alternatives list contains other plausible corrections the user + can choose from (e.g. "stammelt" vs "stammeln"). """ - if len(word_text) < _MIN_WORD_LEN: + if len(word_text) < _MIN_WORD_LEN_SPELL: return None # Determine language priority from column type @@ -202,21 +217,38 @@ def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, fl else: lang = "both" - correction = _spell_correction(word_text, lang=lang) - if not correction: - # Try the other language too - correction = _spell_correction(word_text, lang="both") + candidates = _spell_candidates(word_text, lang=lang) + if not candidates and lang != "both": + candidates = _spell_candidates(word_text, lang="both") - if correction and correction.lower() != word_text.lower(): - # Preserve original casing of first letter - if word_text[0].isupper(): - correction = correction[0].upper() + correction[1:] - # Confidence based on edit distance - dist = _edit_distance(word_text.lower(), correction.lower()) - conf = max(0.5, 1.0 - dist * 0.15) - return (correction, conf) + if not candidates: + return None - return None + # Preserve original casing + is_upper = word_text[0].isupper() + + def _preserve_case(w: str) -> str: + if is_upper and w: + return w[0].upper() + w[1:] + return w + + # Sort candidates by edit distance (closest first) + scored = [] + for c in candidates: + dist = _edit_distance(word_text.lower(), c.lower()) + scored.append((dist, c)) + scored.sort(key=lambda x: x[0]) + + best_dist, best = scored[0] + best = _preserve_case(best) + conf = max(0.5, 1.0 - best_dist * 0.15) + + # Build alternatives (all other candidates, also case-preserved) + alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()] + # Limit to top 5 alternatives + alts = alts[:5] + + return (best, conf, alts) def _edit_distance(a: str, b: str) -> int: @@ -299,7 +331,7 @@ def analyse_grid_for_gutter_repair( # right edge of its column AND not a known word. for (ri, ci), cell in cell_map.items(): text = (cell.get("text") or "").strip() - if not text or len(text) < _MIN_WORD_LEN: + if not text: continue if _is_ipa_text(text): continue @@ -318,12 +350,12 @@ def analyse_grid_for_gutter_repair( last_word = cell_words[-1] - # Skip stopwords and very short words + # Skip stopwords if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS: continue last_word_clean = last_word.rstrip(".,;:!?") - if len(last_word_clean) < _MIN_WORD_LEN: + if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN: continue # Check if the last word is at the gutter edge @@ -382,11 +414,6 @@ def analyse_grid_for_gutter_repair( else: display_p1 += "-" - # Reconstruct cell texts after join - # Current cell: replace last word with first part (hyphenated) - # Next cell: remove first word - remaining_next = " ".join(next_words[1:]) - suggestion = GutterSuggestion( type="hyphen_join", zone_index=zi, @@ -407,10 +434,10 @@ def analyse_grid_for_gutter_repair( suggestions.append(suggestion) continue # skip spell_fix if hyphen_join found - # --- Strategy 2: Single-word spell fix --- + # --- Strategy 2: Single-word spell fix (only for longer words) --- fix_result = _try_spell_fix(last_word_clean, col_type) if fix_result: - corrected, conf = fix_result + corrected, conf, alts = fix_result suggestion = GutterSuggestion( type="spell_fix", zone_index=zi, @@ -420,6 +447,7 @@ def analyse_grid_for_gutter_repair( cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"), original_text=last_word, suggested_text=corrected, + alternatives=alts, confidence=conf, reason="gutter_blur", ) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 33e16f8..f2bd0ac 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1923,9 +1923,20 @@ async def gutter_repair_apply(session_id: str, request: Request): if not accepted_ids: return {"applied_count": 0, "changes": []} + # text_overrides: { suggestion_id: "alternative_text" } + # Allows the user to pick a different correction from the alternatives list + text_overrides = body.get("text_overrides", {}) + from cv_gutter_repair import apply_gutter_suggestions suggestions = gutter_result.get("suggestions", []) + + # Apply user-selected alternatives before passing to apply + for s in suggestions: + sid = s.get("id", "") + if sid in text_overrides and text_overrides[sid]: + s["suggested_text"] = text_overrides[sid] + result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions) # Save updated grid back to session diff --git a/klausur-service/backend/tests/test_gutter_repair.py b/klausur-service/backend/tests/test_gutter_repair.py index b39bf82..b56aee9 100644 --- a/klausur-service/backend/tests/test_gutter_repair.py +++ b/klausur-service/backend/tests/test_gutter_repair.py @@ -13,6 +13,8 @@ from cv_gutter_repair import ( _try_spell_fix, _edit_distance, _word_is_at_gutter_edge, + _MIN_WORD_LEN_SPELL, + _MIN_WORD_LEN_HYPHEN, analyse_grid_for_gutter_repair, apply_gutter_suggestions, ) @@ -127,27 +129,33 @@ class TestTryHyphenJoin: @needs_spellchecker class TestTrySpellFix: - def test_fix_garbled_ending(self): - # "stammeli" should suggest "stammeln" + def test_fix_garbled_ending_returns_alternatives(self): + # "stammeli" should return a correction with alternatives result = _try_spell_fix("stammeli", col_type="column_de") assert result is not None - corrected, conf = result - assert corrected == "stammeln" + corrected, conf, alts = result + # The best correction is one of the valid forms + all_options = [corrected] + alts + all_lower = [w.lower() for w in all_options] + # "stammeln" must be among the candidates + assert "stammeln" in all_lower, f"Expected 'stammeln' in {all_options}" def test_known_word_not_fixed(self): # "Haus" is correct — no fix needed result = _try_spell_fix("Haus", col_type="column_de") # Should be None since the word is correct - # (unless spellchecker suggests something else) - # Either None or same word is acceptable if result is not None: - corrected, _ = result + corrected, _, _ = result assert corrected.lower() == "haus" def test_short_word_skipped(self): result = _try_spell_fix("ab") assert result is None + def test_min_word_len_thresholds(self): + assert _MIN_WORD_LEN_HYPHEN == 2 + assert _MIN_WORD_LEN_SPELL == 3 + # --------------------------------------------------------------------------- # Grid analysis tests