Fix gutter repair: detect short fragments + show spell alternatives
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 48s
CI / test-go-edu-search (push) Successful in 49s
CI / test-python-klausur (push) Failing after 2m37s
CI / test-python-agent-core (push) Successful in 35s
CI / test-nodejs-website (push) Successful in 35s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 48s
CI / test-go-edu-search (push) Successful in 49s
CI / test-python-klausur (push) Failing after 2m37s
CI / test-python-agent-core (push) Successful in 35s
CI / test-nodejs-website (push) Successful in 35s
- Lower min word length from 3→2 for hyphen-join candidates so fragments like "ve" (from "ver-künden") are no longer skipped - Return all spellchecker candidates instead of just top-1, so user can pick the correct form (e.g. "stammeln" vs "stammelt") - Frontend shows clickable alternative buttons for spell_fix suggestions - Backend accepts text_overrides in apply endpoint for user-selected alternatives Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,7 @@ interface GutterSuggestion {
|
|||||||
next_row_text: string
|
next_row_text: string
|
||||||
missing_chars: string
|
missing_chars: string
|
||||||
display_parts: string[]
|
display_parts: string[]
|
||||||
|
alternatives: string[]
|
||||||
confidence: number
|
confidence: number
|
||||||
reason: string
|
reason: string
|
||||||
}
|
}
|
||||||
@@ -50,6 +51,7 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
|
|||||||
const [result, setResult] = useState<GutterRepairResult | null>(null)
|
const [result, setResult] = useState<GutterRepairResult | null>(null)
|
||||||
const [accepted, setAccepted] = useState<Set<string>>(new Set())
|
const [accepted, setAccepted] = useState<Set<string>>(new Set())
|
||||||
const [rejected, setRejected] = useState<Set<string>>(new Set())
|
const [rejected, setRejected] = useState<Set<string>>(new Set())
|
||||||
|
const [selectedText, setSelectedText] = useState<Record<string, string>>({})
|
||||||
const [applied, setApplied] = useState(false)
|
const [applied, setApplied] = useState(false)
|
||||||
const [error, setError] = useState('')
|
const [error, setError] = useState('')
|
||||||
const [applyMessage, setApplyMessage] = useState('')
|
const [applyMessage, setApplyMessage] = useState('')
|
||||||
@@ -128,7 +130,10 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
|
|||||||
{
|
{
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
body: JSON.stringify({ accepted: Array.from(accepted) }),
|
body: JSON.stringify({
|
||||||
|
accepted: Array.from(accepted),
|
||||||
|
text_overrides: selectedText,
|
||||||
|
}),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
@@ -304,14 +309,38 @@ export function StepGutterRepair({ sessionId, onNext }: StepGutterRepairProps) {
|
|||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
) : (
|
) : (
|
||||||
<div className="flex items-center gap-2 text-sm">
|
<div className="space-y-1">
|
||||||
<span className="font-mono text-red-600 dark:text-red-400 line-through">
|
<div className="flex items-center gap-2 text-sm">
|
||||||
{s.original_text}
|
<span className="font-mono text-red-600 dark:text-red-400 line-through">
|
||||||
</span>
|
{s.original_text}
|
||||||
<span className="text-gray-400">→</span>
|
</span>
|
||||||
<span className="font-mono text-green-600 dark:text-green-400 font-semibold">
|
<span className="text-gray-400">→</span>
|
||||||
{s.suggested_text}
|
<span className="font-mono text-green-600 dark:text-green-400 font-semibold">
|
||||||
</span>
|
{selectedText[s.id] || s.suggested_text}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
{/* Alternatives: show other candidates the user can pick */}
|
||||||
|
{s.alternatives && s.alternatives.length > 0 && !applied && (
|
||||||
|
<div className="flex items-center gap-1.5 flex-wrap">
|
||||||
|
<span className="text-[10px] text-gray-400">Alternativen:</span>
|
||||||
|
{[s.suggested_text, ...s.alternatives].map((alt) => {
|
||||||
|
const isSelected = (selectedText[s.id] || s.suggested_text) === alt
|
||||||
|
return (
|
||||||
|
<button
|
||||||
|
key={alt}
|
||||||
|
onClick={() => setSelectedText(prev => ({ ...prev, [s.id]: alt }))}
|
||||||
|
className={`px-1.5 py-0.5 text-[11px] font-mono rounded transition-colors ${
|
||||||
|
isSelected
|
||||||
|
? 'bg-green-200 dark:bg-green-800 text-green-800 dark:text-green-200 font-semibold'
|
||||||
|
: 'bg-gray-100 dark:bg-gray-700 text-gray-600 dark:text-gray-300 hover:bg-gray-200 dark:hover:bg-gray-600'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
{alt}
|
||||||
|
</button>
|
||||||
|
)
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -55,31 +55,40 @@ def _is_known(word: str) -> bool:
|
|||||||
return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))
|
return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))
|
||||||
|
|
||||||
|
|
||||||
def _spell_correction(word: str, lang: str = "both") -> Optional[str]:
|
def _spell_candidates(word: str, lang: str = "both") -> List[str]:
|
||||||
"""Get best spellchecker correction for a word."""
|
"""Get all plausible spellchecker candidates for a word (deduplicated)."""
|
||||||
_init_spellcheckers()
|
_init_spellcheckers()
|
||||||
if not _SPELL_AVAILABLE:
|
if not _SPELL_AVAILABLE:
|
||||||
return None
|
return []
|
||||||
w = word.lower()
|
w = word.lower()
|
||||||
result = None
|
seen: set = set()
|
||||||
if lang in ("de", "both") and _spell_de:
|
results: List[str] = []
|
||||||
result = _spell_de.correction(w)
|
|
||||||
if result and result != w and _spell_de.known([result]):
|
for checker in ([_spell_de, _spell_en] if lang == "both"
|
||||||
return result
|
else [_spell_de] if lang == "de"
|
||||||
if lang in ("en", "both") and _spell_en:
|
else [_spell_en]):
|
||||||
result = _spell_en.correction(w)
|
if checker is None:
|
||||||
if result and result != w and _spell_en.known([result]):
|
continue
|
||||||
return result
|
cands = checker.candidates(w)
|
||||||
return None
|
if cands:
|
||||||
|
for c in cands:
|
||||||
|
if c and c != w and c not in seen:
|
||||||
|
seen.add(c)
|
||||||
|
results.append(c)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Gutter position detection
|
# Gutter position detection
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
# Minimum word length to consider for repair (very short words are often
|
# Minimum word length for spell-fix (very short words are often legitimate)
|
||||||
# legitimate: "a", "to", "in", etc.)
|
_MIN_WORD_LEN_SPELL = 3
|
||||||
_MIN_WORD_LEN = 3
|
|
||||||
|
# Minimum word length for hyphen-join candidates (fragments at the gutter
|
||||||
|
# can be as short as 1-2 chars, e.g. "ve" from "ver-künden")
|
||||||
|
_MIN_WORD_LEN_HYPHEN = 2
|
||||||
|
|
||||||
# How close to the right column edge a word must be to count as "gutter-adjacent".
|
# How close to the right column edge a word must be to count as "gutter-adjacent".
|
||||||
# Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
|
# Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
|
||||||
@@ -138,6 +147,8 @@ class GutterSuggestion:
|
|||||||
next_row_text: str = ""
|
next_row_text: str = ""
|
||||||
missing_chars: str = ""
|
missing_chars: str = ""
|
||||||
display_parts: List[str] = field(default_factory=list)
|
display_parts: List[str] = field(default_factory=list)
|
||||||
|
# Alternatives (other plausible corrections the user can pick from)
|
||||||
|
alternatives: List[str] = field(default_factory=list)
|
||||||
# Meta:
|
# Meta:
|
||||||
confidence: float = 0.0
|
confidence: float = 0.0
|
||||||
reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"
|
reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"
|
||||||
@@ -186,12 +197,16 @@ def _try_hyphen_join(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, float]]:
|
def _try_spell_fix(
|
||||||
|
word_text: str, col_type: str = "",
|
||||||
|
) -> Optional[Tuple[str, float, List[str]]]:
|
||||||
"""Try to fix a single garbled gutter word via spellchecker.
|
"""Try to fix a single garbled gutter word via spellchecker.
|
||||||
|
|
||||||
Returns (corrected_word, confidence) or None.
|
Returns (best_correction, confidence, alternatives_list) or None.
|
||||||
|
The alternatives list contains other plausible corrections the user
|
||||||
|
can choose from (e.g. "stammelt" vs "stammeln").
|
||||||
"""
|
"""
|
||||||
if len(word_text) < _MIN_WORD_LEN:
|
if len(word_text) < _MIN_WORD_LEN_SPELL:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Determine language priority from column type
|
# Determine language priority from column type
|
||||||
@@ -202,21 +217,38 @@ def _try_spell_fix(word_text: str, col_type: str = "") -> Optional[Tuple[str, fl
|
|||||||
else:
|
else:
|
||||||
lang = "both"
|
lang = "both"
|
||||||
|
|
||||||
correction = _spell_correction(word_text, lang=lang)
|
candidates = _spell_candidates(word_text, lang=lang)
|
||||||
if not correction:
|
if not candidates and lang != "both":
|
||||||
# Try the other language too
|
candidates = _spell_candidates(word_text, lang="both")
|
||||||
correction = _spell_correction(word_text, lang="both")
|
|
||||||
|
|
||||||
if correction and correction.lower() != word_text.lower():
|
if not candidates:
|
||||||
# Preserve original casing of first letter
|
return None
|
||||||
if word_text[0].isupper():
|
|
||||||
correction = correction[0].upper() + correction[1:]
|
|
||||||
# Confidence based on edit distance
|
|
||||||
dist = _edit_distance(word_text.lower(), correction.lower())
|
|
||||||
conf = max(0.5, 1.0 - dist * 0.15)
|
|
||||||
return (correction, conf)
|
|
||||||
|
|
||||||
return None
|
# Preserve original casing
|
||||||
|
is_upper = word_text[0].isupper()
|
||||||
|
|
||||||
|
def _preserve_case(w: str) -> str:
|
||||||
|
if is_upper and w:
|
||||||
|
return w[0].upper() + w[1:]
|
||||||
|
return w
|
||||||
|
|
||||||
|
# Sort candidates by edit distance (closest first)
|
||||||
|
scored = []
|
||||||
|
for c in candidates:
|
||||||
|
dist = _edit_distance(word_text.lower(), c.lower())
|
||||||
|
scored.append((dist, c))
|
||||||
|
scored.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
best_dist, best = scored[0]
|
||||||
|
best = _preserve_case(best)
|
||||||
|
conf = max(0.5, 1.0 - best_dist * 0.15)
|
||||||
|
|
||||||
|
# Build alternatives (all other candidates, also case-preserved)
|
||||||
|
alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()]
|
||||||
|
# Limit to top 5 alternatives
|
||||||
|
alts = alts[:5]
|
||||||
|
|
||||||
|
return (best, conf, alts)
|
||||||
|
|
||||||
|
|
||||||
def _edit_distance(a: str, b: str) -> int:
|
def _edit_distance(a: str, b: str) -> int:
|
||||||
@@ -299,7 +331,7 @@ def analyse_grid_for_gutter_repair(
|
|||||||
# right edge of its column AND not a known word.
|
# right edge of its column AND not a known word.
|
||||||
for (ri, ci), cell in cell_map.items():
|
for (ri, ci), cell in cell_map.items():
|
||||||
text = (cell.get("text") or "").strip()
|
text = (cell.get("text") or "").strip()
|
||||||
if not text or len(text) < _MIN_WORD_LEN:
|
if not text:
|
||||||
continue
|
continue
|
||||||
if _is_ipa_text(text):
|
if _is_ipa_text(text):
|
||||||
continue
|
continue
|
||||||
@@ -318,12 +350,12 @@ def analyse_grid_for_gutter_repair(
|
|||||||
|
|
||||||
last_word = cell_words[-1]
|
last_word = cell_words[-1]
|
||||||
|
|
||||||
# Skip stopwords and very short words
|
# Skip stopwords
|
||||||
if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
|
if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
last_word_clean = last_word.rstrip(".,;:!?")
|
last_word_clean = last_word.rstrip(".,;:!?")
|
||||||
if len(last_word_clean) < _MIN_WORD_LEN:
|
if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if the last word is at the gutter edge
|
# Check if the last word is at the gutter edge
|
||||||
@@ -382,11 +414,6 @@ def analyse_grid_for_gutter_repair(
|
|||||||
else:
|
else:
|
||||||
display_p1 += "-"
|
display_p1 += "-"
|
||||||
|
|
||||||
# Reconstruct cell texts after join
|
|
||||||
# Current cell: replace last word with first part (hyphenated)
|
|
||||||
# Next cell: remove first word
|
|
||||||
remaining_next = " ".join(next_words[1:])
|
|
||||||
|
|
||||||
suggestion = GutterSuggestion(
|
suggestion = GutterSuggestion(
|
||||||
type="hyphen_join",
|
type="hyphen_join",
|
||||||
zone_index=zi,
|
zone_index=zi,
|
||||||
@@ -407,10 +434,10 @@ def analyse_grid_for_gutter_repair(
|
|||||||
suggestions.append(suggestion)
|
suggestions.append(suggestion)
|
||||||
continue # skip spell_fix if hyphen_join found
|
continue # skip spell_fix if hyphen_join found
|
||||||
|
|
||||||
# --- Strategy 2: Single-word spell fix ---
|
# --- Strategy 2: Single-word spell fix (only for longer words) ---
|
||||||
fix_result = _try_spell_fix(last_word_clean, col_type)
|
fix_result = _try_spell_fix(last_word_clean, col_type)
|
||||||
if fix_result:
|
if fix_result:
|
||||||
corrected, conf = fix_result
|
corrected, conf, alts = fix_result
|
||||||
suggestion = GutterSuggestion(
|
suggestion = GutterSuggestion(
|
||||||
type="spell_fix",
|
type="spell_fix",
|
||||||
zone_index=zi,
|
zone_index=zi,
|
||||||
@@ -420,6 +447,7 @@ def analyse_grid_for_gutter_repair(
|
|||||||
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
|
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
|
||||||
original_text=last_word,
|
original_text=last_word,
|
||||||
suggested_text=corrected,
|
suggested_text=corrected,
|
||||||
|
alternatives=alts,
|
||||||
confidence=conf,
|
confidence=conf,
|
||||||
reason="gutter_blur",
|
reason="gutter_blur",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1923,9 +1923,20 @@ async def gutter_repair_apply(session_id: str, request: Request):
|
|||||||
if not accepted_ids:
|
if not accepted_ids:
|
||||||
return {"applied_count": 0, "changes": []}
|
return {"applied_count": 0, "changes": []}
|
||||||
|
|
||||||
|
# text_overrides: { suggestion_id: "alternative_text" }
|
||||||
|
# Allows the user to pick a different correction from the alternatives list
|
||||||
|
text_overrides = body.get("text_overrides", {})
|
||||||
|
|
||||||
from cv_gutter_repair import apply_gutter_suggestions
|
from cv_gutter_repair import apply_gutter_suggestions
|
||||||
|
|
||||||
suggestions = gutter_result.get("suggestions", [])
|
suggestions = gutter_result.get("suggestions", [])
|
||||||
|
|
||||||
|
# Apply user-selected alternatives before passing to apply
|
||||||
|
for s in suggestions:
|
||||||
|
sid = s.get("id", "")
|
||||||
|
if sid in text_overrides and text_overrides[sid]:
|
||||||
|
s["suggested_text"] = text_overrides[sid]
|
||||||
|
|
||||||
result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
|
result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions)
|
||||||
|
|
||||||
# Save updated grid back to session
|
# Save updated grid back to session
|
||||||
|
|||||||
@@ -13,6 +13,8 @@ from cv_gutter_repair import (
|
|||||||
_try_spell_fix,
|
_try_spell_fix,
|
||||||
_edit_distance,
|
_edit_distance,
|
||||||
_word_is_at_gutter_edge,
|
_word_is_at_gutter_edge,
|
||||||
|
_MIN_WORD_LEN_SPELL,
|
||||||
|
_MIN_WORD_LEN_HYPHEN,
|
||||||
analyse_grid_for_gutter_repair,
|
analyse_grid_for_gutter_repair,
|
||||||
apply_gutter_suggestions,
|
apply_gutter_suggestions,
|
||||||
)
|
)
|
||||||
@@ -127,27 +129,33 @@ class TestTryHyphenJoin:
|
|||||||
|
|
||||||
@needs_spellchecker
|
@needs_spellchecker
|
||||||
class TestTrySpellFix:
|
class TestTrySpellFix:
|
||||||
def test_fix_garbled_ending(self):
|
def test_fix_garbled_ending_returns_alternatives(self):
|
||||||
# "stammeli" should suggest "stammeln"
|
# "stammeli" should return a correction with alternatives
|
||||||
result = _try_spell_fix("stammeli", col_type="column_de")
|
result = _try_spell_fix("stammeli", col_type="column_de")
|
||||||
assert result is not None
|
assert result is not None
|
||||||
corrected, conf = result
|
corrected, conf, alts = result
|
||||||
assert corrected == "stammeln"
|
# The best correction is one of the valid forms
|
||||||
|
all_options = [corrected] + alts
|
||||||
|
all_lower = [w.lower() for w in all_options]
|
||||||
|
# "stammeln" must be among the candidates
|
||||||
|
assert "stammeln" in all_lower, f"Expected 'stammeln' in {all_options}"
|
||||||
|
|
||||||
def test_known_word_not_fixed(self):
|
def test_known_word_not_fixed(self):
|
||||||
# "Haus" is correct — no fix needed
|
# "Haus" is correct — no fix needed
|
||||||
result = _try_spell_fix("Haus", col_type="column_de")
|
result = _try_spell_fix("Haus", col_type="column_de")
|
||||||
# Should be None since the word is correct
|
# Should be None since the word is correct
|
||||||
# (unless spellchecker suggests something else)
|
|
||||||
# Either None or same word is acceptable
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
corrected, _ = result
|
corrected, _, _ = result
|
||||||
assert corrected.lower() == "haus"
|
assert corrected.lower() == "haus"
|
||||||
|
|
||||||
def test_short_word_skipped(self):
|
def test_short_word_skipped(self):
|
||||||
result = _try_spell_fix("ab")
|
result = _try_spell_fix("ab")
|
||||||
assert result is None
|
assert result is None
|
||||||
|
|
||||||
|
def test_min_word_len_thresholds(self):
|
||||||
|
assert _MIN_WORD_LEN_HYPHEN == 2
|
||||||
|
assert _MIN_WORD_LEN_SPELL == 3
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Grid analysis tests
|
# Grid analysis tests
|
||||||
|
|||||||
Reference in New Issue
Block a user