Add SmartSpellChecker + refactor vocab-worksheet page.tsx

SmartSpellChecker (klausur-service): - Language-aware OCR post-correction without LLMs - Dual-dictionary heuristic for EN/DE language detection - Context-based a/I disambiguation via bigram lookup - Multi-digit substitution (sch00l→school) - Cross-language guard (don't false-correct DE words in EN column) - Umlaut correction (Schuler→Schüler, uber→über) - Integrated into spell_review_entries_sync() pipeline - 31 tests, 9ms/100 corrections Vocab-worksheet refactoring (studio-v2): - Split 2337-line page.tsx into 14 files - Custom hook useVocabWorksheet.ts (all state + logic) - 9 components in components/ directory - types.ts, constants.ts for shared definitions Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 12:25:01 +02:00
parent 04fa01661c
commit 909d0729f6
17 changed files with 3545 additions and 2228 deletions
--- a/klausur-service/backend/cv_review.py
+++ b/klausur-service/backend/cv_review.py
@@ -881,10 +881,25 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
    """Rule-based OCR correction: spell-checker + structural heuristics.

    Deterministic — never translates, never touches IPA, never hallucinates.
+    Uses SmartSpellChecker for language-aware corrections with context-based
+    disambiguation (a/I), multi-digit substitution, and cross-language guard.
    """
    t0 = time.time()
    changes: List[Dict] = []
    all_corrected: List[Dict] = []
+
+    # Use SmartSpellChecker if available, fall back to legacy _spell_fix_field
+    _smart = None
+    try:
+        from smart_spell import SmartSpellChecker
+        _smart = SmartSpellChecker()
+        logger.debug("spell_review: using SmartSpellChecker")
+    except Exception:
+        logger.debug("spell_review: SmartSpellChecker not available, using legacy")
+
+    # Map field names → language codes for SmartSpellChecker
+    _LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
+
    for i, entry in enumerate(entries):
        e = dict(entry)
        # Page-ref normalization (always, regardless of review status)
@@ -907,9 +922,18 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
            old_val = (e.get(field_name) or "").strip()
            if not old_val:
                continue
-            # example field is mixed-language — try German first (for umlauts)
-            lang = "german" if field_name in ("german", "example") else "english"
-            new_val, was_changed = _spell_fix_field(old_val, field=lang)
+
+            if _smart:
+                # SmartSpellChecker path — language-aware, context-based
+                lang_code = _LANG_MAP.get(field_name, "en")
+                result = _smart.correct_text(old_val, lang=lang_code)
+                new_val = result.corrected
+                was_changed = result.changed
+            else:
+                # Legacy path
+                lang = "german" if field_name in ("german", "example") else "english"
+                new_val, was_changed = _spell_fix_field(old_val, field=lang)
+
            if was_changed and new_val != old_val:
                changes.append({
                    "row_index": e.get("row_index", i),
@@ -921,12 +945,13 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
                e["llm_corrected"] = True
        all_corrected.append(e)
    duration_ms = int((time.time() - t0) * 1000)
+    model_name = "smart-spell-checker" if _smart else "spell-checker"
    return {
        "entries_original": entries,
        "entries_corrected": all_corrected,
        "changes": changes,
        "skipped_count": 0,
-        "model_used": "spell-checker",
+        "model_used": model_name,
        "duration_ms": duration_ms,
    }