From 21ea458fcf32595054ee51c44375fb718f6a8669 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 3 Mar 2026 15:04:27 +0100 Subject: [PATCH] feat(ocr-review): replace LLM with rule-based spell-checker (REVIEW_ENGINE=spell) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add pyspellchecker (MIT) to requirements for EN+DE dictionary lookup - New spell_review_entries_sync() + spell_review_entries_streaming(): - Dictionary-backed substitution: checks if corrected word is known - Structural rule: digit at pos 0 + lowercase rest → most likely letter (e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld") - Pattern rule: "|." → "1." for numbered list prefixes - Standalone "|" → "I" (capital I) - IPA entries still protected via existing _entry_needs_review filter - Headings/untranslated words (e.g. "Story") are untouched (no susp. chars) - llm_review_entries + llm_review_entries_streaming: route via REVIEW_ENGINE env var ("spell" default, "llm" to restore previous behaviour) - docker-compose.yml: REVIEW_ENGINE=${REVIEW_ENGINE:-spell} - LLM code preserved for fallback (set REVIEW_ENGINE=llm in .env) Co-Authored-By: Claude Sonnet 4.6 --- docker-compose.yml | 1 + klausur-service/backend/cv_vocab_pipeline.py | 183 ++++++++++++++++++- klausur-service/backend/requirements.txt | 3 + 3 files changed, 185 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 7a3686a..befbd3b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -235,6 +235,7 @@ services: OLLAMA_CORRECTION_MODEL: ${OLLAMA_CORRECTION_MODEL:-llama3.2} OLLAMA_REVIEW_MODEL: ${OLLAMA_REVIEW_MODEL:-qwen3:0.6b} OLLAMA_REVIEW_BATCH_SIZE: ${OLLAMA_REVIEW_BATCH_SIZE:-20} + REVIEW_ENGINE: ${REVIEW_ENGINE:-spell} OCR_ENGINE: ${OCR_ENGINE:-auto} OLLAMA_HTR_MODEL: ${OLLAMA_HTR_MODEL:-qwen2.5vl:32b} HTR_FALLBACK_MODEL: ${HTR_FALLBACK_MODEL:-trocr-large} diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 3f4578f..36fe9ed 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -5540,11 +5540,183 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict return changes, entries_out +# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ──────────────────────────── + +REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm" + +try: + from spellchecker import SpellChecker as _SpellChecker + _en_spell = _SpellChecker(language='en', distance=1) + _de_spell = _SpellChecker(language='de', distance=1) + _SPELL_AVAILABLE = True + logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE) +except ImportError: + _SPELL_AVAILABLE = False + logger.warning("pyspellchecker not installed — falling back to LLM review") + +# Suspicious OCR chars → ordered list of most-likely correct replacements +_SPELL_SUBS: Dict[str, List[str]] = { + '0': ['O', 'o'], + '1': ['l', 'I'], + '5': ['S', 's'], + '6': ['G', 'g'], + '8': ['B', 'b'], + '|': ['I', 'l', '1'], +} +_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys()) + +# Tokenizer: word tokens (letters + pipe) alternating with separators +_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)') + + +def _spell_dict_knows(word: str) -> bool: + """True if word is known in EN or DE dictionary.""" + if not _SPELL_AVAILABLE: + return False + w = word.lower() + return bool(_en_spell.known([w])) or bool(_de_spell.known([w])) + + +def _spell_fix_token(token: str) -> Optional[str]: + """Return corrected form of token, or None if no fix needed/possible.""" + if not any(ch in _SPELL_SUSPICIOUS for ch in token): + return None + # Standalone pipe → capital I + if token == '|': + return 'I' + # Original is already a valid word → leave it + if _spell_dict_knows(token): + return None + # Dictionary-backed single-char substitution + for i, ch in enumerate(token): + if ch not in _SPELL_SUBS: + continue + for replacement in _SPELL_SUBS[ch]: + candidate = token[:i] + replacement + token[i + 1:] + if _spell_dict_knows(candidate): + return candidate + # Structural rule: suspicious char at position 0 + rest is all lowercase letters + # e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld" + first = token[0] + if first in _SPELL_SUBS and len(token) >= 2: + rest = token[1:] + if rest.isalpha() and rest.islower(): + candidate = _SPELL_SUBS[first][0] + rest + if not candidate[0].isdigit(): + return candidate + return None + + +def _spell_fix_field(text: str) -> Tuple[str, bool]: + """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).""" + if not text or not any(ch in text for ch in _SPELL_SUSPICIOUS): + return text, False + # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ") + fixed = _re.sub(r'(? Dict: + """Rule-based OCR correction: spell-checker + structural heuristics. + + Deterministic — never translates, never touches IPA, never hallucinates. + """ + t0 = time.time() + changes: List[Dict] = [] + all_corrected: List[Dict] = [] + for i, entry in enumerate(entries): + e = dict(entry) + if not _entry_needs_review(e): + all_corrected.append(e) + continue + for field_name in ("english", "german"): + old_val = (e.get(field_name) or "").strip() + if not old_val: + continue + new_val, was_changed = _spell_fix_field(old_val) + if was_changed and new_val != old_val: + changes.append({ + "row_index": e.get("row_index", i), + "field": field_name, + "old": old_val, + "new": new_val, + }) + e[field_name] = new_val + e["llm_corrected"] = True + all_corrected.append(e) + duration_ms = int((time.time() - t0) * 1000) + return { + "entries_original": entries, + "entries_corrected": all_corrected, + "changes": changes, + "skipped_count": 0, + "model_used": "spell-checker", + "duration_ms": duration_ms, + } + + +async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50): + """Async generator yielding SSE-compatible events for spell-checker review.""" + total = len(entries) + yield { + "type": "meta", + "total_entries": total, + "to_review": total, + "skipped": 0, + "model": "spell-checker", + "batch_size": batch_size, + } + result = spell_review_entries_sync(entries) + changes = result["changes"] + yield { + "type": "batch", + "batch_index": 0, + "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)], + "changes": changes, + "duration_ms": result["duration_ms"], + "progress": {"current": total, "total": total}, + } + yield { + "type": "complete", + "changes": changes, + "model_used": "spell-checker", + "duration_ms": result["duration_ms"], + "total_entries": total, + "reviewed": total, + "skipped": 0, + "corrections_found": len(changes), + "entries_corrected": result["entries_corrected"], + } + +# ─── End Spell-Checker ──────────────────────────────────────────────────────── + + async def llm_review_entries( entries: List[Dict], model: str = None, ) -> Dict: - """Send vocab entries to a local LLM for OCR error correction (single batch).""" + """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm).""" + if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: + return spell_review_entries_sync(entries) + if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: + logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") + model = model or OLLAMA_REVIEW_MODEL # Filter: only entries that need review @@ -5616,7 +5788,14 @@ async def llm_review_entries_streaming( model: str = None, batch_size: int = _REVIEW_BATCH_SIZE, ): - """Async generator: yield SSE events while reviewing entries in batches.""" + """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.""" + if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: + async for event in spell_review_entries_streaming(entries, batch_size): + yield event + return + if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: + logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") + model = model or OLLAMA_REVIEW_MODEL # Separate reviewable from skipped entries diff --git a/klausur-service/backend/requirements.txt b/klausur-service/backend/requirements.txt index bc2511f..fc2e9f8 100644 --- a/klausur-service/backend/requirements.txt +++ b/klausur-service/backend/requirements.txt @@ -35,6 +35,9 @@ onnxruntime # IPA pronunciation dictionary lookup (MIT license, bundled CMU dict ~134k words) eng-to-ipa +# Spell-checker for rule-based OCR correction (MIT license) +pyspellchecker>=0.8.1 + # PostgreSQL (for metrics storage) psycopg2-binary>=2.9.0 asyncpg>=0.29.0