feat(ocr-review): replace LLM with rule-based spell-checker (REVIEW_ENGINE=spell)

- Add pyspellchecker (MIT) to requirements for EN+DE dictionary lookup
- New spell_review_entries_sync() + spell_review_entries_streaming():
  - Dictionary-backed substitution: checks if corrected word is known
  - Structural rule: digit at pos 0 + lowercase rest → most likely letter
    (e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld")
  - Pattern rule: "|." → "1." for numbered list prefixes
  - Standalone "|" → "I" (capital I)
  - IPA entries still protected via existing _entry_needs_review filter
  - Headings/untranslated words (e.g. "Story") are untouched (no susp. chars)
- llm_review_entries + llm_review_entries_streaming: route via REVIEW_ENGINE
  env var ("spell" default, "llm" to restore previous behaviour)
- docker-compose.yml: REVIEW_ENGINE=${REVIEW_ENGINE:-spell}
- LLM code preserved for fallback (set REVIEW_ENGINE=llm in .env)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 15:04:27 +01:00
parent b1f7fee284
commit 21ea458fcf
3 changed files with 185 additions and 2 deletions

View File

@@ -235,6 +235,7 @@ services:
OLLAMA_CORRECTION_MODEL: ${OLLAMA_CORRECTION_MODEL:-llama3.2}
OLLAMA_REVIEW_MODEL: ${OLLAMA_REVIEW_MODEL:-qwen3:0.6b}
OLLAMA_REVIEW_BATCH_SIZE: ${OLLAMA_REVIEW_BATCH_SIZE:-20}
REVIEW_ENGINE: ${REVIEW_ENGINE:-spell}
OCR_ENGINE: ${OCR_ENGINE:-auto}
OLLAMA_HTR_MODEL: ${OLLAMA_HTR_MODEL:-qwen2.5vl:32b}
HTR_FALLBACK_MODEL: ${HTR_FALLBACK_MODEL:-trocr-large}

View File

@@ -5540,11 +5540,183 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict
return changes, entries_out
# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
try:
from spellchecker import SpellChecker as _SpellChecker
_en_spell = _SpellChecker(language='en', distance=1)
_de_spell = _SpellChecker(language='de', distance=1)
_SPELL_AVAILABLE = True
logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
except ImportError:
_SPELL_AVAILABLE = False
logger.warning("pyspellchecker not installed — falling back to LLM review")
# Suspicious OCR chars → ordered list of most-likely correct replacements
_SPELL_SUBS: Dict[str, List[str]] = {
'0': ['O', 'o'],
'1': ['l', 'I'],
'5': ['S', 's'],
'6': ['G', 'g'],
'8': ['B', 'b'],
'|': ['I', 'l', '1'],
}
_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
# Tokenizer: word tokens (letters + pipe) alternating with separators
_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
def _spell_dict_knows(word: str) -> bool:
"""True if word is known in EN or DE dictionary."""
if not _SPELL_AVAILABLE:
return False
w = word.lower()
return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
def _spell_fix_token(token: str) -> Optional[str]:
"""Return corrected form of token, or None if no fix needed/possible."""
if not any(ch in _SPELL_SUSPICIOUS for ch in token):
return None
# Standalone pipe → capital I
if token == '|':
return 'I'
# Original is already a valid word → leave it
if _spell_dict_knows(token):
return None
# Dictionary-backed single-char substitution
for i, ch in enumerate(token):
if ch not in _SPELL_SUBS:
continue
for replacement in _SPELL_SUBS[ch]:
candidate = token[:i] + replacement + token[i + 1:]
if _spell_dict_knows(candidate):
return candidate
# Structural rule: suspicious char at position 0 + rest is all lowercase letters
# e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld"
first = token[0]
if first in _SPELL_SUBS and len(token) >= 2:
rest = token[1:]
if rest.isalpha() and rest.islower():
candidate = _SPELL_SUBS[first][0] + rest
if not candidate[0].isdigit():
return candidate
return None
def _spell_fix_field(text: str) -> Tuple[str, bool]:
"""Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
if not text or not any(ch in text for ch in _SPELL_SUSPICIOUS):
return text, False
# Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text)
changed = fixed != text
# Tokenize and fix word by word
parts: List[str] = []
pos = 0
for m in _SPELL_TOKEN_RE.finditer(fixed):
token, sep = m.group(1), m.group(2)
correction = _spell_fix_token(token)
if correction:
parts.append(correction)
changed = True
else:
parts.append(token)
parts.append(sep)
pos = m.end()
if pos < len(fixed):
parts.append(fixed[pos:])
return ''.join(parts), changed
def spell_review_entries_sync(entries: List[Dict]) -> Dict:
"""Rule-based OCR correction: spell-checker + structural heuristics.
Deterministic — never translates, never touches IPA, never hallucinates.
"""
t0 = time.time()
changes: List[Dict] = []
all_corrected: List[Dict] = []
for i, entry in enumerate(entries):
e = dict(entry)
if not _entry_needs_review(e):
all_corrected.append(e)
continue
for field_name in ("english", "german"):
old_val = (e.get(field_name) or "").strip()
if not old_val:
continue
new_val, was_changed = _spell_fix_field(old_val)
if was_changed and new_val != old_val:
changes.append({
"row_index": e.get("row_index", i),
"field": field_name,
"old": old_val,
"new": new_val,
})
e[field_name] = new_val
e["llm_corrected"] = True
all_corrected.append(e)
duration_ms = int((time.time() - t0) * 1000)
return {
"entries_original": entries,
"entries_corrected": all_corrected,
"changes": changes,
"skipped_count": 0,
"model_used": "spell-checker",
"duration_ms": duration_ms,
}
async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
"""Async generator yielding SSE-compatible events for spell-checker review."""
total = len(entries)
yield {
"type": "meta",
"total_entries": total,
"to_review": total,
"skipped": 0,
"model": "spell-checker",
"batch_size": batch_size,
}
result = spell_review_entries_sync(entries)
changes = result["changes"]
yield {
"type": "batch",
"batch_index": 0,
"entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
"changes": changes,
"duration_ms": result["duration_ms"],
"progress": {"current": total, "total": total},
}
yield {
"type": "complete",
"changes": changes,
"model_used": "spell-checker",
"duration_ms": result["duration_ms"],
"total_entries": total,
"reviewed": total,
"skipped": 0,
"corrections_found": len(changes),
"entries_corrected": result["entries_corrected"],
}
# ─── End Spell-Checker ────────────────────────────────────────────────────────
async def llm_review_entries(
entries: List[Dict],
model: str = None,
) -> Dict:
"""Send vocab entries to a local LLM for OCR error correction (single batch)."""
"""OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
return spell_review_entries_sync(entries)
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
model = model or OLLAMA_REVIEW_MODEL
# Filter: only entries that need review
@@ -5616,7 +5788,14 @@ async def llm_review_entries_streaming(
model: str = None,
batch_size: int = _REVIEW_BATCH_SIZE,
):
"""Async generator: yield SSE events while reviewing entries in batches."""
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE."""
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
async for event in spell_review_entries_streaming(entries, batch_size):
yield event
return
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
model = model or OLLAMA_REVIEW_MODEL
# Separate reviewable from skipped entries

View File

@@ -35,6 +35,9 @@ onnxruntime
# IPA pronunciation dictionary lookup (MIT license, bundled CMU dict ~134k words)
eng-to-ipa
# Spell-checker for rule-based OCR correction (MIT license)
pyspellchecker>=0.8.1
# PostgreSQL (for metrics storage)
psycopg2-binary>=2.9.0
asyncpg>=0.29.0