feat(ocr-review): replace LLM with rule-based spell-checker (REVIEW_ENGINE=spell)
- Add pyspellchecker (MIT) to requirements for EN+DE dictionary lookup
- New spell_review_entries_sync() + spell_review_entries_streaming():
- Dictionary-backed substitution: checks if corrected word is known
- Structural rule: digit at pos 0 + lowercase rest → most likely letter
(e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld")
- Pattern rule: "|." → "1." for numbered list prefixes
- Standalone "|" → "I" (capital I)
- IPA entries still protected via existing _entry_needs_review filter
- Headings/untranslated words (e.g. "Story") are untouched (no susp. chars)
- llm_review_entries + llm_review_entries_streaming: route via REVIEW_ENGINE
env var ("spell" default, "llm" to restore previous behaviour)
- docker-compose.yml: REVIEW_ENGINE=${REVIEW_ENGINE:-spell}
- LLM code preserved for fallback (set REVIEW_ENGINE=llm in .env)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -235,6 +235,7 @@ services:
|
||||
OLLAMA_CORRECTION_MODEL: ${OLLAMA_CORRECTION_MODEL:-llama3.2}
|
||||
OLLAMA_REVIEW_MODEL: ${OLLAMA_REVIEW_MODEL:-qwen3:0.6b}
|
||||
OLLAMA_REVIEW_BATCH_SIZE: ${OLLAMA_REVIEW_BATCH_SIZE:-20}
|
||||
REVIEW_ENGINE: ${REVIEW_ENGINE:-spell}
|
||||
OCR_ENGINE: ${OCR_ENGINE:-auto}
|
||||
OLLAMA_HTR_MODEL: ${OLLAMA_HTR_MODEL:-qwen2.5vl:32b}
|
||||
HTR_FALLBACK_MODEL: ${HTR_FALLBACK_MODEL:-trocr-large}
|
||||
|
||||
@@ -5540,11 +5540,183 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict
|
||||
return changes, entries_out
|
||||
|
||||
|
||||
# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
|
||||
|
||||
REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
|
||||
|
||||
try:
|
||||
from spellchecker import SpellChecker as _SpellChecker
|
||||
_en_spell = _SpellChecker(language='en', distance=1)
|
||||
_de_spell = _SpellChecker(language='de', distance=1)
|
||||
_SPELL_AVAILABLE = True
|
||||
logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
|
||||
except ImportError:
|
||||
_SPELL_AVAILABLE = False
|
||||
logger.warning("pyspellchecker not installed — falling back to LLM review")
|
||||
|
||||
# Suspicious OCR chars → ordered list of most-likely correct replacements
|
||||
_SPELL_SUBS: Dict[str, List[str]] = {
|
||||
'0': ['O', 'o'],
|
||||
'1': ['l', 'I'],
|
||||
'5': ['S', 's'],
|
||||
'6': ['G', 'g'],
|
||||
'8': ['B', 'b'],
|
||||
'|': ['I', 'l', '1'],
|
||||
}
|
||||
_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
|
||||
|
||||
# Tokenizer: word tokens (letters + pipe) alternating with separators
|
||||
_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
|
||||
|
||||
|
||||
def _spell_dict_knows(word: str) -> bool:
|
||||
"""True if word is known in EN or DE dictionary."""
|
||||
if not _SPELL_AVAILABLE:
|
||||
return False
|
||||
w = word.lower()
|
||||
return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
|
||||
|
||||
|
||||
def _spell_fix_token(token: str) -> Optional[str]:
|
||||
"""Return corrected form of token, or None if no fix needed/possible."""
|
||||
if not any(ch in _SPELL_SUSPICIOUS for ch in token):
|
||||
return None
|
||||
# Standalone pipe → capital I
|
||||
if token == '|':
|
||||
return 'I'
|
||||
# Original is already a valid word → leave it
|
||||
if _spell_dict_knows(token):
|
||||
return None
|
||||
# Dictionary-backed single-char substitution
|
||||
for i, ch in enumerate(token):
|
||||
if ch not in _SPELL_SUBS:
|
||||
continue
|
||||
for replacement in _SPELL_SUBS[ch]:
|
||||
candidate = token[:i] + replacement + token[i + 1:]
|
||||
if _spell_dict_knows(candidate):
|
||||
return candidate
|
||||
# Structural rule: suspicious char at position 0 + rest is all lowercase letters
|
||||
# e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld"
|
||||
first = token[0]
|
||||
if first in _SPELL_SUBS and len(token) >= 2:
|
||||
rest = token[1:]
|
||||
if rest.isalpha() and rest.islower():
|
||||
candidate = _SPELL_SUBS[first][0] + rest
|
||||
if not candidate[0].isdigit():
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
def _spell_fix_field(text: str) -> Tuple[str, bool]:
|
||||
"""Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
|
||||
if not text or not any(ch in text for ch in _SPELL_SUSPICIOUS):
|
||||
return text, False
|
||||
# Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
|
||||
fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text)
|
||||
changed = fixed != text
|
||||
# Tokenize and fix word by word
|
||||
parts: List[str] = []
|
||||
pos = 0
|
||||
for m in _SPELL_TOKEN_RE.finditer(fixed):
|
||||
token, sep = m.group(1), m.group(2)
|
||||
correction = _spell_fix_token(token)
|
||||
if correction:
|
||||
parts.append(correction)
|
||||
changed = True
|
||||
else:
|
||||
parts.append(token)
|
||||
parts.append(sep)
|
||||
pos = m.end()
|
||||
if pos < len(fixed):
|
||||
parts.append(fixed[pos:])
|
||||
return ''.join(parts), changed
|
||||
|
||||
|
||||
def spell_review_entries_sync(entries: List[Dict]) -> Dict:
|
||||
"""Rule-based OCR correction: spell-checker + structural heuristics.
|
||||
|
||||
Deterministic — never translates, never touches IPA, never hallucinates.
|
||||
"""
|
||||
t0 = time.time()
|
||||
changes: List[Dict] = []
|
||||
all_corrected: List[Dict] = []
|
||||
for i, entry in enumerate(entries):
|
||||
e = dict(entry)
|
||||
if not _entry_needs_review(e):
|
||||
all_corrected.append(e)
|
||||
continue
|
||||
for field_name in ("english", "german"):
|
||||
old_val = (e.get(field_name) or "").strip()
|
||||
if not old_val:
|
||||
continue
|
||||
new_val, was_changed = _spell_fix_field(old_val)
|
||||
if was_changed and new_val != old_val:
|
||||
changes.append({
|
||||
"row_index": e.get("row_index", i),
|
||||
"field": field_name,
|
||||
"old": old_val,
|
||||
"new": new_val,
|
||||
})
|
||||
e[field_name] = new_val
|
||||
e["llm_corrected"] = True
|
||||
all_corrected.append(e)
|
||||
duration_ms = int((time.time() - t0) * 1000)
|
||||
return {
|
||||
"entries_original": entries,
|
||||
"entries_corrected": all_corrected,
|
||||
"changes": changes,
|
||||
"skipped_count": 0,
|
||||
"model_used": "spell-checker",
|
||||
"duration_ms": duration_ms,
|
||||
}
|
||||
|
||||
|
||||
async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
|
||||
"""Async generator yielding SSE-compatible events for spell-checker review."""
|
||||
total = len(entries)
|
||||
yield {
|
||||
"type": "meta",
|
||||
"total_entries": total,
|
||||
"to_review": total,
|
||||
"skipped": 0,
|
||||
"model": "spell-checker",
|
||||
"batch_size": batch_size,
|
||||
}
|
||||
result = spell_review_entries_sync(entries)
|
||||
changes = result["changes"]
|
||||
yield {
|
||||
"type": "batch",
|
||||
"batch_index": 0,
|
||||
"entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
|
||||
"changes": changes,
|
||||
"duration_ms": result["duration_ms"],
|
||||
"progress": {"current": total, "total": total},
|
||||
}
|
||||
yield {
|
||||
"type": "complete",
|
||||
"changes": changes,
|
||||
"model_used": "spell-checker",
|
||||
"duration_ms": result["duration_ms"],
|
||||
"total_entries": total,
|
||||
"reviewed": total,
|
||||
"skipped": 0,
|
||||
"corrections_found": len(changes),
|
||||
"entries_corrected": result["entries_corrected"],
|
||||
}
|
||||
|
||||
# ─── End Spell-Checker ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def llm_review_entries(
|
||||
entries: List[Dict],
|
||||
model: str = None,
|
||||
) -> Dict:
|
||||
"""Send vocab entries to a local LLM for OCR error correction (single batch)."""
|
||||
"""OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
|
||||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||||
return spell_review_entries_sync(entries)
|
||||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||||
|
||||
model = model or OLLAMA_REVIEW_MODEL
|
||||
|
||||
# Filter: only entries that need review
|
||||
@@ -5616,7 +5788,14 @@ async def llm_review_entries_streaming(
|
||||
model: str = None,
|
||||
batch_size: int = _REVIEW_BATCH_SIZE,
|
||||
):
|
||||
"""Async generator: yield SSE events while reviewing entries in batches."""
|
||||
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE."""
|
||||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||||
async for event in spell_review_entries_streaming(entries, batch_size):
|
||||
yield event
|
||||
return
|
||||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||||
|
||||
model = model or OLLAMA_REVIEW_MODEL
|
||||
|
||||
# Separate reviewable from skipped entries
|
||||
|
||||
@@ -35,6 +35,9 @@ onnxruntime
|
||||
# IPA pronunciation dictionary lookup (MIT license, bundled CMU dict ~134k words)
|
||||
eng-to-ipa
|
||||
|
||||
# Spell-checker for rule-based OCR correction (MIT license)
|
||||
pyspellchecker>=0.8.1
|
||||
|
||||
# PostgreSQL (for metrics storage)
|
||||
psycopg2-binary>=2.9.0
|
||||
asyncpg>=0.29.0
|
||||
|
||||
Reference in New Issue
Block a user