fix: move char-confusion fix to correction step, add spell + page-ref corrections
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 30s
CI / test-nodejs-website (push) Successful in 20s
CI / nodejs-lint (push) Failing after 10m5s

- Remove _fix_character_confusion() from words endpoint (now only in Phase 0)
- Extend spell checker to find real OCR errors via spell.correction()
- Add field-aware dictionary selection (EN/DE) for spell corrections
- Add _normalize_page_ref() for page_ref column (p-60 → p.60)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-05 00:26:13 +01:00
parent fd99d4f875
commit a58dfca1d8
2 changed files with 83 additions and 33 deletions

View File

@@ -6891,6 +6891,18 @@ except ImportError:
_SPELL_AVAILABLE = False _SPELL_AVAILABLE = False
logger.warning("pyspellchecker not installed — falling back to LLM review") logger.warning("pyspellchecker not installed — falling back to LLM review")
# ─── Page-Ref Normalization ───────────────────────────────────────────────────
# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60"
_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE)
def _normalize_page_ref(text: str) -> str:
"""Normalize page references: 'p-60' / 'p 61' / 'p60''p.60'."""
if not text:
return text
return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
# Suspicious OCR chars → ordered list of most-likely correct replacements # Suspicious OCR chars → ordered list of most-likely correct replacements
_SPELL_SUBS: Dict[str, List[str]] = { _SPELL_SUBS: Dict[str, List[str]] = {
'0': ['O', 'o'], '0': ['O', 'o'],
@@ -6914,49 +6926,76 @@ def _spell_dict_knows(word: str) -> bool:
return bool(_en_spell.known([w])) or bool(_de_spell.known([w])) return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
def _spell_fix_token(token: str) -> Optional[str]: def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
"""Return corrected form of token, or None if no fix needed/possible.""" """Return corrected form of token, or None if no fix needed/possible.
if not any(ch in _SPELL_SUSPICIOUS for ch in token):
return None *field* is 'english' or 'german' — used to pick the right dictionary
# Standalone pipe → capital I for general spell correction (step 3 below).
if token == '|': """
return 'I' has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
# Original is already a valid word → leave it
# 1. Already known word → no fix needed
if _spell_dict_knows(token): if _spell_dict_knows(token):
return None return None
# Dictionary-backed single-char substitution
for i, ch in enumerate(token): # 2. Digit/pipe substitution (existing logic)
if ch not in _SPELL_SUBS: if has_suspicious:
continue # Standalone pipe → capital I
for replacement in _SPELL_SUBS[ch]: if token == '|':
candidate = token[:i] + replacement + token[i + 1:] return 'I'
if _spell_dict_knows(candidate): # Dictionary-backed single-char substitution
return candidate for i, ch in enumerate(token):
# Structural rule: suspicious char at position 0 + rest is all lowercase letters if ch not in _SPELL_SUBS:
# e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld" continue
first = token[0] for replacement in _SPELL_SUBS[ch]:
if first in _SPELL_SUBS and len(token) >= 2: candidate = token[:i] + replacement + token[i + 1:]
rest = token[1:] if _spell_dict_knows(candidate):
if rest.isalpha() and rest.islower(): return candidate
candidate = _SPELL_SUBS[first][0] + rest # Structural rule: suspicious char at position 0 + rest is all lowercase letters
if not candidate[0].isdigit(): first = token[0]
return candidate if first in _SPELL_SUBS and len(token) >= 2:
rest = token[1:]
if rest.isalpha() and rest.islower():
candidate = _SPELL_SUBS[first][0] + rest
if not candidate[0].isdigit():
return candidate
# 3. General spell correction for unknown words (no digits/pipes)
# e.g. "iberqueren" → "ueberqueren", "beautful" → "beautiful"
if not has_suspicious and len(token) >= 3 and token.isalpha():
spell = _en_spell if field == "english" else _de_spell if field == "german" else None
if spell is not None:
correction = spell.correction(token.lower())
if correction and correction != token.lower():
# Preserve original capitalisation pattern
if token[0].isupper():
correction = correction[0].upper() + correction[1:]
if _spell_dict_knows(correction):
return correction
return None return None
def _spell_fix_field(text: str) -> Tuple[str, bool]: def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
"""Apply OCR corrections to a text field. Returns (fixed_text, was_changed).""" """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).
if not text or not any(ch in text for ch in _SPELL_SUSPICIOUS):
*field* is 'english' or 'german' — forwarded to _spell_fix_token for
dictionary selection.
"""
if not text:
return text, False
has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
# If no suspicious chars AND no alpha chars that could be misspelled, skip
if not has_suspicious and not any(c.isalpha() for c in text):
return text, False return text, False
# Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ") # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text) fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
changed = fixed != text changed = fixed != text
# Tokenize and fix word by word # Tokenize and fix word by word
parts: List[str] = [] parts: List[str] = []
pos = 0 pos = 0
for m in _SPELL_TOKEN_RE.finditer(fixed): for m in _SPELL_TOKEN_RE.finditer(fixed):
token, sep = m.group(1), m.group(2) token, sep = m.group(1), m.group(2)
correction = _spell_fix_token(token) correction = _spell_fix_token(token, field=field)
if correction: if correction:
parts.append(correction) parts.append(correction)
changed = True changed = True
@@ -6979,6 +7018,19 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
all_corrected: List[Dict] = [] all_corrected: List[Dict] = []
for i, entry in enumerate(entries): for i, entry in enumerate(entries):
e = dict(entry) e = dict(entry)
# Page-ref normalization (always, regardless of review status)
old_ref = (e.get("source_page") or "").strip()
if old_ref:
new_ref = _normalize_page_ref(old_ref)
if new_ref != old_ref:
changes.append({
"row_index": e.get("row_index", i),
"field": "source_page",
"old": old_ref,
"new": new_ref,
})
e["source_page"] = new_ref
e["llm_corrected"] = True
if not _entry_needs_review(e): if not _entry_needs_review(e):
all_corrected.append(e) all_corrected.append(e)
continue continue
@@ -6986,7 +7038,7 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
old_val = (e.get(field_name) or "").strip() old_val = (e.get(field_name) or "").strip()
if not old_val: if not old_val:
continue continue
new_val, was_changed = _spell_fix_field(old_val) new_val, was_changed = _spell_fix_field(old_val, field=field_name)
if was_changed and new_val != old_val: if was_changed and new_val != old_val:
changes.append({ changes.append({
"row_index": e.get("row_index", i), "row_index": e.get("row_index", i),

View File

@@ -1348,7 +1348,6 @@ async def detect_words(
# No content shuffling — each cell stays at its detected position. # No content shuffling — each cell stays at its detected position.
if is_vocab: if is_vocab:
entries = _cells_to_vocab_entries(cells, columns_meta) entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_character_confusion(entries)
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
word_result["vocab_entries"] = entries word_result["vocab_entries"] = entries
word_result["entries"] = entries word_result["entries"] = entries
@@ -1487,7 +1486,6 @@ async def _word_batch_stream_generator(
vocab_entries = None vocab_entries = None
if is_vocab: if is_vocab:
entries = _cells_to_vocab_entries(cells, columns_meta) entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_character_confusion(entries)
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
word_result["vocab_entries"] = entries word_result["vocab_entries"] = entries
word_result["entries"] = entries word_result["entries"] = entries