From a58dfca1d8408ed9dfdf495f711c2fd76debdfc7 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 5 Mar 2026 00:26:13 +0100 Subject: [PATCH] fix: move char-confusion fix to correction step, add spell + page-ref corrections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove _fix_character_confusion() from words endpoint (now only in Phase 0) - Extend spell checker to find real OCR errors via spell.correction() - Add field-aware dictionary selection (EN/DE) for spell corrections - Add _normalize_page_ref() for page_ref column (p-60 → p.60) Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 114 ++++++++++++++----- klausur-service/backend/ocr_pipeline_api.py | 2 - 2 files changed, 83 insertions(+), 33 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 1ead642..0e182f7 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -6891,6 +6891,18 @@ except ImportError: _SPELL_AVAILABLE = False logger.warning("pyspellchecker not installed — falling back to LLM review") +# ─── Page-Ref Normalization ─────────────────────────────────────────────────── +# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60" +_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE) + + +def _normalize_page_ref(text: str) -> str: + """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'.""" + if not text: + return text + return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text) + + # Suspicious OCR chars → ordered list of most-likely correct replacements _SPELL_SUBS: Dict[str, List[str]] = { '0': ['O', 'o'], @@ -6914,49 +6926,76 @@ def _spell_dict_knows(word: str) -> bool: return bool(_en_spell.known([w])) or bool(_de_spell.known([w])) -def _spell_fix_token(token: str) -> Optional[str]: - """Return corrected form of token, or None if no fix needed/possible.""" - if not any(ch in _SPELL_SUSPICIOUS for ch in token): - return None - # Standalone pipe → capital I - if token == '|': - return 'I' - # Original is already a valid word → leave it +def _spell_fix_token(token: str, field: str = "") -> Optional[str]: + """Return corrected form of token, or None if no fix needed/possible. + + *field* is 'english' or 'german' — used to pick the right dictionary + for general spell correction (step 3 below). + """ + has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token) + + # 1. Already known word → no fix needed if _spell_dict_knows(token): return None - # Dictionary-backed single-char substitution - for i, ch in enumerate(token): - if ch not in _SPELL_SUBS: - continue - for replacement in _SPELL_SUBS[ch]: - candidate = token[:i] + replacement + token[i + 1:] - if _spell_dict_knows(candidate): - return candidate - # Structural rule: suspicious char at position 0 + rest is all lowercase letters - # e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld" - first = token[0] - if first in _SPELL_SUBS and len(token) >= 2: - rest = token[1:] - if rest.isalpha() and rest.islower(): - candidate = _SPELL_SUBS[first][0] + rest - if not candidate[0].isdigit(): - return candidate + + # 2. Digit/pipe substitution (existing logic) + if has_suspicious: + # Standalone pipe → capital I + if token == '|': + return 'I' + # Dictionary-backed single-char substitution + for i, ch in enumerate(token): + if ch not in _SPELL_SUBS: + continue + for replacement in _SPELL_SUBS[ch]: + candidate = token[:i] + replacement + token[i + 1:] + if _spell_dict_knows(candidate): + return candidate + # Structural rule: suspicious char at position 0 + rest is all lowercase letters + first = token[0] + if first in _SPELL_SUBS and len(token) >= 2: + rest = token[1:] + if rest.isalpha() and rest.islower(): + candidate = _SPELL_SUBS[first][0] + rest + if not candidate[0].isdigit(): + return candidate + + # 3. General spell correction for unknown words (no digits/pipes) + # e.g. "iberqueren" → "ueberqueren", "beautful" → "beautiful" + if not has_suspicious and len(token) >= 3 and token.isalpha(): + spell = _en_spell if field == "english" else _de_spell if field == "german" else None + if spell is not None: + correction = spell.correction(token.lower()) + if correction and correction != token.lower(): + # Preserve original capitalisation pattern + if token[0].isupper(): + correction = correction[0].upper() + correction[1:] + if _spell_dict_knows(correction): + return correction return None -def _spell_fix_field(text: str) -> Tuple[str, bool]: - """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).""" - if not text or not any(ch in text for ch in _SPELL_SUSPICIOUS): +def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]: + """Apply OCR corrections to a text field. Returns (fixed_text, was_changed). + + *field* is 'english' or 'german' — forwarded to _spell_fix_token for + dictionary selection. + """ + if not text: + return text, False + has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS) + # If no suspicious chars AND no alpha chars that could be misspelled, skip + if not has_suspicious and not any(c.isalpha() for c in text): return text, False # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ") - fixed = _re.sub(r'(? Dict: all_corrected: List[Dict] = [] for i, entry in enumerate(entries): e = dict(entry) + # Page-ref normalization (always, regardless of review status) + old_ref = (e.get("source_page") or "").strip() + if old_ref: + new_ref = _normalize_page_ref(old_ref) + if new_ref != old_ref: + changes.append({ + "row_index": e.get("row_index", i), + "field": "source_page", + "old": old_ref, + "new": new_ref, + }) + e["source_page"] = new_ref + e["llm_corrected"] = True if not _entry_needs_review(e): all_corrected.append(e) continue @@ -6986,7 +7038,7 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict: old_val = (e.get(field_name) or "").strip() if not old_val: continue - new_val, was_changed = _spell_fix_field(old_val) + new_val, was_changed = _spell_fix_field(old_val, field=field_name) if was_changed and new_val != old_val: changes.append({ "row_index": e.get("row_index", i), diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 7265acb..3e14872 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1348,7 +1348,6 @@ async def detect_words( # No content shuffling — each cell stays at its detected position. if is_vocab: entries = _cells_to_vocab_entries(cells, columns_meta) - entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) word_result["vocab_entries"] = entries word_result["entries"] = entries @@ -1487,7 +1486,6 @@ async def _word_batch_stream_generator( vocab_entries = None if is_vocab: entries = _cells_to_vocab_entries(cells, columns_meta) - entries = _fix_character_confusion(entries) entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) word_result["vocab_entries"] = entries word_result["entries"] = entries