From a58dfca1d8408ed9dfdf495f711c2fd76debdfc7 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Thu, 5 Mar 2026 00:26:13 +0100
Subject: [PATCH] fix: move char-confusion fix to correction step, add spell +
 page-ref corrections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove _fix_character_confusion() from words endpoint (now only in Phase 0)
- Extend spell checker to find real OCR errors via spell.correction()
- Add field-aware dictionary selection (EN/DE) for spell corrections
- Add _normalize_page_ref() for page_ref column (p-60 → p.60)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 114 ++++++++++++++-----
 klausur-service/backend/ocr_pipeline_api.py  |   2 -
 2 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 1ead642..0e182f7 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -6891,6 +6891,18 @@ except ImportError:
     _SPELL_AVAILABLE = False
     logger.warning("pyspellchecker not installed — falling back to LLM review")
 
+# ─── Page-Ref Normalization ───────────────────────────────────────────────────
+# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60"
+_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE)
+
+
+def _normalize_page_ref(text: str) -> str:
+    """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'."""
+    if not text:
+        return text
+    return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
+
+
 # Suspicious OCR chars → ordered list of most-likely correct replacements
 _SPELL_SUBS: Dict[str, List[str]] = {
     '0': ['O', 'o'],
@@ -6914,49 +6926,76 @@ def _spell_dict_knows(word: str) -> bool:
     return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
 
 
-def _spell_fix_token(token: str) -> Optional[str]:
-    """Return corrected form of token, or None if no fix needed/possible."""
-    if not any(ch in _SPELL_SUSPICIOUS for ch in token):
-        return None
-    # Standalone pipe → capital I
-    if token == '|':
-        return 'I'
-    # Original is already a valid word → leave it
+def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
+    """Return corrected form of token, or None if no fix needed/possible.
+
+    *field* is 'english' or 'german' — used to pick the right dictionary
+    for general spell correction (step 3 below).
+    """
+    has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
+
+    # 1. Already known word → no fix needed
     if _spell_dict_knows(token):
         return None
-    # Dictionary-backed single-char substitution
-    for i, ch in enumerate(token):
-        if ch not in _SPELL_SUBS:
-            continue
-        for replacement in _SPELL_SUBS[ch]:
-            candidate = token[:i] + replacement + token[i + 1:]
-            if _spell_dict_knows(candidate):
-                return candidate
-    # Structural rule: suspicious char at position 0 + rest is all lowercase letters
-    # e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld"
-    first = token[0]
-    if first in _SPELL_SUBS and len(token) >= 2:
-        rest = token[1:]
-        if rest.isalpha() and rest.islower():
-            candidate = _SPELL_SUBS[first][0] + rest
-            if not candidate[0].isdigit():
-                return candidate
+
+    # 2. Digit/pipe substitution (existing logic)
+    if has_suspicious:
+        # Standalone pipe → capital I
+        if token == '|':
+            return 'I'
+        # Dictionary-backed single-char substitution
+        for i, ch in enumerate(token):
+            if ch not in _SPELL_SUBS:
+                continue
+            for replacement in _SPELL_SUBS[ch]:
+                candidate = token[:i] + replacement + token[i + 1:]
+                if _spell_dict_knows(candidate):
+                    return candidate
+        # Structural rule: suspicious char at position 0 + rest is all lowercase letters
+        first = token[0]
+        if first in _SPELL_SUBS and len(token) >= 2:
+            rest = token[1:]
+            if rest.isalpha() and rest.islower():
+                candidate = _SPELL_SUBS[first][0] + rest
+                if not candidate[0].isdigit():
+                    return candidate
+
+    # 3. General spell correction for unknown words (no digits/pipes)
+    #    e.g. "iberqueren" → "ueberqueren", "beautful" → "beautiful"
+    if not has_suspicious and len(token) >= 3 and token.isalpha():
+        spell = _en_spell if field == "english" else _de_spell if field == "german" else None
+        if spell is not None:
+            correction = spell.correction(token.lower())
+            if correction and correction != token.lower():
+                # Preserve original capitalisation pattern
+                if token[0].isupper():
+                    correction = correction[0].upper() + correction[1:]
+                if _spell_dict_knows(correction):
+                    return correction
     return None
 
 
-def _spell_fix_field(text: str) -> Tuple[str, bool]:
-    """Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
-    if not text or not any(ch in text for ch in _SPELL_SUSPICIOUS):
+def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
+    """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).
+
+    *field* is 'english' or 'german' — forwarded to _spell_fix_token for
+    dictionary selection.
+    """
+    if not text:
+        return text, False
+    has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
+    # If no suspicious chars AND no alpha chars that could be misspelled, skip
+    if not has_suspicious and not any(c.isalpha() for c in text):
         return text, False
     # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
-    fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text)
+    fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
     changed = fixed != text
     # Tokenize and fix word by word
     parts: List[str] = []
     pos = 0
     for m in _SPELL_TOKEN_RE.finditer(fixed):
         token, sep = m.group(1), m.group(2)
-        correction = _spell_fix_token(token)
+        correction = _spell_fix_token(token, field=field)
         if correction:
             parts.append(correction)
             changed = True
@@ -6979,6 +7018,19 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
     all_corrected: List[Dict] = []
     for i, entry in enumerate(entries):
         e = dict(entry)
+        # Page-ref normalization (always, regardless of review status)
+        old_ref = (e.get("source_page") or "").strip()
+        if old_ref:
+            new_ref = _normalize_page_ref(old_ref)
+            if new_ref != old_ref:
+                changes.append({
+                    "row_index": e.get("row_index", i),
+                    "field": "source_page",
+                    "old": old_ref,
+                    "new": new_ref,
+                })
+                e["source_page"] = new_ref
+                e["llm_corrected"] = True
         if not _entry_needs_review(e):
             all_corrected.append(e)
             continue
@@ -6986,7 +7038,7 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
             old_val = (e.get(field_name) or "").strip()
             if not old_val:
                 continue
-            new_val, was_changed = _spell_fix_field(old_val)
+            new_val, was_changed = _spell_fix_field(old_val, field=field_name)
             if was_changed and new_val != old_val:
                 changes.append({
                     "row_index": e.get("row_index", i),
diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index 7265acb..3e14872 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1348,7 +1348,6 @@ async def detect_words(
     # No content shuffling — each cell stays at its detected position.
     if is_vocab:
         entries = _cells_to_vocab_entries(cells, columns_meta)
-        entries = _fix_character_confusion(entries)
         entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
         word_result["vocab_entries"] = entries
         word_result["entries"] = entries
@@ -1487,7 +1486,6 @@ async def _word_batch_stream_generator(
     vocab_entries = None
     if is_vocab:
         entries = _cells_to_vocab_entries(cells, columns_meta)
-        entries = _fix_character_confusion(entries)
         entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
         word_result["vocab_entries"] = entries
         word_result["entries"] = entries