From 29b1d95acc2380e8e9d111dfec4950f280e00e77 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Mon, 2 Mar 2026 12:40:26 +0100
Subject: [PATCH] fix(ocr-pipeline): improve word-column assignment and LLM
 review accuracy

Word assignment: Replace nearest-center-distance with containment-first
strategy. Words whose center falls within a column's bounds (+ 15% pad)
are assigned to that column before falling back to nearest-center. This
fixes long example sentences losing their rightmost words to adjacent
columns.

LLM review: Strengthen prompt to explicitly forbid changing proper nouns,
place names, and correctly-spelled words. Add _is_spurious_change()
post-filter that rejects case-only changes and hallucinated word
replacements (< 50% character overlap).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_vocab_pipeline.py | 90 ++++++++++++++++----
 1 file changed, 75 insertions(+), 15 deletions(-)

diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 8417c8c..883aaff 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3068,11 +3068,16 @@ def _assign_row_words_to_columns(
     row: RowGeometry,
     columns: List[PageRegion],
 ) -> Dict[int, List[Dict]]:
-    """Assign each word in a row to exactly one column (nearest center).
+    """Assign each word in a row to exactly one column.
 
-    This prevents the same word from appearing in multiple cells when column
-    boundaries are close together.  Each word is assigned to the column whose
-    horizontal center is closest to the word's horizontal center.
+    Uses a two-pass strategy:
+    1. Containment: if a word's center falls within a column's horizontal
+       bounds (with padding), assign it to that column.
+    2. Nearest center: for words not contained by any column, fall back to
+       nearest column center distance.
+
+    This prevents long sentences in wide columns (e.g. example) from having
+    their rightmost words stolen by an adjacent column.
 
     Args:
         row: Row with words (relative coordinates).
@@ -3088,21 +3093,38 @@ def _assign_row_words_to_columns(
 
     left_x = row.x  # content ROI left (absolute)
 
-    # Pre-compute column centers in relative coordinates
-    col_centers_rel = []
+    # Pre-compute column bounds and centers in relative coordinates
+    col_bounds_rel = []  # (left, right, center) per column
     for col in columns:
         col_left_rel = col.x - left_x
+        col_right_rel = col_left_rel + col.width
         col_center_rel = col_left_rel + col.width / 2
-        col_centers_rel.append(col_center_rel)
+        col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel))
+
+    # Padding: allow words slightly outside column bounds (e.g. due to
+    # imprecise column detection).  Use 15% of average column width.
+    avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100
+    pad = avg_col_w * 0.15
 
     for w in row.words:
         w_center_x = w['left'] + w['width'] / 2
 
-        # Find nearest column by center distance
+        # Pass 1: containment check (word center within column bounds + pad)
+        contained_col = -1
+        for ci, (cl, cr, _) in enumerate(col_bounds_rel):
+            if (cl - pad) <= w_center_x <= (cr + pad):
+                contained_col = ci
+                break
+
+        if contained_col >= 0:
+            result[contained_col].append(w)
+            continue
+
+        # Pass 2: nearest center fallback
         best_col = 0
-        best_dist = abs(w_center_x - col_centers_rel[0])
+        best_dist = abs(w_center_x - col_bounds_rel[0][2])
         for ci in range(1, len(columns)):
-            dist = abs(w_center_x - col_centers_rel[ci])
+            dist = abs(w_center_x - col_bounds_rel[ci][2])
             if dist < best_dist:
                 best_dist = dist
                 best_col = ci
@@ -4349,15 +4371,18 @@ Haeufige OCR-Fehler die du korrigieren sollst:
 - Fehlende oder falsche Satzzeichen
 - Offensichtliche Tippfehler die durch OCR entstanden sind
 
-WICHTIG:
-- Aendere NICHTS was korrekt aussieht
-- Erfinde KEINE neuen Woerter oder Uebersetzungen
-- Behalte Abkuerzungen wie sth., sb., etc. bei
-- Behalte die exakte Struktur (gleiche Anzahl Eintraege)
+WICHTIG — Aendere NICHTS in diesen Faellen:
+- Woerter die korrekt geschrieben sind (auch wenn sie ungewoehnlich aussehen)
+- Eigennamen, Laendernamen, Staedtenamen (z.B. China, Japan, London, Africa)
+- Abkuerzungen wie sth., sb., etc., e.g., i.e.
+- Lautschrift und phonetische Zeichen in eckigen Klammern [...]
+- Fachbegriffe und Fremdwoerter die korrekt sind
+- Im Zweifel: NICHT aendern!
 
 Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text.
 Fuer jeden Eintrag den du aenderst, setze "corrected": true.
 Fuer unveraenderte Eintraege setze "corrected": false.
+Behalte die exakte Struktur (gleiche Anzahl Eintraege).
 
 /no_think
 
@@ -4365,6 +4390,38 @@ Eingabe:
 {_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
 
 
+def _is_spurious_change(old_val: str, new_val: str) -> bool:
+    """Detect LLM changes that are likely wrong and should be discarded.
+
+    Filters out:
+    - Case-only changes (OCR doesn't typically swap case)
+    - Completely different words (LLM hallucinating a replacement)
+    - Changes where the old value is a valid proper noun / place name
+    """
+    if not old_val or not new_val:
+        return False
+
+    # Case-only change — almost never a real OCR error
+    if old_val.lower() == new_val.lower():
+        return True
+
+    # If old value starts with uppercase and new is totally different word,
+    # it's likely a proper noun the LLM "corrected"
+    old_words = old_val.split()
+    new_words = new_val.split()
+    if len(old_words) == 1 and len(new_words) == 1:
+        ow, nw = old_words[0], new_words[0]
+        # Both are single words but share very few characters — likely hallucination
+        if len(ow) > 2 and len(nw) > 2:
+            # Levenshtein-like quick check: if < 50% chars overlap, reject
+            common = sum(1 for c in ow.lower() if c in nw.lower())
+            max_len = max(len(ow), len(nw))
+            if common / max_len < 0.5:
+                return True
+
+    return False
+
+
 def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
     """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
     changes = []
@@ -4377,6 +4434,9 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict
                 new_val = c.get(key, "").strip()
                 old_val = (orig.get(field_name, "") or "").strip()
                 if new_val and new_val != old_val:
+                    # Filter spurious LLM changes
+                    if _is_spurious_change(old_val, new_val):
+                        continue
                     changes.append({
                         "row_index": orig.get("row_index", i),
                         "field": field_name,