fix(ocr-pipeline): improve word-column assignment and LLM review accuracy

Word assignment: Replace nearest-center-distance with containment-first strategy. Words whose center falls within a column's bounds (+ 15% pad) are assigned to that column before falling back to nearest-center. This fixes long example sentences losing their rightmost words to adjacent columns. LLM review: Strengthen prompt to explicitly forbid changing proper nouns, place names, and correctly-spelled words. Add _is_spurious_change() post-filter that rejects case-only changes and hallucinated word replacements (< 50% character overlap). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 12:40:26 +01:00
parent dbf0db0c13
commit 29b1d95acc
1 changed files with 75 additions and 15 deletions
@@ -3068,11 +3068,16 @@ def _assign_row_words_to_columns(
    row: RowGeometry,
    columns: List[PageRegion],
 ) -> Dict[int, List[Dict]]:
-    """Assign each word in a row to exactly one column (nearest center).
+    """Assign each word in a row to exactly one column.

-    This prevents the same word from appearing in multiple cells when column
-    boundaries are close together.  Each word is assigned to the column whose
-    horizontal center is closest to the word's horizontal center.
+    Uses a two-pass strategy:
+    1. Containment: if a word's center falls within a column's horizontal
+       bounds (with padding), assign it to that column.
+    2. Nearest center: for words not contained by any column, fall back to
+       nearest column center distance.
+
+    This prevents long sentences in wide columns (e.g. example) from having
+    their rightmost words stolen by an adjacent column.

    Args:
        row: Row with words (relative coordinates).
@@ -3088,21 +3093,38 @@ def _assign_row_words_to_columns(

    left_x = row.x  # content ROI left (absolute)

-    # Pre-compute column centers in relative coordinates
-    col_centers_rel = []
+    # Pre-compute column bounds and centers in relative coordinates
+    col_bounds_rel = []  # (left, right, center) per column
    for col in columns:
        col_left_rel = col.x - left_x
+        col_right_rel = col_left_rel + col.width
        col_center_rel = col_left_rel + col.width / 2
-        col_centers_rel.append(col_center_rel)
+        col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel))
+
+    # Padding: allow words slightly outside column bounds (e.g. due to
+    # imprecise column detection).  Use 15% of average column width.
+    avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100
+    pad = avg_col_w * 0.15

    for w in row.words:
        w_center_x = w['left'] + w['width'] / 2

-        # Find nearest column by center distance
+        # Pass 1: containment check (word center within column bounds + pad)
+        contained_col = -1
+        for ci, (cl, cr, _) in enumerate(col_bounds_rel):
+            if (cl - pad) <= w_center_x <= (cr + pad):
+                contained_col = ci
+                break
+
+        if contained_col >= 0:
+            result[contained_col].append(w)
+            continue
+
+        # Pass 2: nearest center fallback
        best_col = 0
-        best_dist = abs(w_center_x - col_centers_rel[0])
+        best_dist = abs(w_center_x - col_bounds_rel[0][2])
        for ci in range(1, len(columns)):
-            dist = abs(w_center_x - col_centers_rel[ci])
+            dist = abs(w_center_x - col_bounds_rel[ci][2])
            if dist < best_dist:
                best_dist = dist
                best_col = ci
@@ -4349,15 +4371,18 @@ Haeufige OCR-Fehler die du korrigieren sollst:
 - Fehlende oder falsche Satzzeichen
 - Offensichtliche Tippfehler die durch OCR entstanden sind

-WICHTIG:
- Aendere NICHTS was korrekt aussieht
- Erfinde KEINE neuen Woerter oder Uebersetzungen
- Behalte Abkuerzungen wie sth., sb., etc. bei
- Behalte die exakte Struktur (gleiche Anzahl Eintraege)
+WICHTIG — Aendere NICHTS in diesen Faellen:
+- Woerter die korrekt geschrieben sind (auch wenn sie ungewoehnlich aussehen)
+- Eigennamen, Laendernamen, Staedtenamen (z.B. China, Japan, London, Africa)
+- Abkuerzungen wie sth., sb., etc., e.g., i.e.
+- Lautschrift und phonetische Zeichen in eckigen Klammern [...]
+- Fachbegriffe und Fremdwoerter die korrekt sind
+- Im Zweifel: NICHT aendern!

 Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text.
 Fuer jeden Eintrag den du aenderst, setze "corrected": true.
 Fuer unveraenderte Eintraege setze "corrected": false.
+Behalte die exakte Struktur (gleiche Anzahl Eintraege).

 /no_think

@@ -4365,6 +4390,38 @@ Eingabe:
 {_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""


+def _is_spurious_change(old_val: str, new_val: str) -> bool:
+    """Detect LLM changes that are likely wrong and should be discarded.
+
+    Filters out:
+    - Case-only changes (OCR doesn't typically swap case)
+    - Completely different words (LLM hallucinating a replacement)
+    - Changes where the old value is a valid proper noun / place name
+    """
+    if not old_val or not new_val:
+        return False
+
+    # Case-only change — almost never a real OCR error
+    if old_val.lower() == new_val.lower():
+        return True
+
+    # If old value starts with uppercase and new is totally different word,
+    # it's likely a proper noun the LLM "corrected"
+    old_words = old_val.split()
+    new_words = new_val.split()
+    if len(old_words) == 1 and len(new_words) == 1:
+        ow, nw = old_words[0], new_words[0]
+        # Both are single words but share very few characters — likely hallucination
+        if len(ow) > 2 and len(nw) > 2:
+            # Levenshtein-like quick check: if < 50% chars overlap, reject
+            common = sum(1 for c in ow.lower() if c in nw.lower())
+            max_len = max(len(ow), len(nw))
+            if common / max_len < 0.5:
+                return True
+
+    return False
+
+
 def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
    """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
    changes = []
@@ -4377,6 +4434,9 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict
                new_val = c.get(key, "").strip()
                old_val = (orig.get(field_name, "") or "").strip()
                if new_val and new_val != old_val:
+                    # Filter spurious LLM changes
+                    if _is_spurious_change(old_val, new_val):
+                        continue
                    changes.append({
                        "row_index": orig.get("row_index", i),
                        "field": field_name,