fix(ocr-pipeline): improve word-column assignment and LLM review accuracy

Word assignment: Replace nearest-center-distance with containment-first strategy. Words whose center falls within a column's bounds (+ 15% pad) are assigned to that column before falling back to nearest-center. This fixes long example sentences losing their rightmost words to adjacent columns. LLM review: Strengthen prompt to explicitly forbid changing proper nouns, place names, and correctly-spelled words. Add _is_spurious_change() post-filter that rejects case-only changes and hallucinated word replacements (< 50% character overlap). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 12:40:26 +01:00
parent dbf0db0c13
commit 29b1d95acc
1 changed files with 75 additions and 15 deletions
@@ -3068,11 +3068,16 @@ def _assign_row_words_to_columns(
    row: RowGeometry,
    columns: List[PageRegion],
 ) -> Dict[int, List[Dict]]:
-    """Assign each word in a row to exactly one column (nearest center).
+    """Assign each word in a row to exactly one column.
-    This prevents the same word from appearing in multiple cells when column
+    Uses a two-pass strategy:
-    boundaries are close together.  Each word is assigned to the column whose
+    1. Containment: if a word's center falls within a column's horizontal
-    horizontal center is closest to the word's horizontal center.
+       bounds (with padding), assign it to that column.
    2. Nearest center: for words not contained by any column, fall back to
       nearest column center distance.
    This prevents long sentences in wide columns (e.g. example) from having
    their rightmost words stolen by an adjacent column.
    Args:
        row: Row with words (relative coordinates).
@@ -3088,21 +3093,38 @@ def _assign_row_words_to_columns(
    left_x = row.x  # content ROI left (absolute)
-    # Pre-compute column centers in relative coordinates
+    # Pre-compute column bounds and centers in relative coordinates
-    col_centers_rel = []
+    col_bounds_rel = []  # (left, right, center) per column
    for col in columns:
        col_left_rel = col.x - left_x
        col_right_rel = col_left_rel + col.width
        col_center_rel = col_left_rel + col.width / 2
-        col_centers_rel.append(col_center_rel)
+        col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel))
    # Padding: allow words slightly outside column bounds (e.g. due to
    # imprecise column detection).  Use 15% of average column width.
    avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100
    pad = avg_col_w * 0.15
    for w in row.words:
        w_center_x = w['left'] + w['width'] / 2
-        # Find nearest column by center distance
+        # Pass 1: containment check (word center within column bounds + pad)
        contained_col = -1
        for ci, (cl, cr, _) in enumerate(col_bounds_rel):
            if (cl - pad) <= w_center_x <= (cr + pad):
                contained_col = ci
                break
        if contained_col >= 0:
            result[contained_col].append(w)
            continue
        # Pass 2: nearest center fallback
        best_col = 0
-        best_dist = abs(w_center_x - col_centers_rel[0])
+        best_dist = abs(w_center_x - col_bounds_rel[0][2])
        for ci in range(1, len(columns)):
-            dist = abs(w_center_x - col_centers_rel[ci])
+            dist = abs(w_center_x - col_bounds_rel[ci][2])
            if dist < best_dist:
                best_dist = dist
                best_col = ci
@@ -4349,15 +4371,18 @@ Haeufige OCR-Fehler die du korrigieren sollst:
 - Fehlende oder falsche Satzzeichen
 - Offensichtliche Tippfehler die durch OCR entstanden sind
-WICHTIG:
+WICHTIG — Aendere NICHTS in diesen Faellen:
- Aendere NICHTS was korrekt aussieht
+- Woerter die korrekt geschrieben sind (auch wenn sie ungewoehnlich aussehen)
- Erfinde KEINE neuen Woerter oder Uebersetzungen
+- Eigennamen, Laendernamen, Staedtenamen (z.B. China, Japan, London, Africa)
- Behalte Abkuerzungen wie sth., sb., etc. bei
+- Abkuerzungen wie sth., sb., etc., e.g., i.e.
- Behalte die exakte Struktur (gleiche Anzahl Eintraege)
+- Lautschrift und phonetische Zeichen in eckigen Klammern [...]
 - Fachbegriffe und Fremdwoerter die korrekt sind
 - Im Zweifel: NICHT aendern!
 Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text.
 Fuer jeden Eintrag den du aenderst, setze "corrected": true.
 Fuer unveraenderte Eintraege setze "corrected": false.
 Behalte die exakte Struktur (gleiche Anzahl Eintraege).
 /no_think
@@ -4365,6 +4390,38 @@ Eingabe:
 {_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
 def _is_spurious_change(old_val: str, new_val: str) -> bool:
    """Detect LLM changes that are likely wrong and should be discarded.
    Filters out:
    - Case-only changes (OCR doesn't typically swap case)
    - Completely different words (LLM hallucinating a replacement)
    - Changes where the old value is a valid proper noun / place name
    """
    if not old_val or not new_val:
        return False
    # Case-only change — almost never a real OCR error
    if old_val.lower() == new_val.lower():
        return True
    # If old value starts with uppercase and new is totally different word,
    # it's likely a proper noun the LLM "corrected"
    old_words = old_val.split()
    new_words = new_val.split()
    if len(old_words) == 1 and len(new_words) == 1:
        ow, nw = old_words[0], new_words[0]
        # Both are single words but share very few characters — likely hallucination
        if len(ow) > 2 and len(nw) > 2:
            # Levenshtein-like quick check: if < 50% chars overlap, reject
            common = sum(1 for c in ow.lower() if c in nw.lower())
            max_len = max(len(ow), len(nw))
            if common / max_len < 0.5:
                return True
    return False
 def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
    """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
    changes = []
@@ -4377,6 +4434,9 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict
                new_val = c.get(key, "").strip()
                old_val = (orig.get(field_name, "") or "").strip()
                if new_val and new_val != old_val:
                    # Filter spurious LLM changes
                    if _is_spurious_change(old_val, new_val):
                        continue
                    changes.append({
                        "row_index": orig.get("row_index", i),
                        "field": field_name,