fix(ocr-pipeline): improve word-column assignment and LLM review accuracy

Word assignment: Replace nearest-center-distance with containment-first
strategy. Words whose center falls within a column's bounds (+ 15% pad)
are assigned to that column before falling back to nearest-center. This
fixes long example sentences losing their rightmost words to adjacent
columns.

LLM review: Strengthen prompt to explicitly forbid changing proper nouns,
place names, and correctly-spelled words. Add _is_spurious_change()
post-filter that rejects case-only changes and hallucinated word
replacements (< 50% character overlap).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 12:40:26 +01:00
parent dbf0db0c13
commit 29b1d95acc

View File

@@ -3068,11 +3068,16 @@ def _assign_row_words_to_columns(
row: RowGeometry, row: RowGeometry,
columns: List[PageRegion], columns: List[PageRegion],
) -> Dict[int, List[Dict]]: ) -> Dict[int, List[Dict]]:
"""Assign each word in a row to exactly one column (nearest center). """Assign each word in a row to exactly one column.
This prevents the same word from appearing in multiple cells when column Uses a two-pass strategy:
boundaries are close together. Each word is assigned to the column whose 1. Containment: if a word's center falls within a column's horizontal
horizontal center is closest to the word's horizontal center. bounds (with padding), assign it to that column.
2. Nearest center: for words not contained by any column, fall back to
nearest column center distance.
This prevents long sentences in wide columns (e.g. example) from having
their rightmost words stolen by an adjacent column.
Args: Args:
row: Row with words (relative coordinates). row: Row with words (relative coordinates).
@@ -3088,21 +3093,38 @@ def _assign_row_words_to_columns(
left_x = row.x # content ROI left (absolute) left_x = row.x # content ROI left (absolute)
# Pre-compute column centers in relative coordinates # Pre-compute column bounds and centers in relative coordinates
col_centers_rel = [] col_bounds_rel = [] # (left, right, center) per column
for col in columns: for col in columns:
col_left_rel = col.x - left_x col_left_rel = col.x - left_x
col_right_rel = col_left_rel + col.width
col_center_rel = col_left_rel + col.width / 2 col_center_rel = col_left_rel + col.width / 2
col_centers_rel.append(col_center_rel) col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel))
# Padding: allow words slightly outside column bounds (e.g. due to
# imprecise column detection). Use 15% of average column width.
avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100
pad = avg_col_w * 0.15
for w in row.words: for w in row.words:
w_center_x = w['left'] + w['width'] / 2 w_center_x = w['left'] + w['width'] / 2
# Find nearest column by center distance # Pass 1: containment check (word center within column bounds + pad)
contained_col = -1
for ci, (cl, cr, _) in enumerate(col_bounds_rel):
if (cl - pad) <= w_center_x <= (cr + pad):
contained_col = ci
break
if contained_col >= 0:
result[contained_col].append(w)
continue
# Pass 2: nearest center fallback
best_col = 0 best_col = 0
best_dist = abs(w_center_x - col_centers_rel[0]) best_dist = abs(w_center_x - col_bounds_rel[0][2])
for ci in range(1, len(columns)): for ci in range(1, len(columns)):
dist = abs(w_center_x - col_centers_rel[ci]) dist = abs(w_center_x - col_bounds_rel[ci][2])
if dist < best_dist: if dist < best_dist:
best_dist = dist best_dist = dist
best_col = ci best_col = ci
@@ -4349,15 +4371,18 @@ Haeufige OCR-Fehler die du korrigieren sollst:
- Fehlende oder falsche Satzzeichen - Fehlende oder falsche Satzzeichen
- Offensichtliche Tippfehler die durch OCR entstanden sind - Offensichtliche Tippfehler die durch OCR entstanden sind
WICHTIG: WICHTIG — Aendere NICHTS in diesen Faellen:
- Aendere NICHTS was korrekt aussieht - Woerter die korrekt geschrieben sind (auch wenn sie ungewoehnlich aussehen)
- Erfinde KEINE neuen Woerter oder Uebersetzungen - Eigennamen, Laendernamen, Staedtenamen (z.B. China, Japan, London, Africa)
- Behalte Abkuerzungen wie sth., sb., etc. bei - Abkuerzungen wie sth., sb., etc., e.g., i.e.
- Behalte die exakte Struktur (gleiche Anzahl Eintraege) - Lautschrift und phonetische Zeichen in eckigen Klammern [...]
- Fachbegriffe und Fremdwoerter die korrekt sind
- Im Zweifel: NICHT aendern!
Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text. Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text.
Fuer jeden Eintrag den du aenderst, setze "corrected": true. Fuer jeden Eintrag den du aenderst, setze "corrected": true.
Fuer unveraenderte Eintraege setze "corrected": false. Fuer unveraenderte Eintraege setze "corrected": false.
Behalte die exakte Struktur (gleiche Anzahl Eintraege).
/no_think /no_think
@@ -4365,6 +4390,38 @@ Eingabe:
{_json.dumps(table_lines, ensure_ascii=False, indent=2)}""" {_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
def _is_spurious_change(old_val: str, new_val: str) -> bool:
"""Detect LLM changes that are likely wrong and should be discarded.
Filters out:
- Case-only changes (OCR doesn't typically swap case)
- Completely different words (LLM hallucinating a replacement)
- Changes where the old value is a valid proper noun / place name
"""
if not old_val or not new_val:
return False
# Case-only change — almost never a real OCR error
if old_val.lower() == new_val.lower():
return True
# If old value starts with uppercase and new is totally different word,
# it's likely a proper noun the LLM "corrected"
old_words = old_val.split()
new_words = new_val.split()
if len(old_words) == 1 and len(new_words) == 1:
ow, nw = old_words[0], new_words[0]
# Both are single words but share very few characters — likely hallucination
if len(ow) > 2 and len(nw) > 2:
# Levenshtein-like quick check: if < 50% chars overlap, reject
common = sum(1 for c in ow.lower() if c in nw.lower())
max_len = max(len(ow), len(nw))
if common / max_len < 0.5:
return True
return False
def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]: def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
"""Compare original entries with LLM-corrected ones, return (changes, corrected_entries).""" """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
changes = [] changes = []
@@ -4377,6 +4434,9 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict
new_val = c.get(key, "").strip() new_val = c.get(key, "").strip()
old_val = (orig.get(field_name, "") or "").strip() old_val = (orig.get(field_name, "") or "").strip()
if new_val and new_val != old_val: if new_val and new_val != old_val:
# Filter spurious LLM changes
if _is_spurious_change(old_val, new_val):
continue
changes.append({ changes.append({
"row_index": orig.get("row_index", i), "row_index": orig.get("row_index", i),
"field": field_name, "field": field_name,