fix(ocr-pipeline): improve word-column assignment and LLM review accuracy

Word assignment: Replace nearest-center-distance with containment-first
strategy. Words whose center falls within a column's bounds (+ 15% pad)
are assigned to that column before falling back to nearest-center. This
fixes long example sentences losing their rightmost words to adjacent
columns.

LLM review: Strengthen prompt to explicitly forbid changing proper nouns,
place names, and correctly-spelled words. Add _is_spurious_change()
post-filter that rejects case-only changes and hallucinated word
replacements (< 50% character overlap).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 12:40:26 +01:00
parent dbf0db0c13
commit 29b1d95acc

View File

@@ -3068,11 +3068,16 @@ def _assign_row_words_to_columns(
row: RowGeometry,
columns: List[PageRegion],
) -> Dict[int, List[Dict]]:
"""Assign each word in a row to exactly one column (nearest center).
"""Assign each word in a row to exactly one column.
This prevents the same word from appearing in multiple cells when column
boundaries are close together. Each word is assigned to the column whose
horizontal center is closest to the word's horizontal center.
Uses a two-pass strategy:
1. Containment: if a word's center falls within a column's horizontal
bounds (with padding), assign it to that column.
2. Nearest center: for words not contained by any column, fall back to
nearest column center distance.
This prevents long sentences in wide columns (e.g. example) from having
their rightmost words stolen by an adjacent column.
Args:
row: Row with words (relative coordinates).
@@ -3088,21 +3093,38 @@ def _assign_row_words_to_columns(
left_x = row.x # content ROI left (absolute)
# Pre-compute column centers in relative coordinates
col_centers_rel = []
# Pre-compute column bounds and centers in relative coordinates
col_bounds_rel = [] # (left, right, center) per column
for col in columns:
col_left_rel = col.x - left_x
col_right_rel = col_left_rel + col.width
col_center_rel = col_left_rel + col.width / 2
col_centers_rel.append(col_center_rel)
col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel))
# Padding: allow words slightly outside column bounds (e.g. due to
# imprecise column detection). Use 15% of average column width.
avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100
pad = avg_col_w * 0.15
for w in row.words:
w_center_x = w['left'] + w['width'] / 2
# Find nearest column by center distance
# Pass 1: containment check (word center within column bounds + pad)
contained_col = -1
for ci, (cl, cr, _) in enumerate(col_bounds_rel):
if (cl - pad) <= w_center_x <= (cr + pad):
contained_col = ci
break
if contained_col >= 0:
result[contained_col].append(w)
continue
# Pass 2: nearest center fallback
best_col = 0
best_dist = abs(w_center_x - col_centers_rel[0])
best_dist = abs(w_center_x - col_bounds_rel[0][2])
for ci in range(1, len(columns)):
dist = abs(w_center_x - col_centers_rel[ci])
dist = abs(w_center_x - col_bounds_rel[ci][2])
if dist < best_dist:
best_dist = dist
best_col = ci
@@ -4349,15 +4371,18 @@ Haeufige OCR-Fehler die du korrigieren sollst:
- Fehlende oder falsche Satzzeichen
- Offensichtliche Tippfehler die durch OCR entstanden sind
WICHTIG:
- Aendere NICHTS was korrekt aussieht
- Erfinde KEINE neuen Woerter oder Uebersetzungen
- Behalte Abkuerzungen wie sth., sb., etc. bei
- Behalte die exakte Struktur (gleiche Anzahl Eintraege)
WICHTIG — Aendere NICHTS in diesen Faellen:
- Woerter die korrekt geschrieben sind (auch wenn sie ungewoehnlich aussehen)
- Eigennamen, Laendernamen, Staedtenamen (z.B. China, Japan, London, Africa)
- Abkuerzungen wie sth., sb., etc., e.g., i.e.
- Lautschrift und phonetische Zeichen in eckigen Klammern [...]
- Fachbegriffe und Fremdwoerter die korrekt sind
- Im Zweifel: NICHT aendern!
Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text.
Fuer jeden Eintrag den du aenderst, setze "corrected": true.
Fuer unveraenderte Eintraege setze "corrected": false.
Behalte die exakte Struktur (gleiche Anzahl Eintraege).
/no_think
@@ -4365,6 +4390,38 @@ Eingabe:
{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
def _is_spurious_change(old_val: str, new_val: str) -> bool:
"""Detect LLM changes that are likely wrong and should be discarded.
Filters out:
- Case-only changes (OCR doesn't typically swap case)
- Completely different words (LLM hallucinating a replacement)
- Changes where the old value is a valid proper noun / place name
"""
if not old_val or not new_val:
return False
# Case-only change — almost never a real OCR error
if old_val.lower() == new_val.lower():
return True
# If old value starts with uppercase and new is totally different word,
# it's likely a proper noun the LLM "corrected"
old_words = old_val.split()
new_words = new_val.split()
if len(old_words) == 1 and len(new_words) == 1:
ow, nw = old_words[0], new_words[0]
# Both are single words but share very few characters — likely hallucination
if len(ow) > 2 and len(nw) > 2:
# Levenshtein-like quick check: if < 50% chars overlap, reject
common = sum(1 for c in ow.lower() if c in nw.lower())
max_len = max(len(ow), len(nw))
if common / max_len < 0.5:
return True
return False
def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
"""Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
changes = []
@@ -4377,6 +4434,9 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict
new_val = c.get(key, "").strip()
old_val = (orig.get(field_name, "") or "").strip()
if new_val and new_val != old_val:
# Filter spurious LLM changes
if _is_spurious_change(old_val, new_val):
continue
changes.append({
"row_index": orig.get("row_index", i),
"field": field_name,