fix(ocr-pipeline): improve word-column assignment and LLM review accuracy
Word assignment: Replace nearest-center-distance with containment-first strategy. Words whose center falls within a column's bounds (+ 15% pad) are assigned to that column before falling back to nearest-center. This fixes long example sentences losing their rightmost words to adjacent columns. LLM review: Strengthen prompt to explicitly forbid changing proper nouns, place names, and correctly-spelled words. Add _is_spurious_change() post-filter that rejects case-only changes and hallucinated word replacements (< 50% character overlap). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3068,11 +3068,16 @@ def _assign_row_words_to_columns(
|
||||
row: RowGeometry,
|
||||
columns: List[PageRegion],
|
||||
) -> Dict[int, List[Dict]]:
|
||||
"""Assign each word in a row to exactly one column (nearest center).
|
||||
"""Assign each word in a row to exactly one column.
|
||||
|
||||
This prevents the same word from appearing in multiple cells when column
|
||||
boundaries are close together. Each word is assigned to the column whose
|
||||
horizontal center is closest to the word's horizontal center.
|
||||
Uses a two-pass strategy:
|
||||
1. Containment: if a word's center falls within a column's horizontal
|
||||
bounds (with padding), assign it to that column.
|
||||
2. Nearest center: for words not contained by any column, fall back to
|
||||
nearest column center distance.
|
||||
|
||||
This prevents long sentences in wide columns (e.g. example) from having
|
||||
their rightmost words stolen by an adjacent column.
|
||||
|
||||
Args:
|
||||
row: Row with words (relative coordinates).
|
||||
@@ -3088,21 +3093,38 @@ def _assign_row_words_to_columns(
|
||||
|
||||
left_x = row.x # content ROI left (absolute)
|
||||
|
||||
# Pre-compute column centers in relative coordinates
|
||||
col_centers_rel = []
|
||||
# Pre-compute column bounds and centers in relative coordinates
|
||||
col_bounds_rel = [] # (left, right, center) per column
|
||||
for col in columns:
|
||||
col_left_rel = col.x - left_x
|
||||
col_right_rel = col_left_rel + col.width
|
||||
col_center_rel = col_left_rel + col.width / 2
|
||||
col_centers_rel.append(col_center_rel)
|
||||
col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel))
|
||||
|
||||
# Padding: allow words slightly outside column bounds (e.g. due to
|
||||
# imprecise column detection). Use 15% of average column width.
|
||||
avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100
|
||||
pad = avg_col_w * 0.15
|
||||
|
||||
for w in row.words:
|
||||
w_center_x = w['left'] + w['width'] / 2
|
||||
|
||||
# Find nearest column by center distance
|
||||
# Pass 1: containment check (word center within column bounds + pad)
|
||||
contained_col = -1
|
||||
for ci, (cl, cr, _) in enumerate(col_bounds_rel):
|
||||
if (cl - pad) <= w_center_x <= (cr + pad):
|
||||
contained_col = ci
|
||||
break
|
||||
|
||||
if contained_col >= 0:
|
||||
result[contained_col].append(w)
|
||||
continue
|
||||
|
||||
# Pass 2: nearest center fallback
|
||||
best_col = 0
|
||||
best_dist = abs(w_center_x - col_centers_rel[0])
|
||||
best_dist = abs(w_center_x - col_bounds_rel[0][2])
|
||||
for ci in range(1, len(columns)):
|
||||
dist = abs(w_center_x - col_centers_rel[ci])
|
||||
dist = abs(w_center_x - col_bounds_rel[ci][2])
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_col = ci
|
||||
@@ -4349,15 +4371,18 @@ Haeufige OCR-Fehler die du korrigieren sollst:
|
||||
- Fehlende oder falsche Satzzeichen
|
||||
- Offensichtliche Tippfehler die durch OCR entstanden sind
|
||||
|
||||
WICHTIG:
|
||||
- Aendere NICHTS was korrekt aussieht
|
||||
- Erfinde KEINE neuen Woerter oder Uebersetzungen
|
||||
- Behalte Abkuerzungen wie sth., sb., etc. bei
|
||||
- Behalte die exakte Struktur (gleiche Anzahl Eintraege)
|
||||
WICHTIG — Aendere NICHTS in diesen Faellen:
|
||||
- Woerter die korrekt geschrieben sind (auch wenn sie ungewoehnlich aussehen)
|
||||
- Eigennamen, Laendernamen, Staedtenamen (z.B. China, Japan, London, Africa)
|
||||
- Abkuerzungen wie sth., sb., etc., e.g., i.e.
|
||||
- Lautschrift und phonetische Zeichen in eckigen Klammern [...]
|
||||
- Fachbegriffe und Fremdwoerter die korrekt sind
|
||||
- Im Zweifel: NICHT aendern!
|
||||
|
||||
Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text.
|
||||
Fuer jeden Eintrag den du aenderst, setze "corrected": true.
|
||||
Fuer unveraenderte Eintraege setze "corrected": false.
|
||||
Behalte die exakte Struktur (gleiche Anzahl Eintraege).
|
||||
|
||||
/no_think
|
||||
|
||||
@@ -4365,6 +4390,38 @@ Eingabe:
|
||||
{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
|
||||
|
||||
|
||||
def _is_spurious_change(old_val: str, new_val: str) -> bool:
|
||||
"""Detect LLM changes that are likely wrong and should be discarded.
|
||||
|
||||
Filters out:
|
||||
- Case-only changes (OCR doesn't typically swap case)
|
||||
- Completely different words (LLM hallucinating a replacement)
|
||||
- Changes where the old value is a valid proper noun / place name
|
||||
"""
|
||||
if not old_val or not new_val:
|
||||
return False
|
||||
|
||||
# Case-only change — almost never a real OCR error
|
||||
if old_val.lower() == new_val.lower():
|
||||
return True
|
||||
|
||||
# If old value starts with uppercase and new is totally different word,
|
||||
# it's likely a proper noun the LLM "corrected"
|
||||
old_words = old_val.split()
|
||||
new_words = new_val.split()
|
||||
if len(old_words) == 1 and len(new_words) == 1:
|
||||
ow, nw = old_words[0], new_words[0]
|
||||
# Both are single words but share very few characters — likely hallucination
|
||||
if len(ow) > 2 and len(nw) > 2:
|
||||
# Levenshtein-like quick check: if < 50% chars overlap, reject
|
||||
common = sum(1 for c in ow.lower() if c in nw.lower())
|
||||
max_len = max(len(ow), len(nw))
|
||||
if common / max_len < 0.5:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
|
||||
"""Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
|
||||
changes = []
|
||||
@@ -4377,6 +4434,9 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict
|
||||
new_val = c.get(key, "").strip()
|
||||
old_val = (orig.get(field_name, "") or "").strip()
|
||||
if new_val and new_val != old_val:
|
||||
# Filter spurious LLM changes
|
||||
if _is_spurious_change(old_val, new_val):
|
||||
continue
|
||||
changes.append({
|
||||
"row_index": orig.get("row_index", i),
|
||||
"field": field_name,
|
||||
|
||||
Reference in New Issue
Block a user