From 29b1d95acc2380e8e9d111dfec4950f280e00e77 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 2 Mar 2026 12:40:26 +0100 Subject: [PATCH] fix(ocr-pipeline): improve word-column assignment and LLM review accuracy Word assignment: Replace nearest-center-distance with containment-first strategy. Words whose center falls within a column's bounds (+ 15% pad) are assigned to that column before falling back to nearest-center. This fixes long example sentences losing their rightmost words to adjacent columns. LLM review: Strengthen prompt to explicitly forbid changing proper nouns, place names, and correctly-spelled words. Add _is_spurious_change() post-filter that rejects case-only changes and hallucinated word replacements (< 50% character overlap). Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_vocab_pipeline.py | 90 ++++++++++++++++---- 1 file changed, 75 insertions(+), 15 deletions(-) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 8417c8c..883aaff 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -3068,11 +3068,16 @@ def _assign_row_words_to_columns( row: RowGeometry, columns: List[PageRegion], ) -> Dict[int, List[Dict]]: - """Assign each word in a row to exactly one column (nearest center). + """Assign each word in a row to exactly one column. - This prevents the same word from appearing in multiple cells when column - boundaries are close together. Each word is assigned to the column whose - horizontal center is closest to the word's horizontal center. + Uses a two-pass strategy: + 1. Containment: if a word's center falls within a column's horizontal + bounds (with padding), assign it to that column. + 2. Nearest center: for words not contained by any column, fall back to + nearest column center distance. + + This prevents long sentences in wide columns (e.g. example) from having + their rightmost words stolen by an adjacent column. Args: row: Row with words (relative coordinates). @@ -3088,21 +3093,38 @@ def _assign_row_words_to_columns( left_x = row.x # content ROI left (absolute) - # Pre-compute column centers in relative coordinates - col_centers_rel = [] + # Pre-compute column bounds and centers in relative coordinates + col_bounds_rel = [] # (left, right, center) per column for col in columns: col_left_rel = col.x - left_x + col_right_rel = col_left_rel + col.width col_center_rel = col_left_rel + col.width / 2 - col_centers_rel.append(col_center_rel) + col_bounds_rel.append((col_left_rel, col_right_rel, col_center_rel)) + + # Padding: allow words slightly outside column bounds (e.g. due to + # imprecise column detection). Use 15% of average column width. + avg_col_w = sum(c.width for c in columns) / len(columns) if columns else 100 + pad = avg_col_w * 0.15 for w in row.words: w_center_x = w['left'] + w['width'] / 2 - # Find nearest column by center distance + # Pass 1: containment check (word center within column bounds + pad) + contained_col = -1 + for ci, (cl, cr, _) in enumerate(col_bounds_rel): + if (cl - pad) <= w_center_x <= (cr + pad): + contained_col = ci + break + + if contained_col >= 0: + result[contained_col].append(w) + continue + + # Pass 2: nearest center fallback best_col = 0 - best_dist = abs(w_center_x - col_centers_rel[0]) + best_dist = abs(w_center_x - col_bounds_rel[0][2]) for ci in range(1, len(columns)): - dist = abs(w_center_x - col_centers_rel[ci]) + dist = abs(w_center_x - col_bounds_rel[ci][2]) if dist < best_dist: best_dist = dist best_col = ci @@ -4349,15 +4371,18 @@ Haeufige OCR-Fehler die du korrigieren sollst: - Fehlende oder falsche Satzzeichen - Offensichtliche Tippfehler die durch OCR entstanden sind -WICHTIG: -- Aendere NICHTS was korrekt aussieht -- Erfinde KEINE neuen Woerter oder Uebersetzungen -- Behalte Abkuerzungen wie sth., sb., etc. bei -- Behalte die exakte Struktur (gleiche Anzahl Eintraege) +WICHTIG — Aendere NICHTS in diesen Faellen: +- Woerter die korrekt geschrieben sind (auch wenn sie ungewoehnlich aussehen) +- Eigennamen, Laendernamen, Staedtenamen (z.B. China, Japan, London, Africa) +- Abkuerzungen wie sth., sb., etc., e.g., i.e. +- Lautschrift und phonetische Zeichen in eckigen Klammern [...] +- Fachbegriffe und Fremdwoerter die korrekt sind +- Im Zweifel: NICHT aendern! Antworte NUR mit dem korrigierten JSON-Array. Kein erklaerener Text. Fuer jeden Eintrag den du aenderst, setze "corrected": true. Fuer unveraenderte Eintraege setze "corrected": false. +Behalte die exakte Struktur (gleiche Anzahl Eintraege). /no_think @@ -4365,6 +4390,38 @@ Eingabe: {_json.dumps(table_lines, ensure_ascii=False, indent=2)}""" +def _is_spurious_change(old_val: str, new_val: str) -> bool: + """Detect LLM changes that are likely wrong and should be discarded. + + Filters out: + - Case-only changes (OCR doesn't typically swap case) + - Completely different words (LLM hallucinating a replacement) + - Changes where the old value is a valid proper noun / place name + """ + if not old_val or not new_val: + return False + + # Case-only change — almost never a real OCR error + if old_val.lower() == new_val.lower(): + return True + + # If old value starts with uppercase and new is totally different word, + # it's likely a proper noun the LLM "corrected" + old_words = old_val.split() + new_words = new_val.split() + if len(old_words) == 1 and len(new_words) == 1: + ow, nw = old_words[0], new_words[0] + # Both are single words but share very few characters — likely hallucination + if len(ow) > 2 and len(nw) > 2: + # Levenshtein-like quick check: if < 50% chars overlap, reject + common = sum(1 for c in ow.lower() if c in nw.lower()) + max_len = max(len(ow), len(nw)) + if common / max_len < 0.5: + return True + + return False + + def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]: """Compare original entries with LLM-corrected ones, return (changes, corrected_entries).""" changes = [] @@ -4377,6 +4434,9 @@ def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict new_val = c.get(key, "").strip() old_val = (orig.get(field_name, "") or "").strip() if new_val and new_val != old_val: + # Filter spurious LLM changes + if _is_spurious_change(old_val, new_val): + continue changes.append({ "row_index": orig.get("row_index", i), "field": field_name,