From f139d0903e5cb599f06ebf1d481df8020f48c5d8 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 11:08:23 +0100 Subject: [PATCH] Preserve alphabetic marker columns, broaden junk filter, enable IPA in grid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _merge_inline_marker_columns: skip merge when ≥50% of words are alphabetic (preserves "to", "in", "der" columns) - Rule 2 (oversized stub): widen to ≤3 words / ≤5 chars (catches "SEA &") - IPA phonetics: map longest-avg-text column to column_en so fix_cell_phonetics runs in the grid editor - ocr_pipeline_overlays: add missing split_page_into_zones import Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 81 +++++++++++++++---- .../backend/ocr_pipeline_overlays.py | 2 +- 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 6b2be75..792ec4d 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -326,6 +326,9 @@ def _filter_border_ghosts( return filtered, len(words) - len(filtered) +_MARKER_CHARS = set("•*·-–—|~=+#>→►▸▪◆○●□■✓✗✔✘") + + def _merge_inline_marker_columns( columns: List[Dict], words: List[Dict], @@ -335,6 +338,9 @@ def _merge_inline_marker_columns( Bullet points (•, *, -) and numbering (1., 2.) create narrow columns at the left edge of a zone. These are inline markers that indent text, not real separate columns. Merge them with their right neighbour. + + Does NOT merge columns containing alphabetic words like "to", "in", + "der", "die", "das" — those are legitimate content columns. """ if len(columns) < 2: return columns @@ -353,21 +359,38 @@ def _merge_inline_marker_columns( ] col_width = col["x_max"] - col["x_min"] - # Narrow column with mostly short words → likely inline markers + # Narrow column with mostly short words → MIGHT be inline markers if col_words and col_width < 80: avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words) if avg_len <= 2 and i + 1 < len(columns): - # Merge into next column - next_col = columns[i + 1].copy() - next_col["x_min"] = col["x_min"] - merged.append(next_col) - skip.add(i + 1) - logger.info( - " merged inline marker column %d (w=%d, avg_len=%.1f) " - "into column %d", - i, col_width, avg_len, i + 1, + # Check if words are actual markers (symbols/numbers) vs + # real alphabetic words like "to", "in", "der", "die" + texts = [(w.get("text") or "").strip() for w in col_words] + alpha_count = sum( + 1 for t in texts + if t and t[0].isalpha() and t not in _MARKER_CHARS ) - continue + alpha_ratio = alpha_count / len(texts) if texts else 0 + + # If ≥50% of words are alphabetic, this is a real column + if alpha_ratio >= 0.5: + logger.info( + " kept narrow column %d (w=%d, avg_len=%.1f, " + "alpha=%.0f%%) — contains real words", + i, col_width, avg_len, alpha_ratio * 100, + ) + else: + # Merge into next column + next_col = columns[i + 1].copy() + next_col["x_min"] = col["x_min"] + merged.append(next_col) + skip.add(i + 1) + logger.info( + " merged inline marker column %d (w=%d, avg_len=%.1f) " + "into column %d", + i, col_width, avg_len, i + 1, + ) + continue merged.append(col) @@ -1096,12 +1119,13 @@ async def build_grid(session_id: str): junk_row_indices.add(ri) continue - # Rule 2: oversized stub — ≤2 words, all short text (≤2 chars), - # and word height > 1.8× median (page numbers, stray marks) - if len(row_wbs) <= 2: + # Rule 2: oversized stub — ≤3 words, short total text, + # and word height > 1.8× median (page numbers, stray marks, + # OCR from illustration labels like "SEA &") + if len(row_wbs) <= 3: total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs) max_h = max((wb.get("height", 0) for wb in row_wbs), default=0) - if len(total_text) <= 3 and max_h > median_wb_h * 1.8: + if len(total_text) <= 5 and max_h > median_wb_h * 1.8: junk_row_indices.add(ri) continue @@ -1141,8 +1165,35 @@ async def build_grid(session_id: str): # 5c. IPA phonetic correction — replace garbled OCR phonetics with # correct IPA from the dictionary (same as in the OCR pipeline). + # The grid uses generic col_types (column_1, column_2, ...) but + # fix_cell_phonetics expects column_en / column_text. Identify + # the English headword column (longest average text) and mark it. all_cells = [cell for z in zones_data for cell in z.get("cells", [])] + # Find which col_type has the longest average text → English headwords + col_avg_len: Dict[str, List[int]] = {} + for cell in all_cells: + ct = cell.get("col_type", "") + txt = cell.get("text", "") + col_avg_len.setdefault(ct, []).append(len(txt)) + en_col_type = None + best_avg = 0 + for ct, lengths in col_avg_len.items(): + if not ct.startswith("column_"): + continue + avg = sum(lengths) / len(lengths) if lengths else 0 + if avg > best_avg: + best_avg = avg + en_col_type = ct + if en_col_type: + for cell in all_cells: + if cell.get("col_type") == en_col_type: + cell["_orig_col_type"] = en_col_type + cell["col_type"] = "column_en" fix_cell_phonetics(all_cells, pronunciation="british") + for cell in all_cells: + orig = cell.pop("_orig_col_type", None) + if orig: + cell["col_type"] = orig duration = time.time() - t0 diff --git a/klausur-service/backend/ocr_pipeline_overlays.py b/klausur-service/backend/ocr_pipeline_overlays.py index e63ead7..2789557 100644 --- a/klausur-service/backend/ocr_pipeline_overlays.py +++ b/klausur-service/backend/ocr_pipeline_overlays.py @@ -25,7 +25,7 @@ from ocr_pipeline_common import ( ) from ocr_pipeline_session_store import get_session_db, get_session_image from cv_color_detect import _COLOR_HEX, _COLOR_RANGES -from cv_box_detect import detect_boxes +from cv_box_detect import detect_boxes, split_page_into_zones from ocr_pipeline_rows import _draw_box_exclusion_overlay logger = logging.getLogger(__name__)