From be86a7d14d1bdec3fe39065dc2a122d474d6ba41 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 24 Mar 2026 13:52:11 +0100 Subject: [PATCH] fix: preserve pipe syllable dividers + detect alphabet sidebar columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Pipe divider fix: Changed OCR char-confusion regex so | between letters (Ka|me|rad) is NOT converted to I. Only standalone/ word-boundary pipes are converted (|ch → Ich, | want → I want). 2. Alphabet sidebar detection improvements: - _filter_decorative_margin() now considers 2-char words (OCR reads "Aa", "Bb" from sidebars), lowered min strip from 8→6 - _filter_border_strip_words() lowered decorative threshold from 50%→45% - New step 4f: grid-level thin-edge-column filter as safety net — removes edge columns with <35% fill rate and >60% short text Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_ocr_engines.py | 5 +- klausur-service/backend/grid_editor_api.py | 64 ++++++++++++++++++++-- 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 3a3efa2..8adacc6 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -481,8 +481,9 @@ _CHAR_CONFUSION_RULES = [ (re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number) (re.compile(r'(? Tuple[List[Dict], int]: break # Validate candidate strip: real border decorations are mostly short - # single-character words (alphabet letters, stray marks). Multi-word + # words (alphabet letters like "A", "Bb", stray marks). Multi-word # content like "der Ranzen" or "die Schals" (continuation of German # translations) must NOT be removed. def _is_decorative_strip(candidates: List[Dict]) -> bool: if not candidates: return False short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2) - return short / len(candidates) >= 0.5 + return short / len(candidates) >= 0.45 strip_ids: set = set() if left_count > 0 and left_count / total < 0.20: @@ -1243,20 +1243,22 @@ def _filter_decorative_margin( return no_strip margin_cutoff = img_w * 0.30 - # Phase 1: find candidate strips using single-char words + # Phase 1: find candidate strips using short words (1-2 chars). + # OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb") + # rather than singles, so accept ≤2-char words as strip candidates. left_strip = [ w for w in words - if len((w.get("text") or "").strip()) == 1 + if len((w.get("text") or "").strip()) <= 2 and w["left"] + w.get("width", 0) / 2 < margin_cutoff ] right_strip = [ w for w in words - if len((w.get("text") or "").strip()) == 1 + if len((w.get("text") or "").strip()) <= 2 and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff ] for strip, side in [(left_strip, "left"), (right_strip, "right")]: - if len(strip) < 8: + if len(strip) < 6: continue # Check vertical distribution: should have many distinct Y positions y_centers = sorted(set( @@ -2128,6 +2130,56 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: strip_gap, strip_count, total, ) + # 4f. Remove thin decorative edge columns (alphabet sidebar safety net). + # If the leftmost or rightmost column has very few filled cells AND + # most of its text is short (≤2 chars), it's likely an alphabet sidebar + # that slipped through word-level pre-filters. + for z in zones_data: + columns = z.get("columns", []) + cells = z.get("cells", []) + if len(columns) < 3 or not cells: + continue + # Group cells by col_type + col_cells: Dict[str, List[Dict]] = {} + for cell in cells: + ct = cell.get("col_type", "") + col_cells.setdefault(ct, []).append(cell) + # Find edge column types (first and last) + col_types_ordered = sorted(col_cells.keys()) + if not col_types_ordered: + continue + # Median cell count across columns (excluding heading rows) + col_counts = [len(v) for v in col_cells.values()] + median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0 + if median_count < 3: + continue + for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]: + edge_cells_list = col_cells.get(edge_ct, []) + if not edge_cells_list: + continue + fill_ratio = len(edge_cells_list) / median_count + if fill_ratio > 0.35: + continue # well-filled column → not decorative + short_count = sum( + 1 for c in edge_cells_list + if len((c.get("text") or "").strip()) <= 2 + ) + short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0 + if short_ratio < 0.6: + continue # too much real content → not decorative + # Remove this edge column + removed_count = len(edge_cells_list) + edge_ids = {id(c) for c in edge_cells_list} + z["cells"] = [c for c in cells if id(c) not in edge_ids] + z["columns"] = [col for col in columns if col.get("col_type") != edge_ct] + logger.info( + "Step 4f: removed thin decorative edge column '%s' from zone %d " + "(%d cells, fill=%.0f%%, short=%.0f%%)", + edge_ct, z.get("zone_index", 0), removed_count, + fill_ratio * 100, short_ratio * 100, + ) + break # only remove one edge per zone + # 5. Color annotation on final word_boxes in cells if img_bgr is not None: all_wb: List[Dict] = []