From be86a7d14d1bdec3fe39065dc2a122d474d6ba41 Mon Sep 17 00:00:00 2001
From: Benjamin Admin
 <benjaminadmin@37bf1d39-1dc6-4c68-807f-54c9737f55e1.fritz.box>
Date: Tue, 24 Mar 2026 13:52:11 +0100
Subject: [PATCH] fix: preserve pipe syllable dividers + detect alphabet
 sidebar columns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Pipe divider fix: Changed OCR char-confusion regex so | between
   letters (Ka|me|rad) is NOT converted to I. Only standalone/
   word-boundary pipes are converted (|ch → Ich, | want → I want).

2. Alphabet sidebar detection improvements:
   - _filter_decorative_margin() now considers 2-char words (OCR reads
     "Aa", "Bb" from sidebars), lowered min strip from 8→6
   - _filter_border_strip_words() lowered decorative threshold from 50%→45%
   - New step 4f: grid-level thin-edge-column filter as safety net —
     removes edge columns with <35% fill rate and >60% short text

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/cv_ocr_engines.py  |  5 +-
 klausur-service/backend/grid_editor_api.py | 64 ++++++++++++++++++++--
 2 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
index 3a3efa2..8adacc6 100644
--- a/klausur-service/backend/cv_ocr_engines.py
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -481,8 +481,9 @@ _CHAR_CONFUSION_RULES = [
     (re.compile(r'\b1([a-z])'), r'I\1'),           # 1ch → Ich, 1want → Iwant
     # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
     (re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'),  # "1 want" → "I want"
-    # "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
-    (re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'),    # |ch → Ich, | want → I want
+    # "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
+    # and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
+    (re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'),  # |ch → Ich, | want → I want
 ]
 
 # Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 21aa157..fb92dd8 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -84,14 +84,14 @@ def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
             break
 
     # Validate candidate strip: real border decorations are mostly short
-    # single-character words (alphabet letters, stray marks).  Multi-word
+    # words (alphabet letters like "A", "Bb", stray marks).  Multi-word
     # content like "der Ranzen" or "die Schals" (continuation of German
     # translations) must NOT be removed.
     def _is_decorative_strip(candidates: List[Dict]) -> bool:
         if not candidates:
             return False
         short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2)
-        return short / len(candidates) >= 0.5
+        return short / len(candidates) >= 0.45
 
     strip_ids: set = set()
     if left_count > 0 and left_count / total < 0.20:
@@ -1243,20 +1243,22 @@ def _filter_decorative_margin(
         return no_strip
 
     margin_cutoff = img_w * 0.30
-    # Phase 1: find candidate strips using single-char words
+    # Phase 1: find candidate strips using short words (1-2 chars).
+    # OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb")
+    # rather than singles, so accept ≤2-char words as strip candidates.
     left_strip = [
         w for w in words
-        if len((w.get("text") or "").strip()) == 1
+        if len((w.get("text") or "").strip()) <= 2
         and w["left"] + w.get("width", 0) / 2 < margin_cutoff
     ]
     right_strip = [
         w for w in words
-        if len((w.get("text") or "").strip()) == 1
+        if len((w.get("text") or "").strip()) <= 2
         and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
     ]
 
     for strip, side in [(left_strip, "left"), (right_strip, "right")]:
-        if len(strip) < 8:
+        if len(strip) < 6:
             continue
         # Check vertical distribution: should have many distinct Y positions
         y_centers = sorted(set(
@@ -2128,6 +2130,56 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                 strip_gap, strip_count, total,
             )
 
+    # 4f. Remove thin decorative edge columns (alphabet sidebar safety net).
+    # If the leftmost or rightmost column has very few filled cells AND
+    # most of its text is short (≤2 chars), it's likely an alphabet sidebar
+    # that slipped through word-level pre-filters.
+    for z in zones_data:
+        columns = z.get("columns", [])
+        cells = z.get("cells", [])
+        if len(columns) < 3 or not cells:
+            continue
+        # Group cells by col_type
+        col_cells: Dict[str, List[Dict]] = {}
+        for cell in cells:
+            ct = cell.get("col_type", "")
+            col_cells.setdefault(ct, []).append(cell)
+        # Find edge column types (first and last)
+        col_types_ordered = sorted(col_cells.keys())
+        if not col_types_ordered:
+            continue
+        # Median cell count across columns (excluding heading rows)
+        col_counts = [len(v) for v in col_cells.values()]
+        median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0
+        if median_count < 3:
+            continue
+        for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
+            edge_cells_list = col_cells.get(edge_ct, [])
+            if not edge_cells_list:
+                continue
+            fill_ratio = len(edge_cells_list) / median_count
+            if fill_ratio > 0.35:
+                continue  # well-filled column → not decorative
+            short_count = sum(
+                1 for c in edge_cells_list
+                if len((c.get("text") or "").strip()) <= 2
+            )
+            short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0
+            if short_ratio < 0.6:
+                continue  # too much real content → not decorative
+            # Remove this edge column
+            removed_count = len(edge_cells_list)
+            edge_ids = {id(c) for c in edge_cells_list}
+            z["cells"] = [c for c in cells if id(c) not in edge_ids]
+            z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
+            logger.info(
+                "Step 4f: removed thin decorative edge column '%s' from zone %d "
+                "(%d cells, fill=%.0f%%, short=%.0f%%)",
+                edge_ct, z.get("zone_index", 0), removed_count,
+                fill_ratio * 100, short_ratio * 100,
+            )
+            break  # only remove one edge per zone
+
     # 5. Color annotation on final word_boxes in cells
     if img_bgr is not None:
         all_wb: List[Dict] = []