From 19b93f7762e466c3bf16c8961fe769cf4045dbc1 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Tue, 17 Mar 2026 18:19:25 +0100
Subject: [PATCH] fix: conservative column detection + smart graphic word
 filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Column detection:
- Raise MIN_COVERAGE_PRIMARY 20%→35% (prevents false columns in
  flowing text where random gaps < 35% of rows)
- Raise MIN_COVERAGE_SECONDARY 12%→20%, MIN_DISTINCT_ROWS 2→3
- Vocabulary worksheets unaffected (columns appear in >80% of rows)

Graphic word filter:
- Only remove words with OCR confidence < 50 inside graphic regions
- High-confidence words are real text, not image artifacts
- Prevents legitimate colored text from being discarded

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 23 +++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index fdf189b..9f0137d 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -147,10 +147,13 @@ def _cluster_columns_by_alignment(
     })
 
     # --- Filter by row coverage ---
-    MIN_COVERAGE_PRIMARY = 0.20
-    MIN_COVERAGE_SECONDARY = 0.12
-    MIN_WORDS_SECONDARY = 3
-    MIN_DISTINCT_ROWS = 2
+    # These thresholds must be high enough to avoid false columns in flowing
+    # text (random inter-word gaps) while still detecting real columns in
+    # vocabulary worksheets (which typically have >80% row coverage).
+    MIN_COVERAGE_PRIMARY = 0.35
+    MIN_COVERAGE_SECONDARY = 0.20
+    MIN_WORDS_SECONDARY = 4
+    MIN_DISTINCT_ROWS = 3
 
     # Content boundary for left-margin detection
     content_x_min = min(w["left"] for w in words)
@@ -694,6 +697,11 @@ async def build_grid(session_id: str):
                 session_id, len(all_words), len(word_result["cells"]))
 
     # 2b. Filter words inside detected graphic/image regions
+    # Only remove LOW-CONFIDENCE words (likely OCR artifacts from images).
+    # High-confidence words are real text even if they overlap a detected
+    # graphic region (e.g. colored text that graphic detection couldn't
+    # fully distinguish from an image).
+    _GRAPHIC_CONF_THRESHOLD = 50  # keep words with conf >= 50
     structure_result = session.get("structure_result")
     graphic_rects = []
     if structure_result:
@@ -713,13 +721,14 @@ async def build_grid(session_id: str):
                 and gr["y"] <= w_cy <= gr["y"] + gr["h"]
                 for gr in graphic_rects
             )
-            if not inside:
-                filtered.append(w)
+            if inside and w.get("conf", 0) < _GRAPHIC_CONF_THRESHOLD:
+                continue  # remove low-confidence artifact
+            filtered.append(w)
         removed = before - len(filtered)
         if removed:
             all_words = filtered
             logger.info(
-                "build-grid session %s: removed %d words inside %d graphic region(s)",
+                "build-grid session %s: removed %d low-conf words inside %d graphic region(s)",
                 session_id, removed, len(graphic_rects),
             )