From 19b93f7762e466c3bf16c8961fe769cf4045dbc1 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 17 Mar 2026 18:19:25 +0100 Subject: [PATCH] fix: conservative column detection + smart graphic word filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Column detection: - Raise MIN_COVERAGE_PRIMARY 20%→35% (prevents false columns in flowing text where random gaps < 35% of rows) - Raise MIN_COVERAGE_SECONDARY 12%→20%, MIN_DISTINCT_ROWS 2→3 - Vocabulary worksheets unaffected (columns appear in >80% of rows) Graphic word filter: - Only remove words with OCR confidence < 50 inside graphic regions - High-confidence words are real text, not image artifacts - Prevents legitimate colored text from being discarded Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 23 +++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index fdf189b..9f0137d 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -147,10 +147,13 @@ def _cluster_columns_by_alignment( }) # --- Filter by row coverage --- - MIN_COVERAGE_PRIMARY = 0.20 - MIN_COVERAGE_SECONDARY = 0.12 - MIN_WORDS_SECONDARY = 3 - MIN_DISTINCT_ROWS = 2 + # These thresholds must be high enough to avoid false columns in flowing + # text (random inter-word gaps) while still detecting real columns in + # vocabulary worksheets (which typically have >80% row coverage). + MIN_COVERAGE_PRIMARY = 0.35 + MIN_COVERAGE_SECONDARY = 0.20 + MIN_WORDS_SECONDARY = 4 + MIN_DISTINCT_ROWS = 3 # Content boundary for left-margin detection content_x_min = min(w["left"] for w in words) @@ -694,6 +697,11 @@ async def build_grid(session_id: str): session_id, len(all_words), len(word_result["cells"])) # 2b. Filter words inside detected graphic/image regions + # Only remove LOW-CONFIDENCE words (likely OCR artifacts from images). + # High-confidence words are real text even if they overlap a detected + # graphic region (e.g. colored text that graphic detection couldn't + # fully distinguish from an image). + _GRAPHIC_CONF_THRESHOLD = 50 # keep words with conf >= 50 structure_result = session.get("structure_result") graphic_rects = [] if structure_result: @@ -713,13 +721,14 @@ async def build_grid(session_id: str): and gr["y"] <= w_cy <= gr["y"] + gr["h"] for gr in graphic_rects ) - if not inside: - filtered.append(w) + if inside and w.get("conf", 0) < _GRAPHIC_CONF_THRESHOLD: + continue # remove low-confidence artifact + filtered.append(w) removed = before - len(filtered) if removed: all_words = filtered logger.info( - "build-grid session %s: removed %d words inside %d graphic region(s)", + "build-grid session %s: removed %d low-conf words inside %d graphic region(s)", session_id, removed, len(graphic_rects), )