diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index fdf189b..9f0137d 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -147,10 +147,13 @@ def _cluster_columns_by_alignment( }) # --- Filter by row coverage --- - MIN_COVERAGE_PRIMARY = 0.20 - MIN_COVERAGE_SECONDARY = 0.12 - MIN_WORDS_SECONDARY = 3 - MIN_DISTINCT_ROWS = 2 + # These thresholds must be high enough to avoid false columns in flowing + # text (random inter-word gaps) while still detecting real columns in + # vocabulary worksheets (which typically have >80% row coverage). + MIN_COVERAGE_PRIMARY = 0.35 + MIN_COVERAGE_SECONDARY = 0.20 + MIN_WORDS_SECONDARY = 4 + MIN_DISTINCT_ROWS = 3 # Content boundary for left-margin detection content_x_min = min(w["left"] for w in words) @@ -694,6 +697,11 @@ async def build_grid(session_id: str): session_id, len(all_words), len(word_result["cells"])) # 2b. Filter words inside detected graphic/image regions + # Only remove LOW-CONFIDENCE words (likely OCR artifacts from images). + # High-confidence words are real text even if they overlap a detected + # graphic region (e.g. colored text that graphic detection couldn't + # fully distinguish from an image). + _GRAPHIC_CONF_THRESHOLD = 50 # keep words with conf >= 50 structure_result = session.get("structure_result") graphic_rects = [] if structure_result: @@ -713,13 +721,14 @@ async def build_grid(session_id: str): and gr["y"] <= w_cy <= gr["y"] + gr["h"] for gr in graphic_rects ) - if not inside: - filtered.append(w) + if inside and w.get("conf", 0) < _GRAPHIC_CONF_THRESHOLD: + continue # remove low-confidence artifact + filtered.append(w) removed = before - len(filtered) if removed: all_words = filtered logger.info( - "build-grid session %s: removed %d words inside %d graphic region(s)", + "build-grid session %s: removed %d low-conf words inside %d graphic region(s)", session_id, removed, len(graphic_rects), )