diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 3f1e6bd..03926c6 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -266,7 +266,7 @@ def _cluster_columns_by_alignment( # Characters that are typically OCR artefacts from box border lines. # Intentionally excludes ! (red markers) and . , ; (real punctuation). -_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~") +_GRID_GHOST_CHARS = set("|1lI[](){}/\\-—–_~=+") def _filter_border_ghosts( @@ -303,10 +303,14 @@ def _filter_border_ghosts( text = (w.get("text") or "").strip() if not text: return False - cx = w["left"] + w["width"] / 2 - cy = w["top"] + w["height"] / 2 - on_border = any(lo <= cx <= hi for lo, hi in x_bands) or any( - lo <= cy <= hi for lo, hi in y_bands + # Check if any word edge (not just center) touches a border band + w_left = w["left"] + w_right = w["left"] + w["width"] + w_top = w["top"] + w_bottom = w["top"] + w["height"] + on_border = ( + any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands) + or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands) ) if not on_border: return False @@ -318,6 +322,59 @@ def _filter_border_ghosts( return filtered, len(words) - len(filtered) +def _merge_inline_marker_columns( + columns: List[Dict], + words: List[Dict], +) -> List[Dict]: + """Merge narrow marker columns (bullets, numbering) into adjacent text. + + Bullet points (•, *, -) and numbering (1., 2.) create narrow columns + at the left edge of a zone. These are inline markers that indent text, + not real separate columns. Merge them with their right neighbour. + """ + if len(columns) < 2: + return columns + + merged: List[Dict] = [] + skip: set = set() + + for i, col in enumerate(columns): + if i in skip: + continue + + # Find words in this column + col_words = [ + w for w in words + if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"] + ] + col_width = col["x_max"] - col["x_min"] + + # Narrow column with mostly short words → likely inline markers + if col_words and col_width < 80: + avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words) + if avg_len <= 2 and i + 1 < len(columns): + # Merge into next column + next_col = columns[i + 1].copy() + next_col["x_min"] = col["x_min"] + merged.append(next_col) + skip.add(i + 1) + logger.info( + " merged inline marker column %d (w=%d, avg_len=%.1f) " + "into column %d", + i, col_width, avg_len, i + 1, + ) + continue + + merged.append(col) + + # Re-index + for i, col in enumerate(merged): + col["index"] = i + col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text" + + return merged + + def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]: """Extract all word_boxes from cells into a flat list of word dicts.""" words: List[Dict] = [] @@ -445,6 +502,10 @@ def _build_zone_grid( # Use global columns if provided, otherwise detect per zone columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows) + # Merge inline marker columns (bullets, numbering) into adjacent text + if not global_columns: + columns = _merge_inline_marker_columns(columns, zone_words) + if not columns or not rows: return { "columns": [],