From fe754398c031d30e2111216df6699814f9b48a8c Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 24 Mar 2026 14:10:43 +0100 Subject: [PATCH] fix: Step 4f sidebar detection uses avg text length instead of fill ratio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Column_1 data showed avg_len=1.0 with 13 single-char cells (alphabet letters from sidebar). Old fill_ratio check (76% > 35%) missed it. New criteria: avg_len ≤ 1.5 AND ≥ 70% single chars → removes column. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 50 ++++++++++------------ 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index fb92dd8..895c172 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -2130,53 +2130,49 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: strip_gap, strip_count, total, ) - # 4f. Remove thin decorative edge columns (alphabet sidebar safety net). - # If the leftmost or rightmost column has very few filled cells AND - # most of its text is short (≤2 chars), it's likely an alphabet sidebar - # that slipped through word-level pre-filters. + # 4f. Remove decorative edge columns (alphabet sidebar safety net). + # Dictionary pages have A-Z letter sidebars that OCR reads as single- + # character word_boxes. These form narrow columns with very short text. + # Detection: edge column where almost ALL cells are single characters. for z in zones_data: columns = z.get("columns", []) cells = z.get("cells", []) if len(columns) < 3 or not cells: continue - # Group cells by col_type + # Group cells by col_type (skip spanning_header) col_cells: Dict[str, List[Dict]] = {} for cell in cells: ct = cell.get("col_type", "") - col_cells.setdefault(ct, []).append(cell) - # Find edge column types (first and last) + if ct.startswith("column_"): + col_cells.setdefault(ct, []).append(cell) col_types_ordered = sorted(col_cells.keys()) - if not col_types_ordered: - continue - # Median cell count across columns (excluding heading rows) - col_counts = [len(v) for v in col_cells.values()] - median_count = sorted(col_counts)[len(col_counts) // 2] if col_counts else 0 - if median_count < 3: + if len(col_types_ordered) < 3: continue for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]: edge_cells_list = col_cells.get(edge_ct, []) - if not edge_cells_list: + if len(edge_cells_list) < 3: continue - fill_ratio = len(edge_cells_list) / median_count - if fill_ratio > 0.35: - continue # well-filled column → not decorative - short_count = sum( - 1 for c in edge_cells_list - if len((c.get("text") or "").strip()) <= 2 - ) - short_ratio = short_count / len(edge_cells_list) if edge_cells_list else 0 - if short_ratio < 0.6: - continue # too much real content → not decorative + # Key criterion: average text length and single-char ratio. + # Alphabet sidebars have avg_len ≈ 1.0 and nearly all cells + # are single characters. + texts = [(c.get("text") or "").strip() for c in edge_cells_list] + avg_len = sum(len(t) for t in texts) / len(texts) + single_char = sum(1 for t in texts if len(t) <= 1) + single_ratio = single_char / len(texts) + if avg_len > 1.5: + continue # real content has longer text + if single_ratio < 0.7: + continue # not dominated by single chars # Remove this edge column removed_count = len(edge_cells_list) edge_ids = {id(c) for c in edge_cells_list} z["cells"] = [c for c in cells if id(c) not in edge_ids] z["columns"] = [col for col in columns if col.get("col_type") != edge_ct] logger.info( - "Step 4f: removed thin decorative edge column '%s' from zone %d " - "(%d cells, fill=%.0f%%, short=%.0f%%)", + "Step 4f: removed decorative edge column '%s' from zone %d " + "(%d cells, avg_len=%.1f, single_char=%.0f%%)", edge_ct, z.get("zone_index", 0), removed_count, - fill_ratio * 100, short_ratio * 100, + avg_len, single_ratio * 100, ) break # only remove one edge per zone