diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index fc13a6f..1aa12a9 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -631,6 +631,34 @@ async def _build_grid_core( sorted(junk_row_indices), ) + # 4b2. Remove individual cells that consist of a single very-short, + # low-confidence word (OCR artifacts like "as", "b" from stray marks). + # These survive row-level junk removal when the row has valid cells + # in other columns. + _ARTIFACT_MAX_LEN = 2 + _ARTIFACT_CONF_THRESHOLD = 65 + for z in zones_data: + cells = z.get("cells", []) + if not cells: + continue + artifact_ids = set() + for cell in cells: + wbs = cell.get("word_boxes") or [] + if len(wbs) != 1: + continue + wb = wbs[0] + text = (wb.get("text") or "").strip() + conf = wb.get("conf", 100) + if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD: + artifact_ids.add(cell.get("cell_id")) + if artifact_ids: + z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids] + logger.info( + "build-grid: removed %d artifact cells from zone %d: %s", + len(artifact_ids), z.get("zone_index", 0), + [c.get("text") for c in cells if c.get("cell_id") in artifact_ids], + ) + # 4c. Remove oversized word_boxes from individual cells. # OCR artifacts from graphics/images (e.g. a huge "N" from a map image) # have word heights 3-5x the median. Remove them per-word so they don't @@ -707,6 +735,52 @@ async def _build_grid_core( if cleaned != text.strip(): cell["text"] = cleaned + # 4d2. Normalize narrow connector columns. + # In synonym dictionaries a narrow column repeats the same word + # (e.g. "oder") in every row. OCR sometimes appends noise chars + # (e.g. "oderb" instead of "oder"). If ≥60% of cells in a column + # share the same short text, normalize near-match outliers. + for z in zones_data: + cols = z.get("columns", []) + cells = z.get("cells", []) + if not cols or not cells: + continue + for col in cols: + ci = col.get("index") + col_cells = [c for c in cells if c.get("col_index") == ci] + if len(col_cells) < 3: + continue + # Count text occurrences + text_counts: Dict[str, int] = {} + for c in col_cells: + t = (c.get("text") or "").strip() + if t: + text_counts[t] = text_counts.get(t, 0) + 1 + if not text_counts: + continue + dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type] + dominant_count = text_counts[dominant_text] + # Only normalize if dominant word is short and appears in ≥60% + if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6: + continue + # Fix outliers that start with the dominant text + fixed = 0 + for c in col_cells: + t = (c.get("text") or "").strip() + if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2: + c["text"] = dominant_text + # Also fix word_boxes + wbs = c.get("word_boxes") or [] + if len(wbs) == 1: + wbs[0]["text"] = dominant_text + fixed += 1 + if fixed: + logger.info( + "build-grid: normalized %d outlier cells in connector column %d " + "(dominant='%s') zone %d", + fixed, ci, dominant_text, z.get("zone_index", 0), + ) + # 4e. Detect and remove page-border decoration strips. # Skipped when the pre-filter already removed border words BEFORE # column detection — re-running would incorrectly detect the @@ -1095,8 +1169,9 @@ async def _build_grid_core( if c.get("cell_id") not in page_ref_cell_ids] # Detect footer: last non-header row if it has only 1 cell - # and the text is NOT IPA (no real IPA Unicode symbols). - # This catches page numbers like "two hundred and twelve". + # with short, non-content text (page numbers like "233" or + # "two hundred and twelve"). Comma-separated lists and long + # text are content continuations, not page numbers. footer_rows = [] non_header_rows = [r for r in rows if not r.get("is_header")] if non_header_rows: @@ -1108,7 +1183,13 @@ async def _build_grid_core( text = (last_cells[0].get("text") or "").strip() # Not IPA (no real IPA symbols) and not a heading has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text) - if text and not has_real_ipa and last_cells[0].get("col_type") != "heading": + # Comma-separated text is a content continuation, not a footer + has_commas = ',' in text + # Long text (>20 chars) is unlikely a page number + is_short = len(text) <= 20 + if (text and not has_real_ipa and not has_commas + and is_short + and last_cells[0].get("col_type") != "heading"): footer_rows.append({ "row_index": last_ri, "text": text,