From 962bbbe9f6c0b8560e1c6b0f895e3bf7fff9eb3a Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 10:47:17 +0100 Subject: [PATCH] Remove scattered debris rows and disable spanning header detection - Add Rule 3 to junk-row filter: rows where no word is longer than 2 chars are removed as scattered OCR debris from illustrations - Fully disable spanning-header detection which falsely flagged IPA transcriptions and vocabulary entries as spanning headers - First-row heuristic remains for genuine header detection Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 37 ++++++++-------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 18252c3..6b2be75 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -452,30 +452,11 @@ def _detect_header_rows( if 0 not in headers: headers.append(0) - # Spanning header detection: rows with very few words that span - # across many columns (e.g. "Unit 4: Bonnie Scotland" centred - # across all columns). Only trigger for clear cases (≥3 cols, - # ≤3 words) to avoid false positives on vocabulary worksheets - # where colored entries naturally span 2 columns. - if columns and len(columns) >= 3: - for row in rows: - ri = row["index"] - if ri in headers: - continue - row_words = [ - w for w in zone_words - if row["y_min"] <= w["top"] + w["height"] / 2 <= row["y_max"] - ] - if not row_words or len(row_words) > 3: - continue - word_x_min = min(w["left"] for w in row_words) - word_x_max = max(w["left"] + w["width"] for w in row_words) - cols_spanned = sum( - 1 for c in columns - if word_x_min < c["x_max"] and word_x_max > c["x_min"] - ) - if cols_spanned >= 3 and len(row_words) <= 3: - headers.append(ri) + # Note: Spanning-header detection (rows spanning all columns) has been + # disabled because it produces too many false positives on vocabulary + # worksheets where IPA transcriptions or short entries naturally span + # multiple columns with few words. The first-row heuristic above is + # sufficient for detecting real headers. return headers @@ -1124,6 +1105,14 @@ async def build_grid(session_id: str): junk_row_indices.add(ri) continue + # Rule 3: scattered debris — rows with only tiny fragments + # (e.g. OCR artifacts from illustrations/graphics). + # If the row has no word longer than 2 chars, it's noise. + longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs) + if longest <= 2: + junk_row_indices.add(ri) + continue + if junk_row_indices: z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices] z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]