From 9da45c2a59679985dc9808a1aa444c5eb500ed80 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Wed, 18 Mar 2026 10:38:20 +0100 Subject: [PATCH] Fix false header detection and add decorative margin/footer filters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove all_colored spanning header heuristic that falsely flagged colored vocabulary entries (Scotland, secondary school) as headers - Add _filter_decorative_margin: removes vertical A-Z alphabet strips along page margins (single-char words in a compact vertical strip) - Add _filter_footer_words: removes page numbers in bottom 5% of page - Tighten spanning header rule: require ≥3 columns spanned + ≤3 words Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 135 ++++++++++++++++++--- 1 file changed, 115 insertions(+), 20 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index e153b31..18252c3 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -452,11 +452,12 @@ def _detect_header_rows( if 0 not in headers: headers.append(0) - # Spanning header detection: rows with few words that cross column - # boundaries and don't fit the normal multi-column pattern. - if columns and len(columns) >= 2: - # Typical data row has words in 2+ columns; a spanning header has - # words that sit in the middle columns without matching the pattern. + # Spanning header detection: rows with very few words that span + # across many columns (e.g. "Unit 4: Bonnie Scotland" centred + # across all columns). Only trigger for clear cases (≥3 cols, + # ≤3 words) to avoid false positives on vocabulary worksheets + # where colored entries naturally span 2 columns. + if columns and len(columns) >= 3: for row in rows: ri = row["index"] if ri in headers: @@ -465,26 +466,15 @@ def _detect_header_rows( w for w in zone_words if row["y_min"] <= w["top"] + w["height"] / 2 <= row["y_max"] ] - if not row_words or len(row_words) > 6: - continue # too many words to be a header - # Check if all row words are colored (common for section headers) - all_colored = all( - w.get("color_name") and w.get("color_name") != "black" - for w in row_words - ) - # Check if words span across the middle columns (not in col 0) + if not row_words or len(row_words) > 3: + continue word_x_min = min(w["left"] for w in row_words) word_x_max = max(w["left"] + w["width"] for w in row_words) - first_col_end = columns[0]["x_max"] if columns else 0 - # Header if: colored text that starts after the first column - # or spans more than 2 columns cols_spanned = sum( 1 for c in columns if word_x_min < c["x_max"] and word_x_max > c["x_min"] ) - if all_colored and cols_spanned >= 2: - headers.append(ri) - elif cols_spanned >= 3 and len(row_words) <= 4: + if cols_spanned >= 3 and len(row_words) <= 3: headers.append(ri) return headers @@ -655,6 +645,100 @@ def _get_content_bounds(words: List[Dict]) -> tuple: return x_min, y_min, x_max - x_min, y_max - y_min +def _filter_decorative_margin( + words: List[Dict], + img_w: int, + log: Any, + session_id: str, +) -> None: + """Remove words that belong to a decorative alphabet strip on a margin. + + Some vocabulary worksheets have a vertical A–Z alphabet graphic along + the left or right edge. OCR reads each letter as an isolated single- + character word. These decorative elements are not content and confuse + column/row detection. + + Detection criteria: + - Words are in the outer 30% of the page (left or right) + - Nearly all words are single characters (letters or digits) + - At least 8 such words form a vertical strip (≥8 unique Y positions) + - Average horizontal spread of the strip is small (< 60px) + + Modifies *words* in place. + """ + if not words or img_w <= 0: + return + + margin_cutoff = img_w * 0.30 + # Candidate margin words: single char, in left or right 30% + left_strip = [ + w for w in words + if len((w.get("text") or "").strip()) == 1 + and w["left"] + w.get("width", 0) / 2 < margin_cutoff + ] + right_strip = [ + w for w in words + if len((w.get("text") or "").strip()) == 1 + and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff + ] + + for strip, side in [(left_strip, "left"), (right_strip, "right")]: + if len(strip) < 8: + continue + # Check vertical distribution: should have many distinct Y positions + y_centers = sorted(set( + int(w["top"] + w.get("height", 0) / 2) // 20 * 20 # bucket + for w in strip + )) + if len(y_centers) < 6: + continue + # Check horizontal compactness + x_positions = [w["left"] for w in strip] + x_spread = max(x_positions) - min(x_positions) + if x_spread > 80: + continue + # This looks like a decorative alphabet strip — remove these words + strip_set = set(id(w) for w in strip) + before = len(words) + words[:] = [w for w in words if id(w) not in strip_set] + removed = before - len(words) + if removed: + log.info( + "build-grid session %s: removed %d decorative %s-margin chars", + session_id, removed, side, + ) + + +def _filter_footer_words( + words: List[Dict], + img_h: int, + log: Any, + session_id: str, +) -> None: + """Remove isolated words in the bottom 5% of the page (page numbers). + + Modifies *words* in place. + """ + if not words or img_h <= 0: + return + footer_y = img_h * 0.95 + footer_words = [ + w for w in words + if w["top"] + w.get("height", 0) / 2 > footer_y + ] + if not footer_words: + return + # Only remove if footer has very few words (≤ 3) with short text + total_text = "".join((w.get("text") or "").strip() for w in footer_words) + if len(footer_words) <= 3 and len(total_text) <= 10: + footer_set = set(id(w) for w in footer_words) + words[:] = [w for w in words if id(w) not in footer_set] + log.info( + "build-grid session %s: removed %d footer words ('%s')", + session_id, len(footer_words), total_text, + ) + + # --------------------------------------------------------------------------- # Endpoints # --------------------------------------------------------------------------- @@ -696,7 +780,18 @@ async def build_grid(session_id: str): logger.info("build-grid session %s: %d words from %d cells", session_id, len(all_words), len(word_result["cells"])) - # 2b. Filter words inside detected graphic/image regions + # 2b. Filter decorative margin columns (alphabet graphics). + # Some worksheets have a decorative alphabet strip along one margin + # (A-Z in a graphic). OCR reads these as single-char words aligned + # vertically. Detect and remove them before grid building. + _filter_decorative_margin(all_words, img_w, logger, session_id) + + # 2c. Filter footer rows (page numbers at the very bottom). + # Isolated short text in the bottom 5% of the page is typically a + # page number ("64", "S. 12") and not real content. + _filter_footer_words(all_words, img_h, logger, session_id) + + # 2d. Filter words inside detected graphic/image regions # Only remove LOW-CONFIDENCE words (likely OCR artifacts from images). # High-confidence words are real text even if they overlap a detected # graphic region (e.g. colored text that graphic detection couldn't