From 432eee3694ce6903d046cdec1ab1fb29ef910211 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 19 Mar 2026 09:38:24 +0100 Subject: [PATCH] Auto-filter decorative margin strips and header junk - _filter_decorative_margin: Phase 2 now also removes short words (<=3 chars) in the same narrow x-range as the detected single-char strip, catching multi-char OCR artifacts like "Vv" from alphabet graphics. - _filter_header_junk: New filter detects the content start (first row with 3+ high-confidence words) and removes low-conf short fragments above it that are OCR artifacts from header illustrations. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/grid_editor_api.py | 116 +++++++++++++++++++-- 1 file changed, 108 insertions(+), 8 deletions(-) diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 06a85e8..e510aaf 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -663,11 +663,15 @@ def _filter_decorative_margin( character word. These decorative elements are not content and confuse column/row detection. - Detection criteria: + Detection criteria (phase 1 — find the strip using single-char words): - Words are in the outer 30% of the page (left or right) - Nearly all words are single characters (letters or digits) - At least 8 such words form a vertical strip (≥8 unique Y positions) - - Average horizontal spread of the strip is small (< 60px) + - Average horizontal spread of the strip is small (< 80px) + + Phase 2 — once a strip is confirmed, also remove any short word (≤3 + chars) in the same narrow x-range. This catches multi-char OCR + artifacts like "Vv" that belong to the same decorative element. Modifies *words* in place. """ @@ -675,7 +679,7 @@ def _filter_decorative_margin( return margin_cutoff = img_w * 0.30 - # Candidate margin words: single char, in left or right 30% + # Phase 1: find candidate strips using single-char words left_strip = [ w for w in words if len((w.get("text") or "").strip()) == 1 @@ -699,18 +703,34 @@ def _filter_decorative_margin( continue # Check horizontal compactness x_positions = [w["left"] for w in strip] - x_spread = max(x_positions) - min(x_positions) + x_min = min(x_positions) + x_max = max(x_positions) + x_spread = x_max - x_min if x_spread > 80: continue - # This looks like a decorative alphabet strip — remove these words - strip_set = set(id(w) for w in strip) + + # Phase 2: strip confirmed — also collect short words in same x-range + # Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U") + strip_x_lo = x_min - 20 + strip_x_hi = x_max + 60 # word width + tolerance + all_strip_words = [ + w for w in words + if len((w.get("text") or "").strip()) <= 3 + and strip_x_lo <= w["left"] <= strip_x_hi + and (w["left"] + w.get("width", 0) / 2 < margin_cutoff + if side == "left" + else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff) + ] + + strip_set = set(id(w) for w in all_strip_words) before = len(words) words[:] = [w for w in words if id(w) not in strip_set] removed = before - len(words) if removed: log.info( - "build-grid session %s: removed %d decorative %s-margin chars", - session_id, removed, side, + "build-grid session %s: removed %d decorative %s-margin words " + "(strip x=%d-%d)", + session_id, removed, side, strip_x_lo, strip_x_hi, ) @@ -744,6 +764,82 @@ def _filter_footer_words( ) +def _filter_header_junk( + words: List[Dict], + img_h: int, + log: Any, + session_id: str, +) -> None: + """Remove OCR junk from header illustrations above the real content. + + Textbook pages often have decorative header graphics (illustrations, + icons) that OCR reads as low-confidence junk characters. Real content + typically starts further down the page. + + Algorithm: + 1. Find the "content start" — the first Y position where a dense + horizontal row of 3+ high-confidence words begins. + 2. Above that line, remove words with conf < 75 and text ≤ 3 chars. + These are almost certainly OCR artifacts from illustrations. + + Modifies *words* in place. + """ + if not words or img_h <= 0: + return + + # --- Find content start: first horizontal row with ≥3 high-conf words --- + # Sort words by Y + sorted_by_y = sorted(words, key=lambda w: w["top"]) + content_start_y = 0 + _ROW_TOLERANCE = img_h * 0.02 # words within 2% of page height = same row + _MIN_ROW_WORDS = 3 + _MIN_CONF = 80 + + i = 0 + while i < len(sorted_by_y): + row_y = sorted_by_y[i]["top"] + # Collect words in this row band + row_words = [] + j = i + while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE: + row_words.append(sorted_by_y[j]) + j += 1 + # Count high-confidence words with real text (> 1 char) + high_conf = [ + w for w in row_words + if w.get("conf", 0) >= _MIN_CONF + and len((w.get("text") or "").strip()) > 1 + ] + if len(high_conf) >= _MIN_ROW_WORDS: + content_start_y = row_y + break + i = j if j > i else i + 1 + + if content_start_y <= 0: + return # no clear content start found + + # --- Remove low-conf short junk above content start --- + junk = [ + w for w in words + if w["top"] + w.get("height", 0) < content_start_y + and w.get("conf", 0) < 75 + and len((w.get("text") or "").strip()) <= 3 + ] + if not junk: + return + + junk_set = set(id(w) for w in junk) + before = len(words) + words[:] = [w for w in words if id(w) not in junk_set] + removed = before - len(words) + if removed: + log.info( + "build-grid session %s: removed %d header junk words above y=%d " + "(content start)", + session_id, removed, content_start_y, + ) + + # --------------------------------------------------------------------------- # Core computation (used by build-grid endpoint and regression tests) # --------------------------------------------------------------------------- @@ -792,6 +888,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: # page number ("64", "S. 12") and not real content. _filter_footer_words(all_words, img_h, logger, session_id) + # 2c2. Filter OCR junk from header illustrations. + # Low-confidence short fragments above the first real content row. + _filter_header_junk(all_words, img_h, logger, session_id) + # 2d. Filter words inside user-defined exclude regions (from Structure step). # These are explicitly marked by the user, so ALL words inside are removed # regardless of confidence.