From 432eee3694ce6903d046cdec1ab1fb29ef910211 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Thu, 19 Mar 2026 09:38:24 +0100
Subject: [PATCH] Auto-filter decorative margin strips and header junk

- _filter_decorative_margin: Phase 2 now also removes short words (<=3
  chars) in the same narrow x-range as the detected single-char strip,
  catching multi-char OCR artifacts like "Vv" from alphabet graphics.
- _filter_header_junk: New filter detects the content start (first row
  with 3+ high-confidence words) and removes low-conf short fragments
  above it that are OCR artifacts from header illustrations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 116 +++++++++++++++++++--
 1 file changed, 108 insertions(+), 8 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 06a85e8..e510aaf 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -663,11 +663,15 @@ def _filter_decorative_margin(
     character word.  These decorative elements are not content and confuse
     column/row detection.
 
-    Detection criteria:
+    Detection criteria (phase 1 — find the strip using single-char words):
       - Words are in the outer 30% of the page (left or right)
       - Nearly all words are single characters (letters or digits)
       - At least 8 such words form a vertical strip (≥8 unique Y positions)
-      - Average horizontal spread of the strip is small (< 60px)
+      - Average horizontal spread of the strip is small (< 80px)
+
+    Phase 2 — once a strip is confirmed, also remove any short word (≤3
+    chars) in the same narrow x-range.  This catches multi-char OCR
+    artifacts like "Vv" that belong to the same decorative element.
 
     Modifies *words* in place.
     """
@@ -675,7 +679,7 @@ def _filter_decorative_margin(
         return
 
     margin_cutoff = img_w * 0.30
-    # Candidate margin words: single char, in left or right 30%
+    # Phase 1: find candidate strips using single-char words
     left_strip = [
         w for w in words
         if len((w.get("text") or "").strip()) == 1
@@ -699,18 +703,34 @@ def _filter_decorative_margin(
             continue
         # Check horizontal compactness
         x_positions = [w["left"] for w in strip]
-        x_spread = max(x_positions) - min(x_positions)
+        x_min = min(x_positions)
+        x_max = max(x_positions)
+        x_spread = x_max - x_min
         if x_spread > 80:
             continue
-        # This looks like a decorative alphabet strip — remove these words
-        strip_set = set(id(w) for w in strip)
+
+        # Phase 2: strip confirmed — also collect short words in same x-range
+        # Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U")
+        strip_x_lo = x_min - 20
+        strip_x_hi = x_max + 60  # word width + tolerance
+        all_strip_words = [
+            w for w in words
+            if len((w.get("text") or "").strip()) <= 3
+            and strip_x_lo <= w["left"] <= strip_x_hi
+            and (w["left"] + w.get("width", 0) / 2 < margin_cutoff
+                 if side == "left"
+                 else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff)
+        ]
+
+        strip_set = set(id(w) for w in all_strip_words)
         before = len(words)
         words[:] = [w for w in words if id(w) not in strip_set]
         removed = before - len(words)
         if removed:
             log.info(
-                "build-grid session %s: removed %d decorative %s-margin chars",
-                session_id, removed, side,
+                "build-grid session %s: removed %d decorative %s-margin words "
+                "(strip x=%d-%d)",
+                session_id, removed, side, strip_x_lo, strip_x_hi,
             )
 
 
@@ -744,6 +764,82 @@ def _filter_footer_words(
         )
 
 
+def _filter_header_junk(
+    words: List[Dict],
+    img_h: int,
+    log: Any,
+    session_id: str,
+) -> None:
+    """Remove OCR junk from header illustrations above the real content.
+
+    Textbook pages often have decorative header graphics (illustrations,
+    icons) that OCR reads as low-confidence junk characters.  Real content
+    typically starts further down the page.
+
+    Algorithm:
+      1. Find the "content start" — the first Y position where a dense
+         horizontal row of 3+ high-confidence words begins.
+      2. Above that line, remove words with conf < 75 and text ≤ 3 chars.
+         These are almost certainly OCR artifacts from illustrations.
+
+    Modifies *words* in place.
+    """
+    if not words or img_h <= 0:
+        return
+
+    # --- Find content start: first horizontal row with ≥3 high-conf words ---
+    # Sort words by Y
+    sorted_by_y = sorted(words, key=lambda w: w["top"])
+    content_start_y = 0
+    _ROW_TOLERANCE = img_h * 0.02  # words within 2% of page height = same row
+    _MIN_ROW_WORDS = 3
+    _MIN_CONF = 80
+
+    i = 0
+    while i < len(sorted_by_y):
+        row_y = sorted_by_y[i]["top"]
+        # Collect words in this row band
+        row_words = []
+        j = i
+        while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE:
+            row_words.append(sorted_by_y[j])
+            j += 1
+        # Count high-confidence words with real text (> 1 char)
+        high_conf = [
+            w for w in row_words
+            if w.get("conf", 0) >= _MIN_CONF
+            and len((w.get("text") or "").strip()) > 1
+        ]
+        if len(high_conf) >= _MIN_ROW_WORDS:
+            content_start_y = row_y
+            break
+        i = j if j > i else i + 1
+
+    if content_start_y <= 0:
+        return  # no clear content start found
+
+    # --- Remove low-conf short junk above content start ---
+    junk = [
+        w for w in words
+        if w["top"] + w.get("height", 0) < content_start_y
+        and w.get("conf", 0) < 75
+        and len((w.get("text") or "").strip()) <= 3
+    ]
+    if not junk:
+        return
+
+    junk_set = set(id(w) for w in junk)
+    before = len(words)
+    words[:] = [w for w in words if id(w) not in junk_set]
+    removed = before - len(words)
+    if removed:
+        log.info(
+            "build-grid session %s: removed %d header junk words above y=%d "
+            "(content start)",
+            session_id, removed, content_start_y,
+        )
+
+
 # ---------------------------------------------------------------------------
 # Core computation (used by build-grid endpoint and regression tests)
 # ---------------------------------------------------------------------------
@@ -792,6 +888,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
     # page number ("64", "S. 12") and not real content.
     _filter_footer_words(all_words, img_h, logger, session_id)
 
+    # 2c2. Filter OCR junk from header illustrations.
+    # Low-confidence short fragments above the first real content row.
+    _filter_header_junk(all_words, img_h, logger, session_id)
+
     # 2d. Filter words inside user-defined exclude regions (from Structure step).
     # These are explicitly marked by the user, so ALL words inside are removed
     # regardless of confidence.