From 962bbbe9f6c0b8560e1c6b0f895e3bf7fff9eb3a Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Wed, 18 Mar 2026 10:47:17 +0100
Subject: [PATCH] Remove scattered debris rows and disable spanning header
 detection

- Add Rule 3 to junk-row filter: rows where no word is longer than
  2 chars are removed as scattered OCR debris from illustrations
- Fully disable spanning-header detection which falsely flagged IPA
  transcriptions and vocabulary entries as spanning headers
- First-row heuristic remains for genuine header detection

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 37 ++++++++--------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 18252c3..6b2be75 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -452,30 +452,11 @@ def _detect_header_rows(
             if 0 not in headers:
                 headers.append(0)
 
-    # Spanning header detection: rows with very few words that span
-    # across many columns (e.g. "Unit 4: Bonnie Scotland" centred
-    # across all columns).  Only trigger for clear cases (≥3 cols,
-    # ≤3 words) to avoid false positives on vocabulary worksheets
-    # where colored entries naturally span 2 columns.
-    if columns and len(columns) >= 3:
-        for row in rows:
-            ri = row["index"]
-            if ri in headers:
-                continue
-            row_words = [
-                w for w in zone_words
-                if row["y_min"] <= w["top"] + w["height"] / 2 <= row["y_max"]
-            ]
-            if not row_words or len(row_words) > 3:
-                continue
-            word_x_min = min(w["left"] for w in row_words)
-            word_x_max = max(w["left"] + w["width"] for w in row_words)
-            cols_spanned = sum(
-                1 for c in columns
-                if word_x_min < c["x_max"] and word_x_max > c["x_min"]
-            )
-            if cols_spanned >= 3 and len(row_words) <= 3:
-                headers.append(ri)
+    # Note: Spanning-header detection (rows spanning all columns) has been
+    # disabled because it produces too many false positives on vocabulary
+    # worksheets where IPA transcriptions or short entries naturally span
+    # multiple columns with few words.  The first-row heuristic above is
+    # sufficient for detecting real headers.
 
     return headers
 
@@ -1124,6 +1105,14 @@ async def build_grid(session_id: str):
                     junk_row_indices.add(ri)
                     continue
 
+            # Rule 3: scattered debris — rows with only tiny fragments
+            # (e.g. OCR artifacts from illustrations/graphics).
+            # If the row has no word longer than 2 chars, it's noise.
+            longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
+            if longest <= 2:
+                junk_row_indices.add(ri)
+                continue
+
         if junk_row_indices:
             z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
             z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]