From b0e1fbc8d601f0c5b3e67c59ae9761b8e92dab91 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBookPro.fritz.box>
Date: Tue, 17 Mar 2026 11:31:55 +0100
Subject: [PATCH] feat: box zone artifact filter, spanning headers, parenthesis
 fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Filter recovered single-char artifacts (!, ?, •) from box zones
   where they are decorative noise, not real text markers

2. Detect spanning header rows (e.g. "Unit4: Bonnie Scotland") that
   stretch across multiple columns with colored text. Merge their
   cells into a single spanning cell in column 0.

3. Fix missing opening parentheses: when cell text has ")" but no
   matching "(", prepend "(" to the text.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 121 +++++++++++++++++++--
 1 file changed, 114 insertions(+), 7 deletions(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 181a72f..e34843e 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -415,9 +415,13 @@ def _detect_header_rows(
     rows: List[Dict],
     zone_words: List[Dict],
     zone_y: int,
+    columns: Optional[List[Dict]] = None,
 ) -> List[int]:
-    """Heuristic: the first row is a header if it has bold/large text or
-    there's a significant gap after it."""
+    """Detect header rows: first-row heuristic + spanning header detection.
+
+    A "spanning header" is a row whose words stretch across multiple column
+    boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns).
+    """
     if len(rows) < 2:
         return []
 
@@ -425,25 +429,60 @@ def _detect_header_rows(
     first_row = rows[0]
     second_row = rows[1]
 
-    # Gap between first and second row > 1.5x average row height
+    # Gap between first and second row > 0.5x average row height
     avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
     gap = second_row["y_min"] - first_row["y_max"]
     if gap > avg_h * 0.5:
         headers.append(0)
 
     # Also check if first row words are taller than average (bold/header text)
+    all_heights = [w["height"] for w in zone_words]
+    median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
     first_row_words = [
         w for w in zone_words
         if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
     ]
     if first_row_words:
         first_h = max(w["height"] for w in first_row_words)
-        all_heights = [w["height"] for w in zone_words]
-        median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else first_h
         if first_h > median_h * 1.3:
             if 0 not in headers:
                 headers.append(0)
 
+    # Spanning header detection: rows with few words that cross column
+    # boundaries and don't fit the normal multi-column pattern.
+    if columns and len(columns) >= 2:
+        # Typical data row has words in 2+ columns; a spanning header has
+        # words that sit in the middle columns without matching the pattern.
+        for row in rows:
+            ri = row["index"]
+            if ri in headers:
+                continue
+            row_words = [
+                w for w in zone_words
+                if row["y_min"] <= w["top"] + w["height"] / 2 <= row["y_max"]
+            ]
+            if not row_words or len(row_words) > 6:
+                continue  # too many words to be a header
+            # Check if all row words are colored (common for section headers)
+            all_colored = all(
+                w.get("color_name") and w.get("color_name") != "black"
+                for w in row_words
+            )
+            # Check if words span across the middle columns (not in col 0)
+            word_x_min = min(w["left"] for w in row_words)
+            word_x_max = max(w["left"] + w["width"] for w in row_words)
+            first_col_end = columns[0]["x_max"] if columns else 0
+            # Header if: colored text that starts after the first column
+            # or spans more than 2 columns
+            cols_spanned = sum(
+                1 for c in columns
+                if word_x_min < c["x_max"] and word_x_max > c["x_min"]
+            )
+            if all_colored and cols_spanned >= 2:
+                headers.append(ri)
+            elif cols_spanned >= 3 and len(row_words) <= 4:
+                headers.append(ri)
+
     return headers
 
 
@@ -522,8 +561,48 @@ def _build_zone_grid(
         cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
         cell["zone_index"] = zone_index
 
-    # Detect header rows
-    header_rows = _detect_header_rows(rows, zone_words, zone_y)
+    # Detect header rows (pass columns for spanning header detection)
+    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns)
+
+    # Merge cells in spanning header rows into a single col-0 cell
+    if header_rows and len(columns) >= 2:
+        for hri in header_rows:
+            header_cells = [c for c in cells if c["row_index"] == hri]
+            if len(header_cells) <= 1:
+                continue
+            # Collect all word_boxes and text from all columns
+            all_wb = []
+            all_text_parts = []
+            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
+                all_wb.extend(hc.get("word_boxes", []))
+                if hc.get("text", "").strip():
+                    all_text_parts.append(hc["text"].strip())
+            # Remove all header cells, replace with one spanning cell
+            cells = [c for c in cells if c["row_index"] != hri]
+            if all_wb:
+                x_min = min(wb["left"] for wb in all_wb)
+                y_min = min(wb["top"] for wb in all_wb)
+                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
+                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
+                cells.append({
+                    "cell_id": f"R{hri:02d}_C0",
+                    "row_index": hri,
+                    "col_index": 0,
+                    "col_type": "spanning_header",
+                    "text": " ".join(all_text_parts),
+                    "confidence": 0.0,
+                    "bbox_px": {"x": x_min, "y": y_min,
+                                "w": x_max - x_min, "h": y_max - y_min},
+                    "bbox_pct": {
+                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
+                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
+                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
+                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
+                    },
+                    "word_boxes": all_wb,
+                    "ocr_engine": "words_first",
+                    "is_bold": True,
+                })
 
     # Convert columns to output format with percentages
     out_columns = []
@@ -716,10 +795,29 @@ async def build_grid(session_id: str):
                 # First pass: build grids per zone independently
                 zone_grids: List[Dict] = []
 
+                _RECOVERED_NOISE = {"!", "?", "•", "·"}
+
                 for pz in page_zones:
                     zone_words = _words_in_zone(
                         all_words, pz.y, pz.height, pz.x, pz.width
                     )
+                    # In box zones, filter out recovered single-char artifacts
+                    # (decorative elements like !, ?, • from color recovery)
+                    if pz.zone_type == "box":
+                        before = len(zone_words)
+                        zone_words = [
+                            w for w in zone_words
+                            if not (
+                                w.get("recovered")
+                                and w.get("text", "").strip() in _RECOVERED_NOISE
+                            )
+                        ]
+                        removed = before - len(zone_words)
+                        if removed:
+                            logger.info(
+                                "build-grid: filtered %d recovered artifacts from box zone %d",
+                                removed, pz.index,
+                            )
                     grid = _build_zone_grid(
                         zone_words, pz.x, pz.y, pz.width, pz.height,
                         pz.index, img_w, img_h,
@@ -863,6 +961,15 @@ async def build_grid(session_id: str):
                 all_wb.extend(cell.get("word_boxes", []))
         detect_word_colors(img_bgr, all_wb)
 
+    # 5b. Fix unmatched parentheses in cell text
+    # OCR often misses opening "(" while detecting closing ")".
+    # If a cell's text has ")" without a matching "(", prepend "(".
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            text = cell.get("text", "")
+            if ")" in text and "(" not in text:
+                cell["text"] = "(" + text
+
     duration = time.time() - t0
 
     # 6. Build result