Fix overlay word leak, ghost filter false positive, merged zone header

1. Filter words inside image_overlays (removes OCR from images) 2. Ghost filter: only remove single-char border artifacts, not multi-char like (= which is real content 3. Skip first-row header detection for zones with image_overlays (merged geometry creates artificial gaps) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 13:56:04 +01:00
parent df30d4eae3
commit e3395ae8cf
2 changed files with 129 additions and 22 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -320,7 +320,7 @@ def _filter_border_ghosts(
        )
        if not on_border:
            return False
-        if all(c in _GRID_GHOST_CHARS for c in text):
+        if len(text) == 1 and text in _GRID_GHOST_CHARS:
            return True
        return False

@@ -656,6 +656,7 @@ def _detect_header_rows(
    zone_words: List[Dict],
    zone_y: int,
    columns: Optional[List[Dict]] = None,
+    skip_first_row_header: bool = False,
 ) -> List[int]:
    """Detect header rows: first-row heuristic + spanning header detection.

@@ -666,27 +667,29 @@ def _detect_header_rows(
        return []

    headers = []
-    first_row = rows[0]
-    second_row = rows[1]

-    # Gap between first and second row > 0.5x average row height
-    avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
-    gap = second_row["y_min"] - first_row["y_max"]
-    if gap > avg_h * 0.5:
-        headers.append(0)
+    if not skip_first_row_header:
+        first_row = rows[0]
+        second_row = rows[1]

-    # Also check if first row words are taller than average (bold/header text)
-    all_heights = [w["height"] for w in zone_words]
-    median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
-    first_row_words = [
-        w for w in zone_words
-        if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
-    ]
-    if first_row_words:
-        first_h = max(w["height"] for w in first_row_words)
-        if first_h > median_h * 1.3:
-            if 0 not in headers:
-                headers.append(0)
+        # Gap between first and second row > 0.5x average row height
+        avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
+        gap = second_row["y_min"] - first_row["y_max"]
+        if gap > avg_h * 0.5:
+            headers.append(0)
+
+        # Also check if first row words are taller than average (bold/header text)
+        all_heights = [w["height"] for w in zone_words]
+        median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
+        first_row_words = [
+            w for w in zone_words
+            if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
+        ]
+        if first_row_words:
+            first_h = max(w["height"] for w in first_row_words)
+            if first_h > median_h * 1.3:
+                if 0 not in headers:
+                    headers.append(0)

    # Note: Spanning-header detection (rows spanning all columns) has been
    # disabled because it produces too many false positives on vocabulary
@@ -707,6 +710,7 @@ def _build_zone_grid(
    img_w: int,
    img_h: int,
    global_columns: Optional[List[Dict]] = None,
+    skip_first_row_header: bool = False,
 ) -> Dict[str, Any]:
    """Build columns, rows, cells for a single zone from its words.

@@ -773,7 +777,8 @@ def _build_zone_grid(
        cell["zone_index"] = zone_index

    # Detect header rows (pass columns for spanning header detection)
-    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns)
+    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
+                                      skip_first_row_header=skip_first_row_header)

    # Merge cells in spanning header rows into a single col-0 cell
    if header_rows and len(columns) >= 2:
@@ -1270,9 +1275,27 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                            "build-grid: filtered %d recovered artifacts from %s zone %d",
                            removed, pz.zone_type, pz.index,
                        )
+                    # Filter words inside image overlay regions (merged box zones)
+                    if pz.image_overlays:
+                        before_ov = len(zone_words)
+                        zone_words = [
+                            w for w in zone_words
+                            if not any(
+                                ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
+                                and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
+                                for ov in pz.image_overlays
+                            )
+                        ]
+                        ov_removed = before_ov - len(zone_words)
+                        if ov_removed:
+                            logger.info(
+                                "build-grid: filtered %d words inside image overlays from zone %d",
+                                ov_removed, pz.index,
+                            )
                    grid = _build_zone_grid(
                        zone_words, pz.x, pz.y, pz.width, pz.height,
                        pz.index, img_w, img_h,
+                        skip_first_row_header=bool(pz.image_overlays),
                    )
                    zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})

@@ -1339,6 +1362,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                                        pz.width, pz.height,
                                        pz.index, img_w, img_h,
                                        global_columns=merged_columns,
+                                        skip_first_row_header=bool(pz.image_overlays),
                                    )
                                    zg["grid"] = grid
                            logger.info(