From 91625a2646935e0e75a90b10d9b276d86943964c Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.fritz.box>
Date: Mon, 16 Mar 2026 22:40:40 +0100
Subject: [PATCH] fix: add tertiary tier for narrow margin columns (page refs,
 markers)

Page references (p.55, p.57) and marker columns (!) appear in very few
rows (< 12% coverage) but sit at the far left/right margin with a clear
gap to the main content.  Add a third detection tier that catches these
narrow margin columns when they have >= 2 distinct rows and are within
15% of the content edge with >= 40px gap to the nearest main column.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 klausur-service/backend/grid_editor_api.py | 38 +++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index 06f9b86..35b37f2 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -151,6 +151,11 @@ def _cluster_columns_by_alignment(
     MIN_WORDS_SECONDARY = 3
     MIN_DISTINCT_ROWS = 2
 
+    # Content boundary for left-margin detection
+    content_x_min = min(w["left"] for w in words)
+    content_x_max = max(w["left"] + w["width"] for w in words)
+    content_span = content_x_max - content_x_min
+
     primary = [
         c for c in clusters
         if c["row_coverage"] >= MIN_COVERAGE_PRIMARY
@@ -164,7 +169,38 @@ def _cluster_columns_by_alignment(
         and c["count"] >= MIN_WORDS_SECONDARY
         and c["distinct_rows"] >= MIN_DISTINCT_ROWS
     ]
-    significant = sorted(primary + secondary, key=lambda c: c["mean_x"])
+
+    # Tertiary: narrow left-margin columns (page refs, markers) that have
+    # too few rows for secondary but are clearly left-aligned and separated
+    # from the main content.  These appear at the far left or far right and
+    # have a large gap to the nearest significant cluster.
+    used_ids = {id(c) for c in primary} | {id(c) for c in secondary}
+    sig_xs = [c["mean_x"] for c in primary + secondary]
+
+    tertiary = []
+    for c in clusters:
+        if id(c) in used_ids or c["distinct_rows"] < MIN_DISTINCT_ROWS:
+            continue
+        # Must be near left or right content margin (within 15%)
+        rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5
+        if not (rel_pos < 0.15 or rel_pos > 0.85):
+            continue
+        # Must have significant gap to nearest significant cluster
+        if sig_xs:
+            min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs)
+            if min_dist < max(40, content_span * 0.05):
+                continue
+        tertiary.append(c)
+
+    if tertiary:
+        for c in tertiary:
+            logger.info(
+                "  tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)",
+                c["mean_x"], c["min_edge"], c["max_edge"],
+                c["count"], c["distinct_rows"], c["row_coverage"] * 100,
+            )
+
+    significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"])
 
     for c in significant:
         logger.info(