diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 06f9b86..35b37f2 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -151,6 +151,11 @@ def _cluster_columns_by_alignment( MIN_WORDS_SECONDARY = 3 MIN_DISTINCT_ROWS = 2 + # Content boundary for left-margin detection + content_x_min = min(w["left"] for w in words) + content_x_max = max(w["left"] + w["width"] for w in words) + content_span = content_x_max - content_x_min + primary = [ c for c in clusters if c["row_coverage"] >= MIN_COVERAGE_PRIMARY @@ -164,7 +169,38 @@ def _cluster_columns_by_alignment( and c["count"] >= MIN_WORDS_SECONDARY and c["distinct_rows"] >= MIN_DISTINCT_ROWS ] - significant = sorted(primary + secondary, key=lambda c: c["mean_x"]) + + # Tertiary: narrow left-margin columns (page refs, markers) that have + # too few rows for secondary but are clearly left-aligned and separated + # from the main content. These appear at the far left or far right and + # have a large gap to the nearest significant cluster. + used_ids = {id(c) for c in primary} | {id(c) for c in secondary} + sig_xs = [c["mean_x"] for c in primary + secondary] + + tertiary = [] + for c in clusters: + if id(c) in used_ids or c["distinct_rows"] < MIN_DISTINCT_ROWS: + continue + # Must be near left or right content margin (within 15%) + rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5 + if not (rel_pos < 0.15 or rel_pos > 0.85): + continue + # Must have significant gap to nearest significant cluster + if sig_xs: + min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs) + if min_dist < max(40, content_span * 0.05): + continue + tertiary.append(c) + + if tertiary: + for c in tertiary: + logger.info( + " tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)", + c["mean_x"], c["min_edge"], c["max_edge"], + c["count"], c["distinct_rows"], c["row_coverage"] * 100, + ) + + significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"]) for c in significant: logger.info(