Tighten tertiary column detection: require 4+ rows and 5% coverage

Prevents false narrow columns from text overflow at page edges.
Session 355f3c84 had a 3-row/4% tertiary cluster creating a spurious
third column from right-column text overflow.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-20 12:50:03 +01:00
parent 04092a0a66
commit 76ba83eecb

View File

@@ -183,9 +183,15 @@ def _cluster_columns_by_alignment(
used_ids = {id(c) for c in primary} | {id(c) for c in secondary}
sig_xs = [c["mean_x"] for c in primary + secondary]
MIN_DISTINCT_ROWS_TERTIARY = max(MIN_DISTINCT_ROWS + 1, 4)
MIN_COVERAGE_TERTIARY = 0.05 # at least 5% of rows
tertiary = []
for c in clusters:
if id(c) in used_ids or c["distinct_rows"] < MIN_DISTINCT_ROWS:
if id(c) in used_ids:
continue
if c["distinct_rows"] < MIN_DISTINCT_ROWS_TERTIARY:
continue
if c["row_coverage"] < MIN_COVERAGE_TERTIARY:
continue
# Must be near left or right content margin (within 15%)
rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5