fix: use group-start positions for column detection, not all word left-edges
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s

Only cluster left-edges of words that begin a new group within their row
(first word or preceded by a large gap). This filters out mid-phrase
word positions (IPA transcriptions, second words in multi-word entries)
that were causing too many false columns.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-15 00:10:29 +01:00
parent 28352f5bab
commit 1162eac7b4

View File

@@ -43,16 +43,17 @@ def _cluster_columns_by_alignment(
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
"""Detect columns by clustering left-edge alignment across rows. """Detect columns by clustering left-edge alignment across rows.
Algorithm (adapted from cv_layout._detect_columns_by_clustering): Hybrid approach:
1. Tag each word with its row index 1. Group words by row, find "group start" positions within each row
2. Cluster word left-edges by X-proximity (words preceded by a large gap or first word in row)
3. Count distinct rows per cluster (Y-coverage) 2. Cluster group-start left-edges by X-proximity across rows
4. Keep clusters with sufficient row coverage 3. Filter by row coverage (how many rows have a group start here)
5. Merge nearby clusters 4. Merge nearby clusters
6. Build column boundaries 5. Build column boundaries
With real OCR words (from Kombi mode) this is more reliable than the This filters out mid-phrase word positions (e.g. IPA transcriptions,
original ink-based version because left-edge positions are precise. second words in multi-word entries) by only considering positions
where a new word group begins within a row.
""" """
if not words or not rows: if not words or not rows:
return [] return []
@@ -61,26 +62,65 @@ def _cluster_columns_by_alignment(
if total_rows == 0: if total_rows == 0:
return [] return []
# --- Tag each word with its row index --- # --- Group words by row ---
row_of: Dict[int, int] = {} row_words: Dict[int, List[Dict]] = {}
for w in words: for w in words:
y_center = w["top"] + w["height"] / 2 y_center = w["top"] + w["height"] / 2
best = min(rows, key=lambda r: abs(r["y_center"] - y_center)) best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
row_of[id(w)] = best["index"] row_words.setdefault(best["index"], []).append(w)
# --- Collect and sort left-edges --- # --- Compute adaptive gap threshold for group-start detection ---
edge_data = sorted( all_gaps: List[float] = []
((w["left"], row_of[id(w)]) for w in words), for ri, rw_list in row_words.items():
key=lambda x: x[0], sorted_rw = sorted(rw_list, key=lambda w: w["left"])
for i in range(len(sorted_rw) - 1):
right = sorted_rw[i]["left"] + sorted_rw[i]["width"]
gap = sorted_rw[i + 1]["left"] - right
if gap > 0:
all_gaps.append(gap)
if all_gaps:
sorted_gaps = sorted(all_gaps)
median_gap = sorted_gaps[len(sorted_gaps) // 2]
heights = [w["height"] for w in words if w.get("height", 0) > 0]
median_h = sorted(heights)[len(heights) // 2] if heights else 25
# Column boundary: gap > 3× median gap or > 1.5× median word height
gap_threshold = max(median_gap * 3, median_h * 1.5, 30)
else:
gap_threshold = 50
# --- Find group-start positions (left-edges that begin a new column) ---
start_positions: List[tuple] = [] # (left_edge, row_index)
for ri, rw_list in row_words.items():
sorted_rw = sorted(rw_list, key=lambda w: w["left"])
# First word in row is always a group start
start_positions.append((sorted_rw[0]["left"], ri))
for i in range(1, len(sorted_rw)):
right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"]
gap = sorted_rw[i]["left"] - right_prev
if gap >= gap_threshold:
start_positions.append((sorted_rw[i]["left"], ri))
start_positions.sort(key=lambda x: x[0])
logger.info(
"alignment columns: %d group-start positions from %d words "
"(gap_threshold=%.0f, %d rows)",
len(start_positions), len(words), gap_threshold, total_rows,
) )
# --- Cluster by X-proximity --- if not start_positions:
x_min = min(w["left"] for w in words)
x_max = max(w["left"] + w["width"] for w in words)
return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
# --- Cluster group-start positions by X-proximity ---
tolerance = max(10, int(zone_w * 0.01)) tolerance = max(10, int(zone_w * 0.01))
clusters: List[Dict[str, Any]] = [] clusters: List[Dict[str, Any]] = []
cur_edges = [edge_data[0][0]] cur_edges = [start_positions[0][0]]
cur_rows = {edge_data[0][1]} cur_rows = {start_positions[0][1]}
for left, row_idx in edge_data[1:]: for left, row_idx in start_positions[1:]:
if left - cur_edges[-1] <= tolerance: if left - cur_edges[-1] <= tolerance:
cur_edges.append(left) cur_edges.append(left)
cur_rows.add(row_idx) cur_rows.add(row_idx)
@@ -105,8 +145,8 @@ def _cluster_columns_by_alignment(
}) })
# --- Filter by row coverage --- # --- Filter by row coverage ---
MIN_COVERAGE_PRIMARY = 0.15 MIN_COVERAGE_PRIMARY = 0.20
MIN_COVERAGE_SECONDARY = 0.08 MIN_COVERAGE_SECONDARY = 0.12
MIN_WORDS_SECONDARY = 3 MIN_WORDS_SECONDARY = 3
MIN_DISTINCT_ROWS = 2 MIN_DISTINCT_ROWS = 2
@@ -126,7 +166,7 @@ def _cluster_columns_by_alignment(
significant = sorted(primary + secondary, key=lambda c: c["mean_x"]) significant = sorted(primary + secondary, key=lambda c: c["mean_x"])
logger.info( logger.info(
"alignment columns: %d clusters total, %d primary, %d secondary → %d significant", "alignment columns: %d clusters, %d primary, %d secondary → %d significant",
len(clusters), len(primary), len(secondary), len(significant), len(clusters), len(primary), len(secondary), len(significant),
) )