fix: use group-start positions for column detection, not all word left-edges
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s
Only cluster left-edges of words that begin a new group within their row (first word or preceded by a large gap). This filters out mid-phrase word positions (IPA transcriptions, second words in multi-word entries) that were causing too many false columns. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -43,16 +43,17 @@ def _cluster_columns_by_alignment(
|
|||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""Detect columns by clustering left-edge alignment across rows.
|
"""Detect columns by clustering left-edge alignment across rows.
|
||||||
|
|
||||||
Algorithm (adapted from cv_layout._detect_columns_by_clustering):
|
Hybrid approach:
|
||||||
1. Tag each word with its row index
|
1. Group words by row, find "group start" positions within each row
|
||||||
2. Cluster word left-edges by X-proximity
|
(words preceded by a large gap or first word in row)
|
||||||
3. Count distinct rows per cluster (Y-coverage)
|
2. Cluster group-start left-edges by X-proximity across rows
|
||||||
4. Keep clusters with sufficient row coverage
|
3. Filter by row coverage (how many rows have a group start here)
|
||||||
5. Merge nearby clusters
|
4. Merge nearby clusters
|
||||||
6. Build column boundaries
|
5. Build column boundaries
|
||||||
|
|
||||||
With real OCR words (from Kombi mode) this is more reliable than the
|
This filters out mid-phrase word positions (e.g. IPA transcriptions,
|
||||||
original ink-based version because left-edge positions are precise.
|
second words in multi-word entries) by only considering positions
|
||||||
|
where a new word group begins within a row.
|
||||||
"""
|
"""
|
||||||
if not words or not rows:
|
if not words or not rows:
|
||||||
return []
|
return []
|
||||||
@@ -61,26 +62,65 @@ def _cluster_columns_by_alignment(
|
|||||||
if total_rows == 0:
|
if total_rows == 0:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# --- Tag each word with its row index ---
|
# --- Group words by row ---
|
||||||
row_of: Dict[int, int] = {}
|
row_words: Dict[int, List[Dict]] = {}
|
||||||
for w in words:
|
for w in words:
|
||||||
y_center = w["top"] + w["height"] / 2
|
y_center = w["top"] + w["height"] / 2
|
||||||
best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
|
best = min(rows, key=lambda r: abs(r["y_center"] - y_center))
|
||||||
row_of[id(w)] = best["index"]
|
row_words.setdefault(best["index"], []).append(w)
|
||||||
|
|
||||||
# --- Collect and sort left-edges ---
|
# --- Compute adaptive gap threshold for group-start detection ---
|
||||||
edge_data = sorted(
|
all_gaps: List[float] = []
|
||||||
((w["left"], row_of[id(w)]) for w in words),
|
for ri, rw_list in row_words.items():
|
||||||
key=lambda x: x[0],
|
sorted_rw = sorted(rw_list, key=lambda w: w["left"])
|
||||||
|
for i in range(len(sorted_rw) - 1):
|
||||||
|
right = sorted_rw[i]["left"] + sorted_rw[i]["width"]
|
||||||
|
gap = sorted_rw[i + 1]["left"] - right
|
||||||
|
if gap > 0:
|
||||||
|
all_gaps.append(gap)
|
||||||
|
|
||||||
|
if all_gaps:
|
||||||
|
sorted_gaps = sorted(all_gaps)
|
||||||
|
median_gap = sorted_gaps[len(sorted_gaps) // 2]
|
||||||
|
heights = [w["height"] for w in words if w.get("height", 0) > 0]
|
||||||
|
median_h = sorted(heights)[len(heights) // 2] if heights else 25
|
||||||
|
# Column boundary: gap > 3× median gap or > 1.5× median word height
|
||||||
|
gap_threshold = max(median_gap * 3, median_h * 1.5, 30)
|
||||||
|
else:
|
||||||
|
gap_threshold = 50
|
||||||
|
|
||||||
|
# --- Find group-start positions (left-edges that begin a new column) ---
|
||||||
|
start_positions: List[tuple] = [] # (left_edge, row_index)
|
||||||
|
for ri, rw_list in row_words.items():
|
||||||
|
sorted_rw = sorted(rw_list, key=lambda w: w["left"])
|
||||||
|
# First word in row is always a group start
|
||||||
|
start_positions.append((sorted_rw[0]["left"], ri))
|
||||||
|
for i in range(1, len(sorted_rw)):
|
||||||
|
right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"]
|
||||||
|
gap = sorted_rw[i]["left"] - right_prev
|
||||||
|
if gap >= gap_threshold:
|
||||||
|
start_positions.append((sorted_rw[i]["left"], ri))
|
||||||
|
|
||||||
|
start_positions.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"alignment columns: %d group-start positions from %d words "
|
||||||
|
"(gap_threshold=%.0f, %d rows)",
|
||||||
|
len(start_positions), len(words), gap_threshold, total_rows,
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- Cluster by X-proximity ---
|
if not start_positions:
|
||||||
|
x_min = min(w["left"] for w in words)
|
||||||
|
x_max = max(w["left"] + w["width"] for w in words)
|
||||||
|
return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}]
|
||||||
|
|
||||||
|
# --- Cluster group-start positions by X-proximity ---
|
||||||
tolerance = max(10, int(zone_w * 0.01))
|
tolerance = max(10, int(zone_w * 0.01))
|
||||||
clusters: List[Dict[str, Any]] = []
|
clusters: List[Dict[str, Any]] = []
|
||||||
cur_edges = [edge_data[0][0]]
|
cur_edges = [start_positions[0][0]]
|
||||||
cur_rows = {edge_data[0][1]}
|
cur_rows = {start_positions[0][1]}
|
||||||
|
|
||||||
for left, row_idx in edge_data[1:]:
|
for left, row_idx in start_positions[1:]:
|
||||||
if left - cur_edges[-1] <= tolerance:
|
if left - cur_edges[-1] <= tolerance:
|
||||||
cur_edges.append(left)
|
cur_edges.append(left)
|
||||||
cur_rows.add(row_idx)
|
cur_rows.add(row_idx)
|
||||||
@@ -105,8 +145,8 @@ def _cluster_columns_by_alignment(
|
|||||||
})
|
})
|
||||||
|
|
||||||
# --- Filter by row coverage ---
|
# --- Filter by row coverage ---
|
||||||
MIN_COVERAGE_PRIMARY = 0.15
|
MIN_COVERAGE_PRIMARY = 0.20
|
||||||
MIN_COVERAGE_SECONDARY = 0.08
|
MIN_COVERAGE_SECONDARY = 0.12
|
||||||
MIN_WORDS_SECONDARY = 3
|
MIN_WORDS_SECONDARY = 3
|
||||||
MIN_DISTINCT_ROWS = 2
|
MIN_DISTINCT_ROWS = 2
|
||||||
|
|
||||||
@@ -126,7 +166,7 @@ def _cluster_columns_by_alignment(
|
|||||||
significant = sorted(primary + secondary, key=lambda c: c["mean_x"])
|
significant = sorted(primary + secondary, key=lambda c: c["mean_x"])
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"alignment columns: %d clusters total, %d primary, %d secondary → %d significant",
|
"alignment columns: %d clusters, %d primary, %d secondary → %d significant",
|
||||||
len(clusters), len(primary), len(secondary), len(significant),
|
len(clusters), len(primary), len(secondary), len(significant),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user