fix: Alignment-Validierung nur fuer verdaechtige Gaps (>2x Median-Breite)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 20s
Vorher wurden alle internen Gaps geprueft, was echte Spaltentrennungen (EN→DE) faelschlicherweise entfernte. Jetzt werden nur Gaps geprueft, die eine unverhaeltnismaessig breite rechte Spalte erzeugen wuerden (>2x Median-Spaltenbreite). Schwelle auf 15% gesenkt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1265,72 +1265,95 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
if len(wc_gaps) >= 2:
|
||||
validated_gaps = wc_gaps
|
||||
|
||||
# --- Step 5c: Left-edge alignment validation ---
|
||||
# A real column gap must have words to its right whose left-edges are
|
||||
# consistently aligned (i.e. many words start at nearly the same x).
|
||||
# If words to the right of a gap have scattered left-edges, the gap is
|
||||
# just a natural gap within a wide column (e.g. short words ending
|
||||
# before longer example sentences in the same column).
|
||||
# --- Step 5c: Left-edge alignment validation (suspicious gaps only) ---
|
||||
# Only check gaps that would create an unusually wide column to the right.
|
||||
# These are likely false splits within a single wide column (e.g. short EN
|
||||
# words followed by longer DE example sentences in the same column).
|
||||
# Gaps that produce columns of similar width to their neighbors are trusted.
|
||||
if len(validated_gaps) > 2:
|
||||
edge_tolerance_align = max(8, content_w // 150)
|
||||
min_aligned_ratio = 0.25 # at least 25% of words must share a left-edge bin
|
||||
min_aligned_ratio = 0.15 # at least 15% of words must share a left-edge bin
|
||||
margin_thresh = max(10, int(content_w * 0.02))
|
||||
|
||||
alignment_validated = []
|
||||
for gap_start_rel, gap_end_rel in validated_gaps:
|
||||
# Skip margin gaps — they don't need alignment validation
|
||||
if gap_start_rel <= margin_thresh:
|
||||
alignment_validated.append((gap_start_rel, gap_end_rel))
|
||||
continue
|
||||
if gap_end_rel >= content_w - margin_thresh:
|
||||
alignment_validated.append((gap_start_rel, gap_end_rel))
|
||||
continue
|
||||
# Compute tentative column widths from all gaps
|
||||
sorted_gaps = sorted(validated_gaps, key=lambda g: g[0])
|
||||
# Interior gaps only (exclude margins)
|
||||
interior_indices = []
|
||||
for gi, (gs, ge) in enumerate(sorted_gaps):
|
||||
if gs > margin_thresh and ge < content_w - margin_thresh:
|
||||
interior_indices.append(gi)
|
||||
|
||||
# Find the next gap after this one (or content end)
|
||||
next_gap_start = content_w
|
||||
for gs, ge in validated_gaps:
|
||||
if gs > gap_end_rel:
|
||||
next_gap_start = gs
|
||||
break
|
||||
|
||||
# Collect words to the right of this gap (up to the next gap)
|
||||
right_words = [w for w in segment_words
|
||||
if gap_end_rel <= w['left'] < next_gap_start]
|
||||
|
||||
if len(right_words) < 3:
|
||||
# Too few words — keep the gap (benefit of the doubt)
|
||||
alignment_validated.append((gap_start_rel, gap_end_rel))
|
||||
continue
|
||||
|
||||
# Cluster left-edges of right-side words
|
||||
right_lefts = sorted(w['left'] for w in right_words)
|
||||
bins = []
|
||||
cur_bin = [right_lefts[0]]
|
||||
for le in right_lefts[1:]:
|
||||
if le - cur_bin[-1] <= edge_tolerance_align:
|
||||
cur_bin.append(le)
|
||||
if interior_indices:
|
||||
# For each interior gap, compute the width of the column it starts
|
||||
gap_suspicion: dict = {} # gap_index → right_col_width
|
||||
for gi in interior_indices:
|
||||
gap_end = sorted_gaps[gi][1]
|
||||
# Next gap start (or content right edge)
|
||||
if gi + 1 < len(sorted_gaps):
|
||||
next_gs = sorted_gaps[gi + 1][0]
|
||||
else:
|
||||
bins.append(len(cur_bin))
|
||||
cur_bin = [le]
|
||||
bins.append(len(cur_bin))
|
||||
next_gs = content_w
|
||||
right_col_w = next_gs - gap_end
|
||||
gap_suspicion[gi] = right_col_w
|
||||
|
||||
# The largest bin must contain a significant fraction of words
|
||||
max_bin = max(bins)
|
||||
ratio = max_bin / len(right_words)
|
||||
# Median column width (from all gaps, including margins)
|
||||
all_col_widths = []
|
||||
prev_end = 0
|
||||
for gs, ge in sorted_gaps:
|
||||
cw = gs - prev_end
|
||||
if cw > 0:
|
||||
all_col_widths.append(cw)
|
||||
prev_end = ge
|
||||
trailing = content_w - prev_end
|
||||
if trailing > 0:
|
||||
all_col_widths.append(trailing)
|
||||
median_col_w = sorted(all_col_widths)[len(all_col_widths) // 2] if all_col_widths else content_w
|
||||
|
||||
if ratio >= min_aligned_ratio:
|
||||
alignment_validated.append((gap_start_rel, gap_end_rel))
|
||||
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"passed alignment check (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
|
||||
else:
|
||||
logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"REJECTED — words to the right have no consistent left-edge alignment "
|
||||
f"(best_bin={max_bin}/{len(right_words)}={ratio:.2f} < {min_aligned_ratio})")
|
||||
# A gap is suspicious if the column to its right is > 2x median width
|
||||
suspicious_threshold = median_col_w * 2.0
|
||||
|
||||
if len(alignment_validated) >= 2:
|
||||
validated_gaps = alignment_validated
|
||||
else:
|
||||
logger.info("ColumnGeometry: alignment filter removed too many gaps, keeping originals")
|
||||
alignment_validated = list(validated_gaps) # start with all
|
||||
for gi in interior_indices:
|
||||
right_col_w = gap_suspicion[gi]
|
||||
if right_col_w <= suspicious_threshold:
|
||||
continue # normal gap, keep it
|
||||
|
||||
# Suspicious — check left-edge alignment
|
||||
gap_start_rel, gap_end_rel = sorted_gaps[gi]
|
||||
next_gs = sorted_gaps[gi + 1][0] if gi + 1 < len(sorted_gaps) else content_w
|
||||
right_words = [w for w in segment_words
|
||||
if gap_end_rel <= w['left'] < next_gs]
|
||||
|
||||
if len(right_words) < 3:
|
||||
continue # too few words, keep gap
|
||||
|
||||
# Cluster left-edges
|
||||
right_lefts = sorted(w['left'] for w in right_words)
|
||||
bins = []
|
||||
cur_bin = [right_lefts[0]]
|
||||
for le in right_lefts[1:]:
|
||||
if le - cur_bin[-1] <= edge_tolerance_align:
|
||||
cur_bin.append(le)
|
||||
else:
|
||||
bins.append(len(cur_bin))
|
||||
cur_bin = [le]
|
||||
bins.append(len(cur_bin))
|
||||
|
||||
max_bin = max(bins)
|
||||
ratio = max_bin / len(right_words)
|
||||
|
||||
if ratio < min_aligned_ratio:
|
||||
# Remove this gap
|
||||
alignment_validated.remove((gap_start_rel, gap_end_rel))
|
||||
logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"REJECTED — suspicious (right_col={right_col_w}px > 2x median={median_col_w:.0f}px) "
|
||||
f"and poor left-edge alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
|
||||
else:
|
||||
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"suspicious but passed alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
|
||||
|
||||
if len(alignment_validated) >= 2:
|
||||
validated_gaps = alignment_validated
|
||||
|
||||
# --- Step 6: Fallback to clustering if too few gaps ---
|
||||
if len(validated_gaps) < 2:
|
||||
|
||||
Reference in New Issue
Block a user