fix: Box-Bereiche aus Bild entfernen statt pro Zone separat Spalten erkennen
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s

Content-Streifen oberhalb/unterhalb von Boxen werden zu einem Bild zusammengefügt,
Spaltenerkennung läuft einmal auf dem kombinierten Bild. Entfernt Step 5c
(suspicion-based gap alignment), da der neue Ansatz das Problem an der Wurzel löst.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-09 17:03:05 +01:00
parent fb46450802
commit 4610137ecc

View File

@@ -1265,96 +1265,6 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
if len(wc_gaps) >= 2:
validated_gaps = wc_gaps
# --- Step 5c: Left-edge alignment validation (suspicious gaps only) ---
# Only check gaps that would create an unusually wide column to the right.
# These are likely false splits within a single wide column (e.g. short EN
# words followed by longer DE example sentences in the same column).
# Gaps that produce columns of similar width to their neighbors are trusted.
if len(validated_gaps) > 2:
edge_tolerance_align = max(8, content_w // 150)
min_aligned_ratio = 0.15 # at least 15% of words must share a left-edge bin
margin_thresh = max(10, int(content_w * 0.02))
# Compute tentative column widths from all gaps
sorted_gaps = sorted(validated_gaps, key=lambda g: g[0])
# Interior gaps only (exclude margins)
interior_indices = []
for gi, (gs, ge) in enumerate(sorted_gaps):
if gs > margin_thresh and ge < content_w - margin_thresh:
interior_indices.append(gi)
if interior_indices:
# For each interior gap, compute the width of the column it starts
gap_suspicion: dict = {} # gap_index → right_col_width
for gi in interior_indices:
gap_end = sorted_gaps[gi][1]
# Next gap start (or content right edge)
if gi + 1 < len(sorted_gaps):
next_gs = sorted_gaps[gi + 1][0]
else:
next_gs = content_w
right_col_w = next_gs - gap_end
gap_suspicion[gi] = right_col_w
# Median column width (from all gaps, including margins)
all_col_widths = []
prev_end = 0
for gs, ge in sorted_gaps:
cw = gs - prev_end
if cw > 0:
all_col_widths.append(cw)
prev_end = ge
trailing = content_w - prev_end
if trailing > 0:
all_col_widths.append(trailing)
median_col_w = sorted(all_col_widths)[len(all_col_widths) // 2] if all_col_widths else content_w
# A gap is suspicious if the column to its right is > 2x median width
suspicious_threshold = median_col_w * 2.0
alignment_validated = list(validated_gaps) # start with all
for gi in interior_indices:
right_col_w = gap_suspicion[gi]
if right_col_w <= suspicious_threshold:
continue # normal gap, keep it
# Suspicious — check left-edge alignment
gap_start_rel, gap_end_rel = sorted_gaps[gi]
next_gs = sorted_gaps[gi + 1][0] if gi + 1 < len(sorted_gaps) else content_w
right_words = [w for w in segment_words
if gap_end_rel <= w['left'] < next_gs]
if len(right_words) < 3:
continue # too few words, keep gap
# Cluster left-edges
right_lefts = sorted(w['left'] for w in right_words)
bins = []
cur_bin = [right_lefts[0]]
for le in right_lefts[1:]:
if le - cur_bin[-1] <= edge_tolerance_align:
cur_bin.append(le)
else:
bins.append(len(cur_bin))
cur_bin = [le]
bins.append(len(cur_bin))
max_bin = max(bins)
ratio = max_bin / len(right_words)
if ratio < min_aligned_ratio:
# Remove this gap
alignment_validated.remove((gap_start_rel, gap_end_rel))
logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
f"REJECTED — suspicious (right_col={right_col_w}px > 2x median={median_col_w:.0f}px) "
f"and poor left-edge alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
else:
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
f"suspicious but passed alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
if len(alignment_validated) >= 2:
validated_gaps = alignment_validated
# --- Step 6: Fallback to clustering if too few gaps ---
if len(validated_gaps) < 2:
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
@@ -3187,13 +3097,75 @@ def detect_column_geometry_zoned(
return (geometries, left_x, right_x, top_y, bottom_y,
word_dicts, inv, zone_data, boxes)
# Split into zones
# --- New approach: concatenate content regions (skip boxes), run column
# detection ONCE on the combined image, then map coordinates back. ---
# Split into zones (for metadata / overlay purposes)
zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes)
# Run column detection per content zone
all_geometries: List[ColumnGeometry] = []
zones_data: List[Dict] = []
# Collect content strips (above/between/below boxes)
content_strips: List[Tuple[int, int]] = [] # (y_start, y_end) in absolute coords
for zone in zones:
if zone.zone_type == 'content' and zone.height >= 40:
content_strips.append((zone.y, zone.y + zone.height))
if not content_strips:
# Only box zones — fall back to original detection
logger.info("ZonedColumns: no content zones with height >= 40, using original result")
zone_data = [{"index": 0, "zone_type": "content", "y": top_y,
"height": content_h, "x": left_x, "width": content_w, "columns": []}]
return (geometries, left_x, right_x, top_y, bottom_y,
word_dicts, inv, zone_data, boxes)
# Build combined image by vertically stacking content strips
ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips]
bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips]
combined_ocr = np.vstack(ocr_strips)
combined_bgr = np.vstack(bgr_strips)
logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} "
f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}")
# Run column detection on the combined (box-free) image
combined_result = detect_column_geometry(combined_ocr, combined_bgr)
if combined_result is not None:
combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result
else:
# Fallback to original full-page result
logger.info("ZonedColumns: combined image column detection failed, using original")
combined_geoms = geometries
# Map combined-image y-coordinates back to absolute page coordinates.
# In the combined image, strip i starts at cumulative_y = sum of heights
# of strips 0..i-1. We need to add the offset between the strip's
# original y-position and its position in the combined image.
# Build a mapping: combined_y → absolute_y
strip_offsets: List[Tuple[int, int, int]] = [] # (combined_y_start, strip_height, abs_y_start)
cum_y = 0
for ys, ye in content_strips:
h = ye - ys
strip_offsets.append((cum_y, h, ys))
cum_y += h
def _combined_y_to_abs(cy: int) -> int:
"""Map a y-coordinate in combined image back to absolute page coords."""
for c_start, s_h, abs_start in strip_offsets:
if cy < c_start + s_h:
return abs_start + (cy - c_start)
# Past last strip — clamp to end of last strip
last_c, last_h, last_abs = strip_offsets[-1]
return last_abs + last_h
# Adjust geometries: y and height need remapping
if combined_result is not None:
for g in combined_geoms:
abs_y = _combined_y_to_abs(g.y)
abs_y_end = _combined_y_to_abs(g.y + g.height)
g.y = abs_y
g.height = abs_y_end - abs_y
# Build zones_data for the response
zones_data: List[Dict] = []
for zone in zones:
zone_dict: Dict = {
"index": zone.index,
@@ -3215,45 +3187,12 @@ def detect_column_geometry_zoned(
"border_thickness": zone.box.border_thickness,
}
if zone.zone_type == 'content' and zone.height >= 40:
# Extract sub-image for this zone
zone_y_end = zone.y + zone.height
sub_ocr = ocr_img[zone.y:zone_y_end, :]
sub_bgr = dewarped_bgr[zone.y:zone_y_end, :]
sub_result = detect_column_geometry(sub_ocr, sub_bgr)
if sub_result is not None:
sub_geoms, sub_lx, sub_rx, sub_ty, sub_by, _sub_words, _sub_inv = sub_result
# Offset column y-coordinates back to absolute page coords
for g in sub_geoms:
g.y += zone.y
zone_cols = []
for g in sub_geoms:
zone_cols.append({
"index": g.index,
"x": g.x,
"y": g.y,
"width": g.width,
"height": g.height,
"word_count": g.word_count,
"width_ratio": g.width_ratio,
"zone_index": zone.index,
})
zone_dict["columns"] = zone_cols
all_geometries.extend(sub_geoms)
else:
logger.debug(f"ZonedColumns: zone {zone.index} column detection returned None")
zones_data.append(zone_dict)
# If per-zone detection produced no columns, fall back to the original
if not all_geometries:
all_geometries = geometries
all_geometries = combined_geoms if combined_geoms else geometries
logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), "
f"{len(all_geometries)} total columns")
f"{len(all_geometries)} total columns (combined-image approach)")
return (all_geometries, left_x, right_x, top_y, bottom_y,
word_dicts, inv, zones_data, boxes)