fix: Box-Bereiche aus Bild entfernen statt pro Zone separat Spalten erkennen
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Content-Streifen oberhalb/unterhalb von Boxen werden zu einem Bild zusammengefügt, Spaltenerkennung läuft einmal auf dem kombinierten Bild. Entfernt Step 5c (suspicion-based gap alignment), da der neue Ansatz das Problem an der Wurzel löst. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1265,96 +1265,6 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
if len(wc_gaps) >= 2:
|
||||
validated_gaps = wc_gaps
|
||||
|
||||
# --- Step 5c: Left-edge alignment validation (suspicious gaps only) ---
|
||||
# Only check gaps that would create an unusually wide column to the right.
|
||||
# These are likely false splits within a single wide column (e.g. short EN
|
||||
# words followed by longer DE example sentences in the same column).
|
||||
# Gaps that produce columns of similar width to their neighbors are trusted.
|
||||
if len(validated_gaps) > 2:
|
||||
edge_tolerance_align = max(8, content_w // 150)
|
||||
min_aligned_ratio = 0.15 # at least 15% of words must share a left-edge bin
|
||||
margin_thresh = max(10, int(content_w * 0.02))
|
||||
|
||||
# Compute tentative column widths from all gaps
|
||||
sorted_gaps = sorted(validated_gaps, key=lambda g: g[0])
|
||||
# Interior gaps only (exclude margins)
|
||||
interior_indices = []
|
||||
for gi, (gs, ge) in enumerate(sorted_gaps):
|
||||
if gs > margin_thresh and ge < content_w - margin_thresh:
|
||||
interior_indices.append(gi)
|
||||
|
||||
if interior_indices:
|
||||
# For each interior gap, compute the width of the column it starts
|
||||
gap_suspicion: dict = {} # gap_index → right_col_width
|
||||
for gi in interior_indices:
|
||||
gap_end = sorted_gaps[gi][1]
|
||||
# Next gap start (or content right edge)
|
||||
if gi + 1 < len(sorted_gaps):
|
||||
next_gs = sorted_gaps[gi + 1][0]
|
||||
else:
|
||||
next_gs = content_w
|
||||
right_col_w = next_gs - gap_end
|
||||
gap_suspicion[gi] = right_col_w
|
||||
|
||||
# Median column width (from all gaps, including margins)
|
||||
all_col_widths = []
|
||||
prev_end = 0
|
||||
for gs, ge in sorted_gaps:
|
||||
cw = gs - prev_end
|
||||
if cw > 0:
|
||||
all_col_widths.append(cw)
|
||||
prev_end = ge
|
||||
trailing = content_w - prev_end
|
||||
if trailing > 0:
|
||||
all_col_widths.append(trailing)
|
||||
median_col_w = sorted(all_col_widths)[len(all_col_widths) // 2] if all_col_widths else content_w
|
||||
|
||||
# A gap is suspicious if the column to its right is > 2x median width
|
||||
suspicious_threshold = median_col_w * 2.0
|
||||
|
||||
alignment_validated = list(validated_gaps) # start with all
|
||||
for gi in interior_indices:
|
||||
right_col_w = gap_suspicion[gi]
|
||||
if right_col_w <= suspicious_threshold:
|
||||
continue # normal gap, keep it
|
||||
|
||||
# Suspicious — check left-edge alignment
|
||||
gap_start_rel, gap_end_rel = sorted_gaps[gi]
|
||||
next_gs = sorted_gaps[gi + 1][0] if gi + 1 < len(sorted_gaps) else content_w
|
||||
right_words = [w for w in segment_words
|
||||
if gap_end_rel <= w['left'] < next_gs]
|
||||
|
||||
if len(right_words) < 3:
|
||||
continue # too few words, keep gap
|
||||
|
||||
# Cluster left-edges
|
||||
right_lefts = sorted(w['left'] for w in right_words)
|
||||
bins = []
|
||||
cur_bin = [right_lefts[0]]
|
||||
for le in right_lefts[1:]:
|
||||
if le - cur_bin[-1] <= edge_tolerance_align:
|
||||
cur_bin.append(le)
|
||||
else:
|
||||
bins.append(len(cur_bin))
|
||||
cur_bin = [le]
|
||||
bins.append(len(cur_bin))
|
||||
|
||||
max_bin = max(bins)
|
||||
ratio = max_bin / len(right_words)
|
||||
|
||||
if ratio < min_aligned_ratio:
|
||||
# Remove this gap
|
||||
alignment_validated.remove((gap_start_rel, gap_end_rel))
|
||||
logger.info(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"REJECTED — suspicious (right_col={right_col_w}px > 2x median={median_col_w:.0f}px) "
|
||||
f"and poor left-edge alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
|
||||
else:
|
||||
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"suspicious but passed alignment (best_bin={max_bin}/{len(right_words)}={ratio:.2f})")
|
||||
|
||||
if len(alignment_validated) >= 2:
|
||||
validated_gaps = alignment_validated
|
||||
|
||||
# --- Step 6: Fallback to clustering if too few gaps ---
|
||||
if len(validated_gaps) < 2:
|
||||
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
|
||||
@@ -3187,13 +3097,75 @@ def detect_column_geometry_zoned(
|
||||
return (geometries, left_x, right_x, top_y, bottom_y,
|
||||
word_dicts, inv, zone_data, boxes)
|
||||
|
||||
# Split into zones
|
||||
# --- New approach: concatenate content regions (skip boxes), run column
|
||||
# detection ONCE on the combined image, then map coordinates back. ---
|
||||
|
||||
# Split into zones (for metadata / overlay purposes)
|
||||
zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes)
|
||||
|
||||
# Run column detection per content zone
|
||||
all_geometries: List[ColumnGeometry] = []
|
||||
zones_data: List[Dict] = []
|
||||
# Collect content strips (above/between/below boxes)
|
||||
content_strips: List[Tuple[int, int]] = [] # (y_start, y_end) in absolute coords
|
||||
for zone in zones:
|
||||
if zone.zone_type == 'content' and zone.height >= 40:
|
||||
content_strips.append((zone.y, zone.y + zone.height))
|
||||
|
||||
if not content_strips:
|
||||
# Only box zones — fall back to original detection
|
||||
logger.info("ZonedColumns: no content zones with height >= 40, using original result")
|
||||
zone_data = [{"index": 0, "zone_type": "content", "y": top_y,
|
||||
"height": content_h, "x": left_x, "width": content_w, "columns": []}]
|
||||
return (geometries, left_x, right_x, top_y, bottom_y,
|
||||
word_dicts, inv, zone_data, boxes)
|
||||
|
||||
# Build combined image by vertically stacking content strips
|
||||
ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips]
|
||||
bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips]
|
||||
combined_ocr = np.vstack(ocr_strips)
|
||||
combined_bgr = np.vstack(bgr_strips)
|
||||
|
||||
logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} "
|
||||
f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}")
|
||||
|
||||
# Run column detection on the combined (box-free) image
|
||||
combined_result = detect_column_geometry(combined_ocr, combined_bgr)
|
||||
if combined_result is not None:
|
||||
combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result
|
||||
else:
|
||||
# Fallback to original full-page result
|
||||
logger.info("ZonedColumns: combined image column detection failed, using original")
|
||||
combined_geoms = geometries
|
||||
|
||||
# Map combined-image y-coordinates back to absolute page coordinates.
|
||||
# In the combined image, strip i starts at cumulative_y = sum of heights
|
||||
# of strips 0..i-1. We need to add the offset between the strip's
|
||||
# original y-position and its position in the combined image.
|
||||
# Build a mapping: combined_y → absolute_y
|
||||
strip_offsets: List[Tuple[int, int, int]] = [] # (combined_y_start, strip_height, abs_y_start)
|
||||
cum_y = 0
|
||||
for ys, ye in content_strips:
|
||||
h = ye - ys
|
||||
strip_offsets.append((cum_y, h, ys))
|
||||
cum_y += h
|
||||
|
||||
def _combined_y_to_abs(cy: int) -> int:
|
||||
"""Map a y-coordinate in combined image back to absolute page coords."""
|
||||
for c_start, s_h, abs_start in strip_offsets:
|
||||
if cy < c_start + s_h:
|
||||
return abs_start + (cy - c_start)
|
||||
# Past last strip — clamp to end of last strip
|
||||
last_c, last_h, last_abs = strip_offsets[-1]
|
||||
return last_abs + last_h
|
||||
|
||||
# Adjust geometries: y and height need remapping
|
||||
if combined_result is not None:
|
||||
for g in combined_geoms:
|
||||
abs_y = _combined_y_to_abs(g.y)
|
||||
abs_y_end = _combined_y_to_abs(g.y + g.height)
|
||||
g.y = abs_y
|
||||
g.height = abs_y_end - abs_y
|
||||
|
||||
# Build zones_data for the response
|
||||
zones_data: List[Dict] = []
|
||||
for zone in zones:
|
||||
zone_dict: Dict = {
|
||||
"index": zone.index,
|
||||
@@ -3215,45 +3187,12 @@ def detect_column_geometry_zoned(
|
||||
"border_thickness": zone.box.border_thickness,
|
||||
}
|
||||
|
||||
if zone.zone_type == 'content' and zone.height >= 40:
|
||||
# Extract sub-image for this zone
|
||||
zone_y_end = zone.y + zone.height
|
||||
sub_ocr = ocr_img[zone.y:zone_y_end, :]
|
||||
sub_bgr = dewarped_bgr[zone.y:zone_y_end, :]
|
||||
|
||||
sub_result = detect_column_geometry(sub_ocr, sub_bgr)
|
||||
if sub_result is not None:
|
||||
sub_geoms, sub_lx, sub_rx, sub_ty, sub_by, _sub_words, _sub_inv = sub_result
|
||||
|
||||
# Offset column y-coordinates back to absolute page coords
|
||||
for g in sub_geoms:
|
||||
g.y += zone.y
|
||||
|
||||
zone_cols = []
|
||||
for g in sub_geoms:
|
||||
zone_cols.append({
|
||||
"index": g.index,
|
||||
"x": g.x,
|
||||
"y": g.y,
|
||||
"width": g.width,
|
||||
"height": g.height,
|
||||
"word_count": g.word_count,
|
||||
"width_ratio": g.width_ratio,
|
||||
"zone_index": zone.index,
|
||||
})
|
||||
zone_dict["columns"] = zone_cols
|
||||
all_geometries.extend(sub_geoms)
|
||||
else:
|
||||
logger.debug(f"ZonedColumns: zone {zone.index} column detection returned None")
|
||||
|
||||
zones_data.append(zone_dict)
|
||||
|
||||
# If per-zone detection produced no columns, fall back to the original
|
||||
if not all_geometries:
|
||||
all_geometries = geometries
|
||||
all_geometries = combined_geoms if combined_geoms else geometries
|
||||
|
||||
logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), "
|
||||
f"{len(all_geometries)} total columns")
|
||||
f"{len(all_geometries)} total columns (combined-image approach)")
|
||||
|
||||
return (all_geometries, left_x, right_x, top_y, bottom_y,
|
||||
word_dicts, inv, zones_data, boxes)
|
||||
|
||||
Reference in New Issue
Block a user