feat: breite Spalten per Word-Gap splitten + gedrehte Scans im Frontend anzeigen
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s

_split_broad_columns() erkennt EN/DE-Gemisch in breiten Spalten via
Word-Coverage-Analyse und trennt sie am groessten Luecken-Gap.
Thumbnails und Page-Images werden serverseitig per fitz rotiert,
Frontend laedt Thumbnails nach OCR-Processing neu.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-07 18:16:32 +01:00
parent a5635e0c43
commit 02631dc4e0
3 changed files with 202 additions and 7 deletions

View File

@@ -2067,6 +2067,148 @@ def _detect_sub_columns(
return result
def _split_broad_columns(
geometries: List[ColumnGeometry],
content_w: int,
left_x: int = 0,
_broad_threshold: float = 0.35,
_min_gap_px: int = 15,
_min_words_per_split: int = 5,
) -> List[ColumnGeometry]:
"""Split overly broad columns that contain two language blocks (EN+DE).
Uses word-coverage gap analysis: builds a per-pixel coverage array from the
words inside each broad column, finds the largest horizontal gap, and splits
the column at that gap.
Args:
geometries: Column geometries from _detect_sub_columns.
content_w: Width of the content area in pixels.
left_x: Left edge of content ROI in absolute image coordinates.
_broad_threshold: Minimum width_ratio to consider a column "broad".
_min_gap_px: Minimum gap width (pixels) to trigger a split.
_min_words_per_split: Both halves must have at least this many words.
Returns:
Updated list of ColumnGeometry (possibly with more columns).
"""
result: List[ColumnGeometry] = []
for geo in geometries:
if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
result.append(geo)
continue
# Build word-coverage array (per pixel within column)
col_left_rel = geo.x - left_x # column left in content-relative coords
coverage = np.zeros(geo.width, dtype=np.float32)
for wd in geo.words:
# wd['left'] is relative to left_x (content ROI)
wl = wd['left'] - col_left_rel
wr = wl + wd.get('width', 0)
wl = max(0, int(wl))
wr = min(geo.width, int(wr))
if wr > wl:
coverage[wl:wr] += 1.0
# Light smoothing (kernel=3px) to avoid noise
if len(coverage) > 3:
kernel = np.ones(3, dtype=np.float32) / 3.0
coverage = np.convolve(coverage, kernel, mode='same')
# Normalise to [0, 1]
cmax = coverage.max()
if cmax > 0:
coverage /= cmax
# Find gaps where coverage < 0.5
low_mask = coverage < 0.5
gap_start = None
best_gap = None # (start, end, width)
for px in range(len(low_mask)):
if low_mask[px]:
if gap_start is None:
gap_start = px
else:
if gap_start is not None:
gw = px - gap_start
if best_gap is None or gw > best_gap[2]:
best_gap = (gap_start, px, gw)
gap_start = None
# Handle trailing gap
if gap_start is not None:
gw = len(low_mask) - gap_start
if best_gap is None or gw > best_gap[2]:
best_gap = (gap_start, len(low_mask), gw)
if best_gap is None or best_gap[2] < _min_gap_px:
result.append(geo)
continue
gap_center = (best_gap[0] + best_gap[1]) // 2
# Split words by midpoint relative to gap
left_words = []
right_words = []
for wd in geo.words:
wl = wd['left'] - col_left_rel
mid = wl + wd.get('width', 0) / 2.0
if mid < gap_center:
left_words.append(wd)
else:
right_words.append(wd)
if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
result.append(geo)
continue
# Build two new ColumnGeometry objects
split_x_abs = geo.x + gap_center
left_w = gap_center
right_w = geo.width - gap_center
left_geo = ColumnGeometry(
index=0,
x=geo.x,
y=geo.y,
width=left_w,
height=geo.height,
word_count=len(left_words),
words=left_words,
width_ratio=left_w / content_w if content_w else 0,
is_sub_column=True,
)
right_geo = ColumnGeometry(
index=0,
x=split_x_abs,
y=geo.y,
width=right_w,
height=geo.height,
word_count=len(right_words),
words=right_words,
width_ratio=right_w / content_w if content_w else 0,
is_sub_column=True,
)
logger.info(
f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
f"left={len(left_words)} words (w={left_w}), "
f"right={len(right_words)} words (w={right_w})"
)
result.append(left_geo)
result.append(right_geo)
# Re-index left-to-right
result.sort(key=lambda g: g.x)
for i, g in enumerate(result):
g.index = i
return result
def _build_geometries_from_starts(
col_starts: List[Tuple[int, int]],
word_dicts: List[Dict],
@@ -4128,6 +4270,9 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
top_y=top_y, header_y=header_y, footer_y=footer_y)
# Split broad columns that contain EN+DE mixed via word-coverage gaps
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
# Phase B: Positional classification (no language scoring)
content_h = bottom_y - top_y
regions = positional_column_regions(geometries, content_w, content_h, left_x)