feat: breite Spalten per Word-Gap splitten + gedrehte Scans im Frontend anzeigen
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 15s
_split_broad_columns() erkennt EN/DE-Gemisch in breiten Spalten via Word-Coverage-Analyse und trennt sie am groessten Luecken-Gap. Thumbnails und Page-Images werden serverseitig per fitz rotiert, Frontend laedt Thumbnails nach OCR-Processing neu. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2067,6 +2067,148 @@ def _detect_sub_columns(
|
||||
return result
|
||||
|
||||
|
||||
def _split_broad_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
left_x: int = 0,
|
||||
_broad_threshold: float = 0.35,
|
||||
_min_gap_px: int = 15,
|
||||
_min_words_per_split: int = 5,
|
||||
) -> List[ColumnGeometry]:
|
||||
"""Split overly broad columns that contain two language blocks (EN+DE).
|
||||
|
||||
Uses word-coverage gap analysis: builds a per-pixel coverage array from the
|
||||
words inside each broad column, finds the largest horizontal gap, and splits
|
||||
the column at that gap.
|
||||
|
||||
Args:
|
||||
geometries: Column geometries from _detect_sub_columns.
|
||||
content_w: Width of the content area in pixels.
|
||||
left_x: Left edge of content ROI in absolute image coordinates.
|
||||
_broad_threshold: Minimum width_ratio to consider a column "broad".
|
||||
_min_gap_px: Minimum gap width (pixels) to trigger a split.
|
||||
_min_words_per_split: Both halves must have at least this many words.
|
||||
|
||||
Returns:
|
||||
Updated list of ColumnGeometry (possibly with more columns).
|
||||
"""
|
||||
result: List[ColumnGeometry] = []
|
||||
|
||||
for geo in geometries:
|
||||
if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Build word-coverage array (per pixel within column)
|
||||
col_left_rel = geo.x - left_x # column left in content-relative coords
|
||||
coverage = np.zeros(geo.width, dtype=np.float32)
|
||||
|
||||
for wd in geo.words:
|
||||
# wd['left'] is relative to left_x (content ROI)
|
||||
wl = wd['left'] - col_left_rel
|
||||
wr = wl + wd.get('width', 0)
|
||||
wl = max(0, int(wl))
|
||||
wr = min(geo.width, int(wr))
|
||||
if wr > wl:
|
||||
coverage[wl:wr] += 1.0
|
||||
|
||||
# Light smoothing (kernel=3px) to avoid noise
|
||||
if len(coverage) > 3:
|
||||
kernel = np.ones(3, dtype=np.float32) / 3.0
|
||||
coverage = np.convolve(coverage, kernel, mode='same')
|
||||
|
||||
# Normalise to [0, 1]
|
||||
cmax = coverage.max()
|
||||
if cmax > 0:
|
||||
coverage /= cmax
|
||||
|
||||
# Find gaps where coverage < 0.5
|
||||
low_mask = coverage < 0.5
|
||||
gap_start = None
|
||||
best_gap = None # (start, end, width)
|
||||
for px in range(len(low_mask)):
|
||||
if low_mask[px]:
|
||||
if gap_start is None:
|
||||
gap_start = px
|
||||
else:
|
||||
if gap_start is not None:
|
||||
gw = px - gap_start
|
||||
if best_gap is None or gw > best_gap[2]:
|
||||
best_gap = (gap_start, px, gw)
|
||||
gap_start = None
|
||||
# Handle trailing gap
|
||||
if gap_start is not None:
|
||||
gw = len(low_mask) - gap_start
|
||||
if best_gap is None or gw > best_gap[2]:
|
||||
best_gap = (gap_start, len(low_mask), gw)
|
||||
|
||||
if best_gap is None or best_gap[2] < _min_gap_px:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
gap_center = (best_gap[0] + best_gap[1]) // 2
|
||||
|
||||
# Split words by midpoint relative to gap
|
||||
left_words = []
|
||||
right_words = []
|
||||
for wd in geo.words:
|
||||
wl = wd['left'] - col_left_rel
|
||||
mid = wl + wd.get('width', 0) / 2.0
|
||||
if mid < gap_center:
|
||||
left_words.append(wd)
|
||||
else:
|
||||
right_words.append(wd)
|
||||
|
||||
if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Build two new ColumnGeometry objects
|
||||
split_x_abs = geo.x + gap_center
|
||||
left_w = gap_center
|
||||
right_w = geo.width - gap_center
|
||||
|
||||
left_geo = ColumnGeometry(
|
||||
index=0,
|
||||
x=geo.x,
|
||||
y=geo.y,
|
||||
width=left_w,
|
||||
height=geo.height,
|
||||
word_count=len(left_words),
|
||||
words=left_words,
|
||||
width_ratio=left_w / content_w if content_w else 0,
|
||||
is_sub_column=True,
|
||||
)
|
||||
right_geo = ColumnGeometry(
|
||||
index=0,
|
||||
x=split_x_abs,
|
||||
y=geo.y,
|
||||
width=right_w,
|
||||
height=geo.height,
|
||||
word_count=len(right_words),
|
||||
words=right_words,
|
||||
width_ratio=right_w / content_w if content_w else 0,
|
||||
is_sub_column=True,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
|
||||
f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
|
||||
f"left={len(left_words)} words (w={left_w}), "
|
||||
f"right={len(right_words)} words (w={right_w})"
|
||||
)
|
||||
|
||||
result.append(left_geo)
|
||||
result.append(right_geo)
|
||||
|
||||
# Re-index left-to-right
|
||||
result.sort(key=lambda g: g.x)
|
||||
for i, g in enumerate(result):
|
||||
g.index = i
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _build_geometries_from_starts(
|
||||
col_starts: List[Tuple[int, int]],
|
||||
word_dicts: List[Dict],
|
||||
@@ -4128,6 +4270,9 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||||
|
||||
# Split broad columns that contain EN+DE mixed via word-coverage gaps
|
||||
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
|
||||
|
||||
# Phase B: Positional classification (no language scoring)
|
||||
content_h = bottom_y - top_y
|
||||
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
||||
|
||||
Reference in New Issue
Block a user