fix: Edge-Gaps in _split_broad_columns ignorieren + return-Tuple bei leerem Ergebnis
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 16s
Gaps die den Spaltenrand beruehren (Margins) werden jetzt ausgeschlossen, nur interne Gaps werden als Split-Kandidaten betrachtet. Behebt das Problem dass trailing whitespace faelschlich als groesster Gap gewaehlt wurde. Early-return in _run_ocr_pipeline_for_page gibt jetzt korrekt ([], rotation) statt [] zurueck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2125,27 +2125,9 @@ def _split_broad_columns(
|
||||
if cmax > 0:
|
||||
coverage /= cmax
|
||||
|
||||
# Find gaps where coverage < 0.5
|
||||
# Find INTERNAL gaps where coverage < 0.5
|
||||
# Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
|
||||
low_mask = coverage < 0.5
|
||||
gap_start = None
|
||||
best_gap = None # (start, end, width)
|
||||
for px in range(len(low_mask)):
|
||||
if low_mask[px]:
|
||||
if gap_start is None:
|
||||
gap_start = px
|
||||
else:
|
||||
if gap_start is not None:
|
||||
gw = px - gap_start
|
||||
if best_gap is None or gw > best_gap[2]:
|
||||
best_gap = (gap_start, px, gw)
|
||||
gap_start = None
|
||||
# Handle trailing gap
|
||||
if gap_start is not None:
|
||||
gw = len(low_mask) - gap_start
|
||||
if best_gap is None or gw > best_gap[2]:
|
||||
best_gap = (gap_start, len(low_mask), gw)
|
||||
|
||||
# Log all gaps found for debugging
|
||||
all_gaps = []
|
||||
_gs = None
|
||||
for px in range(len(low_mask)):
|
||||
@@ -2158,8 +2140,17 @@ def _split_broad_columns(
|
||||
_gs = None
|
||||
if _gs is not None:
|
||||
all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
|
||||
logger.info(f"SplitBroadCols: col {geo.index} coverage gaps (>=5px): "
|
||||
f"{[g for g in all_gaps if g[2] >= 5]}, best={best_gap}")
|
||||
|
||||
# Filter: only internal gaps (not touching column edges)
|
||||
_edge_margin = 10 # pixels from edge to ignore
|
||||
internal_gaps = [g for g in all_gaps
|
||||
if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
|
||||
best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
|
||||
|
||||
logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
|
||||
f"{[g for g in all_gaps if g[2] >= 5]}, "
|
||||
f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
|
||||
f"best={best_gap}")
|
||||
|
||||
if best_gap is None or best_gap[2] < _min_gap_px:
|
||||
result.append(geo)
|
||||
|
||||
@@ -1510,7 +1510,7 @@ async def _run_ocr_pipeline_for_page(
|
||||
if not is_vocab:
|
||||
logger.warning(f" Page {page_number + 1}: layout is not vocab table "
|
||||
f"(types: {col_types}), returning empty")
|
||||
return []
|
||||
return [], rotation
|
||||
|
||||
# 8. Map cells → vocab entries
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
|
||||
Reference in New Issue
Block a user