fix: Edge-Gaps in _split_broad_columns ignorieren + return-Tuple bei leerem Ergebnis
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 14s
CI / test-nodejs-website (push) Successful in 16s
Gaps die den Spaltenrand beruehren (Margins) werden jetzt ausgeschlossen, nur interne Gaps werden als Split-Kandidaten betrachtet. Behebt das Problem dass trailing whitespace faelschlich als groesster Gap gewaehlt wurde. Early-return in _run_ocr_pipeline_for_page gibt jetzt korrekt ([], rotation) statt [] zurueck. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2125,27 +2125,9 @@ def _split_broad_columns(
|
|||||||
if cmax > 0:
|
if cmax > 0:
|
||||||
coverage /= cmax
|
coverage /= cmax
|
||||||
|
|
||||||
# Find gaps where coverage < 0.5
|
# Find INTERNAL gaps where coverage < 0.5
|
||||||
|
# Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
|
||||||
low_mask = coverage < 0.5
|
low_mask = coverage < 0.5
|
||||||
gap_start = None
|
|
||||||
best_gap = None # (start, end, width)
|
|
||||||
for px in range(len(low_mask)):
|
|
||||||
if low_mask[px]:
|
|
||||||
if gap_start is None:
|
|
||||||
gap_start = px
|
|
||||||
else:
|
|
||||||
if gap_start is not None:
|
|
||||||
gw = px - gap_start
|
|
||||||
if best_gap is None or gw > best_gap[2]:
|
|
||||||
best_gap = (gap_start, px, gw)
|
|
||||||
gap_start = None
|
|
||||||
# Handle trailing gap
|
|
||||||
if gap_start is not None:
|
|
||||||
gw = len(low_mask) - gap_start
|
|
||||||
if best_gap is None or gw > best_gap[2]:
|
|
||||||
best_gap = (gap_start, len(low_mask), gw)
|
|
||||||
|
|
||||||
# Log all gaps found for debugging
|
|
||||||
all_gaps = []
|
all_gaps = []
|
||||||
_gs = None
|
_gs = None
|
||||||
for px in range(len(low_mask)):
|
for px in range(len(low_mask)):
|
||||||
@@ -2158,8 +2140,17 @@ def _split_broad_columns(
|
|||||||
_gs = None
|
_gs = None
|
||||||
if _gs is not None:
|
if _gs is not None:
|
||||||
all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
|
all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
|
||||||
logger.info(f"SplitBroadCols: col {geo.index} coverage gaps (>=5px): "
|
|
||||||
f"{[g for g in all_gaps if g[2] >= 5]}, best={best_gap}")
|
# Filter: only internal gaps (not touching column edges)
|
||||||
|
_edge_margin = 10 # pixels from edge to ignore
|
||||||
|
internal_gaps = [g for g in all_gaps
|
||||||
|
if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
|
||||||
|
best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
|
||||||
|
|
||||||
|
logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
|
||||||
|
f"{[g for g in all_gaps if g[2] >= 5]}, "
|
||||||
|
f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
|
||||||
|
f"best={best_gap}")
|
||||||
|
|
||||||
if best_gap is None or best_gap[2] < _min_gap_px:
|
if best_gap is None or best_gap[2] < _min_gap_px:
|
||||||
result.append(geo)
|
result.append(geo)
|
||||||
|
|||||||
@@ -1510,7 +1510,7 @@ async def _run_ocr_pipeline_for_page(
|
|||||||
if not is_vocab:
|
if not is_vocab:
|
||||||
logger.warning(f" Page {page_number + 1}: layout is not vocab table "
|
logger.warning(f" Page {page_number + 1}: layout is not vocab table "
|
||||||
f"(types: {col_types}), returning empty")
|
f"(types: {col_types}), returning empty")
|
||||||
return []
|
return [], rotation
|
||||||
|
|
||||||
# 8. Map cells → vocab entries
|
# 8. Map cells → vocab entries
|
||||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||||
|
|||||||
Reference in New Issue
Block a user