fix: leere Spalten als strukturell behandeln + 2-Spalten-Layout korrekt labeln
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 24s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m50s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Spalten mit <=2 Woertern und <15% Breite werden jetzt als column_marker statt als content-Spalte klassifiziert. Bei 2 breiten Content-Spalten wird die rechte als column_example statt column_de gelabelt, da die linke Spalte EN+DE kombiniert enthaelt. OSD-Zoom von 1.0 auf 2.0 erhoeht fuer zuverlaessigere Orientierungserkennung. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2145,6 +2145,22 @@ def _split_broad_columns(
|
||||
if best_gap is None or gw > best_gap[2]:
|
||||
best_gap = (gap_start, len(low_mask), gw)
|
||||
|
||||
# Log all gaps found for debugging
|
||||
all_gaps = []
|
||||
_gs = None
|
||||
for px in range(len(low_mask)):
|
||||
if low_mask[px]:
|
||||
if _gs is None:
|
||||
_gs = px
|
||||
else:
|
||||
if _gs is not None:
|
||||
all_gaps.append((_gs, px, px - _gs))
|
||||
_gs = None
|
||||
if _gs is not None:
|
||||
all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
|
||||
logger.info(f"SplitBroadCols: col {geo.index} coverage gaps (>=5px): "
|
||||
f"{[g for g in all_gaps if g[2] >= 5]}, best={best_gap}")
|
||||
|
||||
if best_gap is None or best_gap[2] < _min_gap_px:
|
||||
result.append(geo)
|
||||
continue
|
||||
@@ -3547,6 +3563,14 @@ def positional_column_regions(
|
||||
classification_confidence=0.95,
|
||||
classification_method='positional',
|
||||
))
|
||||
# empty or near-empty narrow column → treat as margin/structural
|
||||
elif g.word_count <= 2 and g.width_ratio < 0.15:
|
||||
structural.append(PageRegion(
|
||||
type='column_marker', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.85,
|
||||
classification_method='positional',
|
||||
))
|
||||
else:
|
||||
content_cols.append(g)
|
||||
|
||||
@@ -3566,7 +3590,16 @@ def positional_column_regions(
|
||||
|
||||
# Sort content columns left→right and assign positional labels
|
||||
content_cols.sort(key=lambda g: g.x)
|
||||
labels = ['column_en', 'column_de', 'column_example']
|
||||
|
||||
# With exactly 2 content columns: if the left one is very wide (>35%),
|
||||
# it likely contains EN+DE combined, so the right one is examples.
|
||||
if (len(content_cols) == 2
|
||||
and content_cols[0].width_ratio > 0.35
|
||||
and content_cols[1].width_ratio > 0.20):
|
||||
labels = ['column_en', 'column_example']
|
||||
else:
|
||||
labels = ['column_en', 'column_de', 'column_example']
|
||||
|
||||
regions = list(structural)
|
||||
for i, g in enumerate(content_cols):
|
||||
label = labels[i] if i < len(labels) else 'column_example'
|
||||
|
||||
@@ -1177,7 +1177,7 @@ async def upload_pdf_get_info(
|
||||
if OCR_PIPELINE_AVAILABLE:
|
||||
for pg in range(page_count):
|
||||
try:
|
||||
img_bgr = render_pdf_high_res(content, pg, zoom=1.0)
|
||||
img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
|
||||
_, rotation = detect_and_fix_orientation(img_bgr)
|
||||
if rotation:
|
||||
page_rotations[pg] = rotation
|
||||
|
||||
Reference in New Issue
Block a user