feat: box-aware column detection — exclude box content from global columns
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
- Enrich column geometries with original full-page words (box-filtered) so _detect_sub_columns() finds narrow sub-columns across box boundaries - Add inline marker guard: bullet points (1., 2., •) are not split into sub-columns (minimum gap check: 1.2× word height or 20px) - Add box_rects parameter to build_grid_from_words() — words inside boxes are excluded from X-gap column clustering - Pass box rects from zones to words_first grid builder - Add 9 tests for box-aware column detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
|
||||
import logging
|
||||
import re
|
||||
import statistics
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
@@ -737,6 +738,24 @@ def _detect_sub_columns(
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# --- Guard against inline markers (bullet points, numbering) ---
|
||||
# Bullet points like "1.", "2.", "•", "-" sit close to the main
|
||||
# column text and are part of the cell, not a separate column.
|
||||
# Only split if the horizontal gap between the rightmost sub-word
|
||||
# and the main column start is large enough.
|
||||
max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
|
||||
gap_to_main = col_start_bin[2] - max_sub_right # px gap
|
||||
median_heights = [w.get('height', 20) for w in confident]
|
||||
med_h = statistics.median(median_heights) if median_heights else 20
|
||||
min_gap = max(med_h * 1.2, 20) # at least 1.2× word height or 20px
|
||||
if gap_to_main < min_gap:
|
||||
logger.debug(
|
||||
"SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
|
||||
"(likely inline markers, not a sub-column)",
|
||||
geo.index, gap_to_main, min_gap)
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# --- Build two sub-column geometries ---
|
||||
# Word 'left' values are relative to left_x; geo.x is absolute.
|
||||
# Convert the split position from relative to absolute coordinates.
|
||||
@@ -3221,6 +3240,46 @@ def detect_column_geometry_zoned(
|
||||
g.y = abs_y
|
||||
g.height = abs_y_end - abs_y
|
||||
|
||||
# --- Enrich column geometries with box-filtered original words ---
|
||||
# The combined-image Tesseract may miss words in small content strips
|
||||
# (e.g. a single row above a box). Use the original full-page word_dicts
|
||||
# filtered to exclude box interiors, so that _detect_sub_columns()
|
||||
# downstream has ALL content-zone words for left-edge clustering.
|
||||
# This ensures narrow sub-columns (page_ref, marker) are detectable
|
||||
# even when only a few entries exist above/below a box.
|
||||
if word_dicts:
|
||||
content_words = []
|
||||
for w in word_dicts:
|
||||
# word positions are relative to left_x / top_y
|
||||
w_abs_cx = w['left'] + left_x + w['width'] / 2
|
||||
w_abs_cy = w['top'] + top_y + w['height'] / 2
|
||||
inside_box = any(
|
||||
box.x <= w_abs_cx <= box.x + box.width
|
||||
and box.y <= w_abs_cy <= box.y + box.height
|
||||
for box in boxes
|
||||
)
|
||||
if not inside_box:
|
||||
content_words.append(w)
|
||||
|
||||
target_geoms = combined_geoms if combined_result is not None else geometries
|
||||
for g in target_geoms:
|
||||
# Word 'left' is relative to left_x; geometry 'x' is absolute
|
||||
g_left_rel = g.x - left_x
|
||||
g_right_rel = g_left_rel + g.width
|
||||
g.words = [
|
||||
w for w in content_words
|
||||
if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
|
||||
]
|
||||
g.word_count = len(g.words)
|
||||
|
||||
excluded_count = len(word_dicts) - len(content_words)
|
||||
if excluded_count:
|
||||
logger.info(
|
||||
"ZonedColumns: enriched geometries with %d content words "
|
||||
"(excluded %d box-interior words)",
|
||||
len(content_words), excluded_count,
|
||||
)
|
||||
|
||||
# Build zones_data for the response
|
||||
zones_data: List[Dict] = []
|
||||
for zone in zones:
|
||||
|
||||
Reference in New Issue
Block a user