fix(sub-columns): protect sub-columns from column_ignore pre-filter
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 23s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 23s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m52s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s
Add is_sub_column flag to ColumnGeometry. Sub-columns created by _detect_sub_columns() are now exempt from the edge-column word_count<8 rule that converts them to column_ignore. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -118,6 +118,7 @@ class ColumnGeometry:
|
|||||||
word_count: int
|
word_count: int
|
||||||
words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
|
words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
|
||||||
width_ratio: float # width / content_width (0.0-1.0)
|
width_ratio: float # width / content_width (0.0-1.0)
|
||||||
|
is_sub_column: bool = False # True if created by _detect_sub_columns() split
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -1150,6 +1151,7 @@ def _detect_sub_columns(
|
|||||||
word_count=len(sub_words),
|
word_count=len(sub_words),
|
||||||
words=sub_words,
|
words=sub_words,
|
||||||
width_ratio=sub_width / content_w if content_w > 0 else 0.0,
|
width_ratio=sub_width / content_w if content_w > 0 else 0.0,
|
||||||
|
is_sub_column=True,
|
||||||
)
|
)
|
||||||
main_geo = ColumnGeometry(
|
main_geo = ColumnGeometry(
|
||||||
index=0,
|
index=0,
|
||||||
@@ -1160,6 +1162,7 @@ def _detect_sub_columns(
|
|||||||
word_count=len(main_words),
|
word_count=len(main_words),
|
||||||
words=main_words,
|
words=main_words,
|
||||||
width_ratio=main_width / content_w if content_w > 0 else 0.0,
|
width_ratio=main_width / content_w if content_w > 0 else 0.0,
|
||||||
|
is_sub_column=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
result.append(sub_geo)
|
result.append(sub_geo)
|
||||||
@@ -2254,10 +2257,12 @@ def classify_column_types(geometries: List[ColumnGeometry],
|
|||||||
)])
|
)])
|
||||||
|
|
||||||
# --- Pre-filter: first/last columns with very few words → column_ignore ---
|
# --- Pre-filter: first/last columns with very few words → column_ignore ---
|
||||||
|
# Sub-columns from _detect_sub_columns() are exempt: they intentionally
|
||||||
|
# have few words (page refs, markers) and should not be discarded.
|
||||||
ignore_regions = []
|
ignore_regions = []
|
||||||
active_geometries = []
|
active_geometries = []
|
||||||
for idx, g in enumerate(geometries):
|
for idx, g in enumerate(geometries):
|
||||||
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8:
|
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
|
||||||
ignore_regions.append(PageRegion(
|
ignore_regions.append(PageRegion(
|
||||||
type='column_ignore', x=g.x, y=g.y,
|
type='column_ignore', x=g.x, y=g.y,
|
||||||
width=g.width, height=content_h,
|
width=g.width, height=content_h,
|
||||||
|
|||||||
Reference in New Issue
Block a user