fix(sub-columns): protect sub-columns from column_ignore pre-filter

Add is_sub_column flag to ColumnGeometry. Sub-columns created by _detect_sub_columns() are now exempt from the edge-column word_count<8 rule that converts them to column_ignore. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 07:55:53 +01:00
parent d6a8c1d821
commit 0d72f2c836
1 changed files with 6 additions and 1 deletions
@@ -118,6 +118,7 @@ class ColumnGeometry:
    word_count: int
    words: List[Dict]       # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
    width_ratio: float      # width / content_width (0.0-1.0)
+    is_sub_column: bool = False  # True if created by _detect_sub_columns() split


@dataclass
@@ -1150,6 +1151,7 @@ def _detect_sub_columns(
            word_count=len(sub_words),
            words=sub_words,
            width_ratio=sub_width / content_w if content_w > 0 else 0.0,
+            is_sub_column=True,
        )
        main_geo = ColumnGeometry(
            index=0,
@@ -1160,6 +1162,7 @@ def _detect_sub_columns(
            word_count=len(main_words),
            words=main_words,
            width_ratio=main_width / content_w if content_w > 0 else 0.0,
+            is_sub_column=True,
        )

        result.append(sub_geo)
@@ -2254,10 +2257,12 @@ def classify_column_types(geometries: List[ColumnGeometry],
        )])

    # --- Pre-filter: first/last columns with very few words → column_ignore ---
+    # Sub-columns from _detect_sub_columns() are exempt: they intentionally
+    # have few words (page refs, markers) and should not be discarded.
    ignore_regions = []
    active_geometries = []
    for idx, g in enumerate(geometries):
-        if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8:
+        if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
            ignore_regions.append(PageRegion(
                type='column_ignore', x=g.x, y=g.y,
                width=g.width, height=content_h,