Bullet indentation detection: group continuation lines into bullets

Flowing/bullet_list layout now analyzes left-edge indentation: - Lines at minimum indent = bullet start / main level - Lines indented >15px more = continuation (belongs to previous bullet) - Continuation lines merged with \n into parent bullet cell - Missing bullet markers (•) auto-added when pattern is clear Example: 7 OCR lines → 3 items (1 header + 2 bullets × 3 lines each) "German leihen" header, then two bullet groups with indented examples. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 16:57:16 +02:00
parent baac98f837
commit b5900f1aff
1 changed files with 73 additions and 15 deletions
@@ -201,9 +201,10 @@ def build_box_zone_grid(
        }

    if layout_type in ("flowing", "bullet_list"):
-        # Force single column — each line becomes one row with one cell
+        # Force single column — each line becomes one row with one cell.
+        # Detect bullet structure from indentation and merge continuation
+        # lines into the bullet they belong to.
        lines = _group_into_lines(zone_words)
-        # Column needs x_min_px/x_max_px for GridTable width calculation
        column = {
            "col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
            "x_min_px": box_x, "x_max_px": box_x + box_w,
@@ -211,14 +212,61 @@ def build_box_zone_grid(
            "x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
            "bold": False,
        }
+
+        # --- Detect indentation levels ---
+        line_indents = []
+        for line_words in lines:
+            if not line_words:
+                line_indents.append(0)
+                continue
+            min_left = min(w["left"] for w in line_words)
+            line_indents.append(min_left - box_x)
+
+        # Find the minimum indent (= bullet/main level)
+        valid_indents = [ind for ind in line_indents if ind >= 0]
+        min_indent = min(valid_indents) if valid_indents else 0
+
+        # Indentation threshold: lines indented > 15px more than minimum
+        # are continuation lines belonging to the previous bullet
+        INDENT_THRESHOLD = 15
+
+        # --- Group lines into logical items (bullet + continuations) ---
+        # Each item is a list of line indices
+        items: List[List[int]] = []
+        for li, indent in enumerate(line_indents):
+            is_continuation = (indent > min_indent + INDENT_THRESHOLD) and len(items) > 0
+            if is_continuation:
+                items[-1].append(li)
+            else:
+                items.append([li])
+
+        logger.info(
+            "Box zone %d flowing: %d lines → %d items (indents=%s, min=%d, threshold=%d)",
+            zone_index, len(lines), len(items),
+            [int(i) for i in line_indents], int(min_indent), INDENT_THRESHOLD,
+        )
+
+        # --- Build rows and cells from grouped items ---
        rows = []
        cells = []
+        header_rows = []

-        for row_idx, line_words in enumerate(lines):
-            if not line_words:
+        for row_idx, item_line_indices in enumerate(items):
+            # Collect all words from all lines in this item
+            item_words = []
+            item_texts = []
+            for li in item_line_indices:
+                if li < len(lines):
+                    item_words.extend(lines[li])
+                    line_text = " ".join(w.get("text", "") for w in lines[li]).strip()
+                    if line_text:
+                        item_texts.append(line_text)
+
+            if not item_words:
                continue
-            y_min = min(w["top"] for w in line_words)
-            y_max = max(w["top"] + w["height"] for w in line_words)
+
+            y_min = min(w["top"] for w in item_words)
+            y_max = max(w["top"] + w["height"] for w in item_words)
            y_center = (y_min + y_max) / 2

            row = {
@@ -235,23 +283,33 @@ def build_box_zone_grid(
            }
            rows.append(row)

-            line_text = " ".join(w.get("text", "") for w in line_words).strip()
+            # Join multi-line text with newline for display
+            merged_text = "\n".join(item_texts)
+
+            # Add bullet marker if this is a bullet item without one
+            first_text = item_texts[0] if item_texts else ""
+            is_bullet = len(item_line_indices) > 1 or _BULLET_RE.match(first_text)
+            if is_bullet and not _BULLET_RE.match(first_text) and row_idx > 0:
+                # Continuation item without bullet — add one
+                merged_text = "• " + merged_text
+
            cell = {
                "cell_id": f"Z{zone_index}_R{row_idx}C0",
                "row_index": row_idx,
                "col_index": 0,
                "col_type": "column_1",
-                "text": line_text,
-                "word_boxes": line_words,
+                "text": merged_text,
+                "word_boxes": item_words,
            }
            cells.append(cell)

-        # Detect header: first row if it's notably different (bold, larger, or short)
-        header_rows = []
-        if len(lines) >= 2:
-            first_line = lines[0]
-            first_text = " ".join(w.get("text", "") for w in first_line).strip()
-            # Header heuristic: short text, or all-caps, or ends with ':'
+        # Detect header: first item if it has no continuation lines and is short
+        if len(items) >= 2:
+            first_item_texts = []
+            for li in items[0]:
+                if li < len(lines):
+                    first_item_texts.append(" ".join(w.get("text", "") for w in lines[li]).strip())
+            first_text = " ".join(first_item_texts)
            if (len(first_text) < 40
                    or first_text.isupper()
                    or first_text.rstrip().endswith(':')):