From b5900f1affc4958aa9b135b7c45deab9c490efad Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 13 Apr 2026 16:57:16 +0200 Subject: [PATCH] Bullet indentation detection: group continuation lines into bullets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flowing/bullet_list layout now analyzes left-edge indentation: - Lines at minimum indent = bullet start / main level - Lines indented >15px more = continuation (belongs to previous bullet) - Continuation lines merged with \n into parent bullet cell - Missing bullet markers (•) auto-added when pattern is clear Example: 7 OCR lines → 3 items (1 header + 2 bullets × 3 lines each) "German leihen" header, then two bullet groups with indented examples. Co-Authored-By: Claude Opus 4.6 (1M context) --- klausur-service/backend/cv_box_layout.py | 88 ++++++++++++++++++++---- 1 file changed, 73 insertions(+), 15 deletions(-) diff --git a/klausur-service/backend/cv_box_layout.py b/klausur-service/backend/cv_box_layout.py index 9b72daa..49201c0 100644 --- a/klausur-service/backend/cv_box_layout.py +++ b/klausur-service/backend/cv_box_layout.py @@ -201,9 +201,10 @@ def build_box_zone_grid( } if layout_type in ("flowing", "bullet_list"): - # Force single column — each line becomes one row with one cell + # Force single column — each line becomes one row with one cell. + # Detect bullet structure from indentation and merge continuation + # lines into the bullet they belong to. lines = _group_into_lines(zone_words) - # Column needs x_min_px/x_max_px for GridTable width calculation column = { "col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1", "x_min_px": box_x, "x_max_px": box_x + box_w, @@ -211,14 +212,61 @@ def build_box_zone_grid( "x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0, "bold": False, } + + # --- Detect indentation levels --- + line_indents = [] + for line_words in lines: + if not line_words: + line_indents.append(0) + continue + min_left = min(w["left"] for w in line_words) + line_indents.append(min_left - box_x) + + # Find the minimum indent (= bullet/main level) + valid_indents = [ind for ind in line_indents if ind >= 0] + min_indent = min(valid_indents) if valid_indents else 0 + + # Indentation threshold: lines indented > 15px more than minimum + # are continuation lines belonging to the previous bullet + INDENT_THRESHOLD = 15 + + # --- Group lines into logical items (bullet + continuations) --- + # Each item is a list of line indices + items: List[List[int]] = [] + for li, indent in enumerate(line_indents): + is_continuation = (indent > min_indent + INDENT_THRESHOLD) and len(items) > 0 + if is_continuation: + items[-1].append(li) + else: + items.append([li]) + + logger.info( + "Box zone %d flowing: %d lines → %d items (indents=%s, min=%d, threshold=%d)", + zone_index, len(lines), len(items), + [int(i) for i in line_indents], int(min_indent), INDENT_THRESHOLD, + ) + + # --- Build rows and cells from grouped items --- rows = [] cells = [] + header_rows = [] - for row_idx, line_words in enumerate(lines): - if not line_words: + for row_idx, item_line_indices in enumerate(items): + # Collect all words from all lines in this item + item_words = [] + item_texts = [] + for li in item_line_indices: + if li < len(lines): + item_words.extend(lines[li]) + line_text = " ".join(w.get("text", "") for w in lines[li]).strip() + if line_text: + item_texts.append(line_text) + + if not item_words: continue - y_min = min(w["top"] for w in line_words) - y_max = max(w["top"] + w["height"] for w in line_words) + + y_min = min(w["top"] for w in item_words) + y_max = max(w["top"] + w["height"] for w in item_words) y_center = (y_min + y_max) / 2 row = { @@ -235,23 +283,33 @@ def build_box_zone_grid( } rows.append(row) - line_text = " ".join(w.get("text", "") for w in line_words).strip() + # Join multi-line text with newline for display + merged_text = "\n".join(item_texts) + + # Add bullet marker if this is a bullet item without one + first_text = item_texts[0] if item_texts else "" + is_bullet = len(item_line_indices) > 1 or _BULLET_RE.match(first_text) + if is_bullet and not _BULLET_RE.match(first_text) and row_idx > 0: + # Continuation item without bullet — add one + merged_text = "• " + merged_text + cell = { "cell_id": f"Z{zone_index}_R{row_idx}C0", "row_index": row_idx, "col_index": 0, "col_type": "column_1", - "text": line_text, - "word_boxes": line_words, + "text": merged_text, + "word_boxes": item_words, } cells.append(cell) - # Detect header: first row if it's notably different (bold, larger, or short) - header_rows = [] - if len(lines) >= 2: - first_line = lines[0] - first_text = " ".join(w.get("text", "") for w in first_line).strip() - # Header heuristic: short text, or all-caps, or ends with ':' + # Detect header: first item if it has no continuation lines and is short + if len(items) >= 2: + first_item_texts = [] + for li in items[0]: + if li < len(lines): + first_item_texts.append(" ".join(w.get("text", "") for w in lines[li]).strip()) + first_text = " ".join(first_item_texts) if (len(first_text) < 40 or first_text.isupper() or first_text.rstrip().endswith(':')):