Bullet indentation detection: group continuation lines into bullets
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m49s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 34s

Flowing/bullet_list layout now analyzes left-edge indentation:
- Lines at minimum indent = bullet start / main level
- Lines indented >15px more = continuation (belongs to previous bullet)
- Continuation lines merged with \n into parent bullet cell
- Missing bullet markers (•) auto-added when pattern is clear

Example: 7 OCR lines → 3 items (1 header + 2 bullets × 3 lines each)
"German leihen" header, then two bullet groups with indented examples.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-13 16:57:16 +02:00
parent baac98f837
commit b5900f1aff

View File

@@ -201,9 +201,10 @@ def build_box_zone_grid(
}
if layout_type in ("flowing", "bullet_list"):
# Force single column — each line becomes one row with one cell
# Force single column — each line becomes one row with one cell.
# Detect bullet structure from indentation and merge continuation
# lines into the bullet they belong to.
lines = _group_into_lines(zone_words)
# Column needs x_min_px/x_max_px for GridTable width calculation
column = {
"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
"x_min_px": box_x, "x_max_px": box_x + box_w,
@@ -211,14 +212,61 @@ def build_box_zone_grid(
"x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
"bold": False,
}
# --- Detect indentation levels ---
line_indents = []
for line_words in lines:
if not line_words:
line_indents.append(0)
continue
min_left = min(w["left"] for w in line_words)
line_indents.append(min_left - box_x)
# Find the minimum indent (= bullet/main level)
valid_indents = [ind for ind in line_indents if ind >= 0]
min_indent = min(valid_indents) if valid_indents else 0
# Indentation threshold: lines indented > 15px more than minimum
# are continuation lines belonging to the previous bullet
INDENT_THRESHOLD = 15
# --- Group lines into logical items (bullet + continuations) ---
# Each item is a list of line indices
items: List[List[int]] = []
for li, indent in enumerate(line_indents):
is_continuation = (indent > min_indent + INDENT_THRESHOLD) and len(items) > 0
if is_continuation:
items[-1].append(li)
else:
items.append([li])
logger.info(
"Box zone %d flowing: %d lines → %d items (indents=%s, min=%d, threshold=%d)",
zone_index, len(lines), len(items),
[int(i) for i in line_indents], int(min_indent), INDENT_THRESHOLD,
)
# --- Build rows and cells from grouped items ---
rows = []
cells = []
header_rows = []
for row_idx, line_words in enumerate(lines):
if not line_words:
for row_idx, item_line_indices in enumerate(items):
# Collect all words from all lines in this item
item_words = []
item_texts = []
for li in item_line_indices:
if li < len(lines):
item_words.extend(lines[li])
line_text = " ".join(w.get("text", "") for w in lines[li]).strip()
if line_text:
item_texts.append(line_text)
if not item_words:
continue
y_min = min(w["top"] for w in line_words)
y_max = max(w["top"] + w["height"] for w in line_words)
y_min = min(w["top"] for w in item_words)
y_max = max(w["top"] + w["height"] for w in item_words)
y_center = (y_min + y_max) / 2
row = {
@@ -235,23 +283,33 @@ def build_box_zone_grid(
}
rows.append(row)
line_text = " ".join(w.get("text", "") for w in line_words).strip()
# Join multi-line text with newline for display
merged_text = "\n".join(item_texts)
# Add bullet marker if this is a bullet item without one
first_text = item_texts[0] if item_texts else ""
is_bullet = len(item_line_indices) > 1 or _BULLET_RE.match(first_text)
if is_bullet and not _BULLET_RE.match(first_text) and row_idx > 0:
# Continuation item without bullet — add one
merged_text = "" + merged_text
cell = {
"cell_id": f"Z{zone_index}_R{row_idx}C0",
"row_index": row_idx,
"col_index": 0,
"col_type": "column_1",
"text": line_text,
"word_boxes": line_words,
"text": merged_text,
"word_boxes": item_words,
}
cells.append(cell)
# Detect header: first row if it's notably different (bold, larger, or short)
header_rows = []
if len(lines) >= 2:
first_line = lines[0]
first_text = " ".join(w.get("text", "") for w in first_line).strip()
# Header heuristic: short text, or all-caps, or ends with ':'
# Detect header: first item if it has no continuation lines and is short
if len(items) >= 2:
first_item_texts = []
for li in items[0]:
if li < len(lines):
first_item_texts.append(" ".join(w.get("text", "") for w in lines[li]).strip())
first_text = " ".join(first_item_texts)
if (len(first_text) < 40
or first_text.isupper()
or first_text.rstrip().endswith(':')):