Bullet indentation detection: group continuation lines into bullets
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m49s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 34s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m49s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 34s
Flowing/bullet_list layout now analyzes left-edge indentation: - Lines at minimum indent = bullet start / main level - Lines indented >15px more = continuation (belongs to previous bullet) - Continuation lines merged with \n into parent bullet cell - Missing bullet markers (•) auto-added when pattern is clear Example: 7 OCR lines → 3 items (1 header + 2 bullets × 3 lines each) "German leihen" header, then two bullet groups with indented examples. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -201,9 +201,10 @@ def build_box_zone_grid(
|
||||
}
|
||||
|
||||
if layout_type in ("flowing", "bullet_list"):
|
||||
# Force single column — each line becomes one row with one cell
|
||||
# Force single column — each line becomes one row with one cell.
|
||||
# Detect bullet structure from indentation and merge continuation
|
||||
# lines into the bullet they belong to.
|
||||
lines = _group_into_lines(zone_words)
|
||||
# Column needs x_min_px/x_max_px for GridTable width calculation
|
||||
column = {
|
||||
"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
|
||||
"x_min_px": box_x, "x_max_px": box_x + box_w,
|
||||
@@ -211,14 +212,61 @@ def build_box_zone_grid(
|
||||
"x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
|
||||
"bold": False,
|
||||
}
|
||||
|
||||
# --- Detect indentation levels ---
|
||||
line_indents = []
|
||||
for line_words in lines:
|
||||
if not line_words:
|
||||
line_indents.append(0)
|
||||
continue
|
||||
min_left = min(w["left"] for w in line_words)
|
||||
line_indents.append(min_left - box_x)
|
||||
|
||||
# Find the minimum indent (= bullet/main level)
|
||||
valid_indents = [ind for ind in line_indents if ind >= 0]
|
||||
min_indent = min(valid_indents) if valid_indents else 0
|
||||
|
||||
# Indentation threshold: lines indented > 15px more than minimum
|
||||
# are continuation lines belonging to the previous bullet
|
||||
INDENT_THRESHOLD = 15
|
||||
|
||||
# --- Group lines into logical items (bullet + continuations) ---
|
||||
# Each item is a list of line indices
|
||||
items: List[List[int]] = []
|
||||
for li, indent in enumerate(line_indents):
|
||||
is_continuation = (indent > min_indent + INDENT_THRESHOLD) and len(items) > 0
|
||||
if is_continuation:
|
||||
items[-1].append(li)
|
||||
else:
|
||||
items.append([li])
|
||||
|
||||
logger.info(
|
||||
"Box zone %d flowing: %d lines → %d items (indents=%s, min=%d, threshold=%d)",
|
||||
zone_index, len(lines), len(items),
|
||||
[int(i) for i in line_indents], int(min_indent), INDENT_THRESHOLD,
|
||||
)
|
||||
|
||||
# --- Build rows and cells from grouped items ---
|
||||
rows = []
|
||||
cells = []
|
||||
header_rows = []
|
||||
|
||||
for row_idx, line_words in enumerate(lines):
|
||||
if not line_words:
|
||||
for row_idx, item_line_indices in enumerate(items):
|
||||
# Collect all words from all lines in this item
|
||||
item_words = []
|
||||
item_texts = []
|
||||
for li in item_line_indices:
|
||||
if li < len(lines):
|
||||
item_words.extend(lines[li])
|
||||
line_text = " ".join(w.get("text", "") for w in lines[li]).strip()
|
||||
if line_text:
|
||||
item_texts.append(line_text)
|
||||
|
||||
if not item_words:
|
||||
continue
|
||||
y_min = min(w["top"] for w in line_words)
|
||||
y_max = max(w["top"] + w["height"] for w in line_words)
|
||||
|
||||
y_min = min(w["top"] for w in item_words)
|
||||
y_max = max(w["top"] + w["height"] for w in item_words)
|
||||
y_center = (y_min + y_max) / 2
|
||||
|
||||
row = {
|
||||
@@ -235,23 +283,33 @@ def build_box_zone_grid(
|
||||
}
|
||||
rows.append(row)
|
||||
|
||||
line_text = " ".join(w.get("text", "") for w in line_words).strip()
|
||||
# Join multi-line text with newline for display
|
||||
merged_text = "\n".join(item_texts)
|
||||
|
||||
# Add bullet marker if this is a bullet item without one
|
||||
first_text = item_texts[0] if item_texts else ""
|
||||
is_bullet = len(item_line_indices) > 1 or _BULLET_RE.match(first_text)
|
||||
if is_bullet and not _BULLET_RE.match(first_text) and row_idx > 0:
|
||||
# Continuation item without bullet — add one
|
||||
merged_text = "• " + merged_text
|
||||
|
||||
cell = {
|
||||
"cell_id": f"Z{zone_index}_R{row_idx}C0",
|
||||
"row_index": row_idx,
|
||||
"col_index": 0,
|
||||
"col_type": "column_1",
|
||||
"text": line_text,
|
||||
"word_boxes": line_words,
|
||||
"text": merged_text,
|
||||
"word_boxes": item_words,
|
||||
}
|
||||
cells.append(cell)
|
||||
|
||||
# Detect header: first row if it's notably different (bold, larger, or short)
|
||||
header_rows = []
|
||||
if len(lines) >= 2:
|
||||
first_line = lines[0]
|
||||
first_text = " ".join(w.get("text", "") for w in first_line).strip()
|
||||
# Header heuristic: short text, or all-caps, or ends with ':'
|
||||
# Detect header: first item if it has no continuation lines and is short
|
||||
if len(items) >= 2:
|
||||
first_item_texts = []
|
||||
for li in items[0]:
|
||||
if li < len(lines):
|
||||
first_item_texts.append(" ".join(w.get("text", "") for w in lines[li]).strip())
|
||||
first_text = " ".join(first_item_texts)
|
||||
if (len(first_text) < 40
|
||||
or first_text.isupper()
|
||||
or first_text.rstrip().endswith(':')):
|
||||
|
||||
Reference in New Issue
Block a user