Bullet indentation detection: group continuation lines into bullets
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m49s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 34s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 45s
CI / test-go-edu-search (push) Successful in 41s
CI / test-python-klausur (push) Failing after 2m49s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 34s
Flowing/bullet_list layout now analyzes left-edge indentation: - Lines at minimum indent = bullet start / main level - Lines indented >15px more = continuation (belongs to previous bullet) - Continuation lines merged with \n into parent bullet cell - Missing bullet markers (•) auto-added when pattern is clear Example: 7 OCR lines → 3 items (1 header + 2 bullets × 3 lines each) "German leihen" header, then two bullet groups with indented examples. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -201,9 +201,10 @@ def build_box_zone_grid(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if layout_type in ("flowing", "bullet_list"):
|
if layout_type in ("flowing", "bullet_list"):
|
||||||
# Force single column — each line becomes one row with one cell
|
# Force single column — each line becomes one row with one cell.
|
||||||
|
# Detect bullet structure from indentation and merge continuation
|
||||||
|
# lines into the bullet they belong to.
|
||||||
lines = _group_into_lines(zone_words)
|
lines = _group_into_lines(zone_words)
|
||||||
# Column needs x_min_px/x_max_px for GridTable width calculation
|
|
||||||
column = {
|
column = {
|
||||||
"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
|
"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
|
||||||
"x_min_px": box_x, "x_max_px": box_x + box_w,
|
"x_min_px": box_x, "x_max_px": box_x + box_w,
|
||||||
@@ -211,14 +212,61 @@ def build_box_zone_grid(
|
|||||||
"x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
|
"x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
|
||||||
"bold": False,
|
"bold": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# --- Detect indentation levels ---
|
||||||
|
line_indents = []
|
||||||
|
for line_words in lines:
|
||||||
|
if not line_words:
|
||||||
|
line_indents.append(0)
|
||||||
|
continue
|
||||||
|
min_left = min(w["left"] for w in line_words)
|
||||||
|
line_indents.append(min_left - box_x)
|
||||||
|
|
||||||
|
# Find the minimum indent (= bullet/main level)
|
||||||
|
valid_indents = [ind for ind in line_indents if ind >= 0]
|
||||||
|
min_indent = min(valid_indents) if valid_indents else 0
|
||||||
|
|
||||||
|
# Indentation threshold: lines indented > 15px more than minimum
|
||||||
|
# are continuation lines belonging to the previous bullet
|
||||||
|
INDENT_THRESHOLD = 15
|
||||||
|
|
||||||
|
# --- Group lines into logical items (bullet + continuations) ---
|
||||||
|
# Each item is a list of line indices
|
||||||
|
items: List[List[int]] = []
|
||||||
|
for li, indent in enumerate(line_indents):
|
||||||
|
is_continuation = (indent > min_indent + INDENT_THRESHOLD) and len(items) > 0
|
||||||
|
if is_continuation:
|
||||||
|
items[-1].append(li)
|
||||||
|
else:
|
||||||
|
items.append([li])
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Box zone %d flowing: %d lines → %d items (indents=%s, min=%d, threshold=%d)",
|
||||||
|
zone_index, len(lines), len(items),
|
||||||
|
[int(i) for i in line_indents], int(min_indent), INDENT_THRESHOLD,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Build rows and cells from grouped items ---
|
||||||
rows = []
|
rows = []
|
||||||
cells = []
|
cells = []
|
||||||
|
header_rows = []
|
||||||
|
|
||||||
for row_idx, line_words in enumerate(lines):
|
for row_idx, item_line_indices in enumerate(items):
|
||||||
if not line_words:
|
# Collect all words from all lines in this item
|
||||||
|
item_words = []
|
||||||
|
item_texts = []
|
||||||
|
for li in item_line_indices:
|
||||||
|
if li < len(lines):
|
||||||
|
item_words.extend(lines[li])
|
||||||
|
line_text = " ".join(w.get("text", "") for w in lines[li]).strip()
|
||||||
|
if line_text:
|
||||||
|
item_texts.append(line_text)
|
||||||
|
|
||||||
|
if not item_words:
|
||||||
continue
|
continue
|
||||||
y_min = min(w["top"] for w in line_words)
|
|
||||||
y_max = max(w["top"] + w["height"] for w in line_words)
|
y_min = min(w["top"] for w in item_words)
|
||||||
|
y_max = max(w["top"] + w["height"] for w in item_words)
|
||||||
y_center = (y_min + y_max) / 2
|
y_center = (y_min + y_max) / 2
|
||||||
|
|
||||||
row = {
|
row = {
|
||||||
@@ -235,23 +283,33 @@ def build_box_zone_grid(
|
|||||||
}
|
}
|
||||||
rows.append(row)
|
rows.append(row)
|
||||||
|
|
||||||
line_text = " ".join(w.get("text", "") for w in line_words).strip()
|
# Join multi-line text with newline for display
|
||||||
|
merged_text = "\n".join(item_texts)
|
||||||
|
|
||||||
|
# Add bullet marker if this is a bullet item without one
|
||||||
|
first_text = item_texts[0] if item_texts else ""
|
||||||
|
is_bullet = len(item_line_indices) > 1 or _BULLET_RE.match(first_text)
|
||||||
|
if is_bullet and not _BULLET_RE.match(first_text) and row_idx > 0:
|
||||||
|
# Continuation item without bullet — add one
|
||||||
|
merged_text = "• " + merged_text
|
||||||
|
|
||||||
cell = {
|
cell = {
|
||||||
"cell_id": f"Z{zone_index}_R{row_idx}C0",
|
"cell_id": f"Z{zone_index}_R{row_idx}C0",
|
||||||
"row_index": row_idx,
|
"row_index": row_idx,
|
||||||
"col_index": 0,
|
"col_index": 0,
|
||||||
"col_type": "column_1",
|
"col_type": "column_1",
|
||||||
"text": line_text,
|
"text": merged_text,
|
||||||
"word_boxes": line_words,
|
"word_boxes": item_words,
|
||||||
}
|
}
|
||||||
cells.append(cell)
|
cells.append(cell)
|
||||||
|
|
||||||
# Detect header: first row if it's notably different (bold, larger, or short)
|
# Detect header: first item if it has no continuation lines and is short
|
||||||
header_rows = []
|
if len(items) >= 2:
|
||||||
if len(lines) >= 2:
|
first_item_texts = []
|
||||||
first_line = lines[0]
|
for li in items[0]:
|
||||||
first_text = " ".join(w.get("text", "") for w in first_line).strip()
|
if li < len(lines):
|
||||||
# Header heuristic: short text, or all-caps, or ends with ':'
|
first_item_texts.append(" ".join(w.get("text", "") for w in lines[li]).strip())
|
||||||
|
first_text = " ".join(first_item_texts)
|
||||||
if (len(first_text) < 40
|
if (len(first_text) < 40
|
||||||
or first_text.isupper()
|
or first_text.isupper()
|
||||||
or first_text.rstrip().endswith(':')):
|
or first_text.rstrip().endswith(':')):
|
||||||
|
|||||||
Reference in New Issue
Block a user