Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s
Words on the same visual line can have slightly different top values (1-6px). Sorting by (top, left) produced wrong word order in the frontend display. Now uses _group_words_into_lines to group by Y proximity first, then sort by X within each line. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
356 lines
13 KiB
Python
356 lines
13 KiB
Python
"""
|
||
Words-First Grid Builder (Bottom-Up).
|
||
|
||
Builds a cell grid from Tesseract word_boxes directly, without requiring
|
||
pre-detected columns or rows. Algorithm:
|
||
|
||
1. Cluster words into columns by X-gap analysis
|
||
2. Cluster words into rows by Y-proximity
|
||
3. Build cells at (column, row) intersections
|
||
|
||
Returns the same (cells, columns_meta) format as build_cell_grid_v2().
|
||
|
||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
import statistics
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
from cv_ocr_engines import (
|
||
_group_words_into_lines,
|
||
_words_to_reading_order_text,
|
||
)
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 1. Column clustering
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _cluster_columns(
|
||
words: List[Dict],
|
||
img_w: int,
|
||
min_gap_pct: float = 3.0,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Cluster words into columns by finding large horizontal gaps.
|
||
|
||
Returns a list of column dicts:
|
||
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
|
||
sorted left-to-right.
|
||
"""
|
||
if not words:
|
||
return []
|
||
|
||
# Sort by X center
|
||
sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
|
||
|
||
# Collect word heights to compute adaptive threshold
|
||
heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
|
||
median_h = statistics.median(heights) if heights else 30
|
||
|
||
# Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
|
||
min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
|
||
|
||
# Find X-gap boundaries between consecutive words (sorted by X-center)
|
||
# For each word, compute right edge; for next word, compute left edge
|
||
boundaries: List[float] = [] # X positions where columns split
|
||
for i in range(len(sorted_w) - 1):
|
||
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
|
||
left_edge = sorted_w[i + 1]['left']
|
||
gap = left_edge - right_edge
|
||
if gap > min_gap_px:
|
||
# Split point is midway through the gap
|
||
boundaries.append((right_edge + left_edge) / 2)
|
||
|
||
# Build column ranges from boundaries
|
||
# Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
|
||
col_edges = [0.0] + boundaries + [float(img_w)]
|
||
columns = []
|
||
for ci in range(len(col_edges) - 1):
|
||
columns.append({
|
||
'index': ci,
|
||
'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
|
||
'x_min': col_edges[ci],
|
||
'x_max': col_edges[ci + 1],
|
||
})
|
||
|
||
return columns
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 2. Row clustering
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _cluster_rows(
|
||
words: List[Dict],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Cluster words into visual rows by Y-proximity.
|
||
|
||
Uses half the median word height as Y-tolerance.
|
||
|
||
Returns a list of row dicts:
|
||
[{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
|
||
sorted top-to-bottom.
|
||
"""
|
||
if not words:
|
||
return []
|
||
|
||
heights = [w['height'] for w in words if w.get('height', 0) > 0]
|
||
median_h = statistics.median(heights) if heights else 20
|
||
y_tol = max(median_h * 0.5, 5)
|
||
|
||
lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
|
||
|
||
rows = []
|
||
for ri, line_words in enumerate(lines):
|
||
y_min = min(w['top'] for w in line_words)
|
||
y_max = max(w['top'] + w['height'] for w in line_words)
|
||
rows.append({
|
||
'index': ri,
|
||
'y_min': y_min,
|
||
'y_max': y_max,
|
||
'y_center': (y_min + y_max) / 2,
|
||
})
|
||
|
||
return rows
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 3. Build cells
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
|
||
"""Return column index for a word based on its X-center."""
|
||
x_center = word['left'] + word['width'] / 2
|
||
for col in columns:
|
||
if col['x_min'] <= x_center < col['x_max']:
|
||
return col['index']
|
||
# Fallback: nearest column
|
||
return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index']
|
||
|
||
|
||
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
|
||
"""Return row index for a word based on its Y-center.
|
||
|
||
When rows overlap (e.g. due to tall border-ghost characters inflating
|
||
a row's y_max), prefer the row whose y_center is closest.
|
||
"""
|
||
y_center = word['top'] + word['height'] / 2
|
||
# Find all rows whose y_range contains this word's center
|
||
matching = [r for r in rows if r['y_min'] <= y_center <= r['y_max']]
|
||
if matching:
|
||
return min(matching, key=lambda r: abs(r['y_center'] - y_center))['index']
|
||
# Fallback: nearest row by Y-center
|
||
return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
|
||
|
||
|
||
def _build_cells(
|
||
words: List[Dict],
|
||
columns: List[Dict],
|
||
rows: List[Dict],
|
||
img_w: int,
|
||
img_h: int,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Build cell dicts from word assignments to (column, row) pairs."""
|
||
if not columns or not rows:
|
||
return []
|
||
|
||
# Bucket words into (col_idx, row_idx)
|
||
buckets: Dict[Tuple[int, int], List[Dict]] = {}
|
||
for w in words:
|
||
ci = _assign_word_to_column(w, columns)
|
||
ri = _assign_word_to_row(w, rows)
|
||
buckets.setdefault((ci, ri), []).append(w)
|
||
|
||
cells = []
|
||
for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
|
||
col = columns[ci]
|
||
row = rows[ri]
|
||
|
||
# Compute tight bbox from actual word positions
|
||
x_min = min(w['left'] for w in cell_words)
|
||
y_min = min(w['top'] for w in cell_words)
|
||
x_max = max(w['left'] + w['width'] for w in cell_words)
|
||
y_max = max(w['top'] + w['height'] for w in cell_words)
|
||
bw = x_max - x_min
|
||
bh = y_max - y_min
|
||
|
||
# Text from words in reading order
|
||
text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
|
||
|
||
# Average confidence
|
||
confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
|
||
avg_conf = sum(confs) / len(confs) if confs else 0.0
|
||
|
||
# Word boxes with absolute pixel coordinates (consistent with cv_cell_grid.py).
|
||
# PaddleOCR returns phrase-level boxes (e.g. "competition [kompa'tifn]"),
|
||
# but the overlay slide mechanism expects one box per word. Split multi-word
|
||
# boxes into individual word positions proportional to character length.
|
||
# Also split at "[" boundaries (IPA patterns like "badge[bxd3]").
|
||
#
|
||
# Sort in reading order: group by Y (same visual line), then sort by X.
|
||
# Simple (top, left) sort fails when words on the same line have slightly
|
||
# different top values (1-6px), causing wrong word order.
|
||
y_tol_wb = max(10, int(bh * 0.4))
|
||
reading_lines = _group_words_into_lines(cell_words, y_tolerance_px=y_tol_wb)
|
||
ordered_cell_words = [w for line in reading_lines for w in line]
|
||
|
||
word_boxes = []
|
||
for w in ordered_cell_words:
|
||
raw_text = w.get('text', '').strip()
|
||
# Split by whitespace, at "[" boundaries (IPA), and after leading "!"
|
||
# e.g. "badge[bxd3]" → ["badge", "[bxd3]"]
|
||
# e.g. "profit['proft]" → ["profit", "['proft]"]
|
||
# e.g. "!Betonung" → ["!", "Betonung"]
|
||
tokens = re.split(r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text)
|
||
tokens = [t for t in tokens if t] # remove empty strings
|
||
if len(tokens) <= 1:
|
||
# Single word — keep as-is
|
||
word_boxes.append({
|
||
'text': raw_text,
|
||
'left': w['left'],
|
||
'top': w['top'],
|
||
'width': w['width'],
|
||
'height': w['height'],
|
||
'conf': w.get('conf', 0),
|
||
})
|
||
else:
|
||
# Multi-word phrase — split proportionally by character count
|
||
total_chars = sum(len(t) for t in tokens)
|
||
if total_chars == 0:
|
||
continue
|
||
# Small gap between words (2% of box width per gap)
|
||
n_gaps = len(tokens) - 1
|
||
gap_px = w['width'] * 0.02
|
||
usable_w = w['width'] - gap_px * n_gaps
|
||
cursor = w['left']
|
||
for t in tokens:
|
||
token_w = max(1, usable_w * len(t) / total_chars)
|
||
word_boxes.append({
|
||
'text': t,
|
||
'left': round(cursor),
|
||
'top': w['top'],
|
||
'width': round(token_w),
|
||
'height': w['height'],
|
||
'conf': w.get('conf', 0),
|
||
})
|
||
cursor += token_w + gap_px
|
||
|
||
cells.append({
|
||
'cell_id': f"R{ri:02d}_C{ci}",
|
||
'row_index': ri,
|
||
'col_index': ci,
|
||
'col_type': col['type'],
|
||
'text': text,
|
||
'confidence': round(avg_conf, 1),
|
||
'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
|
||
'bbox_pct': {
|
||
'x': round(x_min / img_w * 100, 2) if img_w else 0,
|
||
'y': round(y_min / img_h * 100, 2) if img_h else 0,
|
||
'w': round(bw / img_w * 100, 2) if img_w else 0,
|
||
'h': round(bh / img_h * 100, 2) if img_h else 0,
|
||
},
|
||
'word_boxes': word_boxes,
|
||
'ocr_engine': 'words_first',
|
||
'is_bold': False,
|
||
})
|
||
|
||
return cells
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 4. Public API
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def build_grid_from_words(
|
||
word_dicts: List[Dict],
|
||
img_w: int,
|
||
img_h: int,
|
||
min_confidence: int = 30,
|
||
box_rects: Optional[List[Dict]] = None,
|
||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||
"""Build a cell grid bottom-up from Tesseract word boxes.
|
||
|
||
Args:
|
||
word_dicts: Flat list of word dicts with keys:
|
||
text, left, top, width, height, conf
|
||
(absolute pixel coordinates).
|
||
img_w: Image width in pixels.
|
||
img_h: Image height in pixels.
|
||
min_confidence: Minimum OCR confidence to keep a word.
|
||
box_rects: Optional list of box dicts with keys x, y, width, height.
|
||
Words inside these boxes are excluded from column clustering
|
||
(box-internal columns are detected separately in sub-sessions).
|
||
|
||
Returns:
|
||
(cells, columns_meta) — same format as build_cell_grid_v2().
|
||
cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
|
||
columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
|
||
"""
|
||
if not word_dicts:
|
||
logger.info("build_grid_from_words: no words — returning empty grid")
|
||
return [], []
|
||
|
||
# Filter by confidence
|
||
words = [
|
||
w for w in word_dicts
|
||
if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
|
||
]
|
||
if not words:
|
||
logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
|
||
return [], []
|
||
|
||
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
|
||
|
||
# Exclude words inside detected boxes — box columns are detected separately
|
||
if box_rects:
|
||
content_words = []
|
||
for w in words:
|
||
w_cx = w['left'] + w['width'] / 2
|
||
w_cy = w['top'] + w['height'] / 2
|
||
inside = any(
|
||
b['x'] <= w_cx <= b['x'] + b['width']
|
||
and b['y'] <= w_cy <= b['y'] + b['height']
|
||
for b in box_rects
|
||
)
|
||
if not inside:
|
||
content_words.append(w)
|
||
excluded = len(words) - len(content_words)
|
||
if excluded:
|
||
logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
|
||
excluded, len(box_rects))
|
||
words = content_words
|
||
if not words:
|
||
logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
|
||
return [], []
|
||
|
||
# Step 1: cluster columns
|
||
columns = _cluster_columns(words, img_w)
|
||
logger.info("build_grid_from_words: %d column(s) detected", len(columns))
|
||
|
||
# Step 2: cluster rows
|
||
rows = _cluster_rows(words)
|
||
logger.info("build_grid_from_words: %d row(s) detected", len(rows))
|
||
|
||
# Step 3: build cells
|
||
cells = _build_cells(words, columns, rows, img_w, img_h)
|
||
logger.info("build_grid_from_words: %d cells built", len(cells))
|
||
|
||
# Build columns_meta in same format as build_cell_grid_v2
|
||
columns_meta = []
|
||
for col in columns:
|
||
x = int(col['x_min'])
|
||
w = int(col['x_max'] - col['x_min'])
|
||
columns_meta.append({
|
||
'index': col['index'],
|
||
'type': col['type'],
|
||
'x': x,
|
||
'width': w,
|
||
})
|
||
|
||
return cells, columns_meta
|