Files
breakpilot-lehrer/klausur-service/backend/cv_words_first.py
Benjamin Admin ea69239e06
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 39s
CI / test-go-edu-search (push) Successful in 33s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 22s
CI / test-nodejs-website (push) Successful in 33s
fix: word_boxes in words_first use absolute pixels (consistent with v2 grid)
words_first was storing word_boxes in percent coordinates while
cv_cell_grid.py uses absolute pixel coordinates. The overlay slide
mechanism divides by imgW to get percentages, so percent-in-percent
caused positions near zero. Now both grid builders use the same format.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 15:04:04 +01:00

283 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Words-First Grid Builder (Bottom-Up).
Builds a cell grid from Tesseract word_boxes directly, without requiring
pre-detected columns or rows. Algorithm:
1. Cluster words into columns by X-gap analysis
2. Cluster words into rows by Y-proximity
3. Build cells at (column, row) intersections
Returns the same (cells, columns_meta) format as build_cell_grid_v2().
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import statistics
from typing import Any, Dict, List, Tuple
from cv_ocr_engines import (
_group_words_into_lines,
_words_to_reading_order_text,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# 1. Column clustering
# ---------------------------------------------------------------------------
def _cluster_columns(
words: List[Dict],
img_w: int,
min_gap_pct: float = 3.0,
) -> List[Dict[str, Any]]:
"""Cluster words into columns by finding large horizontal gaps.
Returns a list of column dicts:
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
sorted left-to-right.
"""
if not words:
return []
# Sort by X center
sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
# Collect word heights to compute adaptive threshold
heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
median_h = statistics.median(heights) if heights else 30
# Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
# Find X-gap boundaries between consecutive words (sorted by X-center)
# For each word, compute right edge; for next word, compute left edge
boundaries: List[float] = [] # X positions where columns split
for i in range(len(sorted_w) - 1):
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
left_edge = sorted_w[i + 1]['left']
gap = left_edge - right_edge
if gap > min_gap_px:
# Split point is midway through the gap
boundaries.append((right_edge + left_edge) / 2)
# Build column ranges from boundaries
# Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
col_edges = [0.0] + boundaries + [float(img_w)]
columns = []
for ci in range(len(col_edges) - 1):
columns.append({
'index': ci,
'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
'x_min': col_edges[ci],
'x_max': col_edges[ci + 1],
})
return columns
# ---------------------------------------------------------------------------
# 2. Row clustering
# ---------------------------------------------------------------------------
def _cluster_rows(
words: List[Dict],
) -> List[Dict[str, Any]]:
"""Cluster words into visual rows by Y-proximity.
Uses half the median word height as Y-tolerance.
Returns a list of row dicts:
[{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
sorted top-to-bottom.
"""
if not words:
return []
heights = [w['height'] for w in words if w.get('height', 0) > 0]
median_h = statistics.median(heights) if heights else 20
y_tol = max(median_h * 0.5, 5)
lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
rows = []
for ri, line_words in enumerate(lines):
y_min = min(w['top'] for w in line_words)
y_max = max(w['top'] + w['height'] for w in line_words)
rows.append({
'index': ri,
'y_min': y_min,
'y_max': y_max,
'y_center': (y_min + y_max) / 2,
})
return rows
# ---------------------------------------------------------------------------
# 3. Build cells
# ---------------------------------------------------------------------------
def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
"""Return column index for a word based on its X-center."""
x_center = word['left'] + word['width'] / 2
for col in columns:
if col['x_min'] <= x_center < col['x_max']:
return col['index']
# Fallback: nearest column
return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index']
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
"""Return row index for a word based on its Y-center."""
y_center = word['top'] + word['height'] / 2
# Find the row whose y_range contains this word's center
for row in rows:
if row['y_min'] <= y_center <= row['y_max']:
return row['index']
# Fallback: nearest row by Y-center
return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
def _build_cells(
words: List[Dict],
columns: List[Dict],
rows: List[Dict],
img_w: int,
img_h: int,
) -> List[Dict[str, Any]]:
"""Build cell dicts from word assignments to (column, row) pairs."""
if not columns or not rows:
return []
# Bucket words into (col_idx, row_idx)
buckets: Dict[Tuple[int, int], List[Dict]] = {}
for w in words:
ci = _assign_word_to_column(w, columns)
ri = _assign_word_to_row(w, rows)
buckets.setdefault((ci, ri), []).append(w)
cells = []
for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
col = columns[ci]
row = rows[ri]
# Compute tight bbox from actual word positions
x_min = min(w['left'] for w in cell_words)
y_min = min(w['top'] for w in cell_words)
x_max = max(w['left'] + w['width'] for w in cell_words)
y_max = max(w['top'] + w['height'] for w in cell_words)
bw = x_max - x_min
bh = y_max - y_min
# Text from words in reading order
text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
# Average confidence
confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
avg_conf = sum(confs) / len(confs) if confs else 0.0
# Word boxes with absolute pixel coordinates (consistent with cv_cell_grid.py)
word_boxes = []
for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
word_boxes.append({
'text': w.get('text', ''),
'left': w['left'],
'top': w['top'],
'width': w['width'],
'height': w['height'],
'conf': w.get('conf', 0),
})
cells.append({
'cell_id': f"R{ri:02d}_C{ci}",
'row_index': ri,
'col_index': ci,
'col_type': col['type'],
'text': text,
'confidence': round(avg_conf, 1),
'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
'bbox_pct': {
'x': round(x_min / img_w * 100, 2) if img_w else 0,
'y': round(y_min / img_h * 100, 2) if img_h else 0,
'w': round(bw / img_w * 100, 2) if img_w else 0,
'h': round(bh / img_h * 100, 2) if img_h else 0,
},
'word_boxes': word_boxes,
'ocr_engine': 'words_first',
'is_bold': False,
})
return cells
# ---------------------------------------------------------------------------
# 4. Public API
# ---------------------------------------------------------------------------
def build_grid_from_words(
word_dicts: List[Dict],
img_w: int,
img_h: int,
min_confidence: int = 30,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Build a cell grid bottom-up from Tesseract word boxes.
Args:
word_dicts: Flat list of word dicts with keys:
text, left, top, width, height, conf
(absolute pixel coordinates).
img_w: Image width in pixels.
img_h: Image height in pixels.
min_confidence: Minimum OCR confidence to keep a word.
Returns:
(cells, columns_meta) — same format as build_cell_grid_v2().
cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
"""
if not word_dicts:
logger.info("build_grid_from_words: no words — returning empty grid")
return [], []
# Filter by confidence
words = [
w for w in word_dicts
if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
]
if not words:
logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
return [], []
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
# Step 1: cluster columns
columns = _cluster_columns(words, img_w)
logger.info("build_grid_from_words: %d column(s) detected", len(columns))
# Step 2: cluster rows
rows = _cluster_rows(words)
logger.info("build_grid_from_words: %d row(s) detected", len(rows))
# Step 3: build cells
cells = _build_cells(words, columns, rows, img_w, img_h)
logger.info("build_grid_from_words: %d cells built", len(cells))
# Build columns_meta in same format as build_cell_grid_v2
columns_meta = []
for col in columns:
x = int(col['x_min'])
w = int(col['x_max'] - col['x_min'])
columns_meta.append({
'index': col['index'],
'type': col['type'],
'x': x,
'width': w,
})
return cells, columns_meta