Files
breakpilot-lehrer/klausur-service/backend/cv_words_first.py
Benjamin Admin 2f34ee9ede
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s
Add scan quality scoring, column limit, image enhancement (Steps 1-3)
Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts
OCR confidence threshold (40 for good scans, 30 for degraded).
Quality report included in API response + shown in frontend.

Step 2: max_columns parameter in cv_words_first.py — limits column
detection to 3 for vocab tables, preventing phantom columns D/E
from degraded OCR fragments.

Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter
denoising + unsharp mask, only for degraded scans (gated by
quality score). Pattern from handwriting_htr_api.py.

Frontend: quality info shown in extraction status after processing.
Reprocess button now derives pages from vocabulary data.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 14:58:39 +02:00

405 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Words-First Grid Builder (Bottom-Up).
Builds a cell grid from Tesseract word_boxes directly, without requiring
pre-detected columns or rows. Algorithm:
1. Cluster words into columns by X-gap analysis
2. Cluster words into rows by Y-proximity
3. Build cells at (column, row) intersections
Returns the same (cells, columns_meta) format as build_cell_grid_v2().
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
import statistics
from typing import Any, Dict, List, Optional, Tuple
from cv_ocr_engines import (
_group_words_into_lines,
_words_to_reading_order_text,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# 1. Column clustering
# ---------------------------------------------------------------------------
def _cluster_columns(
words: List[Dict],
img_w: int,
min_gap_pct: float = 3.0,
max_columns: Optional[int] = None,
) -> List[Dict[str, Any]]:
"""Cluster words into columns by finding large horizontal gaps.
Args:
max_columns: If set, limits the number of columns by merging
the closest adjacent pairs until the count matches.
Prevents phantom columns from degraded OCR.
Returns a list of column dicts:
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
sorted left-to-right.
"""
if not words:
return []
# Sort by X center
sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
# Collect word heights to compute adaptive threshold
heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
median_h = statistics.median(heights) if heights else 30
# Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
# Find X-gap boundaries between consecutive words (sorted by X-center)
# For each word, compute right edge; for next word, compute left edge
# Collect gaps with their sizes for max_columns enforcement
gaps: List[Tuple[float, float]] = [] # (gap_size, split_x)
for i in range(len(sorted_w) - 1):
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
left_edge = sorted_w[i + 1]['left']
gap = left_edge - right_edge
if gap > min_gap_px:
split_x = (right_edge + left_edge) / 2
gaps.append((gap, split_x))
# If max_columns is set, keep only the (max_columns - 1) largest gaps
if max_columns and len(gaps) >= max_columns:
gaps.sort(key=lambda g: g[0], reverse=True)
gaps = gaps[:max_columns - 1]
logger.info(
f"_cluster_columns: limited to {max_columns} columns "
f"(removed {len(gaps) + max_columns - 1 - (max_columns - 1)} smallest gaps)"
)
boundaries = sorted(g[1] for g in gaps)
# Build column ranges from boundaries
col_edges = [0.0] + boundaries + [float(img_w)]
columns = []
for ci in range(len(col_edges) - 1):
columns.append({
'index': ci,
'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
'x_min': col_edges[ci],
'x_max': col_edges[ci + 1],
})
return columns
# ---------------------------------------------------------------------------
# 2. Row clustering
# ---------------------------------------------------------------------------
def _cluster_rows(
words: List[Dict],
) -> List[Dict[str, Any]]:
"""Cluster words into visual rows by Y-proximity.
Uses half the median word height as Y-tolerance.
Returns a list of row dicts:
[{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
sorted top-to-bottom.
"""
if not words:
return []
heights = [w['height'] for w in words if w.get('height', 0) > 0]
median_h = statistics.median(heights) if heights else 20
y_tol = max(median_h * 0.5, 5)
lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
rows = []
for ri, line_words in enumerate(lines):
y_min = min(w['top'] for w in line_words)
y_max = max(w['top'] + w['height'] for w in line_words)
rows.append({
'index': ri,
'y_min': y_min,
'y_max': y_max,
'y_center': (y_min + y_max) / 2,
})
return rows
# ---------------------------------------------------------------------------
# 3. Build cells
# ---------------------------------------------------------------------------
def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
"""Return column index for a word based on overlap, then center, then nearest.
Three-pass strategy (consistent with _assign_row_words_to_columns):
1. Overlap-based: assign to column with maximum horizontal overlap.
2. Midpoint-range: if no overlap, use midpoints between adjacent columns.
3. Nearest center: last resort fallback.
"""
w_left = word['left']
w_right = w_left + word['width']
w_center = w_left + word['width'] / 2
# Pass 1: overlap-based
best_col = -1
best_overlap = 0
for col in columns:
overlap = max(0, min(w_right, col['x_max']) - max(w_left, col['x_min']))
if overlap > best_overlap:
best_overlap = overlap
best_col = col['index']
if best_col >= 0 and best_overlap > 0:
return best_col
# Pass 2: midpoint-range (non-overlapping assignment zones)
for ci, col in enumerate(columns):
if ci == 0:
assign_left = 0
else:
assign_left = (columns[ci - 1]['x_max'] + col['x_min']) / 2
if ci == len(columns) - 1:
assign_right = float('inf')
else:
assign_right = (col['x_max'] + columns[ci + 1]['x_min']) / 2
if assign_left <= w_center < assign_right:
return col['index']
# Pass 3: nearest column center
return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - w_center))['index']
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
"""Return row index for a word based on its Y-center.
When rows overlap (e.g. due to tall border-ghost characters inflating
a row's y_max), prefer the row whose y_center is closest.
"""
y_center = word['top'] + word['height'] / 2
# Find all rows whose y_range contains this word's center
matching = [r for r in rows if r['y_min'] <= y_center <= r['y_max']]
if matching:
return min(matching, key=lambda r: abs(r['y_center'] - y_center))['index']
# Fallback: nearest row by Y-center
return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
def _build_cells(
words: List[Dict],
columns: List[Dict],
rows: List[Dict],
img_w: int,
img_h: int,
) -> List[Dict[str, Any]]:
"""Build cell dicts from word assignments to (column, row) pairs."""
if not columns or not rows:
return []
# Bucket words into (col_idx, row_idx)
buckets: Dict[Tuple[int, int], List[Dict]] = {}
for w in words:
ci = _assign_word_to_column(w, columns)
ri = _assign_word_to_row(w, rows)
buckets.setdefault((ci, ri), []).append(w)
cells = []
for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
col = columns[ci]
row = rows[ri]
# Compute tight bbox from actual word positions
x_min = min(w['left'] for w in cell_words)
y_min = min(w['top'] for w in cell_words)
x_max = max(w['left'] + w['width'] for w in cell_words)
y_max = max(w['top'] + w['height'] for w in cell_words)
bw = x_max - x_min
bh = y_max - y_min
# Text from words in reading order
text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
# Average confidence
confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
avg_conf = sum(confs) / len(confs) if confs else 0.0
# Word boxes with absolute pixel coordinates (consistent with cv_cell_grid.py).
# PaddleOCR returns phrase-level boxes (e.g. "competition [kompa'tifn]"),
# but the overlay slide mechanism expects one box per word. Split multi-word
# boxes into individual word positions proportional to character length.
# Also split at "[" boundaries (IPA patterns like "badge[bxd3]").
#
# Sort in reading order: group by Y (same visual line), then sort by X.
# Simple (top, left) sort fails when words on the same line have slightly
# different top values (1-6px), causing wrong word order.
y_tol_wb = max(10, int(bh * 0.4))
reading_lines = _group_words_into_lines(cell_words, y_tolerance_px=y_tol_wb)
ordered_cell_words = [w for line in reading_lines for w in line]
word_boxes = []
for w in ordered_cell_words:
raw_text = w.get('text', '').strip()
# Split by whitespace, at "[" boundaries (IPA), and after leading "!"
# e.g. "badge[bxd3]" → ["badge", "[bxd3]"]
# e.g. "profit['proft]" → ["profit", "['proft]"]
# e.g. "!Betonung" → ["!", "Betonung"]
tokens = re.split(r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text)
tokens = [t for t in tokens if t] # remove empty strings
if len(tokens) <= 1:
# Single word — keep as-is
word_boxes.append({
'text': raw_text,
'left': w['left'],
'top': w['top'],
'width': w['width'],
'height': w['height'],
'conf': w.get('conf', 0),
})
else:
# Multi-word phrase — split proportionally by character count
total_chars = sum(len(t) for t in tokens)
if total_chars == 0:
continue
# Small gap between words (2% of box width per gap)
n_gaps = len(tokens) - 1
gap_px = w['width'] * 0.02
usable_w = w['width'] - gap_px * n_gaps
cursor = w['left']
for t in tokens:
token_w = max(1, usable_w * len(t) / total_chars)
word_boxes.append({
'text': t,
'left': round(cursor),
'top': w['top'],
'width': round(token_w),
'height': w['height'],
'conf': w.get('conf', 0),
})
cursor += token_w + gap_px
cells.append({
'cell_id': f"R{ri:02d}_C{ci}",
'row_index': ri,
'col_index': ci,
'col_type': col['type'],
'text': text,
'confidence': round(avg_conf, 1),
'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
'bbox_pct': {
'x': round(x_min / img_w * 100, 2) if img_w else 0,
'y': round(y_min / img_h * 100, 2) if img_h else 0,
'w': round(bw / img_w * 100, 2) if img_w else 0,
'h': round(bh / img_h * 100, 2) if img_h else 0,
},
'word_boxes': word_boxes,
'ocr_engine': 'words_first',
'is_bold': False,
})
return cells
# ---------------------------------------------------------------------------
# 4. Public API
# ---------------------------------------------------------------------------
def build_grid_from_words(
word_dicts: List[Dict],
img_w: int,
img_h: int,
min_confidence: int = 30,
box_rects: Optional[List[Dict]] = None,
max_columns: Optional[int] = None,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Build a cell grid bottom-up from Tesseract word boxes.
Args:
word_dicts: Flat list of word dicts with keys:
text, left, top, width, height, conf
(absolute pixel coordinates).
img_w: Image width in pixels.
img_h: Image height in pixels.
min_confidence: Minimum OCR confidence to keep a word.
box_rects: Optional list of box dicts with keys x, y, width, height.
Words inside these boxes are excluded from column clustering
(box-internal columns are detected separately in sub-sessions).
Returns:
(cells, columns_meta) — same format as build_cell_grid_v2().
cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
"""
if not word_dicts:
logger.info("build_grid_from_words: no words — returning empty grid")
return [], []
# Filter by confidence
words = [
w for w in word_dicts
if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
]
if not words:
logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
return [], []
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
# Exclude words inside detected boxes — box columns are detected separately
if box_rects:
content_words = []
for w in words:
w_cx = w['left'] + w['width'] / 2
w_cy = w['top'] + w['height'] / 2
inside = any(
b['x'] <= w_cx <= b['x'] + b['width']
and b['y'] <= w_cy <= b['y'] + b['height']
for b in box_rects
)
if not inside:
content_words.append(w)
excluded = len(words) - len(content_words)
if excluded:
logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
excluded, len(box_rects))
words = content_words
if not words:
logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
return [], []
# Step 1: cluster columns
columns = _cluster_columns(words, img_w, max_columns=max_columns)
logger.info("build_grid_from_words: %d column(s) detected%s",
len(columns), f" (max={max_columns})" if max_columns else "")
# Step 2: cluster rows
rows = _cluster_rows(words)
logger.info("build_grid_from_words: %d row(s) detected", len(rows))
# Step 3: build cells
cells = _build_cells(words, columns, rows, img_w, img_h)
logger.info("build_grid_from_words: %d cells built", len(cells))
# Build columns_meta in same format as build_cell_grid_v2
columns_meta = []
for col in columns:
x = int(col['x_min'])
w = int(col['x_max'] - col['x_min'])
columns_meta.append({
'index': col['index'],
'type': col['type'],
'x': x,
'width': w,
})
return cells, columns_meta