feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s

Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes
direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an
Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig.

- cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words
- ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint
- StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode
- OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus
- 15 Unit-Tests fuer cv_words_first

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-12 06:46:05 +01:00
parent 2fdf3ff868
commit ced5bb3dd3
6 changed files with 854 additions and 34 deletions

View File

@@ -71,6 +71,7 @@ from cv_vocab_pipeline import (
render_image_high_res,
render_pdf_high_res,
)
from cv_words_first import build_grid_from_words
from ocr_pipeline_session_store import (
create_session_db,
delete_all_sessions_db,
@@ -1859,6 +1860,7 @@ async def detect_words(
pronunciation: str = "british",
stream: bool = False,
skip_heal_gaps: bool = False,
grid_method: str = "v2",
):
"""Build word grid from columns × rows, OCR each cell.
@@ -1868,6 +1870,9 @@ async def detect_words(
stream: false (default) for JSON response, true for SSE streaming
skip_heal_gaps: false (default). When true, cells keep exact row geometry
positions without gap-healing expansion. Better for overlay rendering.
grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
'v2' uses pre-detected columns/rows (top-down).
'words_first' clusters words bottom-up (no column/row detection needed).
"""
if session_id not in _cache:
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
@@ -1902,7 +1907,7 @@ async def detect_words(
"duration_seconds": 0,
}
logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
if not row_result or not row_result.get("rows"):
if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
raise HTTPException(status_code=400, detail="Row detection must be completed first")
# Convert column dicts back to PageRegion objects
@@ -1983,6 +1988,102 @@ async def detect_words(
if excluded:
logger.info(f"detect_words: excluded {excluded} rows inside box zones")
# --- Words-First path: bottom-up grid from word boxes ---
if grid_method == "words_first":
t0 = time.time()
img_h, img_w = dewarped_bgr.shape[:2]
# Get word_dicts from cache or run Tesseract full-page
wf_word_dicts = cached.get("_word_dicts")
if wf_word_dicts is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
if geo_result is not None:
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
cached["_word_dicts"] = wf_word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
if not wf_word_dicts:
raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
# Convert word coordinates to absolute image coordinates if needed
# (detect_column_geometry returns words relative to content ROI)
content_bounds = cached.get("_content_bounds")
if content_bounds:
lx, _rx, ty, _by = content_bounds
abs_words = []
for w in wf_word_dicts:
abs_words.append({
**w,
'left': w['left'] + lx,
'top': w['top'] + ty,
})
wf_word_dicts = abs_words
cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
duration = time.time() - t0
# Apply IPA phonetic fixes
fix_cell_phonetics(cells, pronunciation=pronunciation)
# Add zone_index for backward compat
for cell in cells:
cell.setdefault("zone_index", 0)
col_types = {c['type'] for c in columns_meta}
is_vocab = bool(col_types & {'column_en', 'column_de'})
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
n_cols = len(columns_meta)
used_engine = "words_first"
word_result = {
"cells": cells,
"grid_shape": {
"rows": n_rows,
"cols": n_cols,
"total_cells": len(cells),
},
"columns_used": columns_meta,
"layout": "vocab" if is_vocab else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"ocr_engine": used_engine,
"grid_method": "words_first",
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
},
}
if is_vocab or 'column_text' in col_types:
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
word_result["vocab_entries"] = entries
word_result["entries"] = entries
word_result["entry_count"] = len(entries)
word_result["summary"]["total_entries"] = len(entries)
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
await update_session_db(session_id, word_result=word_result, current_step=8)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline: words-first session {session_id}: "
f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
await _append_pipeline_log(session_id, "words", {
"grid_method": "words_first",
"total_cells": len(cells),
"non_empty_cells": word_result["summary"]["non_empty_cells"],
"ocr_engine": used_engine,
"layout": word_result["layout"],
}, duration_ms=int(duration * 1000))
return {"session_id": session_id, **word_result}
if stream:
# Cell-First OCR v2: use batch-then-stream approach instead of
# per-cell streaming. The parallel ThreadPoolExecutor in
@@ -2001,7 +2102,7 @@ async def detect_words(
},
)
# --- Non-streaming path ---
# --- Non-streaming path (grid_method=v2) ---
t0 = time.time()
# Create binarized OCR image (for Tesseract)