diff --git a/klausur-service/backend/cv_cell_grid.py b/klausur-service/backend/cv_cell_grid.py
new file mode 100644
index 0000000..6e55509
--- /dev/null
+++ b/klausur-service/backend/cv_cell_grid.py
@@ -0,0 +1,1510 @@
+"""
+Cell-grid construction (v2 + legacy), vocab conversion, and word-grid OCR.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import PageRegion, RowGeometry
+from cv_ocr_engines import (
+ _assign_row_words_to_columns,
+ _attach_example_sentences,
+ _clean_cell_text,
+ _clean_cell_text_lite,
+ _fix_phonetic_brackets,
+ _split_comma_entries,
+ _words_to_reading_order_text,
+ ocr_region_lighton,
+ ocr_region_rapid,
+ ocr_region_trocr,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+ import cv2
+except ImportError:
+ cv2 = None # type: ignore[assignment]
+
+try:
+ from PIL import Image
+except ImportError:
+ Image = None # type: ignore[assignment,misc]
+
+
+# ---------------------------------------------------------------------------
+
+def _ocr_cell_crop(
+ row_idx: int,
+ col_idx: int,
+ row: RowGeometry,
+ col: PageRegion,
+ ocr_img: np.ndarray,
+ img_bgr: Optional[np.ndarray],
+ img_w: int,
+ img_h: int,
+ engine_name: str,
+ lang: str,
+ lang_map: Dict[str, str],
+) -> Dict[str, Any]:
+ """OCR a single cell by cropping the exact column×row intersection.
+
+ No padding beyond cell boundaries → no neighbour bleeding.
+ """
+ # Display bbox: exact column × row intersection
+ disp_x = col.x
+ disp_y = row.y
+ disp_w = col.width
+ disp_h = row.height
+
+ # Crop boundaries: add small internal padding (3px each side) to avoid
+ # clipping characters near column/row edges (e.g. parentheses, descenders).
+ # Stays within image bounds but may extend slightly beyond strict cell.
+ # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
+ _PAD = 3
+ cx = max(0, disp_x - _PAD)
+ cy = max(0, disp_y - _PAD)
+ cx2 = min(img_w, disp_x + disp_w + _PAD)
+ cy2 = min(img_h, disp_y + disp_h + _PAD)
+ cw = cx2 - cx
+ ch = cy2 - cy
+
+ empty_cell = {
+ 'cell_id': f"R{row_idx:02d}_C{col_idx}",
+ 'row_index': row_idx,
+ 'col_index': col_idx,
+ 'col_type': col.type,
+ 'text': '',
+ 'confidence': 0.0,
+ 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+ 'bbox_pct': {
+ 'x': round(disp_x / img_w * 100, 2) if img_w else 0,
+ 'y': round(disp_y / img_h * 100, 2) if img_h else 0,
+ 'w': round(disp_w / img_w * 100, 2) if img_w else 0,
+ 'h': round(disp_h / img_h * 100, 2) if img_h else 0,
+ },
+ 'ocr_engine': 'cell_crop_v2',
+ 'is_bold': False,
+ }
+
+ if cw <= 0 or ch <= 0:
+ logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
+ return empty_cell
+
+ # --- Pixel-density check: skip truly empty cells ---
+ if ocr_img is not None:
+ crop = ocr_img[cy:cy + ch, cx:cx + cw]
+ if crop.size > 0:
+ dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+ if dark_ratio < 0.005:
+ logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
+ row_idx, col_idx, dark_ratio, cw, ch)
+ return empty_cell
+
+ # --- Prepare crop for OCR ---
+ cell_lang = lang_map.get(col.type, lang)
+ psm = _select_psm_for_column(col.type, col.width, row.height)
+ text = ''
+ avg_conf = 0.0
+ used_engine = 'cell_crop_v2'
+
+ if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+ cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+ words = ocr_region_trocr(img_bgr, cell_region,
+ handwritten=(engine_name == "trocr-handwritten"))
+ elif engine_name == "lighton" and img_bgr is not None:
+ cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
+ words = ocr_region_lighton(img_bgr, cell_region)
+ elif engine_name == "rapid" and img_bgr is not None:
+ # Upscale small BGR crops for RapidOCR.
+ # Cell crops typically have height 35-55px but width >300px.
+ # _ensure_minimum_crop_size only scales when EITHER dim < min_dim,
+ # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78).
+ # For very short heights (< 80px), force 3× upscale for better OCR
+ # of small characters like periods, ellipsis, and phonetic symbols.
+ bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
+ if bgr_crop.size == 0:
+ words = []
+ else:
+ crop_h, crop_w = bgr_crop.shape[:2]
+ if crop_h < 80:
+ # Force 3× upscale for short rows — small chars need more pixels
+ scale = 3.0
+ bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
+ interpolation=cv2.INTER_CUBIC)
+ else:
+ bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
+ up_h, up_w = bgr_up.shape[:2]
+ scale_x = up_w / max(crop_w, 1)
+ scale_y = up_h / max(crop_h, 1)
+ was_scaled = (up_w != crop_w or up_h != crop_h)
+ logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
+ row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
+ tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+ words = ocr_region_rapid(bgr_up, tmp_region)
+ # Remap positions back to original image coords
+ if words and was_scaled:
+ for w in words:
+ w['left'] = int(w['left'] / scale_x) + cx
+ w['top'] = int(w['top'] / scale_y) + cy
+ w['width'] = int(w['width'] / scale_x)
+ w['height'] = int(w['height'] / scale_y)
+ elif words:
+ for w in words:
+ w['left'] += cx
+ w['top'] += cy
+ else:
+ # Tesseract: upscale tiny crops for better recognition
+ if ocr_img is not None:
+ crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+ upscaled = _ensure_minimum_crop_size(crop_slice)
+ up_h, up_w = upscaled.shape[:2]
+ tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+ words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
+ # Remap word positions back to original image coordinates
+ if words and (up_w != cw or up_h != ch):
+ sx = cw / max(up_w, 1)
+ sy = ch / max(up_h, 1)
+ for w in words:
+ w['left'] = int(w['left'] * sx) + cx
+ w['top'] = int(w['top'] * sy) + cy
+ w['width'] = int(w['width'] * sx)
+ w['height'] = int(w['height'] * sy)
+ elif words:
+ for w in words:
+ w['left'] += cx
+ w['top'] += cy
+ else:
+ words = []
+
+ # Filter low-confidence words
+ _MIN_WORD_CONF = 30
+ if words:
+ words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+ if words:
+ y_tol = max(15, ch)
+ text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+ avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+ logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
+ row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
+ else:
+ logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
+ row_idx, col_idx, cw, ch, psm, engine_name)
+
+ # --- PSM 7 fallback for still-empty Tesseract cells ---
+ if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
+ crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
+ upscaled = _ensure_minimum_crop_size(crop_slice)
+ up_h, up_w = upscaled.shape[:2]
+ tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+ psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
+ if psm7_words:
+ psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+ if psm7_words:
+ p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+ if p7_text.strip():
+ text = p7_text
+ avg_conf = round(
+ sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+ )
+ used_engine = 'cell_crop_v2_psm7'
+
+ # --- Noise filter ---
+ if text.strip():
+ pre_filter = text
+ text = _clean_cell_text_lite(text)
+ if not text:
+ logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
+ row_idx, col_idx, pre_filter)
+ avg_conf = 0.0
+
+ result = dict(empty_cell)
+ result['text'] = text
+ result['confidence'] = avg_conf
+ result['ocr_engine'] = used_engine
+ return result
+
+
+# Threshold: columns narrower than this (% of image width) use single-cell
+# crop OCR instead of full-page word assignment.
+#
+# Broad columns (>= threshold): Full-page Tesseract word assignment.
+# Better for multi-word content (sentences, IPA brackets, punctuation).
+# Examples: EN vocabulary, DE translation, example sentences.
+#
+# Narrow columns (< threshold): Isolated cell-crop OCR.
+# Prevents neighbour bleeding from adjacent broad columns.
+# Examples: page_ref, marker, numbering columns.
+#
+# 15% was empirically validated across vocab table scans with 3-5 columns.
+# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width.
+# The 15% boundary cleanly separates the two groups.
+_NARROW_COL_THRESHOLD_PCT = 15.0
+
+
+def build_cell_grid_v2(
+ ocr_img: np.ndarray,
+ column_regions: List[PageRegion],
+ row_geometries: List[RowGeometry],
+ img_w: int,
+ img_h: int,
+ lang: str = "eng+deu",
+ ocr_engine: str = "auto",
+ img_bgr: Optional[np.ndarray] = None,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
+
+ Drop-in replacement for build_cell_grid() — same signature & return type.
+
+ Strategy:
+ - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
+ words (from row.words). Handles IPA brackets, punctuation, sentence
+ continuity correctly.
+ - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
+ neighbour bleeding from adjacent broad columns.
+ """
+ engine_name = "tesseract"
+ if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+ engine_name = ocr_engine
+ elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
+ engine_name = "rapid"
+
+ logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
+
+ # Filter to content rows only
+ content_rows = [r for r in row_geometries if r.row_type == 'content']
+ if not content_rows:
+ logger.warning("build_cell_grid_v2: no content rows found")
+ return [], []
+
+ # Filter phantom rows (word_count=0) and artifact rows
+ before = len(content_rows)
+ content_rows = [r for r in content_rows if r.word_count > 0]
+ skipped = before - len(content_rows)
+ if skipped > 0:
+ logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
+ if not content_rows:
+ logger.warning("build_cell_grid_v2: no content rows with words found")
+ return [], []
+
+ before_art = len(content_rows)
+ content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+ artifact_skipped = before_art - len(content_rows)
+ if artifact_skipped > 0:
+ logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
+ if not content_rows:
+ logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
+ return [], []
+
+ # Filter columns
+ _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+ 'margin_bottom', 'margin_left', 'margin_right'}
+ relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+ if not relevant_cols:
+ logger.warning("build_cell_grid_v2: no usable columns found")
+ return [], []
+
+ # Heal row gaps — use header/footer boundaries
+ content_rows.sort(key=lambda r: r.y)
+ header_rows = [r for r in row_geometries if r.row_type == 'header']
+ footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+ if header_rows:
+ top_bound = max(r.y + r.height for r in header_rows)
+ else:
+ top_bound = content_rows[0].y
+ if footer_rows:
+ bottom_bound = min(r.y for r in footer_rows)
+ else:
+ bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+ _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+ relevant_cols.sort(key=lambda c: c.x)
+
+ columns_meta = [
+ {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+ for ci, c in enumerate(relevant_cols)
+ ]
+
+ lang_map = {
+ 'column_en': 'eng',
+ 'column_de': 'deu',
+ 'column_example': 'eng+deu',
+ }
+
+ # --- Classify columns as broad vs narrow ---
+ narrow_col_indices = set()
+ for ci, col in enumerate(relevant_cols):
+ col_pct = (col.width / img_w * 100) if img_w > 0 else 0
+ if col_pct < _NARROW_COL_THRESHOLD_PCT:
+ narrow_col_indices.add(ci)
+
+ broad_col_count = len(relevant_cols) - len(narrow_col_indices)
+ logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
+ f"{len(narrow_col_indices)} narrow columns (cell-crop)")
+
+ # --- Phase 1: Broad columns via full-page word assignment ---
+ cells: List[Dict[str, Any]] = []
+
+ for row_idx, row in enumerate(content_rows):
+ # Assign full-page words to columns for this row
+ col_words = _assign_row_words_to_columns(row, relevant_cols)
+
+ for col_idx, col in enumerate(relevant_cols):
+ if col_idx not in narrow_col_indices:
+ # BROAD column: use pre-assigned full-page words
+ words = col_words.get(col_idx, [])
+ # Filter low-confidence words
+ words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+ if words:
+ y_tol = max(15, row.height)
+ text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+ avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+ else:
+ text = ''
+ avg_conf = 0.0
+
+ # Apply noise filter
+ text = _clean_cell_text(text)
+
+ cell = {
+ 'cell_id': f"R{row_idx:02d}_C{col_idx}",
+ 'row_index': row_idx,
+ 'col_index': col_idx,
+ 'col_type': col.type,
+ 'text': text,
+ 'confidence': avg_conf,
+ 'bbox_px': {
+ 'x': col.x, 'y': row.y,
+ 'w': col.width, 'h': row.height,
+ },
+ 'bbox_pct': {
+ 'x': round(col.x / img_w * 100, 2) if img_w else 0,
+ 'y': round(row.y / img_h * 100, 2) if img_h else 0,
+ 'w': round(col.width / img_w * 100, 2) if img_w else 0,
+ 'h': round(row.height / img_h * 100, 2) if img_h else 0,
+ },
+ 'ocr_engine': 'word_lookup',
+ 'is_bold': False,
+ }
+ cells.append(cell)
+
+ # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
+ narrow_tasks = []
+ for row_idx, row in enumerate(content_rows):
+ for col_idx, col in enumerate(relevant_cols):
+ if col_idx in narrow_col_indices:
+ narrow_tasks.append((row_idx, col_idx, row, col))
+
+ if narrow_tasks:
+ max_workers = 4 if engine_name == "tesseract" else 2
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
+ futures = {
+ pool.submit(
+ _ocr_cell_crop,
+ ri, ci, row, col,
+ ocr_img, img_bgr, img_w, img_h,
+ engine_name, lang, lang_map,
+ ): (ri, ci)
+ for ri, ci, row, col in narrow_tasks
+ }
+ for future in as_completed(futures):
+ try:
+ cell = future.result()
+ cells.append(cell)
+ except Exception as e:
+ ri, ci = futures[future]
+ logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
+
+ # Sort cells by (row_index, col_index)
+ cells.sort(key=lambda c: (c['row_index'], c['col_index']))
+
+ # Remove all-empty rows
+ rows_with_text: set = set()
+ for cell in cells:
+ if cell['text'].strip():
+ rows_with_text.add(cell['row_index'])
+ before_filter = len(cells)
+ cells = [c for c in cells if c['row_index'] in rows_with_text]
+ empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+ if empty_rows_removed > 0:
+ logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
+
+ # Bold detection disabled: cell-level stroke-width analysis cannot
+ # distinguish bold from non-bold when cells contain mixed formatting
+ # (e.g. "cookie ['kuki]" — bold word + non-bold phonetics).
+ # TODO: word-level bold detection would require per-word bounding boxes.
+
+ logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
+ f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
+ f"engine={engine_name} (hybrid)")
+
+ return cells, columns_meta
+
+
+def build_cell_grid_v2_streaming(
+ ocr_img: np.ndarray,
+ column_regions: List[PageRegion],
+ row_geometries: List[RowGeometry],
+ img_w: int,
+ img_h: int,
+ lang: str = "eng+deu",
+ ocr_engine: str = "auto",
+ img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+ """Streaming variant of build_cell_grid_v2 — yields each cell as OCR'd.
+
+ Yields:
+ (cell_dict, columns_meta, total_cells)
+ """
+ # Resolve engine — default to Tesseract for cell-first OCR.
+ # Tesseract excels at isolated text crops (binarized, upscaled).
+ # RapidOCR is optimized for full-page scene-text and produces artifacts
+ # on small cell crops (extra chars, missing punctuation, garbled IPA).
+ use_rapid = False
+ if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+ engine_name = ocr_engine
+ elif ocr_engine == "auto":
+ engine_name = "tesseract"
+ elif ocr_engine == "rapid":
+ if not RAPIDOCR_AVAILABLE:
+ logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+ else:
+ use_rapid = True
+ engine_name = "rapid" if use_rapid else "tesseract"
+ else:
+ engine_name = "tesseract"
+
+ content_rows = [r for r in row_geometries if r.row_type == 'content']
+ if not content_rows:
+ return
+
+ content_rows = [r for r in content_rows if r.word_count > 0]
+ if not content_rows:
+ return
+
+ _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
+ 'margin_bottom', 'margin_left', 'margin_right'}
+ relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+ if not relevant_cols:
+ return
+
+ content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+ if not content_rows:
+ return
+
+ # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2)
+ content_rows.sort(key=lambda r: r.y)
+ header_rows = [r for r in row_geometries if r.row_type == 'header']
+ footer_rows = [r for r in row_geometries if r.row_type == 'footer']
+ if header_rows:
+ top_bound = max(r.y + r.height for r in header_rows)
+ else:
+ top_bound = content_rows[0].y
+ if footer_rows:
+ bottom_bound = min(r.y for r in footer_rows)
+ else:
+ bottom_bound = content_rows[-1].y + content_rows[-1].height
+
+ _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
+
+ relevant_cols.sort(key=lambda c: c.x)
+
+ columns_meta = [
+ {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
+ for ci, c in enumerate(relevant_cols)
+ ]
+
+ lang_map = {
+ 'column_en': 'eng',
+ 'column_de': 'deu',
+ 'column_example': 'eng+deu',
+ }
+
+ total_cells = len(content_rows) * len(relevant_cols)
+
+ for row_idx, row in enumerate(content_rows):
+ for col_idx, col in enumerate(relevant_cols):
+ cell = _ocr_cell_crop(
+ row_idx, col_idx, row, col,
+ ocr_img, img_bgr, img_w, img_h,
+ engine_name, lang, lang_map,
+ )
+ yield cell, columns_meta, total_cells
+
+
+# ---------------------------------------------------------------------------
+# Narrow-column OCR helpers (Proposal B) — DEPRECATED (kept for legacy build_cell_grid)
+# ---------------------------------------------------------------------------
+
+def _compute_cell_padding(col_width: int, img_w: int) -> int:
+ """Adaptive padding for OCR crops based on column width.
+
+ Narrow columns (page_ref, marker) need more surrounding context so
+ Tesseract can segment characters correctly. Wide columns keep the
+ minimal 4 px padding to avoid pulling in neighbours.
+ """
+ col_pct = col_width / img_w * 100 if img_w > 0 else 100
+ if col_pct < 5:
+ return max(20, col_width // 2)
+ if col_pct < 10:
+ return max(12, col_width // 4)
+ if col_pct < 15:
+ return 8
+ return 4
+
+
+def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
+ max_scale: int = 3) -> np.ndarray:
+ """Upscale tiny crops so Tesseract gets enough pixel data.
+
+ If either dimension is below *min_dim*, the crop is bicubic-upscaled
+ so the smallest dimension reaches *min_dim* (capped at *max_scale* ×).
+ """
+ h, w = crop.shape[:2]
+ if h >= min_dim and w >= min_dim:
+ return crop
+ scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
+ if scale <= 1.0:
+ return crop
+ new_w = int(w * scale)
+ new_h = int(h * scale)
+ return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+
+
+def _select_psm_for_column(col_type: str, col_width: int,
+ row_height: int) -> int:
+ """Choose the best Tesseract PSM for a given column geometry.
+
+ - page_ref columns are almost always single short tokens → PSM 8
+ - Very narrow or short cells → PSM 7 (single text line)
+ - Everything else → PSM 6 (uniform block)
+ """
+ if col_type in ('page_ref', 'marker'):
+ return 8 # single word
+ if col_width < 100 or row_height < 30:
+ return 7 # single line
+ return 6 # uniform block
+
+
+def _ocr_single_cell(
+ row_idx: int,
+ col_idx: int,
+ row: RowGeometry,
+ col: PageRegion,
+ ocr_img: np.ndarray,
+ img_bgr: Optional[np.ndarray],
+ img_w: int,
+ img_h: int,
+ use_rapid: bool,
+ engine_name: str,
+ lang: str,
+ lang_map: Dict[str, str],
+ preassigned_words: Optional[List[Dict]] = None,
+) -> Dict[str, Any]:
+ """Populate a single cell (column x row intersection) via word lookup."""
+ # Display bbox: exact column × row intersection (no padding)
+ disp_x = col.x
+ disp_y = row.y
+ disp_w = col.width
+ disp_h = row.height
+
+ # OCR crop: adaptive padding — narrow columns get more context
+ pad = _compute_cell_padding(col.width, img_w)
+ cell_x = max(0, col.x - pad)
+ cell_y = max(0, row.y - pad)
+ cell_w = min(col.width + 2 * pad, img_w - cell_x)
+ cell_h = min(row.height + 2 * pad, img_h - cell_y)
+ is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
+
+ if disp_w <= 0 or disp_h <= 0:
+ return {
+ 'cell_id': f"R{row_idx:02d}_C{col_idx}",
+ 'row_index': row_idx,
+ 'col_index': col_idx,
+ 'col_type': col.type,
+ 'text': '',
+ 'confidence': 0.0,
+ 'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
+ 'bbox_pct': {
+ 'x': round(col.x / img_w * 100, 2),
+ 'y': round(row.y / img_h * 100, 2),
+ 'w': round(col.width / img_w * 100, 2),
+ 'h': round(row.height / img_h * 100, 2),
+ },
+ 'ocr_engine': 'word_lookup',
+ }
+
+ # --- PRIMARY: Word-lookup from full-page Tesseract ---
+ words = preassigned_words if preassigned_words is not None else []
+ used_engine = 'word_lookup'
+
+ # Filter low-confidence words (OCR noise from images/artifacts).
+ # Tesseract gives low confidence to misread image edges, borders,
+ # and other non-text elements.
+ _MIN_WORD_CONF = 30
+ if words:
+ words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
+
+ if words:
+ # Use row height as Y-tolerance so all words within a single row
+ # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
+ # across two lines due to slight vertical offset).
+ y_tol = max(15, row.height)
+ text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
+ avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+ else:
+ text = ''
+ avg_conf = 0.0
+
+ # --- FALLBACK: Cell-OCR for empty cells ---
+ # Full-page Tesseract can miss small or isolated words (e.g. "Ei").
+ # Re-run OCR on the cell crop to catch what word-lookup missed.
+ # To avoid wasting time on truly empty cells, check pixel density first:
+ # only run Tesseract if the cell crop contains enough dark pixels to
+ # plausibly contain text.
+ _run_fallback = False
+ if not text.strip() and cell_w > 0 and cell_h > 0:
+ if ocr_img is not None:
+ crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+ if crop.size > 0:
+ # Threshold: pixels darker than 180 (on 0-255 grayscale).
+ # Use 0.5% to catch even small text like "Ei" (2 chars)
+ # in an otherwise empty cell.
+ dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+ _run_fallback = dark_ratio > 0.005
+ if _run_fallback:
+ # For narrow columns, upscale the crop before OCR
+ if is_narrow and ocr_img is not None:
+ _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
+ _upscaled = _ensure_minimum_crop_size(_crop_slice)
+ if _upscaled is not _crop_slice:
+ # Build a temporary full-size image with the upscaled crop
+ # placed at origin so ocr_region can crop it cleanly.
+ _up_h, _up_w = _upscaled.shape[:2]
+ _tmp_region = PageRegion(
+ type=col.type, x=0, y=0, width=_up_w, height=_up_h,
+ )
+ _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+ cell_lang = lang_map.get(col.type, lang)
+ fallback_words = ocr_region(_upscaled, _tmp_region,
+ lang=cell_lang, psm=_cell_psm)
+ # Remap word positions back to original image coordinates
+ _sx = cell_w / max(_up_w, 1)
+ _sy = cell_h / max(_up_h, 1)
+ for _fw in (fallback_words or []):
+ _fw['left'] = int(_fw['left'] * _sx) + cell_x
+ _fw['top'] = int(_fw['top'] * _sy) + cell_y
+ _fw['width'] = int(_fw['width'] * _sx)
+ _fw['height'] = int(_fw['height'] * _sy)
+ else:
+ # No upscaling needed, use adaptive PSM
+ cell_region = PageRegion(
+ type=col.type, x=cell_x, y=cell_y,
+ width=cell_w, height=cell_h,
+ )
+ _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+ cell_lang = lang_map.get(col.type, lang)
+ fallback_words = ocr_region(ocr_img, cell_region,
+ lang=cell_lang, psm=_cell_psm)
+ else:
+ cell_region = PageRegion(
+ type=col.type,
+ x=cell_x, y=cell_y,
+ width=cell_w, height=cell_h,
+ )
+ if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+ fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
+ elif engine_name == "lighton" and img_bgr is not None:
+ fallback_words = ocr_region_lighton(img_bgr, cell_region)
+ elif use_rapid and img_bgr is not None:
+ fallback_words = ocr_region_rapid(img_bgr, cell_region)
+ else:
+ _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
+ cell_lang = lang_map.get(col.type, lang)
+ fallback_words = ocr_region(ocr_img, cell_region,
+ lang=cell_lang, psm=_cell_psm)
+
+ if fallback_words:
+ # Apply same confidence filter to fallback words
+ fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+ if fallback_words:
+ fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
+ fb_y_tol = max(10, int(fb_avg_h * 0.5))
+ fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
+ if fb_text.strip():
+ text = fb_text
+ avg_conf = round(
+ sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
+ )
+ used_engine = 'cell_ocr_fallback'
+
+ # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
+ if not text.strip() and _run_fallback and not use_rapid:
+ _fb_region = PageRegion(
+ type=col.type, x=cell_x, y=cell_y,
+ width=cell_w, height=cell_h,
+ )
+ cell_lang = lang_map.get(col.type, lang)
+ psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
+ if psm7_words:
+ psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+ if psm7_words:
+ p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
+ if p7_text.strip():
+ text = p7_text
+ avg_conf = round(
+ sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
+ )
+ used_engine = 'cell_ocr_psm7'
+
+ # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
+ # If a narrow cell is still empty, OCR the entire row strip with
+ # RapidOCR (which handles small text better) and assign words by
+ # X-position overlap with this column.
+ if not text.strip() and is_narrow and img_bgr is not None:
+ row_region = PageRegion(
+ type='_row_strip', x=0, y=row.y,
+ width=img_w, height=row.height,
+ )
+ strip_words = ocr_region_rapid(img_bgr, row_region)
+ if strip_words:
+ # Filter to words overlapping this column's X-range
+ col_left = col.x
+ col_right = col.x + col.width
+ col_words = []
+ for sw in strip_words:
+ sw_left = sw.get('left', 0)
+ sw_right = sw_left + sw.get('width', 0)
+ overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
+ if overlap > sw.get('width', 1) * 0.3:
+ col_words.append(sw)
+ if col_words:
+ col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
+ if col_words:
+ rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
+ if rs_text.strip():
+ text = rs_text
+ avg_conf = round(
+ sum(w['conf'] for w in col_words) / len(col_words), 1
+ )
+ used_engine = 'row_strip_rapid'
+
+ # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
+ if text.strip():
+ text = _clean_cell_text(text)
+ if not text:
+ avg_conf = 0.0
+
+ return {
+ 'cell_id': f"R{row_idx:02d}_C{col_idx}",
+ 'row_index': row_idx,
+ 'col_index': col_idx,
+ 'col_type': col.type,
+ 'text': text,
+ 'confidence': avg_conf,
+ 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
+ 'bbox_pct': {
+ 'x': round(disp_x / img_w * 100, 2),
+ 'y': round(disp_y / img_h * 100, 2),
+ 'w': round(disp_w / img_w * 100, 2),
+ 'h': round(disp_h / img_h * 100, 2),
+ },
+ 'ocr_engine': used_engine,
+ }
+
+
+def _is_artifact_row(row: RowGeometry) -> bool:
+ """Return True if this row contains only scan artifacts, not real text.
+
+ Artifact rows (scanner shadows, noise) typically produce only single-character
+ detections. A real content row always has at least one token with 2+ characters.
+ """
+ if row.word_count == 0:
+ return True
+ texts = [w.get('text', '').strip() for w in row.words]
+ return all(len(t) <= 1 for t in texts)
+
+
+def _heal_row_gaps(
+ rows: List[RowGeometry],
+ top_bound: int,
+ bottom_bound: int,
+) -> None:
+ """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
+
+ After filtering out empty or artifact rows, remaining content rows may have
+ gaps between them where the removed rows used to be. This function mutates
+ each row to extend upward/downward to the midpoint of such gaps so that
+ OCR crops cover the full available content area.
+
+ The first row always extends to top_bound; the last row to bottom_bound.
+ """
+ if not rows:
+ return
+ rows.sort(key=lambda r: r.y)
+ n = len(rows)
+ orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
+
+ for i, row in enumerate(rows):
+ # New top: midpoint between previous row's bottom and this row's top
+ if i == 0:
+ new_top = top_bound
+ else:
+ prev_bot = orig[i - 1][1]
+ my_top = orig[i][0]
+ gap = my_top - prev_bot
+ new_top = prev_bot + gap // 2 if gap > 1 else my_top
+
+ # New bottom: midpoint between this row's bottom and next row's top
+ if i == n - 1:
+ new_bottom = bottom_bound
+ else:
+ my_bot = orig[i][1]
+ next_top = orig[i + 1][0]
+ gap = next_top - my_bot
+ new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
+
+ row.y = new_top
+ row.height = max(5, new_bottom - new_top)
+
+ logger.debug(
+ f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
+ f"(bounds: top={top_bound}, bottom={bottom_bound})"
+ )
+
+
+def build_cell_grid(
+ ocr_img: np.ndarray,
+ column_regions: List[PageRegion],
+ row_geometries: List[RowGeometry],
+ img_w: int,
+ img_h: int,
+ lang: str = "eng+deu",
+ ocr_engine: str = "auto",
+ img_bgr: Optional[np.ndarray] = None,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """Generic Cell-Grid: Columns × Rows → cells with OCR text.
+
+ This is the layout-agnostic foundation. Every column (except column_ignore)
+ is intersected with every content row to produce numbered cells.
+
+ Args:
+ ocr_img: Binarized full-page image (for Tesseract).
+ column_regions: Classified columns from Step 3 (PageRegion list).
+ row_geometries: Rows from Step 4 (RowGeometry list).
+ img_w: Image width in pixels.
+ img_h: Image height in pixels.
+ lang: Default Tesseract language.
+ ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'.
+ img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR).
+
+ Returns:
+ (cells, columns_meta) where cells is a list of cell dicts and
+ columns_meta describes the columns used.
+ """
+ # Resolve engine choice
+ use_rapid = False
+ if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+ engine_name = ocr_engine
+ elif ocr_engine == "auto":
+ use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+ engine_name = "rapid" if use_rapid else "tesseract"
+ elif ocr_engine == "rapid":
+ if not RAPIDOCR_AVAILABLE:
+ logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+ else:
+ use_rapid = True
+ engine_name = "rapid" if use_rapid else "tesseract"
+ else:
+ engine_name = "tesseract"
+
+ logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
+
+ # Filter to content rows only (skip header/footer)
+ content_rows = [r for r in row_geometries if r.row_type == 'content']
+ if not content_rows:
+ logger.warning("build_cell_grid: no content rows found")
+ return [], []
+
+ # Filter phantom rows: rows with no Tesseract words assigned are
+ # inter-line whitespace gaps that would produce garbage OCR.
+ before = len(content_rows)
+ content_rows = [r for r in content_rows if r.word_count > 0]
+ skipped = before - len(content_rows)
+ if skipped > 0:
+ logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
+ if not content_rows:
+ logger.warning("build_cell_grid: no content rows with words found")
+ return [], []
+
+ # Use columns only — skip ignore, header, footer, page_ref
+ _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+ relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+ if not relevant_cols:
+ logger.warning("build_cell_grid: no usable columns found")
+ return [], []
+
+ # Filter artifact rows: rows whose detected words are all single characters
+ # are caused by scanner shadows or noise, not real text.
+ before_art = len(content_rows)
+ content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+ artifact_skipped = before_art - len(content_rows)
+ if artifact_skipped > 0:
+ logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
+ if not content_rows:
+ logger.warning("build_cell_grid: no content rows after artifact filtering")
+ return [], []
+
+ # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
+ # to fill the space so OCR crops are not artificially narrow.
+ _heal_row_gaps(
+ content_rows,
+ top_bound=min(c.y for c in relevant_cols),
+ bottom_bound=max(c.y + c.height for c in relevant_cols),
+ )
+
+ # Sort columns left-to-right
+ relevant_cols.sort(key=lambda c: c.x)
+
+ # Build columns_meta
+ columns_meta = [
+ {
+ 'index': col_idx,
+ 'type': col.type,
+ 'x': col.x,
+ 'width': col.width,
+ }
+ for col_idx, col in enumerate(relevant_cols)
+ ]
+
+ # Choose OCR language per column type (Tesseract only)
+ lang_map = {
+ 'column_en': 'eng',
+ 'column_de': 'deu',
+ 'column_example': 'eng+deu',
+ }
+
+ cells: List[Dict[str, Any]] = []
+
+ for row_idx, row in enumerate(content_rows):
+ # Pre-assign each word to exactly one column (nearest center)
+ col_words = _assign_row_words_to_columns(row, relevant_cols)
+ for col_idx, col in enumerate(relevant_cols):
+ cell = _ocr_single_cell(
+ row_idx, col_idx, row, col,
+ ocr_img, img_bgr, img_w, img_h,
+ use_rapid, engine_name, lang, lang_map,
+ preassigned_words=col_words[col_idx],
+ )
+ cells.append(cell)
+
+ # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
+ # Collect cells that are still empty but have visible pixels.
+ # Instead of calling Tesseract once per cell (expensive), crop an entire
+ # column strip and run OCR once, then assign words to cells by Y position.
+ empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices]
+ for ci, cell in enumerate(cells):
+ if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
+ bpx = cell['bbox_px']
+ x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
+ if w > 0 and h > 0 and ocr_img is not None:
+ crop = ocr_img[y:y + h, x:x + w]
+ if crop.size > 0:
+ dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
+ if dark_ratio > 0.005:
+ empty_by_col.setdefault(cell['col_index'], []).append(ci)
+
+ for col_idx, cell_indices in empty_by_col.items():
+ if len(cell_indices) < 3:
+ continue # Not worth batching for < 3 cells
+
+ # Find the column strip bounding box (union of all empty cell bboxes)
+ min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
+ max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
+ col_x = cells[cell_indices[0]]['bbox_px']['x']
+ col_w = cells[cell_indices[0]]['bbox_px']['w']
+
+ strip_region = PageRegion(
+ type=relevant_cols[col_idx].type,
+ x=col_x, y=min_y,
+ width=col_w, height=max_y_h - min_y,
+ )
+ strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
+
+ if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
+ strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
+ elif engine_name == "lighton" and img_bgr is not None:
+ strip_words = ocr_region_lighton(img_bgr, strip_region)
+ elif use_rapid and img_bgr is not None:
+ strip_words = ocr_region_rapid(img_bgr, strip_region)
+ else:
+ strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
+
+ if not strip_words:
+ continue
+
+ strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
+ if not strip_words:
+ continue
+
+ # Assign words to cells by Y overlap
+ for ci in cell_indices:
+ cell_y = cells[ci]['bbox_px']['y']
+ cell_h = cells[ci]['bbox_px']['h']
+ cell_mid_y = cell_y + cell_h / 2
+
+ matched_words = [
+ w for w in strip_words
+ if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
+ ]
+ if matched_words:
+ matched_words.sort(key=lambda w: w['left'])
+ batch_text = ' '.join(w['text'] for w in matched_words)
+ batch_text = _clean_cell_text(batch_text)
+ if batch_text.strip():
+ cells[ci]['text'] = batch_text
+ cells[ci]['confidence'] = round(
+ sum(w['conf'] for w in matched_words) / len(matched_words), 1
+ )
+ cells[ci]['ocr_engine'] = 'batch_column_ocr'
+
+ batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
+ if batch_filled > 0:
+ logger.info(
+ f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
+ f"empty cells in column {col_idx}"
+ )
+
+ # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
+ # that had stray Tesseract artifacts giving word_count > 0).
+ rows_with_text: set = set()
+ for cell in cells:
+ if cell['text'].strip():
+ rows_with_text.add(cell['row_index'])
+ before_filter = len(cells)
+ cells = [c for c in cells if c['row_index'] in rows_with_text]
+ empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
+ if empty_rows_removed > 0:
+ logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
+
+ logger.info(f"build_cell_grid: {len(cells)} cells from "
+ f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
+ f"engine={engine_name}")
+
+ return cells, columns_meta
+
+
+def build_cell_grid_streaming(
+ ocr_img: np.ndarray,
+ column_regions: List[PageRegion],
+ row_geometries: List[RowGeometry],
+ img_w: int,
+ img_h: int,
+ lang: str = "eng+deu",
+ ocr_engine: str = "auto",
+ img_bgr: Optional[np.ndarray] = None,
+) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
+ """Like build_cell_grid(), but yields each cell as it is OCR'd.
+
+ Yields:
+ (cell_dict, columns_meta, total_cells) for each cell.
+ """
+ # Resolve engine choice (same as build_cell_grid)
+ use_rapid = False
+ if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
+ engine_name = ocr_engine
+ elif ocr_engine == "auto":
+ use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
+ engine_name = "rapid" if use_rapid else "tesseract"
+ elif ocr_engine == "rapid":
+ if not RAPIDOCR_AVAILABLE:
+ logger.warning("RapidOCR requested but not available, falling back to Tesseract")
+ else:
+ use_rapid = True
+ engine_name = "rapid" if use_rapid else "tesseract"
+ else:
+ engine_name = "tesseract"
+
+ content_rows = [r for r in row_geometries if r.row_type == 'content']
+ if not content_rows:
+ return
+
+ # Filter phantom rows: rows with no Tesseract words assigned are
+ # inter-line whitespace gaps that would produce garbage OCR.
+ before = len(content_rows)
+ content_rows = [r for r in content_rows if r.word_count > 0]
+ skipped = before - len(content_rows)
+ if skipped > 0:
+ logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
+ if not content_rows:
+ return
+
+ _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+ relevant_cols = [c for c in column_regions if c.type not in _skip_types]
+ if not relevant_cols:
+ return
+
+ # Filter artifact rows + heal gaps (same logic as build_cell_grid)
+ before_art = len(content_rows)
+ content_rows = [r for r in content_rows if not _is_artifact_row(r)]
+ artifact_skipped = before_art - len(content_rows)
+ if artifact_skipped > 0:
+ logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
+ if not content_rows:
+ return
+ _heal_row_gaps(
+ content_rows,
+ top_bound=min(c.y for c in relevant_cols),
+ bottom_bound=max(c.y + c.height for c in relevant_cols),
+ )
+
+ relevant_cols.sort(key=lambda c: c.x)
+
+ columns_meta = [
+ {
+ 'index': col_idx,
+ 'type': col.type,
+ 'x': col.x,
+ 'width': col.width,
+ }
+ for col_idx, col in enumerate(relevant_cols)
+ ]
+
+ lang_map = {
+ 'column_en': 'eng',
+ 'column_de': 'deu',
+ 'column_example': 'eng+deu',
+ }
+
+ total_cells = len(content_rows) * len(relevant_cols)
+
+ for row_idx, row in enumerate(content_rows):
+ # Pre-assign each word to exactly one column (nearest center)
+ col_words = _assign_row_words_to_columns(row, relevant_cols)
+ for col_idx, col in enumerate(relevant_cols):
+ cell = _ocr_single_cell(
+ row_idx, col_idx, row, col,
+ ocr_img, img_bgr, img_w, img_h,
+ use_rapid, engine_name, lang, lang_map,
+ preassigned_words=col_words[col_idx],
+ )
+ yield cell, columns_meta, total_cells
+
+
+def _cells_to_vocab_entries(
+ cells: List[Dict[str, Any]],
+ columns_meta: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+ """Map generic cells to vocab entries with english/german/example fields.
+
+ Groups cells by row_index, maps col_type → field name, and produces
+ one entry per row (only rows with at least one non-empty field).
+ """
+ # Determine image dimensions from first cell (for row-level bbox)
+ col_type_to_field = {
+ 'column_en': 'english',
+ 'column_de': 'german',
+ 'column_example': 'example',
+ 'page_ref': 'source_page',
+ 'column_marker': 'marker',
+ }
+ bbox_key_map = {
+ 'column_en': 'bbox_en',
+ 'column_de': 'bbox_de',
+ 'column_example': 'bbox_ex',
+ 'page_ref': 'bbox_ref',
+ 'column_marker': 'bbox_marker',
+ }
+
+ # Group cells by row_index
+ rows: Dict[int, List[Dict]] = {}
+ for cell in cells:
+ ri = cell['row_index']
+ rows.setdefault(ri, []).append(cell)
+
+ entries: List[Dict[str, Any]] = []
+ for row_idx in sorted(rows.keys()):
+ row_cells = rows[row_idx]
+ entry: Dict[str, Any] = {
+ 'row_index': row_idx,
+ 'english': '',
+ 'german': '',
+ 'example': '',
+ 'source_page': '',
+ 'marker': '',
+ 'confidence': 0.0,
+ 'bbox': None,
+ 'bbox_en': None,
+ 'bbox_de': None,
+ 'bbox_ex': None,
+ 'bbox_ref': None,
+ 'bbox_marker': None,
+ 'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
+ }
+
+ confidences = []
+ for cell in row_cells:
+ col_type = cell['col_type']
+ field = col_type_to_field.get(col_type)
+ if field:
+ entry[field] = cell['text']
+ bbox_field = bbox_key_map.get(col_type)
+ if bbox_field:
+ entry[bbox_field] = cell['bbox_pct']
+ if cell['confidence'] > 0:
+ confidences.append(cell['confidence'])
+
+ # Compute row-level bbox as union of all cell bboxes
+ all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
+ if all_bboxes:
+ min_x = min(b['x'] for b in all_bboxes)
+ min_y = min(b['y'] for b in all_bboxes)
+ max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
+ max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
+ entry['bbox'] = {
+ 'x': round(min_x, 2),
+ 'y': round(min_y, 2),
+ 'w': round(max_x2 - min_x, 2),
+ 'h': round(max_y2 - min_y, 2),
+ }
+
+ entry['confidence'] = round(
+ sum(confidences) / len(confidences), 1
+ ) if confidences else 0.0
+
+ # Only include if at least one mapped field has text
+ has_content = any(
+ entry.get(f)
+ for f in col_type_to_field.values()
+ )
+ if has_content:
+ entries.append(entry)
+
+ return entries
+
+
+# Regex: line starts with phonetic bracket content only (no real word before it)
+_PHONETIC_ONLY_RE = re.compile(
+ r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+)
+
+
+def _is_phonetic_only_text(text: str) -> bool:
+ """Check if text consists only of phonetic transcription.
+
+ Phonetic-only patterns:
+ ['mani serva] → True
+ [dɑːns] → True
+ ["a:mand] → True
+ almond ['a:mand] → False (has real word before bracket)
+ Mandel → False
+ """
+ t = text.strip()
+ if not t:
+ return False
+ # Must contain at least one bracket
+ if '[' not in t and ']' not in t:
+ return False
+ # Remove all bracket content and surrounding punctuation/whitespace
+ without_brackets = re.sub(r"\[.*?\]", '', t)
+ without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
+ # If nothing meaningful remains, it's phonetic-only
+ alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
+ return len(alpha_remaining) < 2
+
+
+def _merge_phonetic_continuation_rows(
+ entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+ """Merge rows that contain only phonetic transcription into previous entry.
+
+ In dictionary pages, phonetic transcription sometimes wraps to the next
+ row. E.g.:
+ Row 28: EN="it's a money-saver" DE="es spart Kosten"
+ Row 29: EN="['mani serva]" DE=""
+
+ Row 29 is phonetic-only → merge into row 28's EN field.
+ """
+ if len(entries) < 2:
+ return entries
+
+ merged: List[Dict[str, Any]] = []
+ for entry in entries:
+ en = (entry.get('english') or '').strip()
+ de = (entry.get('german') or '').strip()
+ ex = (entry.get('example') or '').strip()
+
+ # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
+ if merged and _is_phonetic_only_text(en) and not de:
+ prev = merged[-1]
+ prev_en = (prev.get('english') or '').strip()
+ # Append phonetic to previous entry's EN
+ if prev_en:
+ prev['english'] = prev_en + ' ' + en
+ else:
+ prev['english'] = en
+ # If there was an example, append to previous too
+ if ex:
+ prev_ex = (prev.get('example') or '').strip()
+ prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+ logger.debug(
+ f"Merged phonetic row {entry.get('row_index')} "
+ f"into previous entry: {prev['english']!r}"
+ )
+ continue
+
+ merged.append(entry)
+
+ return merged
+
+
+def _merge_continuation_rows(
+ entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+ """Merge multi-line vocabulary entries where text wraps to the next row.
+
+ A row is a continuation of the previous entry when:
+ - EN has text, but DE is empty
+ - EN starts with a lowercase letter (not a new vocab entry)
+ - Previous entry's EN does NOT end with a sentence terminator (.!?)
+ - The continuation text has fewer than 4 words (not an example sentence)
+ - The row was not already merged as phonetic
+
+ Example:
+ Row 5: EN="to put up" DE="aufstellen"
+ Row 6: EN="with sth." DE=""
+ → Merged: EN="to put up with sth." DE="aufstellen"
+ """
+ if len(entries) < 2:
+ return entries
+
+ merged: List[Dict[str, Any]] = []
+ for entry in entries:
+ en = (entry.get('english') or '').strip()
+ de = (entry.get('german') or '').strip()
+
+ if merged and en and not de:
+ # Check: not phonetic (already handled)
+ if _is_phonetic_only_text(en):
+ merged.append(entry)
+ continue
+
+ # Check: starts with lowercase
+ first_alpha = next((c for c in en if c.isalpha()), '')
+ starts_lower = first_alpha and first_alpha.islower()
+
+ # Check: fewer than 4 words (not an example sentence)
+ word_count = len(en.split())
+ is_short = word_count < 4
+
+ # Check: previous entry doesn't end with sentence terminator
+ prev = merged[-1]
+ prev_en = (prev.get('english') or '').strip()
+ prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
+
+ if starts_lower and is_short and not prev_ends_sentence:
+ # Merge into previous entry
+ prev['english'] = (prev_en + ' ' + en).strip()
+ # Merge example if present
+ ex = (entry.get('example') or '').strip()
+ if ex:
+ prev_ex = (prev.get('example') or '').strip()
+ prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+ logger.debug(
+ f"Merged continuation row {entry.get('row_index')} "
+ f"into previous entry: {prev['english']!r}"
+ )
+ continue
+
+ merged.append(entry)
+
+ return merged
+
+
+def build_word_grid(
+ ocr_img: np.ndarray,
+ column_regions: List[PageRegion],
+ row_geometries: List[RowGeometry],
+ img_w: int,
+ img_h: int,
+ lang: str = "eng+deu",
+ ocr_engine: str = "auto",
+ img_bgr: Optional[np.ndarray] = None,
+ pronunciation: str = "british",
+) -> List[Dict[str, Any]]:
+ """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
+
+ Wrapper around build_cell_grid() that adds vocabulary-specific logic:
+ - Maps cells to english/german/example entries
+ - Applies character confusion fixes, IPA lookup, comma splitting, etc.
+ - Falls back to returning raw cells if no vocab columns detected.
+
+ Args:
+ ocr_img: Binarized full-page image (for Tesseract).
+ column_regions: Classified columns from Step 3.
+ row_geometries: Rows from Step 4.
+ img_w, img_h: Image dimensions.
+ lang: Default Tesseract language.
+ ocr_engine: 'tesseract', 'rapid', or 'auto'.
+ img_bgr: BGR color image (required for RapidOCR).
+ pronunciation: 'british' or 'american' for IPA lookup.
+
+ Returns:
+ List of entry dicts with english/german/example text and bbox info (percent).
+ """
+ cells, columns_meta = build_cell_grid(
+ ocr_img, column_regions, row_geometries, img_w, img_h,
+ lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
+ )
+
+ if not cells:
+ return []
+
+ # Check if vocab layout is present
+ col_types = {c['type'] for c in columns_meta}
+ if not (col_types & {'column_en', 'column_de'}):
+ logger.info("build_word_grid: no vocab columns — returning raw cells")
+ return cells
+
+ # Vocab mapping: cells → entries
+ entries = _cells_to_vocab_entries(cells, columns_meta)
+
+ # --- Post-processing pipeline (deterministic, no LLM) ---
+ n_raw = len(entries)
+
+ # 0a. Merge phonetic-only continuation rows into previous entry
+ entries = _merge_phonetic_continuation_rows(entries)
+
+ # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
+ entries = _merge_continuation_rows(entries)
+
+ # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
+ # llm_review_entries_streaming so changes are visible to the user in Step 6.
+
+ # 2. Replace OCR'd phonetics with dictionary IPA
+ entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+
+ # 3. Split comma-separated word forms (break, broke, broken → 3 entries)
+ entries = _split_comma_entries(entries)
+
+ # 4. Attach example sentences (rows without DE → examples for preceding entry)
+ entries = _attach_example_sentences(entries)
+
+ engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
+ logger.info(f"build_word_grid: {len(entries)} entries from "
+ f"{n_raw} raw → {len(entries)} after post-processing "
+ f"(engine={engine_name})")
+
+ return entries
+
diff --git a/klausur-service/backend/cv_layout.py b/klausur-service/backend/cv_layout.py
new file mode 100644
index 0000000..47713a1
--- /dev/null
+++ b/klausur-service/backend/cv_layout.py
@@ -0,0 +1,3036 @@
+"""
+Document type detection, layout analysis, column/row geometry, and classification.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+ ColumnGeometry,
+ DocumentTypeResult,
+ ENGLISH_FUNCTION_WORDS,
+ GERMAN_FUNCTION_WORDS,
+ PageRegion,
+ RowGeometry,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+ import cv2
+except ImportError:
+ cv2 = None # type: ignore[assignment]
+
+try:
+ import pytesseract
+ from PIL import Image
+except ImportError:
+ pytesseract = None # type: ignore[assignment]
+ Image = None # type: ignore[assignment,misc]
+
+
+def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
+ """Detect whether the page is a vocab table, generic table, or full text.
+
+ Uses projection profiles and text density analysis — no OCR required.
+ Runs in < 2 seconds.
+
+ Args:
+ ocr_img: Binarized grayscale image (for projection profiles).
+ img_bgr: BGR color image.
+
+ Returns:
+ DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
+ """
+ if ocr_img is None or ocr_img.size == 0:
+ return DocumentTypeResult(
+ doc_type='full_text', confidence=0.5, pipeline='full_page',
+ skip_steps=['columns', 'rows'],
+ features={'error': 'empty image'},
+ )
+
+ h, w = ocr_img.shape[:2]
+
+ # --- 1. Vertical projection profile → detect column gaps ---
+ # Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
+ # Invert: dark pixels on white background → high values = text.
+ vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
+
+ # Smooth the profile to avoid noise spikes
+ kernel_size = max(3, w // 100)
+ if kernel_size % 2 == 0:
+ kernel_size += 1
+ vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
+
+ # Find significant vertical gaps (columns of near-zero text density)
+ # A gap must be at least 1% of image width and have < 5% of max density
+ max_density = max(vert_smooth.max(), 1)
+ gap_threshold = max_density * 0.05
+ min_gap_width = max(5, w // 100)
+
+ in_gap = False
+ gap_count = 0
+ gap_start = 0
+ vert_gaps = []
+
+ for x in range(w):
+ if vert_smooth[x] < gap_threshold:
+ if not in_gap:
+ in_gap = True
+ gap_start = x
+ else:
+ if in_gap:
+ gap_width = x - gap_start
+ if gap_width >= min_gap_width:
+ gap_count += 1
+ vert_gaps.append((gap_start, x, gap_width))
+ in_gap = False
+
+ # Filter out margin gaps (within 10% of image edges)
+ margin_threshold = w * 0.10
+ internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
+ internal_gap_count = len(internal_gaps)
+
+ # --- 2. Horizontal projection profile → detect row gaps ---
+ horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
+ h_kernel = max(3, h // 200)
+ if h_kernel % 2 == 0:
+ h_kernel += 1
+ horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
+
+ h_max = max(horiz_smooth.max(), 1)
+ h_gap_threshold = h_max * 0.05
+ min_row_gap = max(3, h // 200)
+
+ row_gap_count = 0
+ in_gap = False
+ for y in range(h):
+ if horiz_smooth[y] < h_gap_threshold:
+ if not in_gap:
+ in_gap = True
+ gap_start = y
+ else:
+ if in_gap:
+ if y - gap_start >= min_row_gap:
+ row_gap_count += 1
+ in_gap = False
+
+ # --- 3. Text density distribution (4×4 grid) ---
+ grid_rows, grid_cols = 4, 4
+ cell_h, cell_w = h // grid_rows, w // grid_cols
+ densities = []
+ for gr in range(grid_rows):
+ for gc in range(grid_cols):
+ cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
+ gc * cell_w:(gc + 1) * cell_w]
+ if cell.size > 0:
+ d = float(np.count_nonzero(cell < 128)) / cell.size
+ densities.append(d)
+
+ density_std = float(np.std(densities)) if densities else 0
+ density_mean = float(np.mean(densities)) if densities else 0
+
+ features = {
+ 'vertical_gaps': gap_count,
+ 'internal_vertical_gaps': internal_gap_count,
+ 'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
+ 'row_gaps': row_gap_count,
+ 'density_mean': round(density_mean, 4),
+ 'density_std': round(density_std, 4),
+ 'image_size': (w, h),
+ }
+
+ # --- 4. Decision tree ---
+ # Use internal_gap_count (excludes margin gaps) for column detection.
+ if internal_gap_count >= 2 and row_gap_count >= 5:
+ # Multiple internal vertical gaps + many row gaps → table
+ confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
+ return DocumentTypeResult(
+ doc_type='vocab_table',
+ confidence=round(confidence, 2),
+ pipeline='cell_first',
+ skip_steps=[],
+ features=features,
+ )
+ elif internal_gap_count >= 1 and row_gap_count >= 3:
+ # Some internal structure, likely a table
+ confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
+ return DocumentTypeResult(
+ doc_type='generic_table',
+ confidence=round(confidence, 2),
+ pipeline='cell_first',
+ skip_steps=[],
+ features=features,
+ )
+ elif internal_gap_count == 0:
+ # No internal column gaps → full text (regardless of density)
+ confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
+ return DocumentTypeResult(
+ doc_type='full_text',
+ confidence=round(confidence, 2),
+ pipeline='full_page',
+ skip_steps=['columns', 'rows'],
+ features=features,
+ )
+ else:
+ # Ambiguous — default to vocab_table (most common use case)
+ return DocumentTypeResult(
+ doc_type='vocab_table',
+ confidence=0.5,
+ pipeline='cell_first',
+ skip_steps=[],
+ features=features,
+ )
+
+
+# =============================================================================
+# Stage 4: Dual Image Preparation
+# =============================================================================
+
+def create_ocr_image(img: np.ndarray) -> np.ndarray:
+ """Create a binarized image optimized for Tesseract OCR.
+
+ Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
+
+ Args:
+ img: BGR image.
+
+ Returns:
+ Binary image (white text on black background inverted to black on white).
+ """
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ # Background normalization: divide by blurred version
+ bg = cv2.GaussianBlur(gray, (51, 51), 0)
+ normalized = cv2.divide(gray, bg, scale=255)
+
+ # Adaptive binarization
+ binary = cv2.adaptiveThreshold(
+ normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY, 31, 10
+ )
+
+ # Light denoise
+ denoised = cv2.medianBlur(binary, 3)
+
+ return denoised
+
+
+def create_layout_image(img: np.ndarray) -> np.ndarray:
+ """Create a CLAHE-enhanced grayscale image for layout analysis.
+
+ Args:
+ img: BGR image.
+
+ Returns:
+ Enhanced grayscale image.
+ """
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+ enhanced = clahe.apply(gray)
+ return enhanced
+
+
+# =============================================================================
+# Stage 5: Layout Analysis (Projection Profiles)
+# =============================================================================
+
+def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
+ """Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
+ out = mask.copy()
+ n = len(out)
+ i = 0
+ while i < n:
+ if out[i]:
+ start = i
+ while i < n and out[i]:
+ i += 1
+ if (i - start) < min_width:
+ out[start:i] = False
+ else:
+ i += 1
+ return out
+
+
+def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
+ """Find the bounding box of actual text content (excluding page margins).
+
+ Scan artefacts (thin black lines at page edges) are filtered out by
+ discarding contiguous projection runs narrower than 1 % of the image
+ dimension (min 5 px).
+
+ Returns:
+ Tuple of (left_x, right_x, top_y, bottom_y).
+ """
+ h, w = inv.shape[:2]
+ threshold = 0.005
+
+ # --- Horizontal projection for top/bottom ---
+ h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
+ h_mask = h_proj > threshold
+ min_h_run = max(5, h // 100)
+ h_mask = _filter_narrow_runs(h_mask, min_h_run)
+
+ top_y = 0
+ for y in range(h):
+ if h_mask[y]:
+ top_y = max(0, y - 5)
+ break
+
+ bottom_y = h
+ for y in range(h - 1, 0, -1):
+ if h_mask[y]:
+ bottom_y = min(h, y + 5)
+ break
+
+ # --- Vertical projection for left/right margins ---
+ v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
+ v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
+ v_mask = v_proj_norm > threshold
+ min_v_run = max(5, w // 100)
+ v_mask = _filter_narrow_runs(v_mask, min_v_run)
+
+ left_x = 0
+ for x in range(w):
+ if v_mask[x]:
+ left_x = max(0, x - 2)
+ break
+
+ right_x = w
+ for x in range(w - 1, 0, -1):
+ if v_mask[x]:
+ right_x = min(w, x + 2)
+ break
+
+ return left_x, right_x, top_y, bottom_y
+
+
+def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
+ """Detect columns, header, and footer using projection profiles.
+
+ Uses content-bounds detection to exclude page margins before searching
+ for column separators within the actual text area.
+
+ Args:
+ layout_img: CLAHE-enhanced grayscale image.
+ ocr_img: Binarized image for text density analysis.
+
+ Returns:
+ List of PageRegion objects describing detected regions.
+ """
+ h, w = ocr_img.shape[:2]
+
+ # Invert: black text on white → white text on black for projection
+ inv = cv2.bitwise_not(ocr_img)
+
+ # --- Find actual content bounds (exclude page margins) ---
+ left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
+ content_w = right_x - left_x
+ content_h = bottom_y - top_y
+
+ logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
+ f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
+
+ if content_w < w * 0.3 or content_h < h * 0.3:
+ # Fallback if detection seems wrong
+ left_x, right_x = 0, w
+ top_y, bottom_y = 0, h
+ content_w, content_h = w, h
+
+ # --- Vertical projection within content area to find column separators ---
+ content_strip = inv[top_y:bottom_y, left_x:right_x]
+ v_proj = np.sum(content_strip, axis=0).astype(float)
+ v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
+
+ # Smooth the projection profile
+ kernel_size = max(5, content_w // 50)
+ if kernel_size % 2 == 0:
+ kernel_size += 1
+ v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+ # Debug: log projection profile statistics
+ p_mean = float(np.mean(v_proj_smooth))
+ p_median = float(np.median(v_proj_smooth))
+ p_min = float(np.min(v_proj_smooth))
+ p_max = float(np.max(v_proj_smooth))
+ logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
+ f"mean={p_mean:.4f}, median={p_median:.4f}")
+
+ # Find valleys using multiple threshold strategies
+ # Strategy 1: relative to median (catches clear separators)
+ # Strategy 2: local minima approach (catches subtle gaps)
+ threshold = max(p_median * 0.3, p_mean * 0.2)
+ logger.info(f"Layout: valley threshold={threshold:.4f}")
+
+ in_valley = v_proj_smooth < threshold
+
+ # Find contiguous valley regions
+ all_valleys = []
+ start = None
+ for x in range(len(v_proj_smooth)):
+ if in_valley[x] and start is None:
+ start = x
+ elif not in_valley[x] and start is not None:
+ valley_width = x - start
+ valley_depth = float(np.min(v_proj_smooth[start:x]))
+ # Valley must be at least 3px wide
+ if valley_width >= 3:
+ all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
+ start = None
+
+ logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
+ f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
+
+ # Filter: valleys must be inside the content area (not at edges)
+ inner_margin = int(content_w * 0.08)
+ valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
+
+ # If no valleys found with strict threshold, try local minima approach
+ if len(valleys) < 2:
+ logger.info("Layout: trying local minima approach for column detection")
+ # Divide content into 20 segments, find the 2 lowest
+ seg_count = 20
+ seg_width = content_w // seg_count
+ seg_scores = []
+ for i in range(seg_count):
+ sx = i * seg_width
+ ex = min((i + 1) * seg_width, content_w)
+ seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
+ seg_scores.append((i, sx, ex, seg_mean))
+
+ seg_scores.sort(key=lambda s: s[3])
+ logger.info(f"Layout: segment scores (lowest 5): "
+ f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
+
+ # Find two lowest non-adjacent segments that create reasonable columns
+ candidate_valleys = []
+ for seg_idx, sx, ex, seg_mean in seg_scores:
+ # Must not be at the edges
+ if seg_idx <= 1 or seg_idx >= seg_count - 2:
+ continue
+ # Must be significantly lower than overall mean
+ if seg_mean < p_mean * 0.6:
+ center = (sx + ex) // 2
+ candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
+
+ if len(candidate_valleys) >= 2:
+ # Pick the best pair: non-adjacent, creating reasonable column widths
+ candidate_valleys.sort(key=lambda v: v[2])
+ best_pair = None
+ best_score = float('inf')
+ for i in range(len(candidate_valleys)):
+ for j in range(i + 1, len(candidate_valleys)):
+ c1 = candidate_valleys[i][2]
+ c2 = candidate_valleys[j][2]
+ # Must be at least 20% apart
+ if (c2 - c1) < content_w * 0.2:
+ continue
+ col1 = c1
+ col2 = c2 - c1
+ col3 = content_w - c2
+ # Each column at least 15%
+ if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
+ continue
+ parts = sorted([col1, col2, col3])
+ score = parts[2] - parts[0]
+ if score < best_score:
+ best_score = score
+ best_pair = (candidate_valleys[i], candidate_valleys[j])
+
+ if best_pair:
+ valleys = list(best_pair)
+ logger.info(f"Layout: local minima found 2 valleys: "
+ f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
+
+ logger.info(f"Layout: final {len(valleys)} valleys: "
+ f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
+
+ regions = []
+
+ if len(valleys) >= 2:
+ # 3-column layout detected
+ valleys.sort(key=lambda v: v[2])
+
+ if len(valleys) == 2:
+ sep1_center = valleys[0][2]
+ sep2_center = valleys[1][2]
+ else:
+ # Pick the two valleys that best divide into 3 parts
+ # Prefer wider valleys (more likely true separators)
+ best_pair = None
+ best_score = float('inf')
+ for i in range(len(valleys)):
+ for j in range(i + 1, len(valleys)):
+ c1, c2 = valleys[i][2], valleys[j][2]
+ # Each column should be at least 15% of content width
+ col1 = c1
+ col2 = c2 - c1
+ col3 = content_w - c2
+ if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
+ continue
+ # Score: lower is better (more even distribution)
+ parts = sorted([col1, col2, col3])
+ score = parts[2] - parts[0]
+ # Bonus for wider valleys (subtract valley width)
+ score -= (valleys[i][3] + valleys[j][3]) * 0.5
+ if score < best_score:
+ best_score = score
+ best_pair = (c1, c2)
+ if best_pair:
+ sep1_center, sep2_center = best_pair
+ else:
+ sep1_center = valleys[0][2]
+ sep2_center = valleys[1][2]
+
+ # Convert from content-relative to absolute coordinates
+ abs_sep1 = sep1_center + left_x
+ abs_sep2 = sep2_center + left_x
+
+ logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
+ f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
+
+ regions.append(PageRegion(
+ type='column_en', x=0, y=top_y,
+ width=abs_sep1, height=content_h
+ ))
+ regions.append(PageRegion(
+ type='column_de', x=abs_sep1, y=top_y,
+ width=abs_sep2 - abs_sep1, height=content_h
+ ))
+ regions.append(PageRegion(
+ type='column_example', x=abs_sep2, y=top_y,
+ width=w - abs_sep2, height=content_h
+ ))
+
+ elif len(valleys) == 1:
+ # 2-column layout
+ abs_sep = valleys[0][2] + left_x
+
+ logger.info(f"Layout: 2 columns at separator x={abs_sep}")
+
+ regions.append(PageRegion(
+ type='column_en', x=0, y=top_y,
+ width=abs_sep, height=content_h
+ ))
+ regions.append(PageRegion(
+ type='column_de', x=abs_sep, y=top_y,
+ width=w - abs_sep, height=content_h
+ ))
+
+ else:
+ # No columns detected — run full-page OCR as single column
+ logger.warning("Layout: no column separators found, using full page")
+ regions.append(PageRegion(
+ type='column_en', x=0, y=top_y,
+ width=w, height=content_h
+ ))
+
+ # Add header/footer info (gap-based detection with fallback)
+ _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
+
+ top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
+ bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
+ col_count = len([r for r in regions if r.type.startswith('column')])
+ logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
+
+ return regions
+
+
+# =============================================================================
+# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
+# =============================================================================
+
+# --- Phase A: Geometry Detection ---
+
+def _detect_columns_by_clustering(
+ word_dicts: List[Dict],
+ left_edges: List[int],
+ edge_word_indices: List[int],
+ content_w: int,
+ content_h: int,
+ left_x: int,
+ right_x: int,
+ top_y: int,
+ bottom_y: int,
+ inv: Optional[np.ndarray] = None,
+) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
+ """Fallback: detect columns by clustering left-aligned word positions.
+
+ Used when the primary gap-based algorithm finds fewer than 2 gaps.
+ """
+ tolerance = max(10, int(content_w * 0.01))
+ sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
+
+ clusters = []
+ cluster_widxs = []
+ cur_edges = [sorted_pairs[0][0]]
+ cur_widxs = [sorted_pairs[0][1]]
+ for edge, widx in sorted_pairs[1:]:
+ if edge - cur_edges[-1] <= tolerance:
+ cur_edges.append(edge)
+ cur_widxs.append(widx)
+ else:
+ clusters.append(cur_edges)
+ cluster_widxs.append(cur_widxs)
+ cur_edges = [edge]
+ cur_widxs = [widx]
+ clusters.append(cur_edges)
+ cluster_widxs.append(cur_widxs)
+
+ MIN_Y_COVERAGE_PRIMARY = 0.30
+ MIN_Y_COVERAGE_SECONDARY = 0.15
+ MIN_WORDS_SECONDARY = 5
+
+ cluster_infos = []
+ for c_edges, c_widxs in zip(clusters, cluster_widxs):
+ if len(c_edges) < 2:
+ continue
+ y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
+ y_span = max(y_positions) - min(y_positions)
+ y_coverage = y_span / content_h if content_h > 0 else 0.0
+ cluster_infos.append({
+ 'mean_x': int(np.mean(c_edges)),
+ 'count': len(c_edges),
+ 'min_edge': min(c_edges),
+ 'max_edge': max(c_edges),
+ 'y_min': min(y_positions),
+ 'y_max': max(y_positions),
+ 'y_coverage': y_coverage,
+ })
+
+ primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
+ primary_set = set(id(c) for c in primary)
+ secondary = [c for c in cluster_infos
+ if id(c) not in primary_set
+ and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
+ and c['count'] >= MIN_WORDS_SECONDARY]
+ significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
+
+ if len(significant) < 3:
+ logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
+ return None
+
+ merge_distance = max(30, int(content_w * 0.06))
+ merged = [significant[0].copy()]
+ for s in significant[1:]:
+ if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
+ prev = merged[-1]
+ total = prev['count'] + s['count']
+ avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
+ prev['mean_x'] = avg_x
+ prev['count'] = total
+ prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
+ prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
+ else:
+ merged.append(s.copy())
+
+ if len(merged) < 3:
+ logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
+ return None
+
+ logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
+
+ margin_px = max(6, int(content_w * 0.003))
+ return _build_geometries_from_starts(
+ [(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
+ word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
+ )
+
+
+def _detect_sub_columns(
+ geometries: List[ColumnGeometry],
+ content_w: int,
+ left_x: int = 0,
+ top_y: int = 0,
+ header_y: Optional[int] = None,
+ footer_y: Optional[int] = None,
+ _edge_tolerance: int = 8,
+ _min_col_start_ratio: float = 0.10,
+) -> List[ColumnGeometry]:
+ """Split columns that contain internal sub-columns based on left-edge alignment.
+
+ For each column, clusters word left-edges into alignment bins (within
+ ``_edge_tolerance`` px). The leftmost bin whose word count reaches
+ ``_min_col_start_ratio`` of the column total is treated as the true column
+ start. Any words to the left of that bin form a sub-column, provided they
+ number >= 2 and < 35 % of total.
+
+ Word ``left`` values are relative to the content ROI (offset by *left_x*),
+ while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
+ bridges the two coordinate systems.
+
+ If *header_y* / *footer_y* are provided (absolute y-coordinates), words
+ in header/footer regions are excluded from alignment clustering to avoid
+ polluting the bins with page numbers or chapter titles. Word ``top``
+ values are relative to *top_y*.
+
+ Returns a new list of ColumnGeometry — potentially longer than the input.
+ """
+ if content_w <= 0:
+ return geometries
+
+ result: List[ColumnGeometry] = []
+ for geo in geometries:
+ # Only consider wide-enough columns with enough words
+ if geo.width_ratio < 0.15 or geo.word_count < 5:
+ result.append(geo)
+ continue
+
+ # Collect left-edges of confident words, excluding header/footer
+ # Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
+ min_top_rel = (header_y - top_y) if header_y is not None else None
+ max_top_rel = (footer_y - top_y) if footer_y is not None else None
+
+ confident = [w for w in geo.words
+ if w.get('conf', 0) >= 30
+ and (min_top_rel is None or w['top'] >= min_top_rel)
+ and (max_top_rel is None or w['top'] <= max_top_rel)]
+ if len(confident) < 3:
+ result.append(geo)
+ continue
+
+ # --- Cluster left-edges into alignment bins ---
+ sorted_edges = sorted(w['left'] for w in confident)
+ bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge)
+ cur = [sorted_edges[0]]
+ for i in range(1, len(sorted_edges)):
+ if sorted_edges[i] - cur[-1] <= _edge_tolerance:
+ cur.append(sorted_edges[i])
+ else:
+ bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
+ cur = [sorted_edges[i]]
+ bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
+
+ # --- Find the leftmost bin qualifying as a real column start ---
+ total = len(confident)
+ min_count = max(3, int(total * _min_col_start_ratio))
+ col_start_bin = None
+ for b in bins:
+ if b[1] >= min_count:
+ col_start_bin = b
+ break
+
+ if col_start_bin is None:
+ result.append(geo)
+ continue
+
+ # Words to the left of the column-start bin are sub-column candidates
+ split_threshold = col_start_bin[2] - _edge_tolerance
+ sub_words = [w for w in geo.words if w['left'] < split_threshold]
+ main_words = [w for w in geo.words if w['left'] >= split_threshold]
+
+ # Count only body words (excluding header/footer) for the threshold check
+ # so that header/footer words don't artificially trigger a split.
+ sub_body = [w for w in sub_words
+ if (min_top_rel is None or w['top'] >= min_top_rel)
+ and (max_top_rel is None or w['top'] <= max_top_rel)]
+ if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
+ result.append(geo)
+ continue
+
+ # --- Build two sub-column geometries ---
+ # Word 'left' values are relative to left_x; geo.x is absolute.
+ # Convert the split position from relative to absolute coordinates.
+ max_sub_left = max(w['left'] for w in sub_words)
+ split_rel = (max_sub_left + col_start_bin[2]) // 2
+ split_abs = split_rel + left_x
+
+ sub_x = geo.x
+ sub_width = split_abs - geo.x
+ main_x = split_abs
+ main_width = (geo.x + geo.width) - split_abs
+
+ if sub_width <= 0 or main_width <= 0:
+ result.append(geo)
+ continue
+
+ sub_geo = ColumnGeometry(
+ index=0,
+ x=sub_x,
+ y=geo.y,
+ width=sub_width,
+ height=geo.height,
+ word_count=len(sub_words),
+ words=sub_words,
+ width_ratio=sub_width / content_w if content_w > 0 else 0.0,
+ is_sub_column=True,
+ )
+ main_geo = ColumnGeometry(
+ index=0,
+ x=main_x,
+ y=geo.y,
+ width=main_width,
+ height=geo.height,
+ word_count=len(main_words),
+ words=main_words,
+ width_ratio=main_width / content_w if content_w > 0 else 0.0,
+ is_sub_column=True,
+ )
+
+ result.append(sub_geo)
+ result.append(main_geo)
+
+ logger.info(
+ f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
+ f"(rel={split_rel}), sub={len(sub_words)} words, "
+ f"main={len(main_words)} words, "
+ f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
+ )
+
+ # Re-index by left-to-right order
+ result.sort(key=lambda g: g.x)
+ for i, g in enumerate(result):
+ g.index = i
+
+ return result
+
+
+def _split_broad_columns(
+ geometries: List[ColumnGeometry],
+ content_w: int,
+ left_x: int = 0,
+ _broad_threshold: float = 0.35,
+ _min_gap_px: int = 15,
+ _min_words_per_split: int = 5,
+) -> List[ColumnGeometry]:
+ """Split overly broad columns that contain two language blocks (EN+DE).
+
+ Uses word-coverage gap analysis: builds a per-pixel coverage array from the
+ words inside each broad column, finds the largest horizontal gap, and splits
+ the column at that gap.
+
+ Args:
+ geometries: Column geometries from _detect_sub_columns.
+ content_w: Width of the content area in pixels.
+ left_x: Left edge of content ROI in absolute image coordinates.
+ _broad_threshold: Minimum width_ratio to consider a column "broad".
+ _min_gap_px: Minimum gap width (pixels) to trigger a split.
+ _min_words_per_split: Both halves must have at least this many words.
+
+ Returns:
+ Updated list of ColumnGeometry (possibly with more columns).
+ """
+ result: List[ColumnGeometry] = []
+
+ logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
+ f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")
+
+ for geo in geometries:
+ if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
+ result.append(geo)
+ continue
+
+ # Build word-coverage array (per pixel within column)
+ col_left_rel = geo.x - left_x # column left in content-relative coords
+ coverage = np.zeros(geo.width, dtype=np.float32)
+
+ for wd in geo.words:
+ # wd['left'] is relative to left_x (content ROI)
+ wl = wd['left'] - col_left_rel
+ wr = wl + wd.get('width', 0)
+ wl = max(0, int(wl))
+ wr = min(geo.width, int(wr))
+ if wr > wl:
+ coverage[wl:wr] += 1.0
+
+ # Light smoothing (kernel=3px) to avoid noise
+ if len(coverage) > 3:
+ kernel = np.ones(3, dtype=np.float32) / 3.0
+ coverage = np.convolve(coverage, kernel, mode='same')
+
+ # Normalise to [0, 1]
+ cmax = coverage.max()
+ if cmax > 0:
+ coverage /= cmax
+
+ # Find INTERNAL gaps where coverage < 0.5
+ # Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
+ low_mask = coverage < 0.5
+ all_gaps = []
+ _gs = None
+ for px in range(len(low_mask)):
+ if low_mask[px]:
+ if _gs is None:
+ _gs = px
+ else:
+ if _gs is not None:
+ all_gaps.append((_gs, px, px - _gs))
+ _gs = None
+ if _gs is not None:
+ all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
+
+ # Filter: only internal gaps (not touching column edges)
+ _edge_margin = 10 # pixels from edge to ignore
+ internal_gaps = [g for g in all_gaps
+ if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
+ best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
+
+ logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
+ f"{[g for g in all_gaps if g[2] >= 5]}, "
+ f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
+ f"best={best_gap}")
+
+ if best_gap is None or best_gap[2] < _min_gap_px:
+ result.append(geo)
+ continue
+
+ gap_center = (best_gap[0] + best_gap[1]) // 2
+
+ # Split words by midpoint relative to gap
+ left_words = []
+ right_words = []
+ for wd in geo.words:
+ wl = wd['left'] - col_left_rel
+ mid = wl + wd.get('width', 0) / 2.0
+ if mid < gap_center:
+ left_words.append(wd)
+ else:
+ right_words.append(wd)
+
+ if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
+ result.append(geo)
+ continue
+
+ # Build two new ColumnGeometry objects
+ split_x_abs = geo.x + gap_center
+ left_w = gap_center
+ right_w = geo.width - gap_center
+
+ left_geo = ColumnGeometry(
+ index=0,
+ x=geo.x,
+ y=geo.y,
+ width=left_w,
+ height=geo.height,
+ word_count=len(left_words),
+ words=left_words,
+ width_ratio=left_w / content_w if content_w else 0,
+ is_sub_column=True,
+ )
+ right_geo = ColumnGeometry(
+ index=0,
+ x=split_x_abs,
+ y=geo.y,
+ width=right_w,
+ height=geo.height,
+ word_count=len(right_words),
+ words=right_words,
+ width_ratio=right_w / content_w if content_w else 0,
+ is_sub_column=True,
+ )
+
+ logger.info(
+ f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
+ f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
+ f"left={len(left_words)} words (w={left_w}), "
+ f"right={len(right_words)} words (w={right_w})"
+ )
+
+ result.append(left_geo)
+ result.append(right_geo)
+
+ # Re-index left-to-right
+ result.sort(key=lambda g: g.x)
+ for i, g in enumerate(result):
+ g.index = i
+
+ return result
+
+
+def _build_geometries_from_starts(
+ col_starts: List[Tuple[int, int]],
+ word_dicts: List[Dict],
+ left_x: int,
+ right_x: int,
+ top_y: int,
+ bottom_y: int,
+ content_w: int,
+ content_h: int,
+ inv: Optional[np.ndarray] = None,
+) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
+ """Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
+ geometries = []
+ for i, (start_x, count) in enumerate(col_starts):
+ if i + 1 < len(col_starts):
+ col_width = col_starts[i + 1][0] - start_x
+ else:
+ col_width = right_x - start_x
+
+ col_left_rel = start_x - left_x
+ col_right_rel = col_left_rel + col_width
+ col_words = [w for w in word_dicts
+ if col_left_rel <= w['left'] < col_right_rel]
+
+ geometries.append(ColumnGeometry(
+ index=i,
+ x=start_x,
+ y=top_y,
+ width=col_width,
+ height=content_h,
+ word_count=len(col_words),
+ words=col_words,
+ width_ratio=col_width / content_w if content_w > 0 else 0.0,
+ ))
+
+ logger.info(f"ColumnGeometry: {len(geometries)} columns: "
+ f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+ return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
+
+
+def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
+ """Detect column geometry using whitespace-gap analysis with word validation.
+
+ Phase A of the two-phase column detection. Uses vertical projection
+ profiles to find whitespace gaps between columns, then validates that
+ no gap cuts through a word bounding box.
+
+ Falls back to clustering-based detection if fewer than 2 gaps are found.
+
+ Args:
+ ocr_img: Binarized grayscale image for layout analysis.
+ dewarped_bgr: Original BGR image (for Tesseract word detection).
+
+ Returns:
+ Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
+ or None if detection fails entirely.
+ """
+ h, w = ocr_img.shape[:2]
+
+ # --- Step 1: Find content bounds ---
+ inv = cv2.bitwise_not(ocr_img)
+ left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
+ content_w = right_x - left_x
+ content_h = bottom_y - top_y
+
+ if content_w < w * 0.3 or content_h < h * 0.3:
+ left_x, right_x = 0, w
+ top_y, bottom_y = 0, h
+ content_w, content_h = w, h
+
+ logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
+ f"y=[{top_y}..{bottom_y}] ({content_h}px)")
+
+ # --- Step 2: Get word bounding boxes from Tesseract ---
+ # Crop from left_x to full image width (not right_x) so words at the right
+ # edge of the last column are included even if they extend past the detected
+ # content boundary (right_x).
+ content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
+ pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
+
+ try:
+ data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
+ except Exception as e:
+ logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
+ return None
+
+ word_dicts = []
+ left_edges = []
+ edge_word_indices = []
+ n_words = len(data['text'])
+ for i in range(n_words):
+ conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
+ text = str(data['text'][i]).strip()
+ if conf < 30 or not text:
+ continue
+ lx = int(data['left'][i])
+ ty = int(data['top'][i])
+ bw = int(data['width'][i])
+ bh = int(data['height'][i])
+ left_edges.append(lx)
+ edge_word_indices.append(len(word_dicts))
+ word_dicts.append({
+ 'text': text, 'conf': conf,
+ 'left': lx, 'top': ty, 'width': bw, 'height': bh,
+ })
+
+ if len(left_edges) < 5:
+ logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
+ return None
+
+ logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
+
+ # --- Step 2b: Segment by sub-headers ---
+ # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
+ # text bands that pollute the vertical projection. We detect large
+ # horizontal gaps (= whitespace rows separating sections) and use only
+ # the tallest content segment for the projection. This makes column
+ # detection immune to sub-headers, illustrations, and section dividers.
+ content_strip = inv[top_y:bottom_y, left_x:right_x]
+ h_proj_row = np.sum(content_strip, axis=1).astype(float)
+ h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
+
+ # Find horizontal gaps (near-empty rows)
+ H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty"
+ h_in_gap = h_proj_row_norm < H_GAP_THRESH
+ H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px
+
+ h_gaps: List[Tuple[int, int]] = []
+ h_gap_start = None
+ for y_idx in range(len(h_in_gap)):
+ if h_in_gap[y_idx]:
+ if h_gap_start is None:
+ h_gap_start = y_idx
+ else:
+ if h_gap_start is not None:
+ if y_idx - h_gap_start >= H_MIN_GAP:
+ h_gaps.append((h_gap_start, y_idx))
+ h_gap_start = None
+ if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
+ h_gaps.append((h_gap_start, len(h_in_gap)))
+
+ # Identify "large" gaps (significantly bigger than median) that indicate
+ # section boundaries (sub-headers, chapter titles).
+ if len(h_gaps) >= 3:
+ gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
+ median_gap_h = gap_sizes[len(gap_sizes) // 2]
+ large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
+ large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
+ else:
+ large_gaps = h_gaps
+
+ # Build content segments between large gaps and pick the tallest
+ seg_boundaries = [0]
+ for gs, ge in large_gaps:
+ seg_boundaries.append(gs)
+ seg_boundaries.append(ge)
+ seg_boundaries.append(content_h)
+
+ segments = []
+ for i in range(0, len(seg_boundaries) - 1, 2):
+ seg_top = seg_boundaries[i]
+ seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
+ seg_height = seg_bot - seg_top
+ if seg_height > 20: # ignore tiny fragments
+ segments.append((seg_top, seg_bot, seg_height))
+
+ if segments:
+ segments.sort(key=lambda s: s[2], reverse=True)
+ best_seg = segments[0]
+ proj_strip = content_strip[best_seg[0]:best_seg[1], :]
+ effective_h = best_seg[2]
+ if len(segments) > 1:
+ logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
+ f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
+ f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
+ else:
+ proj_strip = content_strip
+ effective_h = content_h
+
+ # --- Step 3: Vertical projection profile ---
+ v_proj = np.sum(proj_strip, axis=0).astype(float)
+ v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
+
+ # Smooth the projection to avoid noise-induced micro-gaps
+ kernel_size = max(5, content_w // 80)
+ if kernel_size % 2 == 0:
+ kernel_size += 1 # keep odd for symmetry
+ v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+ # --- Step 4: Find whitespace gaps ---
+ # Threshold: areas with very little ink density are gaps
+ median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
+ gap_threshold = max(median_density * 0.15, 0.005)
+
+ in_gap = v_smooth < gap_threshold
+ MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
+
+ # Collect contiguous gap regions
+ raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
+ gap_start = None
+ for x in range(len(in_gap)):
+ if in_gap[x]:
+ if gap_start is None:
+ gap_start = x
+ else:
+ if gap_start is not None:
+ gap_width = x - gap_start
+ if gap_width >= MIN_GAP_WIDTH:
+ raw_gaps.append((gap_start, x))
+ gap_start = None
+ # Handle gap at the right edge
+ if gap_start is not None:
+ gap_width = len(in_gap) - gap_start
+ if gap_width >= MIN_GAP_WIDTH:
+ raw_gaps.append((gap_start, len(in_gap)))
+
+ logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
+ f"min_width={MIN_GAP_WIDTH}px): "
+ f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
+
+ # --- Step 5: Validate gaps against word bounding boxes ---
+ # When using a segment for projection, only validate against words
+ # inside that segment — words from sub-headers or other sections
+ # would incorrectly overlap with real column gaps.
+ if segments and len(segments) > 1:
+ seg_top_abs = best_seg[0] # relative to content strip
+ seg_bot_abs = best_seg[1]
+ segment_words = [wd for wd in word_dicts
+ if wd['top'] >= seg_top_abs
+ and wd['top'] + wd['height'] <= seg_bot_abs]
+ logger.info(f"ColumnGeometry: filtering words to segment: "
+ f"{len(segment_words)}/{len(word_dicts)} words")
+ else:
+ segment_words = word_dicts
+
+ validated_gaps = []
+ for gap_start_rel, gap_end_rel in raw_gaps:
+ # Check if any word overlaps with this gap region
+ overlapping = False
+ for wd in segment_words:
+ word_left = wd['left']
+ word_right = wd['left'] + wd['width']
+ if word_left < gap_end_rel and word_right > gap_start_rel:
+ overlapping = True
+ break
+
+ if not overlapping:
+ validated_gaps.append((gap_start_rel, gap_end_rel))
+ else:
+ # Try to shift the gap to avoid the overlapping word(s)
+ # Find the tightest word boundaries within the gap region
+ min_word_left = content_w
+ max_word_right = 0
+ for wd in segment_words:
+ word_left = wd['left']
+ word_right = wd['left'] + wd['width']
+ if word_left < gap_end_rel and word_right > gap_start_rel:
+ min_word_left = min(min_word_left, word_left)
+ max_word_right = max(max_word_right, word_right)
+
+ # Try gap before the overlapping words
+ if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
+ validated_gaps.append((gap_start_rel, min_word_left))
+ logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
+ # Try gap after the overlapping words
+ elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
+ validated_gaps.append((max_word_right, gap_end_rel))
+ logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
+ else:
+ logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
+ f"discarded (word overlap, no room to shift)")
+
+ logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
+ f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
+
+ # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
+ # When pixel-based projection fails (e.g. due to illustrations or colored
+ # bands), use word bounding boxes to find clear vertical gaps. This is
+ # immune to decorative graphics that Tesseract doesn't recognise as words.
+ if len(validated_gaps) < 2:
+ logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
+ word_coverage = np.zeros(content_w, dtype=np.int32)
+ for wd in segment_words:
+ wl = max(0, wd['left'])
+ wr = min(wd['left'] + wd['width'], content_w)
+ if wr > wl:
+ word_coverage[wl:wr] += 1
+
+ # Smooth slightly to bridge tiny 1-2px noise gaps between words
+ wc_kernel = max(3, content_w // 300)
+ if wc_kernel % 2 == 0:
+ wc_kernel += 1
+ wc_smooth = np.convolve(word_coverage.astype(float),
+ np.ones(wc_kernel) / wc_kernel, mode='same')
+
+ wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage
+ WC_MIN_GAP = max(4, content_w // 300)
+
+ wc_gaps: List[Tuple[int, int]] = []
+ wc_gap_start = None
+ for x in range(len(wc_in_gap)):
+ if wc_in_gap[x]:
+ if wc_gap_start is None:
+ wc_gap_start = x
+ else:
+ if wc_gap_start is not None:
+ if x - wc_gap_start >= WC_MIN_GAP:
+ wc_gaps.append((wc_gap_start, x))
+ wc_gap_start = None
+ if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
+ wc_gaps.append((wc_gap_start, len(wc_in_gap)))
+
+ logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
+ f"(min_width={WC_MIN_GAP}px): "
+ f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
+
+ if len(wc_gaps) >= 2:
+ validated_gaps = wc_gaps
+
+ # --- Step 6: Fallback to clustering if too few gaps ---
+ if len(validated_gaps) < 2:
+ logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
+ return _detect_columns_by_clustering(
+ word_dicts, left_edges, edge_word_indices,
+ content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
+ )
+
+ # --- Step 7: Derive column boundaries from gaps ---
+ # Sort gaps by position
+ validated_gaps.sort(key=lambda g: g[0])
+
+ # Identify margin gaps (first and last) vs interior gaps
+ # A margin gap touches the edge of the content area (within 2% tolerance)
+ edge_tolerance = max(10, int(content_w * 0.02))
+
+ is_left_margin = validated_gaps[0][0] <= edge_tolerance
+ is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
+
+ # Interior gaps define column boundaries
+ # Column starts at the end of a gap, ends at the start of the next gap
+ col_starts = []
+
+ if is_left_margin:
+ # First column starts after the left margin gap
+ first_gap_end = validated_gaps[0][1]
+ interior_gaps = validated_gaps[1:]
+ else:
+ # No left margin gap — first column starts at content left edge
+ first_gap_end = 0
+ interior_gaps = validated_gaps[:]
+
+ if is_right_margin:
+ # Last gap is right margin — don't use it as column start
+ interior_gaps_for_boundaries = interior_gaps[:-1]
+ right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
+ else:
+ interior_gaps_for_boundaries = interior_gaps
+ right_boundary = content_w
+
+ # First column
+ col_starts.append(left_x + first_gap_end)
+
+ # Columns between interior gaps
+ for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
+ col_starts.append(left_x + gap_end_rel)
+
+ # Count words per column region (for logging)
+ col_start_counts = []
+ for i, start_x in enumerate(col_starts):
+ if i + 1 < len(col_starts):
+ next_start = col_starts[i + 1]
+ else:
+ # Rightmost column always extends to full image width (w).
+ # The page margin contains only white space — extending the OCR
+ # crop to the image edge is safe and prevents text near the right
+ # border from being cut off.
+ next_start = w
+
+ col_left_rel = start_x - left_x
+ col_right_rel = next_start - left_x
+ n_words_in_col = sum(1 for w in word_dicts
+ if col_left_rel <= w['left'] < col_right_rel)
+ col_start_counts.append((start_x, n_words_in_col))
+
+ logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
+ f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
+ f"{col_start_counts}")
+
+ # --- Step 8: Build ColumnGeometry objects ---
+ # Determine right edge for each column
+ all_boundaries = []
+ for i, start_x in enumerate(col_starts):
+ if i + 1 < len(col_starts):
+ end_x = col_starts[i + 1]
+ else:
+ # Rightmost column always extends to full image width (w).
+ end_x = w
+ all_boundaries.append((start_x, end_x))
+
+ geometries = []
+ for i, (start_x, end_x) in enumerate(all_boundaries):
+ col_width = end_x - start_x
+ col_left_rel = start_x - left_x
+ col_right_rel = col_left_rel + col_width
+ col_words = [w for w in word_dicts
+ if col_left_rel <= w['left'] < col_right_rel]
+
+ geometries.append(ColumnGeometry(
+ index=i,
+ x=start_x,
+ y=top_y,
+ width=col_width,
+ height=content_h,
+ word_count=len(col_words),
+ words=col_words,
+ width_ratio=col_width / content_w if content_w > 0 else 0.0,
+ ))
+
+ logger.info(f"ColumnGeometry: {len(geometries)} columns: "
+ f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+
+ # --- Step 9: Filter phantom narrow columns ---
+ # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
+ # columns (< 3% of content width) with zero or no words. These are not
+ # real columns — remove them and close the gap between neighbors.
+ min_real_col_w = max(20, int(content_w * 0.03))
+ filtered_geoms = [g for g in geometries
+ if not (g.word_count < 3 and g.width < min_real_col_w)]
+ if len(filtered_geoms) < len(geometries):
+ n_removed = len(geometries) - len(filtered_geoms)
+ logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
+ f"(width < {min_real_col_w}px and words < 3)")
+ # Extend each remaining column to close gaps with its right neighbor
+ for i, g in enumerate(filtered_geoms):
+ if i + 1 < len(filtered_geoms):
+ g.width = filtered_geoms[i + 1].x - g.x
+ else:
+ g.width = w - g.x
+ g.index = i
+ col_left_rel = g.x - left_x
+ col_right_rel = col_left_rel + g.width
+ g.words = [w for w in word_dicts
+ if col_left_rel <= w['left'] < col_right_rel]
+ g.word_count = len(g.words)
+ geometries = filtered_geoms
+ logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
+ f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
+
+ return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
+
+
+def expand_narrow_columns(
+ geometries: List[ColumnGeometry],
+ content_w: int,
+ left_x: int,
+ word_dicts: List[Dict],
+) -> List[ColumnGeometry]:
+ """Expand narrow columns into adjacent whitespace gaps.
+
+ Narrow columns (marker, page_ref, < 10% content width) often lose
+ content at image edges due to residual shear. This expands them toward
+ the neighbouring column, but never past 40% of the gap or past the
+ nearest word in the neighbour.
+
+ Must be called AFTER _detect_sub_columns() so that sub-column splits
+ (which create the narrowest columns) have already happened.
+ """
+ _NARROW_THRESHOLD_PCT = 10.0
+ _MIN_WORD_MARGIN = 4
+
+ if len(geometries) < 2:
+ return geometries
+
+ logger.info("ExpandNarrowCols: input %d cols: %s",
+ len(geometries),
+ [(i, g.x, g.width, round(g.width / content_w * 100, 1))
+ for i, g in enumerate(geometries)])
+
+ for i, g in enumerate(geometries):
+ col_pct = g.width / content_w * 100 if content_w > 0 else 100
+ if col_pct >= _NARROW_THRESHOLD_PCT:
+ continue
+
+ expanded = False
+ orig_pct = col_pct
+
+ # --- try expanding to the LEFT ---
+ if i > 0:
+ left_nb = geometries[i - 1]
+ # Gap can be 0 if sub-column split created adjacent columns.
+ # In that case, look at where the neighbor's rightmost words
+ # actually are — there may be unused space we can claim.
+ nb_words_right = [wd['left'] + wd.get('width', 0)
+ for wd in left_nb.words]
+ if nb_words_right:
+ rightmost_word_abs = left_x + max(nb_words_right)
+ safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
+ else:
+ # No words in neighbor → we can take up to neighbor's start
+ safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
+
+ if safe_left_abs < g.x:
+ g.width += (g.x - safe_left_abs)
+ g.x = safe_left_abs
+ expanded = True
+
+ # --- try expanding to the RIGHT ---
+ if i + 1 < len(geometries):
+ right_nb = geometries[i + 1]
+ nb_words_left = [wd['left'] for wd in right_nb.words]
+ if nb_words_left:
+ leftmost_word_abs = left_x + min(nb_words_left)
+ safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
+ else:
+ safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
+
+ cur_right = g.x + g.width
+ if safe_right_abs > cur_right:
+ g.width = safe_right_abs - g.x
+ expanded = True
+
+ if expanded:
+ col_left_rel = g.x - left_x
+ col_right_rel = col_left_rel + g.width
+ g.words = [wd for wd in word_dicts
+ if col_left_rel <= wd['left'] < col_right_rel]
+ g.word_count = len(g.words)
+ g.width_ratio = g.width / content_w if content_w > 0 else 0.0
+ logger.info(
+ "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
+ i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
+
+ # --- Shrink overlapping neighbors to match new boundaries ---
+ # Left neighbor: its right edge must not exceed our new left edge
+ if i > 0:
+ left_nb = geometries[i - 1]
+ nb_right = left_nb.x + left_nb.width
+ if nb_right > g.x:
+ left_nb.width = g.x - left_nb.x
+ if left_nb.width < 0:
+ left_nb.width = 0
+ left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
+ # Re-assign words
+ nb_left_rel = left_nb.x - left_x
+ nb_right_rel = nb_left_rel + left_nb.width
+ left_nb.words = [wd for wd in word_dicts
+ if nb_left_rel <= wd['left'] < nb_right_rel]
+ left_nb.word_count = len(left_nb.words)
+
+ # Right neighbor: its left edge must not be before our new right edge
+ if i + 1 < len(geometries):
+ right_nb = geometries[i + 1]
+ my_right = g.x + g.width
+ if right_nb.x < my_right:
+ old_right_edge = right_nb.x + right_nb.width
+ right_nb.x = my_right
+ right_nb.width = old_right_edge - right_nb.x
+ if right_nb.width < 0:
+ right_nb.width = 0
+ right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
+ # Re-assign words
+ nb_left_rel = right_nb.x - left_x
+ nb_right_rel = nb_left_rel + right_nb.width
+ right_nb.words = [wd for wd in word_dicts
+ if nb_left_rel <= wd['left'] < nb_right_rel]
+ right_nb.word_count = len(right_nb.words)
+
+ return geometries
+
+
+# =============================================================================
+# Row Geometry Detection (horizontal whitespace-gap analysis)
+# =============================================================================
+
+def detect_row_geometry(
+ inv: np.ndarray,
+ word_dicts: List[Dict],
+ left_x: int, right_x: int,
+ top_y: int, bottom_y: int,
+) -> List['RowGeometry']:
+ """Detect row geometry using horizontal whitespace-gap analysis.
+
+ Mirrors the vertical gap approach used for columns, but operates on
+ horizontal projection profiles to find gaps between text lines.
+ Also classifies header/footer rows based on gap size.
+
+ Args:
+ inv: Inverted binarized image (white text on black bg, full page).
+ word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
+ left_x, right_x: Absolute X bounds of the content area.
+ top_y, bottom_y: Absolute Y bounds of the content area.
+
+ Returns:
+ List of RowGeometry objects sorted top to bottom.
+ """
+ content_w = right_x - left_x
+ content_h = bottom_y - top_y
+
+ if content_h < 10 or content_w < 10:
+ logger.warning("detect_row_geometry: content area too small")
+ return []
+
+ # --- Step 1: Horizontal projection profile (text-only, images masked out) ---
+ content_strip = inv[top_y:bottom_y, left_x:right_x]
+
+ # Build a word-coverage mask so that image regions (high ink density but no
+ # Tesseract words) are ignored. Only pixels within/near word bounding boxes
+ # contribute to the projection. This prevents large illustrations from
+ # merging multiple vocabulary rows into one.
+ WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
+ word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
+ for wd in word_dicts:
+ y1 = max(0, wd['top'] - WORD_PAD_Y)
+ y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
+ x1 = max(0, wd['left'])
+ x2 = min(content_w, wd['left'] + wd['width'])
+ word_mask[y1:y2, x1:x2] = 255
+
+ masked_strip = cv2.bitwise_and(content_strip, word_mask)
+ h_proj = np.sum(masked_strip, axis=1).astype(float)
+ h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
+
+ # --- Step 2: Smoothing + threshold ---
+ kernel_size = max(3, content_h // 200)
+ if kernel_size % 2 == 0:
+ kernel_size += 1
+ h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+ median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
+ gap_threshold = max(median_density * 0.15, 0.003)
+
+ in_gap = h_smooth < gap_threshold
+ MIN_GAP_HEIGHT = max(3, content_h // 500)
+
+ # --- Step 3: Collect contiguous gap regions ---
+ raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI
+ gap_start = None
+ for y in range(len(in_gap)):
+ if in_gap[y]:
+ if gap_start is None:
+ gap_start = y
+ else:
+ if gap_start is not None:
+ gap_height = y - gap_start
+ if gap_height >= MIN_GAP_HEIGHT:
+ raw_gaps.append((gap_start, y))
+ gap_start = None
+ if gap_start is not None:
+ gap_height = len(in_gap) - gap_start
+ if gap_height >= MIN_GAP_HEIGHT:
+ raw_gaps.append((gap_start, len(in_gap)))
+
+ logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
+ f"min_height={MIN_GAP_HEIGHT}px)")
+
+ # --- Step 4: Validate gaps against word bounding boxes ---
+ validated_gaps = []
+ for gap_start_rel, gap_end_rel in raw_gaps:
+ overlapping = False
+ for wd in word_dicts:
+ word_top = wd['top']
+ word_bottom = wd['top'] + wd['height']
+ if word_top < gap_end_rel and word_bottom > gap_start_rel:
+ overlapping = True
+ break
+
+ if not overlapping:
+ validated_gaps.append((gap_start_rel, gap_end_rel))
+ else:
+ # Try to shift the gap to avoid overlapping words
+ min_word_top = content_h
+ max_word_bottom = 0
+ for wd in word_dicts:
+ word_top = wd['top']
+ word_bottom = wd['top'] + wd['height']
+ if word_top < gap_end_rel and word_bottom > gap_start_rel:
+ min_word_top = min(min_word_top, word_top)
+ max_word_bottom = max(max_word_bottom, word_bottom)
+
+ if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
+ validated_gaps.append((gap_start_rel, min_word_top))
+ elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
+ validated_gaps.append((max_word_bottom, gap_end_rel))
+ else:
+ logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
+ f"discarded (word overlap, no room to shift)")
+
+ logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
+
+ # --- Fallback if too few gaps ---
+ if len(validated_gaps) < 2:
+ logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
+ return _build_rows_from_word_grouping(
+ word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
+ )
+
+ validated_gaps.sort(key=lambda g: g[0])
+
+ # --- Step 5: Header/footer detection via gap size ---
+ HEADER_FOOTER_ZONE = 0.15
+ GAP_MULTIPLIER = 2.0
+
+ gap_sizes = [g[1] - g[0] for g in validated_gaps]
+ median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
+ large_gap_threshold = median_gap * GAP_MULTIPLIER
+
+ header_boundary_rel = None # y below which is header
+ footer_boundary_rel = None # y above which is footer
+
+ header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
+ footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
+
+ # Find largest gap in header zone
+ best_header_gap = None
+ for gs, ge in validated_gaps:
+ gap_mid = (gs + ge) / 2
+ gap_size = ge - gs
+ if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
+ if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
+ best_header_gap = (gs, ge)
+
+ if best_header_gap is not None:
+ header_boundary_rel = best_header_gap[1]
+ logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
+ f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
+ f"median_gap={median_gap:.0f}px)")
+
+ # Find largest gap in footer zone
+ best_footer_gap = None
+ for gs, ge in validated_gaps:
+ gap_mid = (gs + ge) / 2
+ gap_size = ge - gs
+ if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
+ if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
+ best_footer_gap = (gs, ge)
+
+ if best_footer_gap is not None:
+ footer_boundary_rel = best_footer_gap[0]
+ logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
+ f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
+
+ # --- Step 6: Build RowGeometry objects from gaps ---
+ # Rows are the spans between gaps
+ row_boundaries = [] # (start_y_rel, end_y_rel)
+
+ # Top of content to first gap
+ if validated_gaps[0][0] > MIN_GAP_HEIGHT:
+ row_boundaries.append((0, validated_gaps[0][0]))
+
+ # Between gaps
+ for i in range(len(validated_gaps) - 1):
+ row_start = validated_gaps[i][1]
+ row_end = validated_gaps[i + 1][0]
+ if row_end - row_start > 0:
+ row_boundaries.append((row_start, row_end))
+
+ # Last gap to bottom of content
+ if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
+ row_boundaries.append((validated_gaps[-1][1], content_h))
+
+ rows = []
+ for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
+ # Determine row type
+ row_mid = (row_start_rel + row_end_rel) / 2
+ if header_boundary_rel is not None and row_mid < header_boundary_rel:
+ row_type = 'header'
+ elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
+ row_type = 'footer'
+ else:
+ row_type = 'content'
+
+ # Collect words in this row
+ row_words = [w for w in word_dicts
+ if w['top'] + w['height'] / 2 >= row_start_rel
+ and w['top'] + w['height'] / 2 < row_end_rel]
+
+ # Gap before this row
+ gap_before = 0
+ if idx == 0 and validated_gaps[0][0] > 0:
+ gap_before = validated_gaps[0][0]
+ elif idx > 0:
+ # Find the gap just before this row boundary
+ for gs, ge in validated_gaps:
+ if ge == row_start_rel:
+ gap_before = ge - gs
+ break
+
+ rows.append(RowGeometry(
+ index=idx,
+ x=left_x,
+ y=top_y + row_start_rel,
+ width=content_w,
+ height=row_end_rel - row_start_rel,
+ word_count=len(row_words),
+ words=row_words,
+ row_type=row_type,
+ gap_before=gap_before,
+ ))
+
+ # --- Step 7: Word-center grid regularization ---
+ # Derive precise row boundaries from word vertical centers. Detects
+ # section breaks (headings, paragraphs) and builds per-section grids.
+ rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
+ content_w, content_h, inv)
+
+ type_counts = {}
+ for r in rows:
+ type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
+ logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
+
+ return rows
+
+
+def _regularize_row_grid(
+ rows: List['RowGeometry'],
+ word_dicts: List[Dict],
+ left_x: int, right_x: int,
+ top_y: int,
+ content_w: int, content_h: int,
+ inv: np.ndarray,
+) -> List['RowGeometry']:
+ """Rebuild row boundaries from word center-lines with section-break awareness.
+
+ Instead of overlaying a rigid grid, this derives row positions bottom-up
+ from the words themselves:
+
+ 1. Group words into line clusters (by Y proximity).
+ 2. For each cluster compute center_y (median of word vertical centers)
+ and letter_height (median of word heights).
+ 3. Compute the pitch (distance between consecutive centers).
+ 4. Detect section breaks where the gap is >1.8× the median pitch
+ (headings, sub-headings, paragraph breaks).
+ 5. Within each section, use the local pitch to place row boundaries
+ at the midpoints between consecutive centers.
+ 6. Validate that ≥85% of words land in a grid row; otherwise fall back.
+
+ Header/footer rows from the gap-based detection are preserved.
+ """
+ content_rows = [r for r in rows if r.row_type == 'content']
+ non_content = [r for r in rows if r.row_type != 'content']
+
+ if len(content_rows) < 5:
+ return rows
+
+ # --- Step A: Group ALL words into line clusters ---
+ # Collect words that belong to content rows (deduplicated)
+ content_words: List[Dict] = []
+ seen_keys: set = set()
+ for r in content_rows:
+ for w in r.words:
+ key = (w['left'], w['top'], w['width'], w['height'])
+ if key not in seen_keys:
+ seen_keys.add(key)
+ content_words.append(w)
+
+ if len(content_words) < 5:
+ return rows
+
+ # Compute median word height (excluding outliers like tall brackets/IPA)
+ word_heights = sorted(w['height'] for w in content_words)
+ median_wh = word_heights[len(word_heights) // 2]
+
+ # Compute median gap-based row height — this is the actual line height
+ # as detected by the horizontal projection. We use 40% of this as
+ # grouping tolerance. This is much more reliable than using word height
+ # alone, because words on the same line can have very different heights
+ # (e.g. lowercase vs uppercase, brackets, phonetic symbols).
+ gap_row_heights = sorted(r.height for r in content_rows)
+ median_row_h = gap_row_heights[len(gap_row_heights) // 2]
+
+ # Tolerance: 40% of row height. Words on the same line should have
+ # centers within this range. Even if a word's bbox is taller/shorter,
+ # its center should stay within half a row height of the line center.
+ y_tol = max(10, int(median_row_h * 0.4))
+
+ # Sort by center_y, then group by proximity
+ words_by_center = sorted(content_words,
+ key=lambda w: (w['top'] + w['height'] / 2, w['left']))
+ line_clusters: List[List[Dict]] = []
+ current_line: List[Dict] = [words_by_center[0]]
+ current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
+
+ for w in words_by_center[1:]:
+ w_center = w['top'] + w['height'] / 2
+ if abs(w_center - current_center) <= y_tol:
+ current_line.append(w)
+ else:
+ current_line.sort(key=lambda w: w['left'])
+ line_clusters.append(current_line)
+ current_line = [w]
+ current_center = w_center
+
+ if current_line:
+ current_line.sort(key=lambda w: w['left'])
+ line_clusters.append(current_line)
+
+ if len(line_clusters) < 3:
+ return rows
+
+ # --- Step B: Compute center_y per cluster ---
+ # center_y = median of (word_top + word_height/2) across all words in cluster
+ # letter_h = median of word heights, but excluding outlier-height words
+ # (>2× median) so that tall brackets/IPA don't skew the height
+ cluster_info: List[Dict] = []
+ for cl_words in line_clusters:
+ centers = [w['top'] + w['height'] / 2 for w in cl_words]
+ # Filter outlier heights for letter_h computation
+ normal_heights = [w['height'] for w in cl_words
+ if w['height'] <= median_wh * 2.0]
+ if not normal_heights:
+ normal_heights = [w['height'] for w in cl_words]
+ center_y = float(np.median(centers))
+ letter_h = float(np.median(normal_heights))
+ cluster_info.append({
+ 'center_y_rel': center_y, # relative to content ROI
+ 'center_y_abs': center_y + top_y, # absolute
+ 'letter_h': letter_h,
+ 'words': cl_words,
+ })
+
+ cluster_info.sort(key=lambda c: c['center_y_rel'])
+
+ # --- Step B2: Merge clusters that are too close together ---
+ # Even with center-based grouping, some edge cases can produce
+ # spurious clusters. Merge any pair whose centers are closer
+ # than 30% of the row height (they're definitely the same text line).
+ merge_threshold = max(8, median_row_h * 0.3)
+ merged: List[Dict] = [cluster_info[0]]
+ for cl in cluster_info[1:]:
+ prev = merged[-1]
+ if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
+ # Merge: combine words, recompute center
+ combined_words = prev['words'] + cl['words']
+ centers = [w['top'] + w['height'] / 2 for w in combined_words]
+ normal_heights = [w['height'] for w in combined_words
+ if w['height'] <= median_wh * 2.0]
+ if not normal_heights:
+ normal_heights = [w['height'] for w in combined_words]
+ prev['center_y_rel'] = float(np.median(centers))
+ prev['center_y_abs'] = prev['center_y_rel'] + top_y
+ prev['letter_h'] = float(np.median(normal_heights))
+ prev['words'] = combined_words
+ else:
+ merged.append(cl)
+
+ cluster_info = merged
+
+ if len(cluster_info) < 3:
+ return rows
+
+ # --- Step C: Compute pitches and detect section breaks ---
+ pitches: List[float] = []
+ for i in range(1, len(cluster_info)):
+ pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
+ pitches.append(pitch)
+
+ if not pitches:
+ return rows
+
+ median_pitch = float(np.median(pitches))
+ if median_pitch <= 5:
+ return rows
+
+ # A section break is where the gap between line centers is much larger
+ # than the normal pitch (sub-headings, section titles, etc.)
+ BREAK_FACTOR = 1.8
+
+ # --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
+ sections: List[List[Dict]] = []
+ current_section: List[Dict] = [cluster_info[0]]
+
+ for i in range(1, len(cluster_info)):
+ gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
+ if gap > median_pitch * BREAK_FACTOR:
+ sections.append(current_section)
+ current_section = [cluster_info[i]]
+ else:
+ current_section.append(cluster_info[i])
+
+ if current_section:
+ sections.append(current_section)
+
+ # --- Step E: Build row boundaries per section ---
+ grid_rows: List[RowGeometry] = []
+
+ for section in sections:
+ if not section:
+ continue
+
+ if len(section) == 1:
+ # Single-line section (likely a heading)
+ cl = section[0]
+ half_h = max(cl['letter_h'], median_pitch * 0.4)
+ row_top = cl['center_y_abs'] - half_h
+ row_bot = cl['center_y_abs'] + half_h
+ grid_rows.append(RowGeometry(
+ index=0,
+ x=left_x,
+ y=round(row_top),
+ width=content_w,
+ height=round(row_bot - row_top),
+ word_count=len(cl['words']),
+ words=cl['words'],
+ row_type='content',
+ gap_before=0,
+ ))
+ continue
+
+ # Compute local pitch for this section
+ local_pitches = []
+ for i in range(1, len(section)):
+ local_pitches.append(
+ section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
+ )
+ local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
+
+ # Row boundaries are placed at midpoints between consecutive centers.
+ # First row: top = center - local_pitch/2
+ # Last row: bottom = center + local_pitch/2
+ for i, cl in enumerate(section):
+ if i == 0:
+ row_top = cl['center_y_abs'] - local_pitch / 2
+ else:
+ # Midpoint between this center and previous center
+ prev_center = section[i - 1]['center_y_abs']
+ row_top = (prev_center + cl['center_y_abs']) / 2
+
+ if i == len(section) - 1:
+ row_bot = cl['center_y_abs'] + local_pitch / 2
+ else:
+ next_center = section[i + 1]['center_y_abs']
+ row_bot = (cl['center_y_abs'] + next_center) / 2
+
+ # Clamp to reasonable bounds
+ row_top = max(top_y, row_top)
+ row_bot = min(top_y + content_h, row_bot)
+
+ if row_bot - row_top < 5:
+ continue
+
+ grid_rows.append(RowGeometry(
+ index=0,
+ x=left_x,
+ y=round(row_top),
+ width=content_w,
+ height=round(row_bot - row_top),
+ word_count=len(cl['words']),
+ words=cl['words'],
+ row_type='content',
+ gap_before=0,
+ ))
+
+ if not grid_rows:
+ return rows
+
+ # --- Step F: Re-assign words to grid rows ---
+ # Words may have shifted slightly; assign each word to the row whose
+ # center is closest to the word's vertical center.
+ for gr in grid_rows:
+ gr.words = []
+
+ for w in content_words:
+ w_center = w['top'] + top_y + w['height'] / 2
+ best_row = None
+ best_dist = float('inf')
+ for gr in grid_rows:
+ row_center = gr.y + gr.height / 2
+ dist = abs(w_center - row_center)
+ if dist < best_dist:
+ best_dist = dist
+ best_row = gr
+ if best_row is not None and best_dist < median_pitch:
+ best_row.words.append(w)
+
+ for gr in grid_rows:
+ gr.word_count = len(gr.words)
+
+ # --- Step G: Validate ---
+ words_placed = sum(gr.word_count for gr in grid_rows)
+ if len(content_words) > 0:
+ match_ratio = words_placed / len(content_words)
+ if match_ratio < 0.85:
+ logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
+ f"of words, keeping gap-based rows")
+ return rows
+
+ # Remove empty grid rows (no words assigned)
+ grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
+
+ # --- Step H: Merge header/footer + re-index ---
+ result = list(non_content) + grid_rows
+ result.sort(key=lambda r: r.y)
+ for i, r in enumerate(result):
+ r.index = i
+
+ row_heights = [gr.height for gr in grid_rows]
+ min_h = min(row_heights) if row_heights else 0
+ max_h = max(row_heights) if row_heights else 0
+ logger.info(f"RowGrid: word-center grid applied "
+ f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
+ f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
+ f"{len(sections)} sections, "
+ f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
+ f"was {len(content_rows)} gap-based rows)")
+
+ return result
+
+
+def _build_rows_from_word_grouping(
+ word_dicts: List[Dict],
+ left_x: int, right_x: int,
+ top_y: int, bottom_y: int,
+ content_w: int, content_h: int,
+) -> List['RowGeometry']:
+ """Fallback: build rows by grouping words by Y position.
+
+ Uses _group_words_into_lines() with a generous tolerance.
+ No header/footer detection in fallback mode.
+ """
+ if not word_dicts:
+ return []
+
+ y_tolerance = max(20, content_h // 100)
+ lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
+
+ rows = []
+ for idx, line_words in enumerate(lines):
+ if not line_words:
+ continue
+ min_top = min(w['top'] for w in line_words)
+ max_bottom = max(w['top'] + w['height'] for w in line_words)
+ row_height = max_bottom - min_top
+
+ rows.append(RowGeometry(
+ index=idx,
+ x=left_x,
+ y=top_y + min_top,
+ width=content_w,
+ height=row_height,
+ word_count=len(line_words),
+ words=line_words,
+ row_type='content',
+ gap_before=0,
+ ))
+
+ logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
+ return rows
+
+
+# --- Phase B: Content-Based Classification ---
+
+def _score_language(words: List[Dict]) -> Dict[str, float]:
+ """Score the language of a column's words.
+
+ Analyzes function words, umlauts, and capitalization patterns
+ to determine whether text is English or German.
+
+ Args:
+ words: List of word dicts with 'text' and 'conf' keys.
+
+ Returns:
+ Dict with 'eng' and 'deu' scores (0.0-1.0).
+ """
+ if not words:
+ return {'eng': 0.0, 'deu': 0.0}
+
+ # Only consider words with decent confidence
+ good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
+ if not good_words:
+ return {'eng': 0.0, 'deu': 0.0}
+
+ total = len(good_words)
+ en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
+ de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
+
+ # Check for umlauts (strong German signal)
+ raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
+ umlaut_count = sum(1 for t in raw_texts
+ for c in t if c in 'äöüÄÖÜß')
+
+ # German capitalization: nouns are capitalized mid-sentence
+ # Count words that start with uppercase but aren't at position 0
+ cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
+
+ en_score = en_hits / total if total > 0 else 0.0
+ de_score = de_hits / total if total > 0 else 0.0
+
+ # Boost German score for umlauts
+ if umlaut_count > 0:
+ de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
+
+ # Boost German score for high capitalization ratio (typical for German nouns)
+ if total > 5:
+ cap_ratio = cap_words / total
+ if cap_ratio > 0.3:
+ de_score = min(1.0, de_score + 0.1)
+
+ return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
+
+
+def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
+ """Score the role of a column based on its geometry and content patterns.
+
+ Args:
+ geom: ColumnGeometry with words and dimensions.
+
+ Returns:
+ Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
+ """
+ scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
+
+ if not geom.words:
+ return scores
+
+ texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
+ if not texts:
+ return scores
+
+ avg_word_len = sum(len(t) for t in texts) / len(texts)
+ has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
+ digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
+ digit_ratio = digit_words / len(texts) if texts else 0.0
+
+ # Reference: narrow + mostly numbers/page references
+ if geom.width_ratio < 0.12:
+ scores['reference'] = 0.5
+ if digit_ratio > 0.4:
+ scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
+
+ # Marker: narrow + few short entries
+ if geom.width_ratio < 0.06 and geom.word_count <= 15:
+ scores['marker'] = 0.7
+ if avg_word_len < 4:
+ scores['marker'] = 0.9
+ # Very narrow non-edge column → strong marker regardless of word count
+ if geom.width_ratio < 0.04 and geom.index > 0:
+ scores['marker'] = max(scores['marker'], 0.9)
+
+ # Sentence: longer words + punctuation present
+ if geom.width_ratio > 0.15 and has_punctuation > 2:
+ scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
+ if avg_word_len > 4:
+ scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
+
+ # Vocabulary: medium width + medium word length
+ if 0.10 < geom.width_ratio < 0.45:
+ scores['vocabulary'] = 0.4
+ if 3 < avg_word_len < 8:
+ scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
+
+ return {k: round(v, 3) for k, v in scores.items()}
+
+
+def _build_margin_regions(
+ all_regions: List[PageRegion],
+ left_x: int,
+ right_x: int,
+ img_w: int,
+ top_y: int,
+ content_h: int,
+) -> List[PageRegion]:
+ """Create margin_left / margin_right PageRegions from content bounds.
+
+ Margins represent the space between the image edge and the first/last
+ content column. They are used downstream for faithful page
+ reconstruction but are skipped during OCR.
+ """
+ margins: List[PageRegion] = []
+ # Minimum gap (px) to create a margin region
+ _min_gap = 5
+
+ if left_x > _min_gap:
+ margins.append(PageRegion(
+ type='margin_left', x=0, y=top_y,
+ width=left_x, height=content_h,
+ classification_confidence=1.0,
+ classification_method='content_bounds',
+ ))
+
+ # Right margin: from end of last content column to image edge
+ non_margin = [r for r in all_regions
+ if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
+ 'margin_top', 'margin_bottom')]
+ if non_margin:
+ last_col_end = max(r.x + r.width for r in non_margin)
+ else:
+ last_col_end = right_x
+ if img_w - last_col_end > _min_gap:
+ margins.append(PageRegion(
+ type='margin_right', x=last_col_end, y=top_y,
+ width=img_w - last_col_end, height=content_h,
+ classification_confidence=1.0,
+ classification_method='content_bounds',
+ ))
+
+ if margins:
+ logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
+ f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
+
+ return margins
+
+
+def positional_column_regions(
+ geometries: List[ColumnGeometry],
+ content_w: int,
+ content_h: int,
+ left_x: int,
+) -> List[PageRegion]:
+ """Classify columns by position only (no language scoring).
+
+ Structural columns (page_ref, column_marker) are identified by geometry.
+ Remaining content columns are labelled left→right as column_en, column_de,
+ column_example. The names are purely positional – no language analysis.
+ """
+ structural: List[PageRegion] = []
+ content_cols: List[ColumnGeometry] = []
+
+ for g in geometries:
+ rel_x = g.x - left_x
+ # page_ref: narrow column in the leftmost 20% region
+ if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
+ structural.append(PageRegion(
+ type='page_ref', x=g.x, y=g.y,
+ width=g.width, height=content_h,
+ classification_confidence=0.95,
+ classification_method='positional',
+ ))
+ # column_marker: very narrow, few words
+ elif g.width_ratio < 0.06 and g.word_count <= 15:
+ structural.append(PageRegion(
+ type='column_marker', x=g.x, y=g.y,
+ width=g.width, height=content_h,
+ classification_confidence=0.95,
+ classification_method='positional',
+ ))
+ # empty or near-empty narrow column → treat as margin/structural
+ elif g.word_count <= 2 and g.width_ratio < 0.15:
+ structural.append(PageRegion(
+ type='column_marker', x=g.x, y=g.y,
+ width=g.width, height=content_h,
+ classification_confidence=0.85,
+ classification_method='positional',
+ ))
+ else:
+ content_cols.append(g)
+
+ # Single content column → plain text page
+ if len(content_cols) == 1:
+ g = content_cols[0]
+ return structural + [PageRegion(
+ type='column_text', x=g.x, y=g.y,
+ width=g.width, height=content_h,
+ classification_confidence=0.9,
+ classification_method='positional',
+ )]
+
+ # No content columns
+ if not content_cols:
+ return structural
+
+ # Sort content columns left→right and assign positional labels
+ content_cols.sort(key=lambda g: g.x)
+
+ # With exactly 2 content columns: if the left one is very wide (>35%),
+ # it likely contains EN+DE combined, so the right one is examples.
+ if (len(content_cols) == 2
+ and content_cols[0].width_ratio > 0.35
+ and content_cols[1].width_ratio > 0.20):
+ labels = ['column_en', 'column_example']
+ else:
+ labels = ['column_en', 'column_de', 'column_example']
+
+ regions = list(structural)
+ for i, g in enumerate(content_cols):
+ label = labels[i] if i < len(labels) else 'column_example'
+ regions.append(PageRegion(
+ type=label, x=g.x, y=g.y,
+ width=g.width, height=content_h,
+ classification_confidence=0.95,
+ classification_method='positional',
+ ))
+
+ logger.info(f"PositionalColumns: {len(structural)} structural, "
+ f"{len(content_cols)} content → "
+ f"{[r.type for r in regions]}")
+ return regions
+
+
+def classify_column_types(geometries: List[ColumnGeometry],
+ content_w: int,
+ top_y: int,
+ img_w: int,
+ img_h: int,
+ bottom_y: int,
+ left_x: int = 0,
+ right_x: int = 0,
+ inv: Optional[np.ndarray] = None) -> List[PageRegion]:
+ """Classify column types using a 3-level fallback chain.
+
+ Level 1: Content-based (language + role scoring)
+ Level 2: Position + language (old rules enhanced with language detection)
+ Level 3: Pure position (exact old code, no regression)
+
+ Args:
+ geometries: List of ColumnGeometry from Phase A.
+ content_w: Total content width.
+ top_y: Top Y of content area.
+ img_w: Full image width.
+ img_h: Full image height.
+ bottom_y: Bottom Y of content area.
+ left_x: Left content bound (from _find_content_bounds).
+ right_x: Right content bound (from _find_content_bounds).
+
+ Returns:
+ List of PageRegion with types, confidence, and method.
+ """
+ content_h = bottom_y - top_y
+
+ def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
+ """Append margin_left / margin_right regions to *result*."""
+ margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
+ return result + margins
+
+ # Special case: single column → plain text page
+ if len(geometries) == 1:
+ geom = geometries[0]
+ return _with_margins([PageRegion(
+ type='column_text', x=geom.x, y=geom.y,
+ width=geom.width, height=geom.height,
+ classification_confidence=0.9,
+ classification_method='content',
+ )])
+
+ # --- Pre-filter: first/last columns with very few words → column_ignore ---
+ # Sub-columns from _detect_sub_columns() are exempt: they intentionally
+ # have few words (page refs, markers) and should not be discarded.
+ ignore_regions = []
+ active_geometries = []
+ for idx, g in enumerate(geometries):
+ if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
+ ignore_regions.append(PageRegion(
+ type='column_ignore', x=g.x, y=g.y,
+ width=g.width, height=content_h,
+ classification_confidence=0.95,
+ classification_method='content',
+ ))
+ logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)")
+ else:
+ active_geometries.append(g)
+
+ # Re-index active geometries for classification
+ for new_idx, g in enumerate(active_geometries):
+ g.index = new_idx
+ geometries = active_geometries
+
+ # Handle edge case: all columns ignored or only 1 left
+ if len(geometries) == 0:
+ return _with_margins(ignore_regions)
+ if len(geometries) == 1:
+ geom = geometries[0]
+ ignore_regions.append(PageRegion(
+ type='column_text', x=geom.x, y=geom.y,
+ width=geom.width, height=geom.height,
+ classification_confidence=0.9,
+ classification_method='content',
+ ))
+ return _with_margins(ignore_regions)
+
+ # --- Score all columns ---
+ lang_scores = [_score_language(g.words) for g in geometries]
+ role_scores = [_score_role(g) for g in geometries]
+
+ logger.info(f"ClassifyColumns: language scores: "
+ f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
+ logger.info(f"ClassifyColumns: role scores: "
+ f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
+
+ # --- Level 1: Content-based classification ---
+ regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
+ if regions is not None:
+ logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
+ _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+ return _with_margins(ignore_regions + regions)
+
+ # --- Level 2: Position + language enhanced ---
+ regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
+ if regions is not None:
+ logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
+ _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+ return _with_margins(ignore_regions + regions)
+
+ # --- Level 3: Pure position fallback (old code, no regression) ---
+ logger.info("ClassifyColumns: Level 3 (position fallback)")
+ regions = _classify_by_position_fallback(geometries, content_w, content_h)
+ _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
+ return _with_margins(ignore_regions + regions)
+
+
+def _classify_by_content(geometries: List[ColumnGeometry],
+ lang_scores: List[Dict[str, float]],
+ role_scores: List[Dict[str, float]],
+ content_w: int,
+ content_h: int) -> Optional[List[PageRegion]]:
+ """Level 1: Classify columns purely by content analysis.
+
+ Requires clear language signals to distinguish EN/DE columns.
+ Returns None if language signals are too weak.
+ """
+ regions = []
+ assigned = set()
+
+ # Step 1: Assign structural roles first (reference, marker)
+ # left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
+ left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
+
+ for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
+ is_left_side = geom.x < left_20_threshold
+ has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
+ if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
+ regions.append(PageRegion(
+ type='page_ref', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=rs['reference'],
+ classification_method='content',
+ ))
+ assigned.add(i)
+ elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
+ regions.append(PageRegion(
+ type='column_marker', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=rs['marker'],
+ classification_method='content',
+ ))
+ assigned.add(i)
+ elif geom.width_ratio < 0.05 and not is_left_side:
+ # Narrow column on the right side → marker, not page_ref
+ regions.append(PageRegion(
+ type='column_marker', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=0.8,
+ classification_method='content',
+ ))
+ assigned.add(i)
+
+ # Step 2: Among remaining columns, find EN and DE by language scores
+ remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
+ for i in range(len(geometries)) if i not in assigned]
+
+ if len(remaining) < 2:
+ # Not enough columns for EN/DE pair
+ if len(remaining) == 1:
+ i, geom, ls, rs = remaining[0]
+ regions.append(PageRegion(
+ type='column_text', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=0.6,
+ classification_method='content',
+ ))
+ regions.sort(key=lambda r: r.x)
+ return regions
+
+ # Check if we have enough language signal
+ en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
+ de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
+
+ # Position tiebreaker: when language signals are weak, use left=EN, right=DE
+ if (not en_candidates or not de_candidates) and len(remaining) >= 2:
+ max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
+ max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
+ if max_eng < 0.15 and max_deu < 0.15:
+ # Both signals weak — fall back to positional: left=EN, right=DE
+ sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
+ best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
+ best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
+ logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
+ en_conf = 0.4
+ de_conf = 0.4
+
+ regions.append(PageRegion(
+ type='column_en', x=best_en[1].x, y=best_en[1].y,
+ width=best_en[1].width, height=content_h,
+ classification_confidence=en_conf,
+ classification_method='content',
+ ))
+ assigned.add(best_en[0])
+
+ regions.append(PageRegion(
+ type='column_de', x=best_de[1].x, y=best_de[1].y,
+ width=best_de[1].width, height=content_h,
+ classification_confidence=de_conf,
+ classification_method='content',
+ ))
+ assigned.add(best_de[0])
+
+ # Assign remaining as example
+ for i, geom, ls, rs in remaining:
+ if i not in assigned:
+ regions.append(PageRegion(
+ type='column_example', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=0.4,
+ classification_method='content',
+ ))
+ regions.sort(key=lambda r: r.x)
+ return regions
+
+ if not en_candidates or not de_candidates:
+ # Language signals too weak for content-based classification
+ logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
+ return None
+
+ # Pick the best EN and DE candidates
+ best_en = max(en_candidates, key=lambda x: x[2]['eng'])
+ best_de = max(de_candidates, key=lambda x: x[2]['deu'])
+
+ # Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
+ # Example sentences contain English function words ("the", "a", "is") which inflate
+ # the eng score of the Example column. When the best EN candidate sits to the RIGHT
+ # of the DE column and there is another EN candidate to the LEFT, prefer the left one
+ # — it is almost certainly the real vocabulary column.
+ if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
+ left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
+ if left_of_de:
+ alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
+ logger.info(
+ f"ClassifyColumns: Level 1 position fix — best EN col {best_en[0]} "
+ f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
+ f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
+ best_en = alt_en
+
+ if best_en[0] == best_de[0]:
+ # Same column scored highest for both — ambiguous
+ logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
+ return None
+
+ en_conf = best_en[2]['eng']
+ de_conf = best_de[2]['deu']
+
+ regions.append(PageRegion(
+ type='column_en', x=best_en[1].x, y=best_en[1].y,
+ width=best_en[1].width, height=content_h,
+ classification_confidence=round(en_conf, 2),
+ classification_method='content',
+ ))
+ assigned.add(best_en[0])
+
+ regions.append(PageRegion(
+ type='column_de', x=best_de[1].x, y=best_de[1].y,
+ width=best_de[1].width, height=content_h,
+ classification_confidence=round(de_conf, 2),
+ classification_method='content',
+ ))
+ assigned.add(best_de[0])
+
+ # Step 3: Remaining columns → example or text based on role scores
+ for i, geom, ls, rs in remaining:
+ if i in assigned:
+ continue
+ if rs['sentence'] > 0.4:
+ regions.append(PageRegion(
+ type='column_example', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=round(rs['sentence'], 2),
+ classification_method='content',
+ ))
+ else:
+ regions.append(PageRegion(
+ type='column_example', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=0.5,
+ classification_method='content',
+ ))
+
+ regions.sort(key=lambda r: r.x)
+ return regions
+
+
+def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
+ lang_scores: List[Dict[str, float]],
+ content_w: int,
+ content_h: int) -> Optional[List[PageRegion]]:
+ """Level 2: Position-based rules enhanced with language confirmation.
+
+ Uses the old positional heuristics but confirms EN/DE assignment
+ with language scores (swapping if needed).
+ """
+ regions = []
+ untyped = list(range(len(geometries)))
+ first_x = geometries[0].x if geometries else 0
+ left_20_threshold = first_x + content_w * 0.20
+
+ # Rule 1: Leftmost narrow column → page_ref (only if in left 20%, no strong language)
+ g0 = geometries[0]
+ ls0 = lang_scores[0]
+ has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
+ if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
+ regions.append(PageRegion(
+ type='page_ref', x=g0.x, y=g0.y,
+ width=g0.width, height=content_h,
+ classification_confidence=0.8,
+ classification_method='position_enhanced',
+ ))
+ untyped.remove(0)
+
+ # Rule 2: Narrow columns with few words → marker
+ for i in list(untyped):
+ geom = geometries[i]
+ if geom.width_ratio < 0.06 and geom.word_count <= 15:
+ regions.append(PageRegion(
+ type='column_marker', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=0.7,
+ classification_method='position_enhanced',
+ ))
+ untyped.remove(i)
+
+ # Rule 3: Rightmost remaining → column_example (if 3+ remaining)
+ if len(untyped) >= 3:
+ last_idx = untyped[-1]
+ geom = geometries[last_idx]
+ regions.append(PageRegion(
+ type='column_example', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=0.7,
+ classification_method='position_enhanced',
+ ))
+ untyped.remove(last_idx)
+
+ # Rule 4: First two remaining → EN/DE, but check language to possibly swap
+ if len(untyped) >= 2:
+ idx_a = untyped[0]
+ idx_b = untyped[1]
+ ls_a = lang_scores[idx_a]
+ ls_b = lang_scores[idx_b]
+
+ # Default: first=EN, second=DE (old behavior)
+ en_idx, de_idx = idx_a, idx_b
+ conf = 0.7
+
+ # Swap if language signals clearly indicate the opposite
+ if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
+ en_idx, de_idx = idx_b, idx_a
+ conf = 0.85
+ logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
+
+ regions.append(PageRegion(
+ type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
+ width=geometries[en_idx].width, height=content_h,
+ classification_confidence=conf,
+ classification_method='position_enhanced',
+ ))
+ regions.append(PageRegion(
+ type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
+ width=geometries[de_idx].width, height=content_h,
+ classification_confidence=conf,
+ classification_method='position_enhanced',
+ ))
+ untyped = untyped[2:]
+ elif len(untyped) == 1:
+ idx = untyped[0]
+ geom = geometries[idx]
+ regions.append(PageRegion(
+ type='column_en', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=0.5,
+ classification_method='position_enhanced',
+ ))
+ untyped = []
+
+ # Remaining → example
+ for idx in untyped:
+ geom = geometries[idx]
+ regions.append(PageRegion(
+ type='column_example', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=0.5,
+ classification_method='position_enhanced',
+ ))
+
+ regions.sort(key=lambda r: r.x)
+ return regions
+
+
+def _classify_by_position_fallback(geometries: List[ColumnGeometry],
+ content_w: int,
+ content_h: int) -> List[PageRegion]:
+ """Level 3: Pure position-based fallback (identical to old code).
+
+ Guarantees no regression from the previous behavior.
+ """
+ regions = []
+ untyped = list(range(len(geometries)))
+ first_x = geometries[0].x if geometries else 0
+ left_20_threshold = first_x + content_w * 0.20
+
+ # Rule 1: Leftmost narrow column → page_ref (only if in left 20%)
+ g0 = geometries[0]
+ if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
+ regions.append(PageRegion(
+ type='page_ref', x=g0.x, y=g0.y,
+ width=g0.width, height=content_h,
+ classification_confidence=1.0,
+ classification_method='position_fallback',
+ ))
+ untyped.remove(0)
+
+ # Rule 2: Narrow + few words → marker
+ for i in list(untyped):
+ geom = geometries[i]
+ if geom.width_ratio < 0.06 and geom.word_count <= 15:
+ regions.append(PageRegion(
+ type='column_marker', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=1.0,
+ classification_method='position_fallback',
+ ))
+ untyped.remove(i)
+
+ # Rule 3: Rightmost remaining → example (if 3+)
+ if len(untyped) >= 3:
+ last_idx = untyped[-1]
+ geom = geometries[last_idx]
+ regions.append(PageRegion(
+ type='column_example', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=1.0,
+ classification_method='position_fallback',
+ ))
+ untyped.remove(last_idx)
+
+ # Rule 4: First remaining → EN, second → DE
+ if len(untyped) >= 2:
+ en_idx = untyped[0]
+ de_idx = untyped[1]
+ regions.append(PageRegion(
+ type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
+ width=geometries[en_idx].width, height=content_h,
+ classification_confidence=1.0,
+ classification_method='position_fallback',
+ ))
+ regions.append(PageRegion(
+ type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
+ width=geometries[de_idx].width, height=content_h,
+ classification_confidence=1.0,
+ classification_method='position_fallback',
+ ))
+ untyped = untyped[2:]
+ elif len(untyped) == 1:
+ idx = untyped[0]
+ geom = geometries[idx]
+ regions.append(PageRegion(
+ type='column_en', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=1.0,
+ classification_method='position_fallback',
+ ))
+ untyped = []
+
+ for idx in untyped:
+ geom = geometries[idx]
+ regions.append(PageRegion(
+ type='column_example', x=geom.x, y=geom.y,
+ width=geom.width, height=content_h,
+ classification_confidence=1.0,
+ classification_method='position_fallback',
+ ))
+
+ regions.sort(key=lambda r: r.x)
+ return regions
+
+
+def _detect_header_footer_gaps(
+ inv: np.ndarray,
+ img_w: int,
+ img_h: int,
+) -> Tuple[Optional[int], Optional[int]]:
+ """Detect header/footer boundaries via horizontal projection gap analysis.
+
+ Scans the full-page inverted image for large horizontal gaps in the top/bottom
+ 20% that separate header/footer content from the main body.
+
+ Returns:
+ (header_y, footer_y) — absolute y-coordinates.
+ header_y = bottom edge of header region (None if no header detected).
+ footer_y = top edge of footer region (None if no footer detected).
+ """
+ HEADER_FOOTER_ZONE = 0.20
+ GAP_MULTIPLIER = 2.0
+
+ # Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
+ actual_h = min(inv.shape[0], img_h)
+ roi = inv[:actual_h, :]
+ h_proj = np.sum(roi, axis=1).astype(float)
+ proj_w = roi.shape[1]
+ h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
+
+ # Step 2: Smoothing
+ kernel_size = max(3, actual_h // 200)
+ if kernel_size % 2 == 0:
+ kernel_size += 1
+ h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+ # Step 3: Gap threshold
+ positive = h_smooth[h_smooth > 0]
+ median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
+ gap_threshold = max(median_density * 0.15, 0.003)
+
+ in_gap = h_smooth < gap_threshold
+ MIN_GAP_HEIGHT = max(3, actual_h // 500)
+
+ # Step 4: Collect contiguous gaps
+ raw_gaps: List[Tuple[int, int]] = []
+ gap_start: Optional[int] = None
+ for y in range(len(in_gap)):
+ if in_gap[y]:
+ if gap_start is None:
+ gap_start = y
+ else:
+ if gap_start is not None:
+ gap_height = y - gap_start
+ if gap_height >= MIN_GAP_HEIGHT:
+ raw_gaps.append((gap_start, y))
+ gap_start = None
+ if gap_start is not None:
+ gap_height = len(in_gap) - gap_start
+ if gap_height >= MIN_GAP_HEIGHT:
+ raw_gaps.append((gap_start, len(in_gap)))
+
+ if not raw_gaps:
+ return None, None
+
+ # Step 5: Compute median gap size and large-gap threshold
+ gap_sizes = [g[1] - g[0] for g in raw_gaps]
+ median_gap = float(np.median(gap_sizes))
+ large_gap_threshold = median_gap * GAP_MULTIPLIER
+
+ # Step 6: Find largest qualifying gap in header / footer zones
+ # A separator gap must have content on BOTH sides — edge-touching gaps
+ # (e.g. dewarp padding at bottom) are not valid separators.
+ EDGE_MARGIN = max(5, actual_h // 400)
+ header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
+ footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
+
+ header_y: Optional[int] = None
+ footer_y: Optional[int] = None
+
+ best_header_size = 0
+ for gs, ge in raw_gaps:
+ if gs <= EDGE_MARGIN:
+ continue # skip gaps touching the top edge
+ gap_mid = (gs + ge) / 2
+ gap_size = ge - gs
+ if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
+ if gap_size > best_header_size:
+ best_header_size = gap_size
+ header_y = ge # bottom edge of gap
+
+ best_footer_size = 0
+ for gs, ge in raw_gaps:
+ if ge >= actual_h - EDGE_MARGIN:
+ continue # skip gaps touching the bottom edge
+ gap_mid = (gs + ge) / 2
+ gap_size = ge - gs
+ if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
+ if gap_size > best_footer_size:
+ best_footer_size = gap_size
+ footer_y = gs # top edge of gap
+
+ if header_y is not None:
+ logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
+ f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
+ if footer_y is not None:
+ logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
+ f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
+
+ return header_y, footer_y
+
+
+def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
+ min_density: float = 0.005) -> bool:
+ """Check whether a horizontal strip contains meaningful ink.
+
+ Args:
+ inv: Inverted binarized image (white-on-black).
+ y_start: Top of the region (inclusive).
+ y_end: Bottom of the region (exclusive).
+ min_density: Fraction of white pixels required to count as content.
+
+ Returns:
+ True if the region contains text/graphics, False if empty margin.
+ """
+ if y_start >= y_end:
+ return False
+ strip = inv[y_start:y_end, :]
+ density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
+ return density > min_density
+
+
+def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
+ img_w: int, img_h: int,
+ inv: Optional[np.ndarray] = None) -> None:
+ """Add header/footer/margin regions in-place.
+
+ Uses gap-based detection when *inv* is provided, otherwise falls back
+ to simple top_y/bottom_y bounds.
+
+ Region types depend on whether there is actual content (text/graphics):
+ - 'header' / 'footer' — region contains text (e.g. title, page number)
+ - 'margin_top' / 'margin_bottom' — region is empty page margin
+ """
+ header_y: Optional[int] = None
+ footer_y: Optional[int] = None
+
+ if inv is not None:
+ header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
+
+ # --- Top region ---
+ top_boundary = header_y if header_y is not None and header_y > 10 else (
+ top_y if top_y > 10 else None
+ )
+ if top_boundary is not None:
+ has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
+ rtype = 'header' if has_content else 'margin_top'
+ regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
+ logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
+ f"(has_content={has_content})")
+
+ # --- Bottom region ---
+ bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
+ bottom_y if bottom_y < img_h - 10 else None
+ )
+ if bottom_boundary is not None:
+ has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
+ rtype = 'footer' if has_content else 'margin_bottom'
+ regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
+ height=img_h - bottom_boundary))
+ logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
+ f"height={img_h - bottom_boundary}px (has_content={has_content})")
+
+
+# --- Main Entry Point ---
+
+def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
+ """Detect columns using two-phase approach: geometry then content classification.
+
+ Phase A: detect_column_geometry() — clustering word positions into columns.
+ Phase B: classify_column_types() — content-based type assignment with fallback.
+
+ Falls back to projection-based analyze_layout() if geometry detection fails.
+
+ Args:
+ ocr_img: Binarized grayscale image for layout analysis.
+ dewarped_bgr: Original BGR image (for Tesseract word detection).
+
+ Returns:
+ List of PageRegion objects with types, confidence, and method.
+ """
+ h, w = ocr_img.shape[:2]
+
+ # Phase A: Geometry detection
+ result = detect_column_geometry(ocr_img, dewarped_bgr)
+
+ if result is None:
+ # Fallback to projection-based layout
+ logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
+ layout_img = create_layout_image(dewarped_bgr)
+ return analyze_layout(layout_img, ocr_img)
+
+ geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
+ content_w = right_x - left_x
+
+ # Detect header/footer early so sub-column clustering ignores them
+ header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
+
+ # Split sub-columns (e.g. page references) before classification
+ geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
+ top_y=top_y, header_y=header_y, footer_y=footer_y)
+
+ # Split broad columns that contain EN+DE mixed via word-coverage gaps
+ geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
+
+ # Phase B: Positional classification (no language scoring)
+ content_h = bottom_y - top_y
+ regions = positional_column_regions(geometries, content_w, content_h, left_x)
+
+ col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
+ methods = set(r.classification_method for r in regions if r.classification_method)
+ logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
+ f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
+
+ return regions
diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py
new file mode 100644
index 0000000..2f630c3
--- /dev/null
+++ b/klausur-service/backend/cv_ocr_engines.py
@@ -0,0 +1,1282 @@
+"""
+OCR engines (RapidOCR, TrOCR, LightOn), vocab postprocessing, and text cleaning.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import io
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+ IPA_AVAILABLE,
+ PageRegion,
+ RowGeometry,
+ _britfone_dict,
+ _ipa_convert_american,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+ import cv2
+except ImportError:
+ cv2 = None # type: ignore[assignment]
+
+try:
+ from PIL import Image
+except ImportError:
+ Image = None # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Pipeline Step 5: Word Grid from Columns × Rows
+# =============================================================================
+
+def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
+ """Group OCR words into visual lines in reading order.
+
+ Returns a list of line strings (one per visual line in the cell).
+ """
+ if not words:
+ return []
+
+ lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
+ return [' '.join(w['text'] for w in line) for line in lines]
+
+
+def _rejoin_hyphenated(lines: List[str]) -> List[str]:
+ """Rejoin words split by line-break hyphenation.
+
+ E.g. ['Fuß-', 'boden'] → ['Fußboden']
+ ['some text-', 'thing here'] → ['something here']
+ """
+ if len(lines) <= 1:
+ return lines
+
+ result = []
+ i = 0
+ while i < len(lines):
+ line = lines[i]
+ # If line ends with '-' and there's a next line, rejoin
+ if i + 1 < len(lines) and line.rstrip().endswith('-'):
+ stripped = line.rstrip()
+ # Get the word fragment before hyphen (last word)
+ prefix = stripped[:-1] # remove trailing hyphen
+ next_line = lines[i + 1]
+ # Join: last word of this line + first word of next line
+ prefix_words = prefix.rsplit(' ', 1)
+ next_words = next_line.split(' ', 1)
+ if len(prefix_words) > 1:
+ joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
+ else:
+ joined = prefix_words[0] + next_words[0]
+ remainder = next_words[1] if len(next_words) > 1 else ''
+ if remainder:
+ result.append(joined + ' ' + remainder)
+ else:
+ result.append(joined)
+ i += 2
+ else:
+ result.append(line)
+ i += 1
+ return result
+
+
+def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
+ """Join OCR words into text in correct reading order, preserving line breaks.
+
+ Groups words into visual lines by Y-tolerance, sorts each line by X,
+ rejoins hyphenated words, then joins lines with newlines.
+ """
+ lines = _words_to_reading_order_lines(words, y_tolerance_px)
+ lines = _rejoin_hyphenated(lines)
+ return '\n'.join(lines)
+
+
+# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
+
+_rapid_engine = None
+RAPIDOCR_AVAILABLE = False
+
+try:
+ from rapidocr import RapidOCR as _RapidOCRClass
+ from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
+ RAPIDOCR_AVAILABLE = True
+ logger.info("RapidOCR available — can be used as alternative to Tesseract")
+except ImportError:
+ logger.info("RapidOCR not installed — using Tesseract only")
+
+
+def _get_rapid_engine():
+ """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
+ global _rapid_engine
+ if _rapid_engine is None:
+ _rapid_engine = _RapidOCRClass(params={
+ # PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
+ "Rec.lang_type": _LangRec.LATIN,
+ "Rec.model_type": _ModelType.SERVER,
+ "Rec.ocr_version": _OCRVersion.PPOCRV5,
+ # Tighter detection boxes to reduce word merging
+ "Det.unclip_ratio": 1.3,
+ # Lower threshold to detect small chars (periods, ellipsis, phonetics)
+ "Det.box_thresh": 0.4,
+ # Silence verbose logging
+ "Global.log_level": "critical",
+ })
+ logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
+ return _rapid_engine
+
+
+def ocr_region_rapid(
+ img_bgr: np.ndarray,
+ region: PageRegion,
+) -> List[Dict[str, Any]]:
+ """Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format.
+
+ Args:
+ img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray).
+ region: Region to crop and OCR.
+
+ Returns:
+ List of word dicts with text, left, top, width, height, conf, region_type.
+ """
+ engine = _get_rapid_engine()
+
+ # Crop region from BGR image
+ crop = img_bgr[region.y:region.y + region.height,
+ region.x:region.x + region.width]
+
+ if crop.size == 0:
+ return []
+
+ result = engine(crop)
+
+ if result is None or result.boxes is None or result.txts is None:
+ return []
+
+ words = []
+ boxes = result.boxes # shape (N, 4, 2) — 4 corner points per text line
+ txts = result.txts # tuple of strings
+ scores = result.scores # tuple of floats
+
+ for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
+ if not txt or not txt.strip():
+ continue
+
+ # box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left)
+ xs = [p[0] for p in box]
+ ys = [p[1] for p in box]
+ left = int(min(xs))
+ top = int(min(ys))
+ w = int(max(xs) - left)
+ h = int(max(ys) - top)
+
+ words.append({
+ 'text': txt.strip(),
+ 'left': left + region.x, # Absolute coords
+ 'top': top + region.y,
+ 'width': w,
+ 'height': h,
+ 'conf': int(score * 100), # 0-100 like Tesseract
+ 'region_type': region.type,
+ })
+
+ return words
+
+
+def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
+ """Run TrOCR on a region. Returns line-level word dicts (same format as ocr_region_rapid).
+
+ Uses trocr_service.get_trocr_model() + _split_into_lines() for line segmentation.
+ Bboxes are approximated from equal line-height distribution within the region.
+ Falls back to Tesseract if TrOCR is not available.
+ """
+ from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
+
+ if not _check_trocr_available():
+ logger.warning("TrOCR not available, falling back to Tesseract")
+ if region.height > 0 and region.width > 0:
+ ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
+ if ocr_img_crop is not None:
+ return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+ return []
+
+ crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
+ if crop.size == 0:
+ return []
+
+ try:
+ import torch
+ from PIL import Image as _PILImage
+
+ processor, model = get_trocr_model(handwritten=handwritten)
+ if processor is None or model is None:
+ logger.warning("TrOCR model not loaded, falling back to Tesseract")
+ ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+ return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+
+ pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
+ lines = _split_into_lines(pil_crop)
+ if not lines:
+ lines = [pil_crop]
+
+ device = next(model.parameters()).device
+ all_text = []
+ confidences = []
+ for line_img in lines:
+ pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
+ with torch.no_grad():
+ generated_ids = model.generate(pixel_values, max_length=128)
+ text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+ if text_line:
+ all_text.append(text_line)
+ confidences.append(0.85 if len(text_line) > 3 else 0.5)
+
+ if not all_text:
+ return []
+
+ avg_conf = int(sum(confidences) / len(confidences) * 100)
+ line_h = region.height // max(len(all_text), 1)
+ words = []
+ for i, line in enumerate(all_text):
+ words.append({
+ "text": line,
+ "left": region.x,
+ "top": region.y + i * line_h,
+ "width": region.width,
+ "height": line_h,
+ "conf": avg_conf,
+ "region_type": region.type,
+ })
+ return words
+
+ except Exception as e:
+ logger.error(f"ocr_region_trocr failed: {e}")
+ return []
+
+
+def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
+ """Run LightOnOCR-2-1B on a region. Returns line-level word dicts (same format as ocr_region_rapid).
+
+ Falls back to RapidOCR or Tesseract if LightOnOCR is not available.
+ """
+ from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
+
+ if not _check_lighton_available():
+ logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
+ if RAPIDOCR_AVAILABLE and img_bgr is not None:
+ return ocr_region_rapid(img_bgr, region)
+ ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
+ return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
+
+ crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
+ if crop.size == 0:
+ return []
+
+ try:
+ import io
+ import torch
+ from PIL import Image as _PILImage
+
+ processor, model = get_lighton_model()
+ if processor is None or model is None:
+ logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
+ if RAPIDOCR_AVAILABLE and img_bgr is not None:
+ return ocr_region_rapid(img_bgr, region)
+ ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+ return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+
+ pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
+ conversation = [{"role": "user", "content": [{"type": "image"}]}]
+ inputs = processor.apply_chat_template(
+ conversation, images=[pil_crop],
+ add_generation_prompt=True, return_tensors="pt"
+ ).to(model.device)
+
+ with torch.no_grad():
+ output_ids = model.generate(**inputs, max_new_tokens=1024)
+
+ text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
+ if not text:
+ return []
+
+ lines = [l.strip() for l in text.split("\n") if l.strip()]
+ line_h = region.height // max(len(lines), 1)
+ words = []
+ for i, line in enumerate(lines):
+ words.append({
+ "text": line,
+ "left": region.x,
+ "top": region.y + i * line_h,
+ "width": region.width,
+ "height": line_h,
+ "conf": 85,
+ "region_type": region.type,
+ })
+ return words
+
+ except Exception as e:
+ logger.error(f"ocr_region_lighton failed: {e}")
+ return []
+
+
+# =============================================================================
+# Post-Processing: Deterministic Quality Fixes
+# =============================================================================
+
+# --- A. Character Confusion Fix (I/1/l) ---
+
+# Common OCR confusion pairs in vocabulary context
+_CHAR_CONFUSION_RULES = [
+ # "1" at word start followed by lowercase → likely "I" or "l"
+ # Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
+ (re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
+ # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
+ (re.compile(r'(? List[Dict[str, Any]]:
+ """Fix common OCR character confusions using context.
+
+ Deterministic rules:
+ - "1" at word start → "I" or "l" based on context
+ - Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
+ - "y " artifact at word boundaries → remove (e.g. "y you" → "you")
+ """
+ for entry in entries:
+ en = entry.get('english', '') or ''
+ de = entry.get('german', '') or ''
+ ex = entry.get('example', '') or ''
+
+ # Apply general rules to all fields
+ for pattern, replacement in _CHAR_CONFUSION_RULES:
+ en = pattern.sub(replacement, en)
+ de = pattern.sub(replacement, de)
+ ex = pattern.sub(replacement, ex)
+
+ # Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
+ de_lower_words = set(de.lower().replace(',', ' ').split())
+ if de_lower_words & _DE_INDICATORS_FOR_EN_I:
+ # Any remaining "1" in EN that looks like "I"
+ en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
+
+ # Fix "y " artifact before repeated word: "y you" → "you"
+ en = re.sub(r'\by\s+([a-z])', r'\1', en)
+ ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
+
+ entry['english'] = en.strip()
+ entry['german'] = de.strip()
+ entry['example'] = ex.strip()
+
+ return entries
+
+
+# --- B. Comma-Separated Word Form Splitting ---
+
+def _is_singular_plural_pair(parts: List[str]) -> bool:
+ """Detect if comma-separated parts are singular/plural forms of the same word.
+
+ E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
+ "break, broke, broken" → False (different verb forms, OK to split).
+
+ Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
+ OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
+ """
+ if len(parts) != 2:
+ return False
+
+ a, b = parts[0].lower().strip(), parts[1].lower().strip()
+ if not a or not b:
+ return False
+
+ # Common prefix heuristic: if words share >= 50% of the shorter word,
+ # they are likely forms of the same word (Maus/Mäuse, child/children).
+ min_len = min(len(a), len(b))
+ common = 0
+ for ca, cb in zip(a, b):
+ if ca == cb:
+ common += 1
+ else:
+ break
+ if common >= max(2, min_len * 0.5):
+ return True
+
+ # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
+ umlaut_map = str.maketrans('aou', 'äöü')
+ if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
+ return True
+
+ return False
+
+
+def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+ """Split entries with comma-separated word forms into individual entries.
+
+ E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
+ → 3 entries: break/brechen, broke/brach, broken/gebrochen
+
+ Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
+ because those are forms of the same vocabulary entry.
+
+ Only splits when both EN and DE have the same number of comma-parts,
+ parts are short (word forms, not sentences), and at least 3 parts
+ (to avoid splitting pairs that likely belong together).
+ """
+ result: List[Dict[str, Any]] = []
+
+ for entry in entries:
+ en = (entry.get('english', '') or '').strip()
+ de = (entry.get('german', '') or '').strip()
+
+ # Split by comma (but not inside brackets or parentheses)
+ en_parts = _split_by_comma(en)
+ de_parts = _split_by_comma(de)
+
+ # Only split if we have multiple parts and counts match
+ should_split = False
+ if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
+ # All parts must be short (word forms, not sentences)
+ if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
+ # Do NOT split singular/plural pairs (2 parts that are
+ # forms of the same word)
+ if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
+ should_split = False
+ else:
+ should_split = True
+
+ if not should_split:
+ result.append(entry)
+ continue
+
+ # Split into individual entries
+ for k in range(len(en_parts)):
+ sub = dict(entry) # shallow copy
+ sub['english'] = en_parts[k].strip()
+ sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
+ sub['example'] = '' # examples get attached later
+ sub['split_from_comma'] = True
+ result.append(sub)
+
+ # Re-number
+ for i, e in enumerate(result):
+ e['row_index'] = i
+
+ return result
+
+
+def _split_by_comma(text: str) -> List[str]:
+ """Split text by commas, but not inside brackets [...] or parens (...)."""
+ if ',' not in text:
+ return [text]
+
+ parts = []
+ depth_bracket = 0
+ depth_paren = 0
+ current = []
+
+ for ch in text:
+ if ch == '[':
+ depth_bracket += 1
+ elif ch == ']':
+ depth_bracket = max(0, depth_bracket - 1)
+ elif ch == '(':
+ depth_paren += 1
+ elif ch == ')':
+ depth_paren = max(0, depth_paren - 1)
+ elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
+ parts.append(''.join(current).strip())
+ current = []
+ continue
+ current.append(ch)
+
+ if current:
+ parts.append(''.join(current).strip())
+
+ # Filter empty parts
+ return [p for p in parts if p]
+
+
+# --- C. Example Sentence Attachment ---
+
+def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
+ """Find the vocab entry whose English word(s) best match the example sentence.
+
+ Returns index into vocab_entries, or -1 if no match found.
+ Uses word stem overlap: "a broken arm" matches "broken" or "break".
+ """
+ if not vocab_entries or not example_text:
+ return -1
+
+ example_lower = example_text.lower()
+ example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
+
+ best_idx = -1
+ best_score = 0
+
+ for i, entry in enumerate(vocab_entries):
+ en = (entry.get('english', '') or '').lower()
+ if not en:
+ continue
+
+ # Extract vocab words (split on space, comma, newline)
+ vocab_words = set(re.findall(r'[a-zäöüß]+', en))
+
+ # Score: how many vocab words appear in the example?
+ # Also check if example words share a common stem (first 4 chars)
+ direct_matches = vocab_words & example_words
+ score = len(direct_matches) * 10
+
+ # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
+ if score == 0:
+ for vw in vocab_words:
+ if len(vw) < 3:
+ continue
+ stem = vw[:4] if len(vw) >= 4 else vw[:3]
+ for ew in example_words:
+ if len(ew) >= len(stem) and ew[:len(stem)] == stem:
+ score += 5
+ break
+
+ if score > best_score:
+ best_score = score
+ best_idx = i
+
+ return best_idx if best_score > 0 else -1
+
+
+def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+ """Attach rows with EN text but no DE translation as examples to matching vocab entries.
+
+ Vocabulary worksheets often have:
+ Row 1: break, broke, broken / brechen, brach, gebrochen
+ Row 2: a broken arm (no DE → example for "broken")
+ Row 3: a broken plate (no DE → example for "broken")
+ Row 4: egg / Ei (has DE → new vocab entry)
+
+ Rules (deterministic, generic):
+ - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
+ - Find the best matching vocab entry by checking which entry's English words
+ appear in the example sentence (semantic matching via word overlap)
+ - Fall back to the nearest preceding entry if no word match found
+ - Multiple examples get joined with " | "
+ """
+ if not entries:
+ return entries
+
+ # Separate into vocab entries (have DE) and example candidates (no DE)
+ vocab_entries: List[Dict[str, Any]] = []
+ examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
+
+ for entry in entries:
+ en = (entry.get('english', '') or '').strip()
+ de = (entry.get('german', '') or '').strip()
+ ex = (entry.get('example', '') or '').strip()
+
+ # Treat single-char DE as OCR noise, not real translation.
+ # "Ei" (2 chars) is a valid German word, so threshold is 1.
+ has_de = len(de) > 1
+ has_en = bool(en)
+
+ # Heuristic: a row without DE is an "example sentence" only if
+ # the EN text looks like a sentence (>= 4 words, or contains
+ # typical sentence punctuation). Short EN text (1-3 words) is
+ # more likely a vocab entry whose DE was missed by OCR.
+ _looks_like_sentence = (
+ len(en.split()) >= 4
+ or en.rstrip().endswith(('.', '!', '?'))
+ )
+ is_example_candidate = (
+ has_en and not has_de and _looks_like_sentence and vocab_entries
+ )
+
+ if is_example_candidate:
+ # This is an example sentence — find best matching vocab entry
+ example_text = en
+
+ match_idx = _find_best_vocab_match(en, vocab_entries)
+ if match_idx < 0:
+ # No word match → fall back to last entry
+ match_idx = len(vocab_entries) - 1
+
+ if match_idx not in examples_for:
+ examples_for[match_idx] = []
+ examples_for[match_idx].append(example_text)
+ else:
+ vocab_entries.append(entry)
+
+ # Attach examples to their matched vocab entries
+ for idx, example_list in examples_for.items():
+ if 0 <= idx < len(vocab_entries):
+ entry = vocab_entries[idx]
+ existing_ex = (entry.get('example', '') or '').strip()
+ new_examples = ' | '.join(example_list)
+ entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
+
+ # Re-number
+ for i, e in enumerate(vocab_entries):
+ e['row_index'] = i
+
+ return vocab_entries
+
+
+# --- D. Phonetic Bracket IPA Replacement ---
+
+# Pattern: word followed by any bracket type containing phonetic content.
+# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
+# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
+# This intentionally matches mixed brackets (e.g. {content]) because
+# Tesseract frequently misrecognizes bracket characters.
+_PHONETIC_BRACKET_RE = re.compile(
+ r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
+)
+
+# Unicode IPA characters — used to distinguish correct IPA (from dictionary
+# lookup) from garbled OCR content when stripping orphan brackets.
+_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
+
+# Minimum word confidence for full-page Tesseract results (0-100).
+# Words below this threshold are OCR noise (scanner shadows, borders).
+_MIN_WORD_CONF = 30
+
+
+def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
+ """Look up IPA for a word using the selected pronunciation dictionary.
+
+ Args:
+ word: English word to look up.
+ pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
+
+ Returns:
+ IPA string or None if not found.
+ """
+ word_lower = word.lower().strip()
+ if not word_lower:
+ return None
+
+ if pronunciation == 'british' and _britfone_dict:
+ ipa = _britfone_dict.get(word_lower)
+ if ipa:
+ return ipa
+ # Fallback to American if not in Britfone
+ if _ipa_convert_american:
+ result = _ipa_convert_american(word_lower)
+ if result and '*' not in result:
+ return result
+ return None
+
+ if pronunciation == 'american' and _ipa_convert_american:
+ result = _ipa_convert_american(word_lower)
+ if result and '*' not in result:
+ return result
+ # Fallback to Britfone if not in CMU
+ if _britfone_dict:
+ ipa = _britfone_dict.get(word_lower)
+ if ipa:
+ return ipa
+ return None
+
+ # Try any available source
+ if _britfone_dict:
+ ipa = _britfone_dict.get(word_lower)
+ if ipa:
+ return ipa
+ if _ipa_convert_american:
+ result = _ipa_convert_american(word_lower)
+ if result and '*' not in result:
+ return result
+
+ return None
+
+
+def _fix_phonetic_brackets(
+ entries: List[Dict[str, Any]],
+ pronunciation: str = 'british',
+) -> List[Dict[str, Any]]:
+ """Replace OCR'd phonetic transcriptions with dictionary IPA.
+
+ Detects patterns like "dance [du:ns]" and replaces with correct IPA:
+ - British: "dance [dˈɑːns]" (Britfone, MIT)
+ - American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
+
+ Only replaces if the word before brackets is found in the dictionary.
+ """
+ if not IPA_AVAILABLE:
+ return entries
+
+ # IPA phonetics only appear in the ENGLISH field of vocab tables.
+ # German and example fields contain meaningful parenthetical content:
+ # german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
+ # example: "(sich beschweren)", "(brauchen)", "(jammern)"
+ # These must NEVER be processed as phonetic transcriptions.
+ replaced_count = 0
+ for entry in entries:
+ text = entry.get('english', '') or ''
+ if not any(ch in text for ch in '[{('):
+ continue
+ new_text = _replace_phonetics_in_text(text, pronunciation)
+ if new_text != text:
+ logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
+ replaced_count += 1
+ entry['english'] = new_text
+
+ if replaced_count:
+ logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
+ return entries
+
+
+# Grammar particles that appear in brackets after English words:
+# cross (with), complain (about/of), agree (on/with), look (sth) up
+# These must NOT be replaced with IPA. Only used for the English field
+# (German/example fields are never processed for IPA replacement).
+_GRAMMAR_BRACKET_WORDS = frozenset({
+ # English prepositions/particles commonly in vocab tables
+ 'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
+ 'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
+ # English grammar abbreviations used in vocab tables
+ 'sth', 'sb', 'adj', 'adv',
+})
+
+
+def _is_grammar_bracket_content(content: str) -> bool:
+ """Return True if bracket content is grammar info in the ENGLISH field.
+
+ Grammar info: cross (with), complain (about/of), agree (on/with)
+ NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
+
+ Since we only process the English field, we only need to recognize
+ English grammar particles. Everything else is (garbled) IPA.
+ """
+ if not content:
+ return False
+
+ # Split on / for patterns like (about/of), (on/with)
+ tokens = [t.strip().lower() for t in content.split('/') if t.strip()]
+ if not tokens:
+ return False
+
+ # ALL tokens must be known grammar words
+ return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
+
+
+def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
+ """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
+
+ Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
+ We match any bracket type and replace with dictionary IPA if found.
+ Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
+ """
+ if not IPA_AVAILABLE:
+ return text
+
+ def replacer(match):
+ word = match.group(1)
+ bracket_content = match.group(2).strip()
+ full_match = match.group(0)
+
+ # Skip if bracket content looks like regular text (multiple words)
+ if len(bracket_content.split()) > 3:
+ return full_match
+
+ # Look up IPA for the word before brackets
+ ipa = _lookup_ipa(word, pronunciation)
+
+ if ipa:
+ # Word has IPA → bracket content is phonetic (garbled or correct).
+ # Exception: grammar particles like cross (with) — keep those.
+ if _is_grammar_bracket_content(bracket_content):
+ return full_match
+ logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
+ return f"{word} [{ipa}]"
+
+ # No IPA for this word — keep as-is
+ return full_match
+
+ text = _PHONETIC_BRACKET_RE.sub(replacer, text)
+
+ # Second pass: strip remaining orphan brackets that are garbled IPA.
+ # These have no word before them (the main regex requires \b word \s* bracket).
+ # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
+ # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
+ def _strip_orphan_bracket(m):
+ content = m.group(1).strip()
+ # Keep grammar info: (sich beschweren), (about/of)
+ if _is_grammar_bracket_content(content):
+ return m.group(0)
+ # Keep correct IPA (contains Unicode IPA characters)
+ if any(ch in _IPA_CHARS for ch in content):
+ return m.group(0)
+ logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
+ return ''
+
+ text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
+ text = text.strip()
+
+ return text
+
+
+def _assign_row_words_to_columns(
+ row: RowGeometry,
+ columns: List[PageRegion],
+) -> Dict[int, List[Dict]]:
+ """Assign each word in a row to exactly one column.
+
+ Uses a two-pass strategy:
+ 1. Containment: if a word's center falls within a column's horizontal
+ bounds (with padding), assign it to that column.
+ 2. Nearest center: for words not contained by any column, fall back to
+ nearest column center distance.
+
+ This prevents long sentences in wide columns (e.g. example) from having
+ their rightmost words stolen by an adjacent column.
+
+ Args:
+ row: Row with words (relative coordinates).
+ columns: Sorted list of columns (absolute coordinates).
+
+ Returns:
+ Dict mapping col_index → list of words assigned to that column.
+ """
+ result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
+
+ if not row.words or not columns:
+ return result
+
+ left_x = row.x # content ROI left (absolute)
+
+ # Build non-overlapping column assignment ranges using midpoints.
+ # For adjacent columns, the boundary is the midpoint between them.
+ # This prevents words near column borders from being assigned to
+ # the wrong column (e.g. "We" at the start of an example sentence
+ # being stolen by the preceding DE column).
+ n = len(columns)
+ col_ranges_rel = [] # (assign_left, assign_right) per column
+ for ci, col in enumerate(columns):
+ col_left_rel = col.x - left_x
+ col_right_rel = col_left_rel + col.width
+
+ # Left boundary: midpoint to previous column, or 0
+ if ci == 0:
+ assign_left = 0
+ else:
+ prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
+ assign_left = (prev_right + col_left_rel) / 2
+
+ # Right boundary: midpoint to next column, or infinity (row width)
+ if ci == n - 1:
+ assign_right = row.width + 100 # generous for last column
+ else:
+ next_left = columns[ci + 1].x - left_x
+ assign_right = (col_right_rel + next_left) / 2
+
+ col_ranges_rel.append((assign_left, assign_right))
+
+ for w in row.words:
+ w_left = w['left']
+ w_right = w_left + w['width']
+ w_center_x = w_left + w['width'] / 2
+
+ # Primary: overlap-based matching — assign to column with most overlap.
+ # This is more robust than center-based for narrow columns (page_ref)
+ # where the last character's center may fall into the next column.
+ best_col = -1
+ best_overlap = 0
+ for ci, col in enumerate(columns):
+ col_left_rel = col.x - left_x
+ col_right_rel = col_left_rel + col.width
+ overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
+ if overlap > best_overlap:
+ best_overlap = overlap
+ best_col = ci
+
+ if best_col >= 0 and best_overlap > 0:
+ result[best_col].append(w)
+ else:
+ # Fallback: center-based range matching
+ assigned = False
+ for ci, (al, ar) in enumerate(col_ranges_rel):
+ if al <= w_center_x < ar:
+ result[ci].append(w)
+ assigned = True
+ break
+
+ if not assigned:
+ # Last resort: nearest column center
+ best_col = 0
+ col_left_0 = columns[0].x - left_x
+ best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
+ for ci in range(1, n):
+ col_left = columns[ci].x - left_x
+ dist = abs(w_center_x - (col_left + columns[ci].width / 2))
+ if dist < best_dist:
+ best_dist = dist
+ best_col = ci
+ result[best_col].append(w)
+
+ return result
+
+
+# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
+_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
+_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
+
+# Common short EN/DE words (2-3 chars). Tokens at the end of a cell
+# that do NOT appear here are treated as trailing OCR noise.
+_COMMON_SHORT_WORDS: set = {
+ # EN 1-2 letter
+ 'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
+ 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
+ 'or', 'so', 'to', 'up', 'us', 'we',
+ # EN 3 letter
+ 'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
+ 'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
+ 'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
+ 'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
+ 'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
+ 'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
+ 'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
+ 'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
+ 'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
+ 'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
+ 'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
+ 'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
+ 'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
+ 'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
+ 'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
+ 'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
+ 'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
+ 'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
+ 'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
+ 'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
+ 'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
+ 'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
+ 'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
+ 'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
+ 'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
+ 'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
+ 'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
+ 'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
+ 'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
+ 'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
+ 'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
+ 'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
+ 'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
+ 'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
+ 'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
+ 'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
+ 'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
+ 'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
+ 'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
+ 'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
+ 'zap', 'zip', 'zoo',
+ # DE 2-3 letter
+ 'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
+ 'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
+ 'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
+ 'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
+ 'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
+ 'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
+ 'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
+ 'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
+ 'wut', 'zum', 'zur',
+}
+
+# Known abbreviations found in EN/DE textbooks and dictionaries.
+# Stored WITHOUT trailing period (the noise filter strips periods).
+# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
+_KNOWN_ABBREVIATIONS: set = {
+ # EN dictionary meta-words
+ 'sth', 'sb', 'smth', 'smb', 'sbd',
+ # EN general
+ 'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
+ 'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
+ # EN references / textbook
+ 'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
+ 'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
+ 'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
+ 'ans', 'wb', 'tb', 'vocab',
+ # EN parts of speech / grammar
+ 'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
+ 'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
+ 'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
+ 'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
+ 'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
+ 'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
+ 'syn', 'ant', 'opp', 'var', 'orig',
+ # EN titles
+ 'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
+ # EN pronunciation
+ 'br', 'am', 'brit', 'amer',
+ # EN units
+ 'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
+ # DE general
+ 'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
+ 'bes', 'insb', 'insbes', 'bspw', 'ca',
+ 'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
+ 'inkl', 'exkl', 'zzgl', 'abzgl',
+ # DE references
+ 'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
+ 'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
+ 's', 'sp', 'zit', 'zs', 'vlg',
+ # DE grammar
+ 'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
+ 'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
+ 'trennb', 'untrennb', 'ugs', 'geh', 'pej',
+ # DE regional
+ 'nordd', 'österr', 'schweiz',
+ # Linguistic
+ 'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
+ 'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
+ 'count', 'uncount', 'indef', 'def', 'poss', 'demon',
+}
+
+
+def _is_noise_tail_token(token: str) -> bool:
+ """Check if a token at the END of cell text is trailing OCR noise.
+
+ Trailing fragments are very common OCR artifacts from image edges,
+ borders, and neighbouring cells. This is more aggressive than a
+ general word filter: any short token that isn't in the dictionary
+ of common EN/DE words is considered noise.
+
+ Examples of noise: "Es)", "3", "ee", "B"
+ Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]"
+ """
+ t = token.strip()
+ if not t:
+ return True
+
+ # Keep ellipsis
+ if t in ('...', '…'):
+ return False
+
+ # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
+ if t.startswith('[') or t.startswith('["') or t.startswith("['"):
+ return False
+ if t.endswith(']'):
+ return False
+
+ # Pure non-alpha → noise ("3", ")", "|")
+ alpha_chars = _RE_ALPHA.findall(t)
+ if not alpha_chars:
+ return True
+
+ # Extract only alpha characters for dictionary lookup
+ cleaned = ''.join(alpha_chars)
+
+ # Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
+ if cleaned.lower() in _KNOWN_ABBREVIATIONS:
+ return False
+
+ # Strip normal trailing punctuation before checking for internal noise.
+ stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes"
+ t_check = stripped_punct if stripped_punct else t
+
+ # Check for legitimate punctuation patterns vs. real noise.
+ # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
+ # "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
+ # Noise: "3d", "B|", "x7"
+ # Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
+ # THEN check if residual contains only alpha characters.
+ t_inner = t_check
+ # Remove all parentheses, hyphens, slashes, and dots — these are normal
+ # in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
+ # "(zer)brechen", "wir/uns", "e.g."
+ t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
+ # Now check: does the inner form still have non-alpha noise?
+ inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
+ has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
+
+ # Long alpha words (4+ chars) without internal noise are likely real
+ if len(cleaned) >= 4 and not has_internal_noise:
+ return False
+
+ # Short words: check dictionary (uses only alpha chars)
+ if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
+ return False
+
+ # Default: short or suspicious → noise
+ return True
+
+
+def _is_garbage_text(text: str) -> bool:
+ """Check if entire cell text is OCR garbage from image areas.
+
+ Garbage text = no recognizable dictionary word. Catches
+ "(ci]oeu", "uanoaain." etc.
+ """
+ words = _RE_REAL_WORD.findall(text)
+ if not words:
+ # Check if any token is a known abbreviation (e.g. "e.g.")
+ alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
+ if alpha_only in _KNOWN_ABBREVIATIONS:
+ return False
+ return True
+
+ for w in words:
+ wl = w.lower()
+ # Known short word or abbreviation → not garbage
+ if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
+ return False
+ # Long word (>= 4 chars): check vowel/consonant ratio.
+ # Real EN/DE words have 20-60% vowels. Garbage like "uanoaain"
+ # or "cioeu" has unusual ratios (too many or too few vowels).
+ if len(wl) >= 4:
+ vowels = sum(1 for c in wl if c in 'aeiouäöü')
+ ratio = vowels / len(wl)
+ if 0.15 <= ratio <= 0.65:
+ return False # plausible vowel ratio → real word
+
+ return True
+
+
+def _clean_cell_text(text: str) -> str:
+ """Remove OCR noise from cell text. Generic filters:
+
+ 1. If the entire text has no real alphabetic word (>= 2 letters), clear.
+ 2. If the entire text is garbage (no dictionary word), clear.
+ 3. Strip trailing noise tokens from the end of the text.
+ """
+ stripped = text.strip()
+ if not stripped:
+ return ''
+
+ # --- Filter 1: No real word at all ---
+ if not _RE_REAL_WORD.search(stripped):
+ # Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
+ alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
+ if alpha_only not in _KNOWN_ABBREVIATIONS:
+ return ''
+
+ # --- Filter 2: Entire text is garbage ---
+ if _is_garbage_text(stripped):
+ return ''
+
+ # --- Filter 3: Strip trailing noise tokens ---
+ tokens = stripped.split()
+ while tokens and _is_noise_tail_token(tokens[-1]):
+ tokens.pop()
+ if not tokens:
+ return ''
+
+ return ' '.join(tokens)
+
+
+def _clean_cell_text_lite(text: str) -> str:
+ """Simplified noise filter for cell-first OCR (isolated cell crops).
+
+ Since each cell is OCR'd in isolation (no neighbour content visible),
+ trailing-noise stripping is unnecessary. Only 2 filters remain:
+
+ 1. No real alphabetic word (>= 2 letters) and not a known abbreviation → empty.
+ 2. Entire text is garbage (no dictionary word) → empty.
+ """
+ stripped = text.strip()
+ if not stripped:
+ return ''
+
+ # --- Filter 1: No real word at all ---
+ if not _RE_REAL_WORD.search(stripped):
+ alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
+ if alpha_only not in _KNOWN_ABBREVIATIONS:
+ return ''
+
+ # --- Filter 2: Entire text is garbage ---
+ if _is_garbage_text(stripped):
+ return ''
+
+ return stripped
+
+
+# ---------------------------------------------------------------------------
+# Bold detection via stroke-width analysis (relative / page-level)
+# ---------------------------------------------------------------------------
+
+def _measure_stroke_width(gray_crop: np.ndarray) -> float:
+ """Measure mean stroke width in a binarised cell crop.
+
+ Returns a DPI-normalised value (mean stroke width as % of crop height),
+ or 0.0 if measurement is not possible.
+ """
+ if gray_crop is None or gray_crop.size == 0:
+ return 0.0
+ h, w = gray_crop.shape[:2]
+ if h < 10 or w < 10:
+ return 0.0
+
+ # Binarise: text = white (255), background = black (0)
+ _, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
+ if cv2.countNonZero(bw) < 20:
+ return 0.0
+
+ # Distance transform: value at each white pixel = distance to nearest black
+ dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
+
+ # Skeleton via morphological thinning
+ kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
+ thin = bw.copy()
+ for _ in range(max(1, min(h, w) // 6)):
+ eroded = cv2.erode(thin, kernel)
+ if cv2.countNonZero(eroded) < 5:
+ break
+ thin = eroded
+
+ skeleton_pts = thin > 0
+ if not np.any(skeleton_pts):
+ return 0.0
+ mean_stroke = float(np.mean(dist[skeleton_pts]))
+ return mean_stroke / max(h, 1) * 100 # normalised: % of cell height
+
+
+def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
+ img_w: int, img_h: int) -> None:
+ """Two-pass bold detection: measure all cells, then compare against median.
+
+ Cells with stroke width > 1.4× the page median are marked as bold.
+ This adapts automatically to font, DPI and scan quality.
+ Modifies cells in-place (sets 'is_bold' key).
+ """
+ if ocr_img is None:
+ return
+
+ # Pass 1: measure stroke width for every cell with text
+ metrics: List[float] = []
+ cell_strokes: List[float] = []
+ for cell in cells:
+ sw = 0.0
+ if cell.get('text', '').strip():
+ bp = cell['bbox_px']
+ y1 = max(0, bp['y'])
+ y2 = min(img_h, bp['y'] + bp['h'])
+ x1 = max(0, bp['x'])
+ x2 = min(img_w, bp['x'] + bp['w'])
+ if y2 > y1 and x2 > x1:
+ sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
+ cell_strokes.append(sw)
+ if sw > 0:
+ metrics.append(sw)
+
+ if len(metrics) < 3:
+ # Too few cells to compare — leave all as non-bold
+ return
+
+ median_sw = float(np.median(metrics))
+ if median_sw <= 0:
+ return
+
+ # Pass 2: cells significantly above median → bold
+ for cell, sw in zip(cells, cell_strokes):
+ cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
+
+
+# ---------------------------------------------------------------------------
diff --git a/klausur-service/backend/cv_preprocessing.py b/klausur-service/backend/cv_preprocessing.py
new file mode 100644
index 0000000..133d47f
--- /dev/null
+++ b/klausur-service/backend/cv_preprocessing.py
@@ -0,0 +1,1166 @@
+"""
+Image I/O, orientation detection, deskew, and dewarp for the CV vocabulary pipeline.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import time
+from collections import defaultdict
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+ CV2_AVAILABLE,
+ TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+# Guarded imports — mirror cv_vocab_types guards
+try:
+ import cv2
+except ImportError:
+ cv2 = None # type: ignore[assignment]
+
+try:
+ import pytesseract
+ from PIL import Image
+except ImportError:
+ pytesseract = None # type: ignore[assignment]
+ Image = None # type: ignore[assignment,misc]
+
+
+def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
+ """Render a PDF page to a high-resolution numpy array (BGR).
+
+ Args:
+ pdf_data: Raw PDF bytes.
+ page_number: 0-indexed page number.
+ zoom: Zoom factor (3.0 = 432 DPI).
+
+ Returns:
+ numpy array in BGR format.
+ """
+ import fitz # PyMuPDF
+
+ pdf_doc = fitz.open(stream=pdf_data, filetype="pdf")
+ if page_number >= pdf_doc.page_count:
+ raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)")
+
+ page = pdf_doc[page_number]
+ mat = fitz.Matrix(zoom, zoom)
+ pix = page.get_pixmap(matrix=mat)
+
+ # Convert to numpy BGR
+ img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
+ if pix.n == 4: # RGBA
+ img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
+ elif pix.n == 3: # RGB
+ img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
+ else: # Grayscale
+ img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
+
+ pdf_doc.close()
+ return img_bgr
+
+
+def render_image_high_res(image_data: bytes) -> np.ndarray:
+ """Load an image (PNG/JPEG) into a numpy array (BGR).
+
+ Args:
+ image_data: Raw image bytes.
+
+ Returns:
+ numpy array in BGR format.
+ """
+ img_array = np.frombuffer(image_data, dtype=np.uint8)
+ img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+ if img_bgr is None:
+ raise ValueError("Could not decode image data")
+ return img_bgr
+
+
+# =============================================================================
+# Stage 1b: Orientation Detection (0°/90°/180°/270°)
+# =============================================================================
+
+def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
+ """Detect page orientation via Tesseract OSD and rotate if needed.
+
+ Handles upside-down scans (180°) common with book scanners where
+ every other page is flipped due to the scanner hinge.
+
+ Returns:
+ (corrected_image, rotation_degrees) — rotation is 0, 90, 180, or 270.
+ """
+ if pytesseract is None:
+ return img_bgr, 0
+
+ try:
+ # Tesseract OSD needs a grayscale or RGB image
+ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+ pil_img = Image.fromarray(gray)
+
+ osd = pytesseract.image_to_osd(pil_img, output_type=pytesseract.Output.DICT)
+ rotate = osd.get("rotate", 0)
+ confidence = osd.get("orientation_conf", 0.0)
+
+ logger.info(f"OSD: orientation={rotate}° confidence={confidence:.1f}")
+
+ if rotate == 0 or confidence < 1.0:
+ return img_bgr, 0
+
+ # Apply rotation
+ if rotate == 180:
+ corrected = cv2.rotate(img_bgr, cv2.ROTATE_180)
+ elif rotate == 90:
+ corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_COUNTERCLOCKWISE)
+ elif rotate == 270:
+ corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_CLOCKWISE)
+ else:
+ return img_bgr, 0
+
+ logger.info(f"OSD: rotated {rotate}° to fix orientation")
+ return corrected, rotate
+
+ except Exception as e:
+ logger.warning(f"OSD orientation detection failed: {e}")
+ return img_bgr, 0
+
+
+# =============================================================================
+# Stage 2: Deskew (Rotation Correction)
+# =============================================================================
+
+def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
+ """Correct rotation using Hough Line detection.
+
+ Args:
+ img: BGR image.
+
+ Returns:
+ Tuple of (corrected image, detected angle in degrees).
+ """
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+ # Binarize for line detection
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+ # Detect lines
+ lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
+ minLineLength=img.shape[1] // 4, maxLineGap=20)
+
+ if lines is None or len(lines) < 3:
+ return img, 0.0
+
+ # Compute angles of near-horizontal lines
+ angles = []
+ for line in lines:
+ x1, y1, x2, y2 = line[0]
+ angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
+ if abs(angle) < 15: # Only near-horizontal
+ angles.append(angle)
+
+ if not angles:
+ return img, 0.0
+
+ median_angle = float(np.median(angles))
+
+ # Limit correction to ±5°
+ if abs(median_angle) > 5.0:
+ median_angle = 5.0 * np.sign(median_angle)
+
+ if abs(median_angle) < 0.1:
+ return img, 0.0
+
+ # Rotate
+ h, w = img.shape[:2]
+ center = (w // 2, h // 2)
+ M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+ corrected = cv2.warpAffine(img, M, (w, h),
+ flags=cv2.INTER_LINEAR,
+ borderMode=cv2.BORDER_REPLICATE)
+
+ logger.info(f"Deskew: corrected {median_angle:.2f}° rotation")
+ return corrected, median_angle
+
+
+def deskew_image_by_word_alignment(
+ image_data: bytes,
+ lang: str = "eng+deu",
+ downscale_factor: float = 0.5,
+) -> Tuple[bytes, float]:
+ """Correct rotation by fitting a line through left-most word starts per text line.
+
+ More robust than Hough-based deskew for vocabulary worksheets where text lines
+ have consistent left-alignment. Runs a quick Tesseract pass on a downscaled
+ copy to find word positions, computes the dominant left-edge column, fits a
+ line through those points and rotates the full-resolution image.
+
+ Args:
+ image_data: Raw image bytes (PNG/JPEG).
+ lang: Tesseract language string for the quick pass.
+ downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
+
+ Returns:
+ Tuple of (rotated image as PNG bytes, detected angle in degrees).
+ """
+ if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
+ return image_data, 0.0
+
+ # 1. Decode image
+ img_array = np.frombuffer(image_data, dtype=np.uint8)
+ img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+ if img is None:
+ logger.warning("deskew_by_word_alignment: could not decode image")
+ return image_data, 0.0
+
+ orig_h, orig_w = img.shape[:2]
+
+ # 2. Downscale for fast Tesseract pass
+ small_w = int(orig_w * downscale_factor)
+ small_h = int(orig_h * downscale_factor)
+ small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
+
+ # 3. Quick Tesseract — word-level positions
+ pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
+ try:
+ data = pytesseract.image_to_data(
+ pil_small, lang=lang, config="--psm 6 --oem 3",
+ output_type=pytesseract.Output.DICT,
+ )
+ except Exception as e:
+ logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
+ return image_data, 0.0
+
+ # 4. Per text-line, find the left-most word start
+ # Group by (block_num, par_num, line_num)
+ line_groups: Dict[tuple, list] = defaultdict(list)
+ for i in range(len(data["text"])):
+ text = (data["text"][i] or "").strip()
+ conf = int(data["conf"][i])
+ if not text or conf < 20:
+ continue
+ key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+ line_groups[key].append(i)
+
+ if len(line_groups) < 5:
+ logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
+ return image_data, 0.0
+
+ # For each line, pick the word with smallest 'left' → compute (left_x, center_y)
+ # Scale back to original resolution
+ scale = 1.0 / downscale_factor
+ points = [] # list of (x, y) in original-image coords
+ for key, indices in line_groups.items():
+ best_idx = min(indices, key=lambda i: data["left"][i])
+ lx = data["left"][best_idx] * scale
+ top = data["top"][best_idx] * scale
+ h = data["height"][best_idx] * scale
+ cy = top + h / 2.0
+ points.append((lx, cy))
+
+ # 5. Find dominant left-edge column + compute angle
+ xs = np.array([p[0] for p in points])
+ ys = np.array([p[1] for p in points])
+ median_x = float(np.median(xs))
+ tolerance = orig_w * 0.03 # 3% of image width
+
+ mask = np.abs(xs - median_x) <= tolerance
+ filtered_xs = xs[mask]
+ filtered_ys = ys[mask]
+
+ if len(filtered_xs) < 5:
+ logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
+ return image_data, 0.0
+
+ # polyfit: x = a*y + b → a = dx/dy → angle = arctan(a)
+ coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
+ slope = coeffs[0] # dx/dy
+ angle_rad = np.arctan(slope)
+ angle_deg = float(np.degrees(angle_rad))
+
+ # Clamp to ±5°
+ angle_deg = max(-5.0, min(5.0, angle_deg))
+
+ logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points "
+ f"(total lines: {len(line_groups)})")
+
+ if abs(angle_deg) < 0.05:
+ return image_data, 0.0
+
+ # 6. Rotate full-res image
+ center = (orig_w // 2, orig_h // 2)
+ M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
+ rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
+ flags=cv2.INTER_LINEAR,
+ borderMode=cv2.BORDER_REPLICATE)
+
+ # Encode back to PNG
+ success, png_buf = cv2.imencode(".png", rotated)
+ if not success:
+ logger.warning("deskew_by_word_alignment: PNG encoding failed")
+ return image_data, 0.0
+
+ return png_buf.tobytes(), angle_deg
+
+
+def _projection_gradient_score(profile: np.ndarray) -> float:
+ """Score a projection profile by the L2-norm of its first derivative.
+
+ Higher score = sharper transitions between text-lines and gaps,
+ i.e. better row/column alignment.
+ """
+ diff = np.diff(profile)
+ return float(np.sum(diff * diff))
+
+
+def deskew_image_iterative(
+ img: np.ndarray,
+ coarse_range: float = 5.0,
+ coarse_step: float = 0.1,
+ fine_range: float = 0.15,
+ fine_step: float = 0.02,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+ """Iterative deskew using vertical-edge projection optimisation.
+
+ The key insight: at the correct rotation angle, vertical features
+ (word left-edges, column borders) become truly vertical, producing
+ the sharpest peaks in the vertical projection of vertical edges.
+
+ Method:
+ 1. Detect vertical edges via Sobel-X on the central crop.
+ 2. Coarse sweep: rotate edge image, compute vertical projection
+ gradient score. The angle where vertical edges align best wins.
+ 3. Fine sweep: refine around the coarse winner.
+
+ Args:
+ img: BGR image (full resolution).
+ coarse_range: half-range in degrees for the coarse sweep.
+ coarse_step: step size in degrees for the coarse sweep.
+ fine_range: half-range around the coarse winner for the fine sweep.
+ fine_step: step size in degrees for the fine sweep.
+
+ Returns:
+ (rotated_bgr, angle_degrees, debug_dict)
+ """
+ h, w = img.shape[:2]
+ debug: Dict[str, Any] = {}
+
+ # --- Grayscale + vertical edge detection ---
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ # Central crop (15%-85% height, 10%-90% width) to avoid page margins
+ y_lo, y_hi = int(h * 0.15), int(h * 0.85)
+ x_lo, x_hi = int(w * 0.10), int(w * 0.90)
+ gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
+
+ # Sobel-X → absolute vertical edges
+ sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
+ edges = np.abs(sobel_x)
+ # Normalise to 0-255 for consistent scoring
+ edge_max = edges.max()
+ if edge_max > 0:
+ edges = (edges / edge_max * 255).astype(np.uint8)
+ else:
+ return img, 0.0, {"error": "no edges detected"}
+
+ crop_h, crop_w = edges.shape[:2]
+ crop_center = (crop_w // 2, crop_h // 2)
+
+ # Trim margin after rotation to avoid border artifacts
+ trim_y = max(4, int(crop_h * 0.03))
+ trim_x = max(4, int(crop_w * 0.03))
+
+ def _sweep_edges(angles: np.ndarray) -> list:
+ """Score each angle by vertical projection gradient of vertical edges."""
+ results = []
+ for angle in angles:
+ if abs(angle) < 1e-6:
+ rotated = edges
+ else:
+ M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
+ rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
+ flags=cv2.INTER_NEAREST,
+ borderMode=cv2.BORDER_REPLICATE)
+ # Trim borders to avoid edge artifacts
+ trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
+ v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
+ score = _projection_gradient_score(v_profile)
+ results.append((float(angle), score))
+ return results
+
+ # --- Phase 1: coarse sweep ---
+ coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
+ coarse_results = _sweep_edges(coarse_angles)
+ best_coarse = max(coarse_results, key=lambda x: x[1])
+ best_coarse_angle, best_coarse_score = best_coarse
+
+ debug["coarse_best_angle"] = round(best_coarse_angle, 2)
+ debug["coarse_best_score"] = round(best_coarse_score, 1)
+ debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
+
+ # --- Phase 2: fine sweep around coarse winner ---
+ fine_lo = best_coarse_angle - fine_range
+ fine_hi = best_coarse_angle + fine_range
+ fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
+ fine_results = _sweep_edges(fine_angles)
+ best_fine = max(fine_results, key=lambda x: x[1])
+ best_fine_angle, best_fine_score = best_fine
+
+ debug["fine_best_angle"] = round(best_fine_angle, 2)
+ debug["fine_best_score"] = round(best_fine_score, 1)
+ debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
+
+ final_angle = best_fine_angle
+
+ # Clamp to ±5°
+ final_angle = max(-5.0, min(5.0, final_angle))
+
+ logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}° fine={best_fine_angle:.2f}° -> {final_angle:.2f}°")
+
+ if abs(final_angle) < 0.05:
+ return img, 0.0, debug
+
+ # --- Rotate full-res image ---
+ center = (w // 2, h // 2)
+ M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
+ rotated = cv2.warpAffine(img, M, (w, h),
+ flags=cv2.INTER_LINEAR,
+ borderMode=cv2.BORDER_REPLICATE)
+
+ return rotated, final_angle, debug
+
+
+def _measure_textline_slope(img: np.ndarray) -> float:
+ """Measure residual text-line slope via Tesseract word-position regression.
+
+ Groups Tesseract words by (block, par, line), fits a linear regression
+ per line (y = slope * x + b), and returns the trimmed-mean slope in
+ degrees. Positive = text rises to the right, negative = falls.
+
+ This is the most direct measurement of remaining rotation after deskew.
+ """
+ import math as _math
+
+ if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
+ return 0.0
+
+ h, w = img.shape[:2]
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+ data = pytesseract.image_to_data(
+ Image.fromarray(gray),
+ output_type=pytesseract.Output.DICT,
+ config="--psm 6",
+ )
+
+ # Group word centres by text line
+ lines: Dict[tuple, list] = {}
+ for i in range(len(data["text"])):
+ txt = (data["text"][i] or "").strip()
+ if len(txt) < 2 or int(data["conf"][i]) < 30:
+ continue
+ key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+ cx = data["left"][i] + data["width"][i] / 2.0
+ cy = data["top"][i] + data["height"][i] / 2.0
+ lines.setdefault(key, []).append((cx, cy))
+
+ # Per-line linear regression → slope angle
+ slopes: list = []
+ for pts in lines.values():
+ if len(pts) < 3:
+ continue
+ pts.sort(key=lambda p: p[0])
+ xs = np.array([p[0] for p in pts], dtype=np.float64)
+ ys = np.array([p[1] for p in pts], dtype=np.float64)
+ if xs[-1] - xs[0] < w * 0.15:
+ continue # skip short lines
+ A = np.vstack([xs, np.ones_like(xs)]).T
+ result = np.linalg.lstsq(A, ys, rcond=None)
+ slope = result[0][0]
+ slopes.append(_math.degrees(_math.atan(slope)))
+
+ if len(slopes) < 3:
+ return 0.0
+
+ # Trimmed mean (drop 10% extremes on each side)
+ slopes.sort()
+ trim = max(1, len(slopes) // 10)
+ trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
+ if not trimmed:
+ return 0.0
+
+ return sum(trimmed) / len(trimmed)
+
+
+def deskew_two_pass(
+ img: np.ndarray,
+ coarse_range: float = 5.0,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+ """Two-pass deskew: iterative projection + word-alignment residual check.
+
+ Pass 1: ``deskew_image_iterative()`` (vertical-edge projection, wide range).
+ Pass 2: ``deskew_image_by_word_alignment()`` on the already-corrected image
+ to detect and fix residual skew that the projection method missed.
+
+ The two corrections are summed. If the residual from Pass 2 is below
+ 0.3° it is ignored (already good enough).
+
+ Returns:
+ (corrected_bgr, total_angle_degrees, debug_dict)
+ """
+ debug: Dict[str, Any] = {}
+
+ # --- Pass 1: iterative projection ---
+ corrected, angle1, dbg1 = deskew_image_iterative(
+ img.copy(), coarse_range=coarse_range,
+ )
+ debug["pass1_angle"] = round(angle1, 3)
+ debug["pass1_method"] = "iterative"
+ debug["pass1_debug"] = dbg1
+
+ # --- Pass 2: word-alignment residual check on corrected image ---
+ angle2 = 0.0
+ try:
+ # Encode the corrected image to PNG bytes for word-alignment
+ ok, buf = cv2.imencode(".png", corrected)
+ if ok:
+ corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
+ if abs(angle2) >= 0.3:
+ # Significant residual — decode and use the second correction
+ arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
+ corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
+ if corrected2 is not None:
+ corrected = corrected2
+ logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° applied "
+ f"(total={angle1 + angle2:.2f}°)")
+ else:
+ angle2 = 0.0
+ else:
+ logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° < 0.3° — skipped")
+ angle2 = 0.0
+ except Exception as e:
+ logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
+ angle2 = 0.0
+
+ # --- Pass 3: Tesseract text-line regression residual check ---
+ # The most reliable final check: measure actual text-line slopes
+ # using Tesseract word positions and linear regression per line.
+ angle3 = 0.0
+ try:
+ residual = _measure_textline_slope(corrected)
+ debug["pass3_raw"] = round(residual, 3)
+ if abs(residual) >= 0.3:
+ h3, w3 = corrected.shape[:2]
+ center3 = (w3 // 2, h3 // 2)
+ M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
+ corrected = cv2.warpAffine(
+ corrected, M3, (w3, h3),
+ flags=cv2.INTER_LINEAR,
+ borderMode=cv2.BORDER_REPLICATE,
+ )
+ angle3 = residual
+ logger.info(
+ "deskew_two_pass: pass3 text-line residual=%.2f° applied",
+ residual,
+ )
+ else:
+ logger.info(
+ "deskew_two_pass: pass3 text-line residual=%.2f° < 0.3° — skipped",
+ residual,
+ )
+ except Exception as e:
+ logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
+
+ total_angle = angle1 + angle2 + angle3
+ debug["pass2_angle"] = round(angle2, 3)
+ debug["pass2_method"] = "word_alignment"
+ debug["pass3_angle"] = round(angle3, 3)
+ debug["pass3_method"] = "textline_regression"
+ debug["total_angle"] = round(total_angle, 3)
+
+ logger.info(
+ "deskew_two_pass: pass1=%.2f° + pass2=%.2f° + pass3=%.2f° = %.2f°",
+ angle1, angle2, angle3, total_angle,
+ )
+
+ return corrected, total_angle, debug
+
+
+# =============================================================================
+# Stage 3: Dewarp (Book Curvature Correction)
+# =============================================================================
+
+def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
+ """Detect the vertical shear angle of the page.
+
+ After deskew (horizontal lines aligned), vertical features like column
+ edges may still be tilted. This measures that tilt by tracking the
+ strongest vertical edge across horizontal strips.
+
+ The result is a shear angle in degrees: the angular difference between
+ true vertical and the detected column edge.
+
+ Returns:
+ Dict with keys: method, shear_degrees, confidence.
+ """
+ h, w = img.shape[:2]
+ result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
+
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ # Vertical Sobel to find vertical edges
+ sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
+ abs_sobel = np.abs(sobel_x).astype(np.uint8)
+
+ # Binarize with Otsu
+ _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+
+ num_strips = 20
+ strip_h = h // num_strips
+ edge_positions = [] # (y_center, x_position)
+
+ for i in range(num_strips):
+ y_start = i * strip_h
+ y_end = min((i + 1) * strip_h, h)
+ strip = binary[y_start:y_end, :]
+
+ # Project vertically (sum along y-axis)
+ projection = np.sum(strip, axis=0).astype(np.float64)
+ if projection.max() == 0:
+ continue
+
+ # Find the strongest vertical edge in left 40% of image
+ search_w = int(w * 0.4)
+ left_proj = projection[:search_w]
+ if left_proj.max() == 0:
+ continue
+
+ # Smooth and find peak
+ kernel_size = max(3, w // 100)
+ if kernel_size % 2 == 0:
+ kernel_size += 1
+ smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
+ x_pos = float(np.argmax(smoothed))
+ y_center = (y_start + y_end) / 2.0
+ edge_positions.append((y_center, x_pos))
+
+ if len(edge_positions) < 8:
+ return result
+
+ ys = np.array([p[0] for p in edge_positions])
+ xs = np.array([p[1] for p in edge_positions])
+
+ # Remove outliers (> 2 std from median)
+ median_x = np.median(xs)
+ std_x = max(np.std(xs), 1.0)
+ mask = np.abs(xs - median_x) < 2 * std_x
+ ys = ys[mask]
+ xs = xs[mask]
+
+ if len(ys) < 6:
+ return result
+
+ # Fit straight line: x = slope * y + intercept
+ # The slope tells us the tilt of the vertical edge
+ straight_coeffs = np.polyfit(ys, xs, 1)
+ slope = straight_coeffs[0] # dx/dy in pixels
+ fitted = np.polyval(straight_coeffs, ys)
+ residuals = xs - fitted
+ rmse = float(np.sqrt(np.mean(residuals ** 2)))
+
+ # Convert slope to angle: arctan(dx/dy) in degrees
+ import math
+ shear_degrees = math.degrees(math.atan(slope))
+
+ confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
+
+ result["shear_degrees"] = round(shear_degrees, 3)
+ result["confidence"] = round(float(confidence), 2)
+
+ return result
+
+
+def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
+ """Detect shear angle by maximising variance of horizontal text-line projections.
+
+ Principle: horizontal text lines produce a row-projection profile with sharp
+ peaks (high variance) when the image is correctly aligned. Any residual shear
+ smears the peaks and reduces variance. We sweep ±3° and pick the angle whose
+ corrected projection has the highest variance.
+
+ Works best on pages with clear horizontal banding (vocabulary tables, prose).
+ Complements _detect_shear_angle() which needs strong vertical edges.
+
+ Returns:
+ Dict with keys: method, shear_degrees, confidence.
+ """
+ import math
+ result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
+
+ h, w = img.shape[:2]
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ # Otsu binarisation
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+ # Work at half resolution for speed
+ small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
+ sh, sw = small.shape
+
+ # 2-pass angle sweep for 10x better precision:
+ # Pass 1: Coarse sweep ±3° in 0.5° steps (13 values)
+ # Pass 2: Fine sweep ±0.5° around coarse best in 0.05° steps (21 values)
+
+ def _sweep_variance(angles_list):
+ results = []
+ for angle_deg in angles_list:
+ if abs(angle_deg) < 0.001:
+ rotated = small
+ else:
+ shear_tan = math.tan(math.radians(angle_deg))
+ M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
+ rotated = cv2.warpAffine(small, M, (sw, sh),
+ flags=cv2.INTER_NEAREST,
+ borderMode=cv2.BORDER_CONSTANT)
+ profile = np.sum(rotated, axis=1).astype(float)
+ results.append((angle_deg, float(np.var(profile))))
+ return results
+
+ # Pass 1: coarse
+ coarse_angles = [a * 0.5 for a in range(-6, 7)] # 13 values
+ coarse_results = _sweep_variance(coarse_angles)
+ coarse_best = max(coarse_results, key=lambda x: x[1])
+
+ # Pass 2: fine around coarse best
+ fine_center = coarse_best[0]
+ fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)] # 21 values
+ fine_results = _sweep_variance(fine_angles)
+ fine_best = max(fine_results, key=lambda x: x[1])
+
+ best_angle = fine_best[0]
+ best_variance = fine_best[1]
+ variances = coarse_results + fine_results
+
+ # Confidence: how much sharper is the best angle vs. the mean?
+ all_mean = sum(v for _, v in variances) / len(variances)
+ if all_mean > 0 and best_variance > all_mean:
+ confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
+ else:
+ confidence = 0.0
+
+ result["shear_degrees"] = round(best_angle, 3)
+ result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+ return result
+
+
+def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
+ """Detect shear using Hough transform on printed table / ruled lines.
+
+ Vocabulary worksheets have near-horizontal printed table borders. After
+ deskew these should be exactly horizontal; any residual tilt equals the
+ vertical shear angle (with inverted sign).
+
+ The sign convention: a horizontal line tilting +α degrees (left end lower)
+ means the page has vertical shear of -α degrees (left column edge drifts
+ to the left going downward).
+
+ Returns:
+ Dict with keys: method, shear_degrees, confidence.
+ """
+ result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+ h, w = img.shape[:2]
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+ edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+
+ min_len = int(w * 0.15)
+ lines = cv2.HoughLinesP(
+ edges, rho=1, theta=np.pi / 360,
+ threshold=int(w * 0.08),
+ minLineLength=min_len,
+ maxLineGap=20,
+ )
+
+ if lines is None or len(lines) < 3:
+ return result
+
+ horizontal_angles: List[Tuple[float, float]] = []
+ for line in lines:
+ x1, y1, x2, y2 = line[0]
+ if x1 == x2:
+ continue
+ angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
+ if abs(angle) <= 5.0:
+ length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
+ horizontal_angles.append((angle, length))
+
+ if len(horizontal_angles) < 3:
+ return result
+
+ # Weighted median
+ angles_arr = np.array([a for a, _ in horizontal_angles])
+ weights_arr = np.array([l for _, l in horizontal_angles])
+ sorted_idx = np.argsort(angles_arr)
+ s_angles = angles_arr[sorted_idx]
+ s_weights = weights_arr[sorted_idx]
+ cum = np.cumsum(s_weights)
+ mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
+ median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
+
+ agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
+ confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
+
+ # Sign inversion: horizontal line tilt is complementary to vertical shear
+ shear_degrees = -median_angle
+
+ result["shear_degrees"] = round(shear_degrees, 3)
+ result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+ return result
+
+
+def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
+ """Detect shear by measuring text-line straightness (Method D).
+
+ Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
+ bounding boxes, groups them into vertical columns by X-proximity,
+ and measures how the left-edge X position drifts with Y (vertical
+ position). The drift dx/dy is the tangent of the shear angle.
+
+ This directly measures vertical shear (column tilt) rather than
+ horizontal text-line slope, which is already corrected by deskew.
+
+ Returns:
+ Dict with keys: method, shear_degrees, confidence.
+ """
+ import math
+ result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+ h, w = img.shape[:2]
+ # Downscale 50% for speed
+ scale = 0.5
+ small = cv2.resize(img, (int(w * scale), int(h * scale)),
+ interpolation=cv2.INTER_AREA)
+ gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
+ pil_img = Image.fromarray(gray)
+
+ try:
+ data = pytesseract.image_to_data(
+ pil_img, lang='eng+deu', config='--psm 11 --oem 3',
+ output_type=pytesseract.Output.DICT,
+ )
+ except Exception:
+ return result
+
+ # Collect word left-edges (x) and vertical centres (y)
+ words = []
+ for i in range(len(data['text'])):
+ text = data['text'][i].strip()
+ conf = int(data['conf'][i])
+ if not text or conf < 20 or len(text) < 2:
+ continue
+ left_x = float(data['left'][i])
+ cy = data['top'][i] + data['height'][i] / 2.0
+ word_w = float(data['width'][i])
+ words.append((left_x, cy, word_w))
+
+ if len(words) < 15:
+ return result
+
+ # --- Group words into vertical columns by left-edge X proximity ---
+ # Sort by x, then cluster words whose left-edges are within x_tol
+ avg_w = sum(ww for _, _, ww in words) / len(words)
+ x_tol = max(avg_w * 0.4, 8) # tolerance for "same column"
+
+ words_by_x = sorted(words, key=lambda w: w[0])
+ columns: List[List[Tuple[float, float]]] = [] # each: [(left_x, cy), ...]
+ cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
+ cur_x = words_by_x[0][0]
+
+ for lx, cy, _ in words_by_x[1:]:
+ if abs(lx - cur_x) <= x_tol:
+ cur_col.append((lx, cy))
+ # Update running x as median of cluster
+ cur_x = cur_x * 0.8 + lx * 0.2
+ else:
+ if len(cur_col) >= 5:
+ columns.append(cur_col)
+ cur_col = [(lx, cy)]
+ cur_x = lx
+ if len(cur_col) >= 5:
+ columns.append(cur_col)
+
+ if len(columns) < 2:
+ return result
+
+ # --- For each column, measure X-drift as a function of Y ---
+ # Fit: left_x = a * cy + b → a = dx/dy = tan(shear_angle)
+ drifts = []
+ for col in columns:
+ ys = np.array([p[1] for p in col])
+ xs = np.array([p[0] for p in col])
+ y_range = ys.max() - ys.min()
+ if y_range < h * scale * 0.3:
+ continue # column must span at least 30% of image height
+ # Linear regression: x = a*y + b
+ coeffs = np.polyfit(ys, xs, 1)
+ drifts.append(coeffs[0]) # dx/dy
+
+ if len(drifts) < 2:
+ return result
+
+ # Median dx/dy → shear angle
+ # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
+ median_drift = float(np.median(drifts))
+ shear_degrees = math.degrees(math.atan(median_drift))
+
+ # Confidence from column count + drift consistency
+ drift_std = float(np.std(drifts))
+ consistency = max(0.0, 1.0 - drift_std * 50) # tighter penalty for drift variance
+ count_factor = min(1.0, len(drifts) / 4.0)
+ confidence = count_factor * 0.5 + consistency * 0.5
+
+ result["shear_degrees"] = round(shear_degrees, 3)
+ result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+ logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
+ "shear=%.3f°, conf=%.2f",
+ len(columns), len(drifts), median_drift,
+ shear_degrees, confidence)
+ return result
+
+
+def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
+ """Check whether the dewarp correction actually improved alignment.
+
+ Compares horizontal projection variance before and after correction.
+ Higher variance means sharper text-line peaks, which indicates better
+ horizontal alignment.
+
+ Returns True if the correction improved the image, False if it should
+ be discarded.
+ """
+ def _h_proj_variance(img: np.ndarray) -> float:
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+ _, binary = cv2.threshold(gray, 0, 255,
+ cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+ small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
+ interpolation=cv2.INTER_AREA)
+ profile = np.sum(small, axis=1).astype(float)
+ return float(np.var(profile))
+
+ var_before = _h_proj_variance(original)
+ var_after = _h_proj_variance(corrected)
+
+ # Correction must improve variance (even by a tiny margin)
+ return var_after > var_before
+
+
+def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+ """Apply a vertical shear correction to an image.
+
+ Shifts each row horizontally proportional to its distance from the
+ vertical center. This corrects the tilt of vertical features (columns)
+ without affecting horizontal alignment (text lines).
+
+ Args:
+ img: BGR image.
+ shear_degrees: Shear angle in degrees. Positive = shift top-right/bottom-left.
+
+ Returns:
+ Corrected image.
+ """
+ import math
+ h, w = img.shape[:2]
+ shear_tan = math.tan(math.radians(shear_degrees))
+
+ # Affine matrix: shift x by shear_tan * (y - h/2)
+ # [1 shear_tan -h/2*shear_tan]
+ # [0 1 0 ]
+ M = np.float32([
+ [1, shear_tan, -h / 2.0 * shear_tan],
+ [0, 1, 0],
+ ])
+
+ corrected = cv2.warpAffine(img, M, (w, h),
+ flags=cv2.INTER_LINEAR,
+ borderMode=cv2.BORDER_REPLICATE)
+ return corrected
+
+
+def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
+ """Combine multiple shear detections into a single weighted estimate (v2).
+
+ Ensemble v2 changes vs v1:
+ - Minimum confidence raised to 0.5 (was 0.3)
+ - text_lines method gets 1.5× weight boost (most reliable detector)
+ - Outlier filter at 1° from weighted mean
+
+ Returns:
+ (shear_degrees, ensemble_confidence, methods_used_str)
+ """
+ # Confidence threshold — lowered from 0.5 to 0.35 to catch subtle shear
+ # that individual methods detect with moderate confidence.
+ _MIN_CONF = 0.35
+
+ # text_lines gets a weight boost as the most content-aware method
+ _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
+
+ accepted = []
+ for d in detections:
+ if d["confidence"] < _MIN_CONF:
+ continue
+ boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
+ effective_conf = d["confidence"] * boost
+ accepted.append((d["shear_degrees"], effective_conf, d["method"]))
+
+ if not accepted:
+ return 0.0, 0.0, "none"
+
+ if len(accepted) == 1:
+ deg, conf, method = accepted[0]
+ return deg, min(conf, 1.0), method
+
+ # First pass: weighted mean
+ total_w = sum(c for _, c, _ in accepted)
+ w_mean = sum(d * c for d, c, _ in accepted) / total_w
+
+ # Outlier filter: keep results within 1° of weighted mean
+ filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
+ if not filtered:
+ filtered = accepted # fallback: keep all
+
+ # Second pass: weighted mean on filtered results
+ total_w2 = sum(c for _, c, _ in filtered)
+ final_deg = sum(d * c for d, c, _ in filtered) / total_w2
+
+ # Ensemble confidence: average of individual confidences, boosted when
+ # methods agree (all within 0.5° of each other)
+ avg_conf = total_w2 / len(filtered)
+ spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
+ agreement_bonus = 0.15 if spread < 0.5 else 0.0
+ ensemble_conf = min(1.0, avg_conf + agreement_bonus)
+
+ methods_str = "+".join(m for _, _, m in filtered)
+ return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
+
+
+def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
+ """Correct vertical shear after deskew (v2 with quality gate).
+
+ After deskew aligns horizontal text lines, vertical features (column
+ edges) may still be tilted. This detects the tilt angle using an ensemble
+ of four complementary methods and applies an affine shear correction.
+
+ Methods (all run in ~150ms total):
+ A. _detect_shear_angle() — vertical edge profile (~50ms)
+ B. _detect_shear_by_projection() — horizontal text-line variance (~30ms)
+ C. _detect_shear_by_hough() — Hough lines on table borders (~20ms)
+ D. _detect_shear_by_text_lines() — text-line straightness (~50ms)
+
+ Quality gate: after correction, horizontal projection variance is compared
+ before vs after. If correction worsened alignment, it is discarded.
+
+ Args:
+ img: BGR image (already deskewed).
+ use_ensemble: If False, fall back to single-method behaviour (method A only).
+
+ Returns:
+ Tuple of (corrected_image, dewarp_info).
+ dewarp_info keys: method, shear_degrees, confidence, detections.
+ """
+ no_correction = {
+ "method": "none",
+ "shear_degrees": 0.0,
+ "confidence": 0.0,
+ "detections": [],
+ }
+
+ if not CV2_AVAILABLE:
+ return img, no_correction
+
+ t0 = time.time()
+
+ if use_ensemble:
+ det_a = _detect_shear_angle(img)
+ det_b = _detect_shear_by_projection(img)
+ det_c = _detect_shear_by_hough(img)
+ det_d = _detect_shear_by_text_lines(img)
+ detections = [det_a, det_b, det_c, det_d]
+ shear_deg, confidence, method = _ensemble_shear(detections)
+ else:
+ det_a = _detect_shear_angle(img)
+ detections = [det_a]
+ shear_deg = det_a["shear_degrees"]
+ confidence = det_a["confidence"]
+ method = det_a["method"]
+
+ duration = time.time() - t0
+
+ logger.info(
+ "dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | "
+ "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
+ shear_deg, confidence, method, duration,
+ detections[0]["shear_degrees"], detections[0]["confidence"],
+ detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
+ detections[1]["confidence"] if len(detections) > 1 else 0.0,
+ detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
+ detections[2]["confidence"] if len(detections) > 2 else 0.0,
+ detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
+ detections[3]["confidence"] if len(detections) > 3 else 0.0,
+ )
+
+ # Always include individual detections (even when no correction applied)
+ _all_detections = [
+ {"method": d["method"], "shear_degrees": d["shear_degrees"],
+ "confidence": d["confidence"]}
+ for d in detections
+ ]
+
+ # Thresholds: very small shear (<0.08°) is truly irrelevant for OCR.
+ # For ensemble confidence, require at least 0.4 (lowered from 0.5 to
+ # catch moderate-confidence detections from multiple agreeing methods).
+ if abs(shear_deg) < 0.08 or confidence < 0.4:
+ no_correction["detections"] = _all_detections
+ return img, no_correction
+
+ # Apply correction (negate the detected shear to straighten)
+ corrected = _apply_shear(img, -shear_deg)
+
+ # Quality gate: verify the correction actually improved alignment.
+ # For small corrections (< 0.5°), the projection variance change can be
+ # negligible, so we skip the quality gate — the cost of a tiny wrong
+ # correction is much less than the cost of leaving 0.4° uncorrected
+ # (which shifts content ~25px at image edges on tall scans).
+ if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
+ logger.info("dewarp: quality gate REJECTED correction (%.3f°) — "
+ "projection variance did not improve", shear_deg)
+ no_correction["detections"] = _all_detections
+ return img, no_correction
+
+ info = {
+ "method": method,
+ "shear_degrees": shear_deg,
+ "confidence": confidence,
+ "detections": _all_detections,
+ }
+
+ return corrected, info
+
+
+def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+ """Apply shear correction with a manual angle.
+
+ Args:
+ img: BGR image (deskewed, before dewarp).
+ shear_degrees: Shear angle in degrees to correct.
+
+ Returns:
+ Corrected image.
+ """
+ if abs(shear_degrees) < 0.001:
+ return img
+ return _apply_shear(img, -shear_degrees)
+
diff --git a/klausur-service/backend/cv_review.py b/klausur-service/backend/cv_review.py
new file mode 100644
index 0000000..b3e0bc6
--- /dev/null
+++ b/klausur-service/backend/cv_review.py
@@ -0,0 +1,1184 @@
+"""
+Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import os
+import re
+import time
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+ CV_PIPELINE_AVAILABLE,
+ PageRegion,
+ PipelineResult,
+ VocabRow,
+)
+from cv_preprocessing import (
+ deskew_image,
+ dewarp_image,
+ render_image_high_res,
+ render_pdf_high_res,
+)
+from cv_layout import (
+ analyze_layout,
+ create_layout_image,
+ create_ocr_image,
+)
+from cv_ocr_engines import (
+ _fix_character_confusion,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+ import cv2
+except ImportError:
+ cv2 = None # type: ignore[assignment]
+
+try:
+ import pytesseract
+ from PIL import Image
+except ImportError:
+ pytesseract = None # type: ignore[assignment]
+ Image = None # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Stage 6: Multi-Pass OCR
+# =============================================================================
+
+def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
+ psm: int, fallback_psm: Optional[int] = None,
+ min_confidence: float = 40.0) -> List[Dict[str, Any]]:
+ """Run Tesseract OCR on a specific region with given PSM.
+
+ Args:
+ ocr_img: Binarized full-page image.
+ region: Region to crop and OCR.
+ lang: Tesseract language string.
+ psm: Page Segmentation Mode.
+ fallback_psm: If confidence too low, retry with this PSM per line.
+ min_confidence: Minimum average confidence before fallback.
+
+ Returns:
+ List of word dicts with text, position, confidence.
+ """
+ # Crop region
+ crop = ocr_img[region.y:region.y + region.height,
+ region.x:region.x + region.width]
+
+ if crop.size == 0:
+ return []
+
+ # Convert to PIL for pytesseract
+ pil_img = Image.fromarray(crop)
+
+ # Run Tesseract with specified PSM
+ config = f'--psm {psm} --oem 3'
+ try:
+ data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
+ output_type=pytesseract.Output.DICT)
+ except Exception as e:
+ logger.warning(f"Tesseract failed for region {region.type}: {e}")
+ return []
+
+ words = []
+ for i in range(len(data['text'])):
+ text = data['text'][i].strip()
+ conf = int(data['conf'][i])
+ if not text or conf < 10:
+ continue
+ words.append({
+ 'text': text,
+ 'left': data['left'][i] + region.x, # Absolute coords
+ 'top': data['top'][i] + region.y,
+ 'width': data['width'][i],
+ 'height': data['height'][i],
+ 'conf': conf,
+ 'region_type': region.type,
+ })
+
+ # Check average confidence
+ if words and fallback_psm is not None:
+ avg_conf = sum(w['conf'] for w in words) / len(words)
+ if avg_conf < min_confidence:
+ logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
+ f"trying fallback PSM {fallback_psm}")
+ words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
+
+ return words
+
+
+def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
+ lang: str, psm: int) -> List[Dict[str, Any]]:
+ """OCR a region line by line (fallback for low-confidence regions).
+
+ Splits the region into horizontal strips based on text density,
+ then OCRs each strip individually with the given PSM.
+ """
+ crop = ocr_img[region.y:region.y + region.height,
+ region.x:region.x + region.width]
+
+ if crop.size == 0:
+ return []
+
+ # Find text lines via horizontal projection
+ inv = cv2.bitwise_not(crop)
+ h_proj = np.sum(inv, axis=1)
+ threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
+
+ # Find line boundaries
+ lines = []
+ in_text = False
+ line_start = 0
+ for y in range(len(h_proj)):
+ if h_proj[y] > threshold and not in_text:
+ line_start = y
+ in_text = True
+ elif h_proj[y] <= threshold and in_text:
+ if y - line_start > 5: # Minimum line height
+ lines.append((line_start, y))
+ in_text = False
+ if in_text and len(h_proj) - line_start > 5:
+ lines.append((line_start, len(h_proj)))
+
+ all_words = []
+ config = f'--psm {psm} --oem 3'
+
+ for line_y_start, line_y_end in lines:
+ # Add small padding
+ pad = 3
+ y1 = max(0, line_y_start - pad)
+ y2 = min(crop.shape[0], line_y_end + pad)
+ line_crop = crop[y1:y2, :]
+
+ if line_crop.size == 0:
+ continue
+
+ pil_img = Image.fromarray(line_crop)
+ try:
+ data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
+ output_type=pytesseract.Output.DICT)
+ except Exception:
+ continue
+
+ for i in range(len(data['text'])):
+ text = data['text'][i].strip()
+ conf = int(data['conf'][i])
+ if not text or conf < 10:
+ continue
+ all_words.append({
+ 'text': text,
+ 'left': data['left'][i] + region.x,
+ 'top': data['top'][i] + region.y + y1,
+ 'width': data['width'][i],
+ 'height': data['height'][i],
+ 'conf': conf,
+ 'region_type': region.type,
+ })
+
+ return all_words
+
+
+def run_multi_pass_ocr(ocr_img: np.ndarray,
+ regions: List[PageRegion],
+ lang: str = "eng+deu") -> Dict[str, List[Dict]]:
+ """Run OCR on each detected region with optimized settings.
+
+ Args:
+ ocr_img: Binarized full-page image.
+ regions: Detected page regions.
+ lang: Default language.
+
+ Returns:
+ Dict mapping region type to list of word dicts.
+ """
+ results: Dict[str, List[Dict]] = {}
+
+ _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
+ for region in regions:
+ if region.type in _ocr_skip:
+ continue # Skip non-content regions
+
+ if region.type == 'column_en':
+ words = ocr_region(ocr_img, region, lang='eng', psm=4)
+ elif region.type == 'column_de':
+ words = ocr_region(ocr_img, region, lang='deu', psm=4)
+ elif region.type == 'column_example':
+ words = ocr_region(ocr_img, region, lang=lang, psm=6,
+ fallback_psm=7, min_confidence=40.0)
+ else:
+ words = ocr_region(ocr_img, region, lang=lang, psm=6)
+
+ results[region.type] = words
+ logger.info(f"OCR {region.type}: {len(words)} words")
+
+ return results
+
+
+# =============================================================================
+# Stage 7: Line Alignment → Vocabulary Entries
+# =============================================================================
+
+def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
+ """Group words by Y position into lines, sorted by X within each line."""
+ if not words:
+ return []
+
+ sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
+ lines: List[List[Dict]] = []
+ current_line: List[Dict] = [sorted_words[0]]
+ current_y = sorted_words[0]['top']
+
+ for word in sorted_words[1:]:
+ if abs(word['top'] - current_y) <= y_tolerance_px:
+ current_line.append(word)
+ else:
+ current_line.sort(key=lambda w: w['left'])
+ lines.append(current_line)
+ current_line = [word]
+ current_y = word['top']
+
+ if current_line:
+ current_line.sort(key=lambda w: w['left'])
+ lines.append(current_line)
+
+ return lines
+
+
+def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
+ regions: List[PageRegion],
+ y_tolerance_px: int = 25) -> List[VocabRow]:
+ """Align OCR results from different columns into vocabulary rows.
+
+ Uses Y-coordinate matching to pair English words, German translations,
+ and example sentences that appear on the same line.
+
+ Args:
+ ocr_results: Dict mapping region type to word lists.
+ regions: Detected regions (for reference).
+ y_tolerance_px: Max Y-distance to consider words on the same row.
+
+ Returns:
+ List of VocabRow objects.
+ """
+ # If no vocabulary columns detected (e.g. plain text page), return empty
+ if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
+ logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
+ return []
+
+ # Group words into lines per column
+ en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
+ de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
+ ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
+
+ def line_y_center(line: List[Dict]) -> float:
+ return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
+
+ def line_text(line: List[Dict]) -> str:
+ return ' '.join(w['text'] for w in line)
+
+ def line_confidence(line: List[Dict]) -> float:
+ return sum(w['conf'] for w in line) / len(line) if line else 0
+
+ # Build EN entries as the primary reference
+ vocab_rows: List[VocabRow] = []
+
+ for en_line in en_lines:
+ en_y = line_y_center(en_line)
+ en_text = line_text(en_line)
+ en_conf = line_confidence(en_line)
+
+ # Skip very short or likely header content
+ if len(en_text.strip()) < 2:
+ continue
+
+ # Find matching DE line
+ de_text = ""
+ de_conf = 0.0
+ best_de_dist = float('inf')
+ best_de_idx = -1
+ for idx, de_line in enumerate(de_lines):
+ dist = abs(line_y_center(de_line) - en_y)
+ if dist < y_tolerance_px and dist < best_de_dist:
+ best_de_dist = dist
+ best_de_idx = idx
+
+ if best_de_idx >= 0:
+ de_text = line_text(de_lines[best_de_idx])
+ de_conf = line_confidence(de_lines[best_de_idx])
+
+ # Find matching example line
+ ex_text = ""
+ ex_conf = 0.0
+ best_ex_dist = float('inf')
+ best_ex_idx = -1
+ for idx, ex_line in enumerate(ex_lines):
+ dist = abs(line_y_center(ex_line) - en_y)
+ if dist < y_tolerance_px and dist < best_ex_dist:
+ best_ex_dist = dist
+ best_ex_idx = idx
+
+ if best_ex_idx >= 0:
+ ex_text = line_text(ex_lines[best_ex_idx])
+ ex_conf = line_confidence(ex_lines[best_ex_idx])
+
+ avg_conf = en_conf
+ conf_count = 1
+ if de_conf > 0:
+ avg_conf += de_conf
+ conf_count += 1
+ if ex_conf > 0:
+ avg_conf += ex_conf
+ conf_count += 1
+
+ vocab_rows.append(VocabRow(
+ english=en_text.strip(),
+ german=de_text.strip(),
+ example=ex_text.strip(),
+ confidence=avg_conf / conf_count,
+ y_position=int(en_y),
+ ))
+
+ # Handle multi-line wrapping in example column:
+ # If an example line has no matching EN/DE, append to previous entry
+ matched_ex_ys = set()
+ for row in vocab_rows:
+ if row.example:
+ matched_ex_ys.add(row.y_position)
+
+ for ex_line in ex_lines:
+ ex_y = line_y_center(ex_line)
+ # Check if already matched
+ already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
+ if already_matched:
+ continue
+
+ # Find nearest previous vocab row
+ best_row = None
+ best_dist = float('inf')
+ for row in vocab_rows:
+ dist = ex_y - row.y_position
+ if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
+ best_dist = dist
+ best_row = row
+
+ if best_row:
+ continuation = line_text(ex_line).strip()
+ if continuation:
+ best_row.example = (best_row.example + " " + continuation).strip()
+
+ # Sort by Y position
+ vocab_rows.sort(key=lambda r: r.y_position)
+
+ return vocab_rows
+
+
+# =============================================================================
+# Stage 8: Optional LLM Post-Correction
+# =============================================================================
+
+async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
+ confidence_threshold: float = 50.0,
+ enabled: bool = False) -> List[VocabRow]:
+ """Optionally send low-confidence regions to Qwen-VL for correction.
+
+ Default: disabled. Enable per parameter.
+
+ Args:
+ img: Original BGR image.
+ vocab_rows: Current vocabulary rows.
+ confidence_threshold: Rows below this get LLM correction.
+ enabled: Whether to actually run LLM correction.
+
+ Returns:
+ Corrected vocabulary rows.
+ """
+ if not enabled:
+ return vocab_rows
+
+ # TODO: Implement Qwen-VL correction for low-confidence entries
+ # For each row with confidence < threshold:
+ # 1. Crop the relevant region from img
+ # 2. Send crop + OCR text to Qwen-VL
+ # 3. Replace text if LLM provides a confident correction
+ logger.info(f"LLM post-correction skipped (not yet implemented)")
+ return vocab_rows
+
+
+# =============================================================================
+# Orchestrator
+# =============================================================================
+
+async def run_cv_pipeline(
+ pdf_data: Optional[bytes] = None,
+ image_data: Optional[bytes] = None,
+ page_number: int = 0,
+ zoom: float = 3.0,
+ enable_dewarp: bool = True,
+ enable_llm_correction: bool = False,
+ lang: str = "eng+deu",
+) -> PipelineResult:
+ """Run the complete CV document reconstruction pipeline.
+
+ Args:
+ pdf_data: Raw PDF bytes (mutually exclusive with image_data).
+ image_data: Raw image bytes (mutually exclusive with pdf_data).
+ page_number: 0-indexed page number (for PDF).
+ zoom: PDF rendering zoom factor.
+ enable_dewarp: Whether to run dewarp stage.
+ enable_llm_correction: Whether to run LLM post-correction.
+ lang: Tesseract language string.
+
+ Returns:
+ PipelineResult with vocabulary and timing info.
+ """
+ if not CV_PIPELINE_AVAILABLE:
+ return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
+
+ result = PipelineResult()
+ total_start = time.time()
+
+ try:
+ # Stage 1: Render
+ t = time.time()
+ if pdf_data:
+ img = render_pdf_high_res(pdf_data, page_number, zoom)
+ elif image_data:
+ img = render_image_high_res(image_data)
+ else:
+ return PipelineResult(error="No input data (pdf_data or image_data required)")
+ result.stages['render'] = round(time.time() - t, 2)
+ result.image_width = img.shape[1]
+ result.image_height = img.shape[0]
+ logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
+
+ # Stage 2: Deskew
+ t = time.time()
+ img, angle = deskew_image(img)
+ result.stages['deskew'] = round(time.time() - t, 2)
+ logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s")
+
+ # Stage 3: Dewarp
+ if enable_dewarp:
+ t = time.time()
+ img, _dewarp_info = dewarp_image(img)
+ result.stages['dewarp'] = round(time.time() - t, 2)
+
+ # Stage 4: Dual image preparation
+ t = time.time()
+ ocr_img = create_ocr_image(img)
+ layout_img = create_layout_image(img)
+ result.stages['image_prep'] = round(time.time() - t, 2)
+
+ # Stage 5: Layout analysis
+ t = time.time()
+ regions = analyze_layout(layout_img, ocr_img)
+ result.stages['layout'] = round(time.time() - t, 2)
+ result.columns_detected = len([r for r in regions if r.type.startswith('column')])
+ logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
+
+ # Stage 6: Multi-pass OCR
+ t = time.time()
+ ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
+ result.stages['ocr'] = round(time.time() - t, 2)
+ total_words = sum(len(w) for w in ocr_results.values())
+ result.word_count = total_words
+ logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
+
+ # Stage 7: Line alignment
+ t = time.time()
+ vocab_rows = match_lines_to_vocab(ocr_results, regions)
+ result.stages['alignment'] = round(time.time() - t, 2)
+
+ # Stage 8: Optional LLM correction
+ if enable_llm_correction:
+ t = time.time()
+ vocab_rows = await llm_post_correct(img, vocab_rows)
+ result.stages['llm_correction'] = round(time.time() - t, 2)
+
+ # Convert to output format
+ result.vocabulary = [
+ {
+ "english": row.english,
+ "german": row.german,
+ "example": row.example,
+ "confidence": round(row.confidence, 1),
+ }
+ for row in vocab_rows
+ if row.english or row.german # Skip empty rows
+ ]
+
+ result.duration_seconds = round(time.time() - total_start, 2)
+ logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
+
+ except Exception as e:
+ logger.error(f"CV Pipeline error: {e}")
+ import traceback
+ logger.debug(traceback.format_exc())
+ result.error = str(e)
+ result.duration_seconds = round(time.time() - total_start, 2)
+
+ return result
+
+
+# ---------------------------------------------------------------------------
+# LLM-based OCR Correction (Step 6)
+# ---------------------------------------------------------------------------
+
+import httpx
+import os
+import json as _json
+import re as _re
+
+_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
+_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
+logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
+
+# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]"
+_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]')
+
+# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion.
+# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B)
+# when they appear inside or next to a word character.
+_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])')
+
+
+def _entry_needs_review(entry: Dict) -> bool:
+ """Check if an entry should be sent to the LLM for review.
+
+ Sends all non-empty entries that don't have IPA phonetic transcriptions.
+ The LLM prompt and _is_spurious_change() guard against unwanted changes.
+ """
+ en = entry.get("english", "") or ""
+ de = entry.get("german", "") or ""
+
+ # Skip completely empty entries
+ if not en.strip() and not de.strip():
+ return False
+ # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them
+ if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
+ return False
+ return True
+
+
+def _build_llm_prompt(table_lines: List[Dict]) -> str:
+ """Build the LLM correction prompt for a batch of entries."""
+ return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
+
+DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
+
+NUR diese Korrekturen sind erlaubt:
+- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball"
+- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old"
+- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
+- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
+- Ziffer 6 statt G oder g: "6eld" → "Geld"
+- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"
+
+ABSOLUT VERBOTEN — aendere NIEMALS:
+- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
+- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN
+- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst
+- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest
+- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
+- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
+- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren
+- Beispielsaetze in der ex-Spalte — NIEMALS aendern
+
+Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
+
+Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
+Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
+
+/no_think
+
+Eingabe:
+{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
+
+
+def _is_spurious_change(old_val: str, new_val: str) -> bool:
+ """Detect LLM changes that are likely wrong and should be discarded.
+
+ Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are
+ legitimate OCR corrections. Everything else is rejected.
+
+ Filters out:
+ - Case-only changes
+ - Changes that don't contain any digit→letter fix
+ - Completely different words (LLM translating or hallucinating)
+ - Additions or removals of whole words (count changed)
+ """
+ if not old_val or not new_val:
+ return False
+
+ # Case-only change — never a real OCR error
+ if old_val.lower() == new_val.lower():
+ return True
+
+ # If the word count changed significantly, the LLM rewrote rather than fixed
+ old_words = old_val.split()
+ new_words = new_val.split()
+ if abs(len(old_words) - len(new_words)) > 1:
+ return True
+
+ # Core rule: a legitimate correction replaces a digit with the corresponding
+ # letter. If the change doesn't include such a substitution, reject it.
+ # Build a set of (old_char, new_char) pairs that differ between old and new.
+ # Use character-level diff heuristic: if lengths are close, zip and compare.
+ # Map of characters that OCR commonly misreads → set of correct replacements
+ _OCR_CHAR_MAP = {
+ # Digits mistaken for letters
+ '0': set('oOgG'),
+ '1': set('lLiI'),
+ '5': set('sS'),
+ '6': set('gG'),
+ '8': set('bB'),
+ # Non-letter symbols mistaken for letters
+ '|': set('lLiI1'), # pipe → lowercase l, capital I, or digit 1
+ 'l': set('iI|1'), # lowercase l → capital I (and reverse)
+ }
+ has_valid_fix = False
+ if len(old_val) == len(new_val):
+ for oc, nc in zip(old_val, new_val):
+ if oc != nc:
+ if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
+ has_valid_fix = True
+ elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
+ # Reverse check (e.g. l→I where new is the "correct" char)
+ has_valid_fix = True
+ else:
+ # Length changed by 1: accept if old had a suspicious char sequence
+ _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
+ if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
+ has_valid_fix = True
+
+ if not has_valid_fix:
+ return True # Reject — looks like translation or hallucination
+
+ return False
+
+
+def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
+ """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
+ changes = []
+ entries_out = []
+ for i, orig in enumerate(originals):
+ if i < len(corrected):
+ c = corrected[i]
+ entry = dict(orig)
+ for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
+ new_val = c.get(key, "").strip()
+ old_val = (orig.get(field_name, "") or "").strip()
+ if new_val and new_val != old_val:
+ # Filter spurious LLM changes
+ if _is_spurious_change(old_val, new_val):
+ continue
+ changes.append({
+ "row_index": orig.get("row_index", i),
+ "field": field_name,
+ "old": old_val,
+ "new": new_val,
+ })
+ entry[field_name] = new_val
+ entry["llm_corrected"] = True
+ entries_out.append(entry)
+ else:
+ entries_out.append(dict(orig))
+ return changes, entries_out
+
+
+# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
+
+REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
+
+try:
+ from spellchecker import SpellChecker as _SpellChecker
+ _en_spell = _SpellChecker(language='en', distance=1)
+ _de_spell = _SpellChecker(language='de', distance=1)
+ _SPELL_AVAILABLE = True
+ logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
+except ImportError:
+ _SPELL_AVAILABLE = False
+ logger.warning("pyspellchecker not installed — falling back to LLM review")
+
+# ─── Page-Ref Normalization ───────────────────────────────────────────────────
+# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60"
+_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE)
+
+
+def _normalize_page_ref(text: str) -> str:
+ """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'."""
+ if not text:
+ return text
+ return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
+
+
+# Suspicious OCR chars → ordered list of most-likely correct replacements
+_SPELL_SUBS: Dict[str, List[str]] = {
+ '0': ['O', 'o'],
+ '1': ['l', 'I'],
+ '5': ['S', 's'],
+ '6': ['G', 'g'],
+ '8': ['B', 'b'],
+ '|': ['I', 'l', '1'],
+}
+_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
+
+# Tokenizer: word tokens (letters + pipe) alternating with separators
+_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
+
+
+def _spell_dict_knows(word: str) -> bool:
+ """True if word is known in EN or DE dictionary."""
+ if not _SPELL_AVAILABLE:
+ return False
+ w = word.lower()
+ return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
+
+
+def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
+ """Return corrected form of token, or None if no fix needed/possible.
+
+ *field* is 'english' or 'german' — used to pick the right dictionary
+ for general spell correction (step 3 below).
+ """
+ has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
+
+ # 1. Already known word → no fix needed
+ if _spell_dict_knows(token):
+ return None
+
+ # 2. Digit/pipe substitution (existing logic)
+ if has_suspicious:
+ # Standalone pipe → capital I
+ if token == '|':
+ return 'I'
+ # Dictionary-backed single-char substitution
+ for i, ch in enumerate(token):
+ if ch not in _SPELL_SUBS:
+ continue
+ for replacement in _SPELL_SUBS[ch]:
+ candidate = token[:i] + replacement + token[i + 1:]
+ if _spell_dict_knows(candidate):
+ return candidate
+ # Structural rule: suspicious char at position 0 + rest is all lowercase letters
+ first = token[0]
+ if first in _SPELL_SUBS and len(token) >= 2:
+ rest = token[1:]
+ if rest.isalpha() and rest.islower():
+ candidate = _SPELL_SUBS[first][0] + rest
+ if not candidate[0].isdigit():
+ return candidate
+
+ # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u)
+ # Try single-char umlaut substitutions and check against dictionary.
+ if len(token) >= 3 and token.isalpha() and field == "german":
+ _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
+ 'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'}
+ for i, ch in enumerate(token):
+ if ch in _UMLAUT_SUBS:
+ candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
+ if _spell_dict_knows(candidate):
+ return candidate
+
+ # 4. General spell correction for unknown words (no digits/pipes)
+ # e.g. "beautful" → "beautiful"
+ if not has_suspicious and len(token) >= 3 and token.isalpha():
+ spell = _en_spell if field == "english" else _de_spell if field == "german" else None
+ if spell is not None:
+ correction = spell.correction(token.lower())
+ if correction and correction != token.lower():
+ # Preserve original capitalisation pattern
+ if token[0].isupper():
+ correction = correction[0].upper() + correction[1:]
+ if _spell_dict_knows(correction):
+ return correction
+ return None
+
+
+def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
+ """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).
+
+ *field* is 'english' or 'german' — forwarded to _spell_fix_token for
+ dictionary selection.
+ """
+ if not text:
+ return text, False
+ has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
+ # If no suspicious chars AND no alpha chars that could be misspelled, skip
+ if not has_suspicious and not any(c.isalpha() for c in text):
+ return text, False
+ # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
+ fixed = _re.sub(r'(? Dict:
+ """Rule-based OCR correction: spell-checker + structural heuristics.
+
+ Deterministic — never translates, never touches IPA, never hallucinates.
+ """
+ t0 = time.time()
+ changes: List[Dict] = []
+ all_corrected: List[Dict] = []
+ for i, entry in enumerate(entries):
+ e = dict(entry)
+ # Page-ref normalization (always, regardless of review status)
+ old_ref = (e.get("source_page") or "").strip()
+ if old_ref:
+ new_ref = _normalize_page_ref(old_ref)
+ if new_ref != old_ref:
+ changes.append({
+ "row_index": e.get("row_index", i),
+ "field": "source_page",
+ "old": old_ref,
+ "new": new_ref,
+ })
+ e["source_page"] = new_ref
+ e["llm_corrected"] = True
+ if not _entry_needs_review(e):
+ all_corrected.append(e)
+ continue
+ for field_name in ("english", "german", "example"):
+ old_val = (e.get(field_name) or "").strip()
+ if not old_val:
+ continue
+ # example field is mixed-language — try German first (for umlauts)
+ lang = "german" if field_name in ("german", "example") else "english"
+ new_val, was_changed = _spell_fix_field(old_val, field=lang)
+ if was_changed and new_val != old_val:
+ changes.append({
+ "row_index": e.get("row_index", i),
+ "field": field_name,
+ "old": old_val,
+ "new": new_val,
+ })
+ e[field_name] = new_val
+ e["llm_corrected"] = True
+ all_corrected.append(e)
+ duration_ms = int((time.time() - t0) * 1000)
+ return {
+ "entries_original": entries,
+ "entries_corrected": all_corrected,
+ "changes": changes,
+ "skipped_count": 0,
+ "model_used": "spell-checker",
+ "duration_ms": duration_ms,
+ }
+
+
+async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
+ """Async generator yielding SSE-compatible events for spell-checker review."""
+ total = len(entries)
+ yield {
+ "type": "meta",
+ "total_entries": total,
+ "to_review": total,
+ "skipped": 0,
+ "model": "spell-checker",
+ "batch_size": batch_size,
+ }
+ result = spell_review_entries_sync(entries)
+ changes = result["changes"]
+ yield {
+ "type": "batch",
+ "batch_index": 0,
+ "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
+ "changes": changes,
+ "duration_ms": result["duration_ms"],
+ "progress": {"current": total, "total": total},
+ }
+ yield {
+ "type": "complete",
+ "changes": changes,
+ "model_used": "spell-checker",
+ "duration_ms": result["duration_ms"],
+ "total_entries": total,
+ "reviewed": total,
+ "skipped": 0,
+ "corrections_found": len(changes),
+ "entries_corrected": result["entries_corrected"],
+ }
+
+# ─── End Spell-Checker ────────────────────────────────────────────────────────
+
+
+async def llm_review_entries(
+ entries: List[Dict],
+ model: str = None,
+) -> Dict:
+ """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
+ if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+ return spell_review_entries_sync(entries)
+ if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+ logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
+ model = model or OLLAMA_REVIEW_MODEL
+
+ # Filter: only entries that need review
+ reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
+
+ if not reviewable:
+ return {
+ "entries_original": entries,
+ "entries_corrected": [dict(e) for e in entries],
+ "changes": [],
+ "skipped_count": len(entries),
+ "model_used": model,
+ "duration_ms": 0,
+ }
+
+ review_entries = [e for _, e in reviewable]
+ table_lines = [
+ {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
+ for e in review_entries
+ ]
+
+ logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
+ len(review_entries), len(entries), model, len(entries) - len(reviewable))
+ logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False))
+
+ prompt = _build_llm_prompt(table_lines)
+
+ t0 = time.time()
+ async with httpx.AsyncClient(timeout=300.0) as client:
+ resp = await client.post(
+ f"{_OLLAMA_URL}/api/chat",
+ json={
+ "model": model,
+ "messages": [{"role": "user", "content": prompt}],
+ "stream": False,
+ "think": False, # qwen3: disable chain-of-thought (Ollama >=0.6)
+ "options": {"temperature": 0.1, "num_predict": 8192},
+ },
+ )
+ resp.raise_for_status()
+ content = resp.json().get("message", {}).get("content", "")
+ duration_ms = int((time.time() - t0) * 1000)
+
+ logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
+ logger.debug("LLM review raw response (first 500): %.500s", content)
+
+ corrected = _parse_llm_json_array(content)
+ logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected))
+ changes, corrected_entries = _diff_batch(review_entries, corrected)
+
+ # Merge corrected entries back into the full list
+ all_corrected = [dict(e) for e in entries]
+ for batch_idx, (orig_idx, _) in enumerate(reviewable):
+ if batch_idx < len(corrected_entries):
+ all_corrected[orig_idx] = corrected_entries[batch_idx]
+
+ return {
+ "entries_original": entries,
+ "entries_corrected": all_corrected,
+ "changes": changes,
+ "skipped_count": len(entries) - len(reviewable),
+ "model_used": model,
+ "duration_ms": duration_ms,
+ }
+
+
+async def llm_review_entries_streaming(
+ entries: List[Dict],
+ model: str = None,
+ batch_size: int = _REVIEW_BATCH_SIZE,
+):
+ """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
+
+ Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
+ visible in the UI — this is the only place the fix now runs (removed from Step 1
+ of build_vocab_pipeline_streaming).
+ """
+ # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
+ _CONF_FIELDS = ('english', 'german', 'example')
+ originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
+ _fix_character_confusion(entries) # modifies in-place, returns same list
+ char_changes = [
+ {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
+ for i in range(len(entries))
+ for f in _CONF_FIELDS
+ if originals[i][f] != entries[i].get(f, '')
+ ]
+
+ if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
+ # Inject char_changes as a batch right after the meta event from the spell checker
+ _meta_sent = False
+ async for event in spell_review_entries_streaming(entries, batch_size):
+ yield event
+ if not _meta_sent and event.get('type') == 'meta' and char_changes:
+ _meta_sent = True
+ yield {
+ 'type': 'batch',
+ 'changes': char_changes,
+ 'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+ 'progress': {'current': 0, 'total': len(entries)},
+ }
+ return
+
+ if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
+ logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
+
+ # LLM path: emit char_changes first (before meta) so they appear in the UI
+ if char_changes:
+ yield {
+ 'type': 'batch',
+ 'changes': char_changes,
+ 'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
+ 'progress': {'current': 0, 'total': len(entries)},
+ }
+
+ model = model or OLLAMA_REVIEW_MODEL
+
+ # Separate reviewable from skipped entries
+ reviewable = []
+ skipped_indices = []
+ for i, e in enumerate(entries):
+ if _entry_needs_review(e):
+ reviewable.append((i, e))
+ else:
+ skipped_indices.append(i)
+
+ total_to_review = len(reviewable)
+
+ # meta event
+ yield {
+ "type": "meta",
+ "total_entries": len(entries),
+ "to_review": total_to_review,
+ "skipped": len(skipped_indices),
+ "model": model,
+ "batch_size": batch_size,
+ }
+
+ all_changes = []
+ all_corrected = [dict(e) for e in entries]
+ total_duration_ms = 0
+ reviewed_count = 0
+
+ # Process in batches
+ for batch_start in range(0, total_to_review, batch_size):
+ batch_items = reviewable[batch_start:batch_start + batch_size]
+ batch_entries = [e for _, e in batch_items]
+
+ table_lines = [
+ {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
+ for e in batch_entries
+ ]
+
+ prompt = _build_llm_prompt(table_lines)
+
+ logger.info("LLM review streaming: batch %d — sending %d entries to %s",
+ batch_start // batch_size, len(batch_entries), model)
+
+ t0 = time.time()
+ async with httpx.AsyncClient(timeout=300.0) as client:
+ resp = await client.post(
+ f"{_OLLAMA_URL}/api/chat",
+ json={
+ "model": model,
+ "messages": [{"role": "user", "content": prompt}],
+ "stream": False,
+ "think": False, # qwen3: disable chain-of-thought
+ "options": {"temperature": 0.1, "num_predict": 8192},
+ },
+ )
+ resp.raise_for_status()
+ content = resp.json().get("message", {}).get("content", "")
+ batch_ms = int((time.time() - t0) * 1000)
+ total_duration_ms += batch_ms
+
+ logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content))
+ logger.debug("LLM review streaming raw (first 500): %.500s", content)
+
+ corrected = _parse_llm_json_array(content)
+ logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected))
+ batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
+
+ # Merge back
+ for batch_idx, (orig_idx, _) in enumerate(batch_items):
+ if batch_idx < len(batch_corrected):
+ all_corrected[orig_idx] = batch_corrected[batch_idx]
+
+ all_changes.extend(batch_changes)
+ reviewed_count += len(batch_items)
+
+ # Yield batch result
+ yield {
+ "type": "batch",
+ "batch_index": batch_start // batch_size,
+ "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
+ "changes": batch_changes,
+ "duration_ms": batch_ms,
+ "progress": {"current": reviewed_count, "total": total_to_review},
+ }
+
+ # Complete event
+ yield {
+ "type": "complete",
+ "changes": all_changes,
+ "model_used": model,
+ "duration_ms": total_duration_ms,
+ "total_entries": len(entries),
+ "reviewed": total_to_review,
+ "skipped": len(skipped_indices),
+ "corrections_found": len(all_changes),
+ "entries_corrected": all_corrected,
+ }
+
+
+def _sanitize_for_json(text: str) -> str:
+ """Remove or escape control characters that break JSON parsing.
+
+ Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
+ JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
+ that are only valid inside JSON strings when properly escaped.
+ """
+ # Replace literal control chars (except \\t \\n \\r) with a space
+ return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
+
+
+def _parse_llm_json_array(text: str) -> List[Dict]:
+ """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
+ # Strip qwen3 ... blocks (present even with think=False on some builds)
+ text = _re.sub(r'.*?', '', text, flags=_re.DOTALL)
+ # Strip markdown code fences
+ text = _re.sub(r'```json\s*', '', text)
+ text = _re.sub(r'```\s*', '', text)
+ # Sanitize control characters before JSON parsing
+ text = _sanitize_for_json(text)
+ # Find first [ ... last ]
+ match = _re.search(r'\[.*\]', text, _re.DOTALL)
+ if match:
+ try:
+ return _json.loads(match.group())
+ except (ValueError, _json.JSONDecodeError) as e:
+ logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
+ else:
+ logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
+ return []
diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 1c4961d..940381b 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -1,8163 +1,35 @@
"""
CV-based Document Reconstruction Pipeline for Vocabulary Extraction.
-Uses classical Computer Vision techniques for high-quality OCR:
-- High-resolution PDF rendering (432 DPI)
-- Deskew (rotation correction via Hough Lines)
-- Dewarp (book curvature correction) — pass-through initially
-- Dual image preparation (binarized for OCR, CLAHE for layout)
-- Projection-profile layout analysis (column/row detection)
-- Multi-pass Tesseract OCR with region-specific PSM settings
-- Y-coordinate line alignment for vocabulary matching
-- Optional LLM post-correction for low-confidence regions
+Re-export facade — all logic lives in the sub-modules:
+
+ cv_vocab_types Dataklassen, Konstanten, IPA, Feature-Flags
+ cv_preprocessing Bild-I/O, Orientierung, Deskew, Dewarp
+ cv_layout Dokumenttyp, Spalten, Zeilen, Klassifikation
+ cv_ocr_engines OCR-Engines, Vocab-Postprocessing, Text-Cleaning
+ cv_cell_grid Cell-Grid (v2 + Legacy), Vocab-Konvertierung
+ cv_review LLM/Spell Review, Pipeline-Orchestrierung
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
-import io
-import logging
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import dataclass, field
-from typing import Any, Dict, Generator, List, Optional, Tuple
-
-import numpy as np
-
-logger = logging.getLogger(__name__)
-
-# --- Availability Guards ---
-
-try:
- import cv2
- CV2_AVAILABLE = True
-except ImportError:
- cv2 = None
- CV2_AVAILABLE = False
- logger.warning("OpenCV not available — CV pipeline disabled")
-
-try:
- import pytesseract
- from PIL import Image
- TESSERACT_AVAILABLE = True
-except ImportError:
- pytesseract = None
- Image = None
- TESSERACT_AVAILABLE = False
- logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
-
-CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
-
-# --- IPA Dictionary ---
-
-import json
-import os
-import re
-
-IPA_AVAILABLE = False
-_ipa_convert_american = None
-_britfone_dict: Dict[str, str] = {}
-
-try:
- import eng_to_ipa as _eng_to_ipa
- _ipa_convert_american = _eng_to_ipa.convert
- IPA_AVAILABLE = True
- logger.info("eng_to_ipa available — American IPA lookup enabled")
-except ImportError:
- logger.info("eng_to_ipa not installed — American IPA disabled")
-
-# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
-_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
-if os.path.exists(_britfone_path):
- try:
- with open(_britfone_path, 'r', encoding='utf-8') as f:
- _britfone_dict = json.load(f)
- IPA_AVAILABLE = True
- logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
- except Exception as e:
- logger.warning(f"Failed to load Britfone: {e}")
-else:
- logger.info("Britfone not found — British IPA disabled")
-
-# --- Language Detection Constants ---
-
-GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
- 'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
- 'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
- 'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
- 'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
-
-ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
- 'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
- 'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
- 'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
- 'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
-
-
-# --- Data Classes ---
-
-@dataclass
-class PageRegion:
- """A detected region on the page."""
- type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
- x: int
- y: int
- width: int
- height: int
- classification_confidence: float = 1.0 # 0.0-1.0
- classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback'
-
-
-@dataclass
-class ColumnGeometry:
- """Geometrisch erkannte Spalte vor Typ-Klassifikation."""
- index: int # 0-basiert, links->rechts
- x: int
- y: int
- width: int
- height: int
- word_count: int
- words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
- width_ratio: float # width / content_width (0.0-1.0)
- is_sub_column: bool = False # True if created by _detect_sub_columns() split
-
-
-@dataclass
-class RowGeometry:
- """Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
- index: int # 0-basiert, oben→unten
- x: int # absolute left (= content left_x)
- y: int # absolute y start
- width: int # content width
- height: int # Zeilenhoehe in px
- word_count: int
- words: List[Dict]
- row_type: str = 'content' # 'content' | 'header' | 'footer'
- gap_before: int = 0 # Gap in px ueber dieser Zeile
-
-
-@dataclass
-class VocabRow:
- """A single vocabulary entry assembled from multi-column OCR."""
- english: str = ""
- german: str = ""
- example: str = ""
- source_page: str = ""
- confidence: float = 0.0
- y_position: int = 0
-
-
-@dataclass
-class PipelineResult:
- """Complete result of the CV pipeline."""
- vocabulary: List[Dict[str, Any]] = field(default_factory=list)
- word_count: int = 0
- columns_detected: int = 0
- duration_seconds: float = 0.0
- stages: Dict[str, float] = field(default_factory=dict)
- error: Optional[str] = None
- image_width: int = 0
- image_height: int = 0
-
-
-@dataclass
-class DocumentTypeResult:
- """Result of automatic document type detection."""
- doc_type: str # 'vocab_table' | 'full_text' | 'generic_table'
- confidence: float # 0.0-1.0
- pipeline: str # 'cell_first' | 'full_page'
- skip_steps: List[str] = field(default_factory=list) # e.g. ['columns', 'rows']
- features: Dict[str, Any] = field(default_factory=dict) # debug info
-
-
-# =============================================================================
-# Stage 1: High-Resolution PDF Rendering
-# =============================================================================
-
-def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
- """Render a PDF page to a high-resolution numpy array (BGR).
-
- Args:
- pdf_data: Raw PDF bytes.
- page_number: 0-indexed page number.
- zoom: Zoom factor (3.0 = 432 DPI).
-
- Returns:
- numpy array in BGR format.
- """
- import fitz # PyMuPDF
-
- pdf_doc = fitz.open(stream=pdf_data, filetype="pdf")
- if page_number >= pdf_doc.page_count:
- raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)")
-
- page = pdf_doc[page_number]
- mat = fitz.Matrix(zoom, zoom)
- pix = page.get_pixmap(matrix=mat)
-
- # Convert to numpy BGR
- img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
- if pix.n == 4: # RGBA
- img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
- elif pix.n == 3: # RGB
- img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
- else: # Grayscale
- img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
-
- pdf_doc.close()
- return img_bgr
-
-
-def render_image_high_res(image_data: bytes) -> np.ndarray:
- """Load an image (PNG/JPEG) into a numpy array (BGR).
-
- Args:
- image_data: Raw image bytes.
-
- Returns:
- numpy array in BGR format.
- """
- img_array = np.frombuffer(image_data, dtype=np.uint8)
- img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
- if img_bgr is None:
- raise ValueError("Could not decode image data")
- return img_bgr
-
-
-# =============================================================================
-# Stage 1b: Orientation Detection (0°/90°/180°/270°)
-# =============================================================================
-
-def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
- """Detect page orientation via Tesseract OSD and rotate if needed.
-
- Handles upside-down scans (180°) common with book scanners where
- every other page is flipped due to the scanner hinge.
-
- Returns:
- (corrected_image, rotation_degrees) — rotation is 0, 90, 180, or 270.
- """
- if pytesseract is None:
- return img_bgr, 0
-
- try:
- # Tesseract OSD needs a grayscale or RGB image
- gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
- pil_img = Image.fromarray(gray)
-
- osd = pytesseract.image_to_osd(pil_img, output_type=pytesseract.Output.DICT)
- rotate = osd.get("rotate", 0)
- confidence = osd.get("orientation_conf", 0.0)
-
- logger.info(f"OSD: orientation={rotate}° confidence={confidence:.1f}")
-
- if rotate == 0 or confidence < 1.0:
- return img_bgr, 0
-
- # Apply rotation
- if rotate == 180:
- corrected = cv2.rotate(img_bgr, cv2.ROTATE_180)
- elif rotate == 90:
- corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_COUNTERCLOCKWISE)
- elif rotate == 270:
- corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_CLOCKWISE)
- else:
- return img_bgr, 0
-
- logger.info(f"OSD: rotated {rotate}° to fix orientation")
- return corrected, rotate
-
- except Exception as e:
- logger.warning(f"OSD orientation detection failed: {e}")
- return img_bgr, 0
-
-
-# =============================================================================
-# Stage 2: Deskew (Rotation Correction)
-# =============================================================================
-
-def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
- """Correct rotation using Hough Line detection.
-
- Args:
- img: BGR image.
-
- Returns:
- Tuple of (corrected image, detected angle in degrees).
- """
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- # Binarize for line detection
- _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
- # Detect lines
- lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
- minLineLength=img.shape[1] // 4, maxLineGap=20)
-
- if lines is None or len(lines) < 3:
- return img, 0.0
-
- # Compute angles of near-horizontal lines
- angles = []
- for line in lines:
- x1, y1, x2, y2 = line[0]
- angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
- if abs(angle) < 15: # Only near-horizontal
- angles.append(angle)
-
- if not angles:
- return img, 0.0
-
- median_angle = float(np.median(angles))
-
- # Limit correction to ±5°
- if abs(median_angle) > 5.0:
- median_angle = 5.0 * np.sign(median_angle)
-
- if abs(median_angle) < 0.1:
- return img, 0.0
-
- # Rotate
- h, w = img.shape[:2]
- center = (w // 2, h // 2)
- M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
- corrected = cv2.warpAffine(img, M, (w, h),
- flags=cv2.INTER_LINEAR,
- borderMode=cv2.BORDER_REPLICATE)
-
- logger.info(f"Deskew: corrected {median_angle:.2f}° rotation")
- return corrected, median_angle
-
-
-def deskew_image_by_word_alignment(
- image_data: bytes,
- lang: str = "eng+deu",
- downscale_factor: float = 0.5,
-) -> Tuple[bytes, float]:
- """Correct rotation by fitting a line through left-most word starts per text line.
-
- More robust than Hough-based deskew for vocabulary worksheets where text lines
- have consistent left-alignment. Runs a quick Tesseract pass on a downscaled
- copy to find word positions, computes the dominant left-edge column, fits a
- line through those points and rotates the full-resolution image.
-
- Args:
- image_data: Raw image bytes (PNG/JPEG).
- lang: Tesseract language string for the quick pass.
- downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
-
- Returns:
- Tuple of (rotated image as PNG bytes, detected angle in degrees).
- """
- if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
- return image_data, 0.0
-
- # 1. Decode image
- img_array = np.frombuffer(image_data, dtype=np.uint8)
- img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
- if img is None:
- logger.warning("deskew_by_word_alignment: could not decode image")
- return image_data, 0.0
-
- orig_h, orig_w = img.shape[:2]
-
- # 2. Downscale for fast Tesseract pass
- small_w = int(orig_w * downscale_factor)
- small_h = int(orig_h * downscale_factor)
- small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
-
- # 3. Quick Tesseract — word-level positions
- pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
- try:
- data = pytesseract.image_to_data(
- pil_small, lang=lang, config="--psm 6 --oem 3",
- output_type=pytesseract.Output.DICT,
- )
- except Exception as e:
- logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
- return image_data, 0.0
-
- # 4. Per text-line, find the left-most word start
- # Group by (block_num, par_num, line_num)
- from collections import defaultdict
- line_groups: Dict[tuple, list] = defaultdict(list)
- for i in range(len(data["text"])):
- text = (data["text"][i] or "").strip()
- conf = int(data["conf"][i])
- if not text or conf < 20:
- continue
- key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
- line_groups[key].append(i)
-
- if len(line_groups) < 5:
- logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
- return image_data, 0.0
-
- # For each line, pick the word with smallest 'left' → compute (left_x, center_y)
- # Scale back to original resolution
- scale = 1.0 / downscale_factor
- points = [] # list of (x, y) in original-image coords
- for key, indices in line_groups.items():
- best_idx = min(indices, key=lambda i: data["left"][i])
- lx = data["left"][best_idx] * scale
- top = data["top"][best_idx] * scale
- h = data["height"][best_idx] * scale
- cy = top + h / 2.0
- points.append((lx, cy))
-
- # 5. Find dominant left-edge column + compute angle
- xs = np.array([p[0] for p in points])
- ys = np.array([p[1] for p in points])
- median_x = float(np.median(xs))
- tolerance = orig_w * 0.03 # 3% of image width
-
- mask = np.abs(xs - median_x) <= tolerance
- filtered_xs = xs[mask]
- filtered_ys = ys[mask]
-
- if len(filtered_xs) < 5:
- logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
- return image_data, 0.0
-
- # polyfit: x = a*y + b → a = dx/dy → angle = arctan(a)
- coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
- slope = coeffs[0] # dx/dy
- angle_rad = np.arctan(slope)
- angle_deg = float(np.degrees(angle_rad))
-
- # Clamp to ±5°
- angle_deg = max(-5.0, min(5.0, angle_deg))
-
- logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points "
- f"(total lines: {len(line_groups)})")
-
- if abs(angle_deg) < 0.05:
- return image_data, 0.0
-
- # 6. Rotate full-res image
- center = (orig_w // 2, orig_h // 2)
- M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
- rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
- flags=cv2.INTER_LINEAR,
- borderMode=cv2.BORDER_REPLICATE)
-
- # Encode back to PNG
- success, png_buf = cv2.imencode(".png", rotated)
- if not success:
- logger.warning("deskew_by_word_alignment: PNG encoding failed")
- return image_data, 0.0
-
- return png_buf.tobytes(), angle_deg
-
-
-def _projection_gradient_score(profile: np.ndarray) -> float:
- """Score a projection profile by the L2-norm of its first derivative.
-
- Higher score = sharper transitions between text-lines and gaps,
- i.e. better row/column alignment.
- """
- diff = np.diff(profile)
- return float(np.sum(diff * diff))
-
-
-def deskew_image_iterative(
- img: np.ndarray,
- coarse_range: float = 5.0,
- coarse_step: float = 0.1,
- fine_range: float = 0.15,
- fine_step: float = 0.02,
-) -> Tuple[np.ndarray, float, Dict[str, Any]]:
- """Iterative deskew using vertical-edge projection optimisation.
-
- The key insight: at the correct rotation angle, vertical features
- (word left-edges, column borders) become truly vertical, producing
- the sharpest peaks in the vertical projection of vertical edges.
-
- Method:
- 1. Detect vertical edges via Sobel-X on the central crop.
- 2. Coarse sweep: rotate edge image, compute vertical projection
- gradient score. The angle where vertical edges align best wins.
- 3. Fine sweep: refine around the coarse winner.
-
- Args:
- img: BGR image (full resolution).
- coarse_range: half-range in degrees for the coarse sweep.
- coarse_step: step size in degrees for the coarse sweep.
- fine_range: half-range around the coarse winner for the fine sweep.
- fine_step: step size in degrees for the fine sweep.
-
- Returns:
- (rotated_bgr, angle_degrees, debug_dict)
- """
- h, w = img.shape[:2]
- debug: Dict[str, Any] = {}
-
- # --- Grayscale + vertical edge detection ---
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # Central crop (15%-85% height, 10%-90% width) to avoid page margins
- y_lo, y_hi = int(h * 0.15), int(h * 0.85)
- x_lo, x_hi = int(w * 0.10), int(w * 0.90)
- gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
-
- # Sobel-X → absolute vertical edges
- sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
- edges = np.abs(sobel_x)
- # Normalise to 0-255 for consistent scoring
- edge_max = edges.max()
- if edge_max > 0:
- edges = (edges / edge_max * 255).astype(np.uint8)
- else:
- return img, 0.0, {"error": "no edges detected"}
-
- crop_h, crop_w = edges.shape[:2]
- crop_center = (crop_w // 2, crop_h // 2)
-
- # Trim margin after rotation to avoid border artifacts
- trim_y = max(4, int(crop_h * 0.03))
- trim_x = max(4, int(crop_w * 0.03))
-
- def _sweep_edges(angles: np.ndarray) -> list:
- """Score each angle by vertical projection gradient of vertical edges."""
- results = []
- for angle in angles:
- if abs(angle) < 1e-6:
- rotated = edges
- else:
- M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
- rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
- flags=cv2.INTER_NEAREST,
- borderMode=cv2.BORDER_REPLICATE)
- # Trim borders to avoid edge artifacts
- trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
- v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
- score = _projection_gradient_score(v_profile)
- results.append((float(angle), score))
- return results
-
- # --- Phase 1: coarse sweep ---
- coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
- coarse_results = _sweep_edges(coarse_angles)
- best_coarse = max(coarse_results, key=lambda x: x[1])
- best_coarse_angle, best_coarse_score = best_coarse
-
- debug["coarse_best_angle"] = round(best_coarse_angle, 2)
- debug["coarse_best_score"] = round(best_coarse_score, 1)
- debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
-
- # --- Phase 2: fine sweep around coarse winner ---
- fine_lo = best_coarse_angle - fine_range
- fine_hi = best_coarse_angle + fine_range
- fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
- fine_results = _sweep_edges(fine_angles)
- best_fine = max(fine_results, key=lambda x: x[1])
- best_fine_angle, best_fine_score = best_fine
-
- debug["fine_best_angle"] = round(best_fine_angle, 2)
- debug["fine_best_score"] = round(best_fine_score, 1)
- debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
-
- final_angle = best_fine_angle
-
- # Clamp to ±5°
- final_angle = max(-5.0, min(5.0, final_angle))
-
- logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}° fine={best_fine_angle:.2f}° -> {final_angle:.2f}°")
-
- if abs(final_angle) < 0.05:
- return img, 0.0, debug
-
- # --- Rotate full-res image ---
- center = (w // 2, h // 2)
- M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
- rotated = cv2.warpAffine(img, M, (w, h),
- flags=cv2.INTER_LINEAR,
- borderMode=cv2.BORDER_REPLICATE)
-
- return rotated, final_angle, debug
-
-
-def _measure_textline_slope(img: np.ndarray) -> float:
- """Measure residual text-line slope via Tesseract word-position regression.
-
- Groups Tesseract words by (block, par, line), fits a linear regression
- per line (y = slope * x + b), and returns the trimmed-mean slope in
- degrees. Positive = text rises to the right, negative = falls.
-
- This is the most direct measurement of remaining rotation after deskew.
- """
- import math as _math
-
- if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
- return 0.0
-
- h, w = img.shape[:2]
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- data = pytesseract.image_to_data(
- Image.fromarray(gray),
- output_type=pytesseract.Output.DICT,
- config="--psm 6",
- )
-
- # Group word centres by text line
- lines: Dict[tuple, list] = {}
- for i in range(len(data["text"])):
- txt = (data["text"][i] or "").strip()
- if len(txt) < 2 or int(data["conf"][i]) < 30:
- continue
- key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
- cx = data["left"][i] + data["width"][i] / 2.0
- cy = data["top"][i] + data["height"][i] / 2.0
- lines.setdefault(key, []).append((cx, cy))
-
- # Per-line linear regression → slope angle
- slopes: list = []
- for pts in lines.values():
- if len(pts) < 3:
- continue
- pts.sort(key=lambda p: p[0])
- xs = np.array([p[0] for p in pts], dtype=np.float64)
- ys = np.array([p[1] for p in pts], dtype=np.float64)
- if xs[-1] - xs[0] < w * 0.15:
- continue # skip short lines
- A = np.vstack([xs, np.ones_like(xs)]).T
- result = np.linalg.lstsq(A, ys, rcond=None)
- slope = result[0][0]
- slopes.append(_math.degrees(_math.atan(slope)))
-
- if len(slopes) < 3:
- return 0.0
-
- # Trimmed mean (drop 10% extremes on each side)
- slopes.sort()
- trim = max(1, len(slopes) // 10)
- trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
- if not trimmed:
- return 0.0
-
- return sum(trimmed) / len(trimmed)
-
-
-def deskew_two_pass(
- img: np.ndarray,
- coarse_range: float = 5.0,
-) -> Tuple[np.ndarray, float, Dict[str, Any]]:
- """Two-pass deskew: iterative projection + word-alignment residual check.
-
- Pass 1: ``deskew_image_iterative()`` (vertical-edge projection, wide range).
- Pass 2: ``deskew_image_by_word_alignment()`` on the already-corrected image
- to detect and fix residual skew that the projection method missed.
-
- The two corrections are summed. If the residual from Pass 2 is below
- 0.3° it is ignored (already good enough).
-
- Returns:
- (corrected_bgr, total_angle_degrees, debug_dict)
- """
- debug: Dict[str, Any] = {}
-
- # --- Pass 1: iterative projection ---
- corrected, angle1, dbg1 = deskew_image_iterative(
- img.copy(), coarse_range=coarse_range,
- )
- debug["pass1_angle"] = round(angle1, 3)
- debug["pass1_method"] = "iterative"
- debug["pass1_debug"] = dbg1
-
- # --- Pass 2: word-alignment residual check on corrected image ---
- angle2 = 0.0
- try:
- # Encode the corrected image to PNG bytes for word-alignment
- ok, buf = cv2.imencode(".png", corrected)
- if ok:
- corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
- if abs(angle2) >= 0.3:
- # Significant residual — decode and use the second correction
- arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
- corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
- if corrected2 is not None:
- corrected = corrected2
- logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° applied "
- f"(total={angle1 + angle2:.2f}°)")
- else:
- angle2 = 0.0
- else:
- logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° < 0.3° — skipped")
- angle2 = 0.0
- except Exception as e:
- logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
- angle2 = 0.0
-
- # --- Pass 3: Tesseract text-line regression residual check ---
- # The most reliable final check: measure actual text-line slopes
- # using Tesseract word positions and linear regression per line.
- angle3 = 0.0
- try:
- residual = _measure_textline_slope(corrected)
- debug["pass3_raw"] = round(residual, 3)
- if abs(residual) >= 0.3:
- h3, w3 = corrected.shape[:2]
- center3 = (w3 // 2, h3 // 2)
- M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
- corrected = cv2.warpAffine(
- corrected, M3, (w3, h3),
- flags=cv2.INTER_LINEAR,
- borderMode=cv2.BORDER_REPLICATE,
- )
- angle3 = residual
- logger.info(
- "deskew_two_pass: pass3 text-line residual=%.2f° applied",
- residual,
- )
- else:
- logger.info(
- "deskew_two_pass: pass3 text-line residual=%.2f° < 0.3° — skipped",
- residual,
- )
- except Exception as e:
- logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
-
- total_angle = angle1 + angle2 + angle3
- debug["pass2_angle"] = round(angle2, 3)
- debug["pass2_method"] = "word_alignment"
- debug["pass3_angle"] = round(angle3, 3)
- debug["pass3_method"] = "textline_regression"
- debug["total_angle"] = round(total_angle, 3)
-
- logger.info(
- "deskew_two_pass: pass1=%.2f° + pass2=%.2f° + pass3=%.2f° = %.2f°",
- angle1, angle2, angle3, total_angle,
- )
-
- return corrected, total_angle, debug
-
-
-# =============================================================================
-# Stage 3: Dewarp (Book Curvature Correction)
-# =============================================================================
-
-def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
- """Detect the vertical shear angle of the page.
-
- After deskew (horizontal lines aligned), vertical features like column
- edges may still be tilted. This measures that tilt by tracking the
- strongest vertical edge across horizontal strips.
-
- The result is a shear angle in degrees: the angular difference between
- true vertical and the detected column edge.
-
- Returns:
- Dict with keys: method, shear_degrees, confidence.
- """
- h, w = img.shape[:2]
- result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
-
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # Vertical Sobel to find vertical edges
- sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
- abs_sobel = np.abs(sobel_x).astype(np.uint8)
-
- # Binarize with Otsu
- _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-
- num_strips = 20
- strip_h = h // num_strips
- edge_positions = [] # (y_center, x_position)
-
- for i in range(num_strips):
- y_start = i * strip_h
- y_end = min((i + 1) * strip_h, h)
- strip = binary[y_start:y_end, :]
-
- # Project vertically (sum along y-axis)
- projection = np.sum(strip, axis=0).astype(np.float64)
- if projection.max() == 0:
- continue
-
- # Find the strongest vertical edge in left 40% of image
- search_w = int(w * 0.4)
- left_proj = projection[:search_w]
- if left_proj.max() == 0:
- continue
-
- # Smooth and find peak
- kernel_size = max(3, w // 100)
- if kernel_size % 2 == 0:
- kernel_size += 1
- smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
- x_pos = float(np.argmax(smoothed))
- y_center = (y_start + y_end) / 2.0
- edge_positions.append((y_center, x_pos))
-
- if len(edge_positions) < 8:
- return result
-
- ys = np.array([p[0] for p in edge_positions])
- xs = np.array([p[1] for p in edge_positions])
-
- # Remove outliers (> 2 std from median)
- median_x = np.median(xs)
- std_x = max(np.std(xs), 1.0)
- mask = np.abs(xs - median_x) < 2 * std_x
- ys = ys[mask]
- xs = xs[mask]
-
- if len(ys) < 6:
- return result
-
- # Fit straight line: x = slope * y + intercept
- # The slope tells us the tilt of the vertical edge
- straight_coeffs = np.polyfit(ys, xs, 1)
- slope = straight_coeffs[0] # dx/dy in pixels
- fitted = np.polyval(straight_coeffs, ys)
- residuals = xs - fitted
- rmse = float(np.sqrt(np.mean(residuals ** 2)))
-
- # Convert slope to angle: arctan(dx/dy) in degrees
- import math
- shear_degrees = math.degrees(math.atan(slope))
-
- confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
-
- result["shear_degrees"] = round(shear_degrees, 3)
- result["confidence"] = round(float(confidence), 2)
-
- return result
-
-
-def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
- """Detect shear angle by maximising variance of horizontal text-line projections.
-
- Principle: horizontal text lines produce a row-projection profile with sharp
- peaks (high variance) when the image is correctly aligned. Any residual shear
- smears the peaks and reduces variance. We sweep ±3° and pick the angle whose
- corrected projection has the highest variance.
-
- Works best on pages with clear horizontal banding (vocabulary tables, prose).
- Complements _detect_shear_angle() which needs strong vertical edges.
-
- Returns:
- Dict with keys: method, shear_degrees, confidence.
- """
- import math
- result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
-
- h, w = img.shape[:2]
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # Otsu binarisation
- _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
- # Work at half resolution for speed
- small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
- sh, sw = small.shape
-
- # 2-pass angle sweep for 10x better precision:
- # Pass 1: Coarse sweep ±3° in 0.5° steps (13 values)
- # Pass 2: Fine sweep ±0.5° around coarse best in 0.05° steps (21 values)
-
- def _sweep_variance(angles_list):
- results = []
- for angle_deg in angles_list:
- if abs(angle_deg) < 0.001:
- rotated = small
- else:
- shear_tan = math.tan(math.radians(angle_deg))
- M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
- rotated = cv2.warpAffine(small, M, (sw, sh),
- flags=cv2.INTER_NEAREST,
- borderMode=cv2.BORDER_CONSTANT)
- profile = np.sum(rotated, axis=1).astype(float)
- results.append((angle_deg, float(np.var(profile))))
- return results
-
- # Pass 1: coarse
- coarse_angles = [a * 0.5 for a in range(-6, 7)] # 13 values
- coarse_results = _sweep_variance(coarse_angles)
- coarse_best = max(coarse_results, key=lambda x: x[1])
-
- # Pass 2: fine around coarse best
- fine_center = coarse_best[0]
- fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)] # 21 values
- fine_results = _sweep_variance(fine_angles)
- fine_best = max(fine_results, key=lambda x: x[1])
-
- best_angle = fine_best[0]
- best_variance = fine_best[1]
- variances = coarse_results + fine_results
-
- # Confidence: how much sharper is the best angle vs. the mean?
- all_mean = sum(v for _, v in variances) / len(variances)
- if all_mean > 0 and best_variance > all_mean:
- confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
- else:
- confidence = 0.0
-
- result["shear_degrees"] = round(best_angle, 3)
- result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
- return result
-
-
-def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
- """Detect shear using Hough transform on printed table / ruled lines.
-
- Vocabulary worksheets have near-horizontal printed table borders. After
- deskew these should be exactly horizontal; any residual tilt equals the
- vertical shear angle (with inverted sign).
-
- The sign convention: a horizontal line tilting +α degrees (left end lower)
- means the page has vertical shear of -α degrees (left column edge drifts
- to the left going downward).
-
- Returns:
- Dict with keys: method, shear_degrees, confidence.
- """
- result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
-
- h, w = img.shape[:2]
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- edges = cv2.Canny(gray, 50, 150, apertureSize=3)
-
- min_len = int(w * 0.15)
- lines = cv2.HoughLinesP(
- edges, rho=1, theta=np.pi / 360,
- threshold=int(w * 0.08),
- minLineLength=min_len,
- maxLineGap=20,
- )
-
- if lines is None or len(lines) < 3:
- return result
-
- horizontal_angles: List[Tuple[float, float]] = []
- for line in lines:
- x1, y1, x2, y2 = line[0]
- if x1 == x2:
- continue
- angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
- if abs(angle) <= 5.0:
- length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
- horizontal_angles.append((angle, length))
-
- if len(horizontal_angles) < 3:
- return result
-
- # Weighted median
- angles_arr = np.array([a for a, _ in horizontal_angles])
- weights_arr = np.array([l for _, l in horizontal_angles])
- sorted_idx = np.argsort(angles_arr)
- s_angles = angles_arr[sorted_idx]
- s_weights = weights_arr[sorted_idx]
- cum = np.cumsum(s_weights)
- mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
- median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
-
- agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
- confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
-
- # Sign inversion: horizontal line tilt is complementary to vertical shear
- shear_degrees = -median_angle
-
- result["shear_degrees"] = round(shear_degrees, 3)
- result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
- return result
-
-
-def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
- """Detect shear by measuring text-line straightness (Method D).
-
- Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
- bounding boxes, groups them into vertical columns by X-proximity,
- and measures how the left-edge X position drifts with Y (vertical
- position). The drift dx/dy is the tangent of the shear angle.
-
- This directly measures vertical shear (column tilt) rather than
- horizontal text-line slope, which is already corrected by deskew.
-
- Returns:
- Dict with keys: method, shear_degrees, confidence.
- """
- import math
- result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
-
- h, w = img.shape[:2]
- # Downscale 50% for speed
- scale = 0.5
- small = cv2.resize(img, (int(w * scale), int(h * scale)),
- interpolation=cv2.INTER_AREA)
- gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
- pil_img = Image.fromarray(gray)
-
- try:
- data = pytesseract.image_to_data(
- pil_img, lang='eng+deu', config='--psm 11 --oem 3',
- output_type=pytesseract.Output.DICT,
- )
- except Exception:
- return result
-
- # Collect word left-edges (x) and vertical centres (y)
- words = []
- for i in range(len(data['text'])):
- text = data['text'][i].strip()
- conf = int(data['conf'][i])
- if not text or conf < 20 or len(text) < 2:
- continue
- left_x = float(data['left'][i])
- cy = data['top'][i] + data['height'][i] / 2.0
- word_w = float(data['width'][i])
- words.append((left_x, cy, word_w))
-
- if len(words) < 15:
- return result
-
- # --- Group words into vertical columns by left-edge X proximity ---
- # Sort by x, then cluster words whose left-edges are within x_tol
- avg_w = sum(ww for _, _, ww in words) / len(words)
- x_tol = max(avg_w * 0.4, 8) # tolerance for "same column"
-
- words_by_x = sorted(words, key=lambda w: w[0])
- columns: List[List[Tuple[float, float]]] = [] # each: [(left_x, cy), ...]
- cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
- cur_x = words_by_x[0][0]
-
- for lx, cy, _ in words_by_x[1:]:
- if abs(lx - cur_x) <= x_tol:
- cur_col.append((lx, cy))
- # Update running x as median of cluster
- cur_x = cur_x * 0.8 + lx * 0.2
- else:
- if len(cur_col) >= 5:
- columns.append(cur_col)
- cur_col = [(lx, cy)]
- cur_x = lx
- if len(cur_col) >= 5:
- columns.append(cur_col)
-
- if len(columns) < 2:
- return result
-
- # --- For each column, measure X-drift as a function of Y ---
- # Fit: left_x = a * cy + b → a = dx/dy = tan(shear_angle)
- drifts = []
- for col in columns:
- ys = np.array([p[1] for p in col])
- xs = np.array([p[0] for p in col])
- y_range = ys.max() - ys.min()
- if y_range < h * scale * 0.3:
- continue # column must span at least 30% of image height
- # Linear regression: x = a*y + b
- coeffs = np.polyfit(ys, xs, 1)
- drifts.append(coeffs[0]) # dx/dy
-
- if len(drifts) < 2:
- return result
-
- # Median dx/dy → shear angle
- # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
- median_drift = float(np.median(drifts))
- shear_degrees = math.degrees(math.atan(median_drift))
-
- # Confidence from column count + drift consistency
- drift_std = float(np.std(drifts))
- consistency = max(0.0, 1.0 - drift_std * 50) # tighter penalty for drift variance
- count_factor = min(1.0, len(drifts) / 4.0)
- confidence = count_factor * 0.5 + consistency * 0.5
-
- result["shear_degrees"] = round(shear_degrees, 3)
- result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
- logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
- "shear=%.3f°, conf=%.2f",
- len(columns), len(drifts), median_drift,
- shear_degrees, confidence)
- return result
-
-
-def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
- """Check whether the dewarp correction actually improved alignment.
-
- Compares horizontal projection variance before and after correction.
- Higher variance means sharper text-line peaks, which indicates better
- horizontal alignment.
-
- Returns True if the correction improved the image, False if it should
- be discarded.
- """
- def _h_proj_variance(img: np.ndarray) -> float:
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- _, binary = cv2.threshold(gray, 0, 255,
- cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
- small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
- interpolation=cv2.INTER_AREA)
- profile = np.sum(small, axis=1).astype(float)
- return float(np.var(profile))
-
- var_before = _h_proj_variance(original)
- var_after = _h_proj_variance(corrected)
-
- # Correction must improve variance (even by a tiny margin)
- return var_after > var_before
-
-
-def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
- """Apply a vertical shear correction to an image.
-
- Shifts each row horizontally proportional to its distance from the
- vertical center. This corrects the tilt of vertical features (columns)
- without affecting horizontal alignment (text lines).
-
- Args:
- img: BGR image.
- shear_degrees: Shear angle in degrees. Positive = shift top-right/bottom-left.
-
- Returns:
- Corrected image.
- """
- import math
- h, w = img.shape[:2]
- shear_tan = math.tan(math.radians(shear_degrees))
-
- # Affine matrix: shift x by shear_tan * (y - h/2)
- # [1 shear_tan -h/2*shear_tan]
- # [0 1 0 ]
- M = np.float32([
- [1, shear_tan, -h / 2.0 * shear_tan],
- [0, 1, 0],
- ])
-
- corrected = cv2.warpAffine(img, M, (w, h),
- flags=cv2.INTER_LINEAR,
- borderMode=cv2.BORDER_REPLICATE)
- return corrected
-
-
-def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
- """Combine multiple shear detections into a single weighted estimate (v2).
-
- Ensemble v2 changes vs v1:
- - Minimum confidence raised to 0.5 (was 0.3)
- - text_lines method gets 1.5× weight boost (most reliable detector)
- - Outlier filter at 1° from weighted mean
-
- Returns:
- (shear_degrees, ensemble_confidence, methods_used_str)
- """
- # Confidence threshold — lowered from 0.5 to 0.35 to catch subtle shear
- # that individual methods detect with moderate confidence.
- _MIN_CONF = 0.35
-
- # text_lines gets a weight boost as the most content-aware method
- _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
-
- accepted = []
- for d in detections:
- if d["confidence"] < _MIN_CONF:
- continue
- boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
- effective_conf = d["confidence"] * boost
- accepted.append((d["shear_degrees"], effective_conf, d["method"]))
-
- if not accepted:
- return 0.0, 0.0, "none"
-
- if len(accepted) == 1:
- deg, conf, method = accepted[0]
- return deg, min(conf, 1.0), method
-
- # First pass: weighted mean
- total_w = sum(c for _, c, _ in accepted)
- w_mean = sum(d * c for d, c, _ in accepted) / total_w
-
- # Outlier filter: keep results within 1° of weighted mean
- filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
- if not filtered:
- filtered = accepted # fallback: keep all
-
- # Second pass: weighted mean on filtered results
- total_w2 = sum(c for _, c, _ in filtered)
- final_deg = sum(d * c for d, c, _ in filtered) / total_w2
-
- # Ensemble confidence: average of individual confidences, boosted when
- # methods agree (all within 0.5° of each other)
- avg_conf = total_w2 / len(filtered)
- spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
- agreement_bonus = 0.15 if spread < 0.5 else 0.0
- ensemble_conf = min(1.0, avg_conf + agreement_bonus)
-
- methods_str = "+".join(m for _, _, m in filtered)
- return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
-
-
-def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
- """Correct vertical shear after deskew (v2 with quality gate).
-
- After deskew aligns horizontal text lines, vertical features (column
- edges) may still be tilted. This detects the tilt angle using an ensemble
- of four complementary methods and applies an affine shear correction.
-
- Methods (all run in ~150ms total):
- A. _detect_shear_angle() — vertical edge profile (~50ms)
- B. _detect_shear_by_projection() — horizontal text-line variance (~30ms)
- C. _detect_shear_by_hough() — Hough lines on table borders (~20ms)
- D. _detect_shear_by_text_lines() — text-line straightness (~50ms)
-
- Quality gate: after correction, horizontal projection variance is compared
- before vs after. If correction worsened alignment, it is discarded.
-
- Args:
- img: BGR image (already deskewed).
- use_ensemble: If False, fall back to single-method behaviour (method A only).
-
- Returns:
- Tuple of (corrected_image, dewarp_info).
- dewarp_info keys: method, shear_degrees, confidence, detections.
- """
- no_correction = {
- "method": "none",
- "shear_degrees": 0.0,
- "confidence": 0.0,
- "detections": [],
- }
-
- if not CV2_AVAILABLE:
- return img, no_correction
-
- t0 = time.time()
-
- if use_ensemble:
- det_a = _detect_shear_angle(img)
- det_b = _detect_shear_by_projection(img)
- det_c = _detect_shear_by_hough(img)
- det_d = _detect_shear_by_text_lines(img)
- detections = [det_a, det_b, det_c, det_d]
- shear_deg, confidence, method = _ensemble_shear(detections)
- else:
- det_a = _detect_shear_angle(img)
- detections = [det_a]
- shear_deg = det_a["shear_degrees"]
- confidence = det_a["confidence"]
- method = det_a["method"]
-
- duration = time.time() - t0
-
- logger.info(
- "dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | "
- "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
- shear_deg, confidence, method, duration,
- detections[0]["shear_degrees"], detections[0]["confidence"],
- detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
- detections[1]["confidence"] if len(detections) > 1 else 0.0,
- detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
- detections[2]["confidence"] if len(detections) > 2 else 0.0,
- detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
- detections[3]["confidence"] if len(detections) > 3 else 0.0,
- )
-
- # Always include individual detections (even when no correction applied)
- _all_detections = [
- {"method": d["method"], "shear_degrees": d["shear_degrees"],
- "confidence": d["confidence"]}
- for d in detections
- ]
-
- # Thresholds: very small shear (<0.08°) is truly irrelevant for OCR.
- # For ensemble confidence, require at least 0.4 (lowered from 0.5 to
- # catch moderate-confidence detections from multiple agreeing methods).
- if abs(shear_deg) < 0.08 or confidence < 0.4:
- no_correction["detections"] = _all_detections
- return img, no_correction
-
- # Apply correction (negate the detected shear to straighten)
- corrected = _apply_shear(img, -shear_deg)
-
- # Quality gate: verify the correction actually improved alignment.
- # For small corrections (< 0.5°), the projection variance change can be
- # negligible, so we skip the quality gate — the cost of a tiny wrong
- # correction is much less than the cost of leaving 0.4° uncorrected
- # (which shifts content ~25px at image edges on tall scans).
- if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
- logger.info("dewarp: quality gate REJECTED correction (%.3f°) — "
- "projection variance did not improve", shear_deg)
- no_correction["detections"] = _all_detections
- return img, no_correction
-
- info = {
- "method": method,
- "shear_degrees": shear_deg,
- "confidence": confidence,
- "detections": _all_detections,
- }
-
- return corrected, info
-
-
-def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
- """Apply shear correction with a manual angle.
-
- Args:
- img: BGR image (deskewed, before dewarp).
- shear_degrees: Shear angle in degrees to correct.
-
- Returns:
- Corrected image.
- """
- if abs(shear_degrees) < 0.001:
- return img
- return _apply_shear(img, -shear_degrees)
-
-
-# =============================================================================
-# Document Type Detection
-# =============================================================================
-
-def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
- """Detect whether the page is a vocab table, generic table, or full text.
-
- Uses projection profiles and text density analysis — no OCR required.
- Runs in < 2 seconds.
-
- Args:
- ocr_img: Binarized grayscale image (for projection profiles).
- img_bgr: BGR color image.
-
- Returns:
- DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
- """
- if ocr_img is None or ocr_img.size == 0:
- return DocumentTypeResult(
- doc_type='full_text', confidence=0.5, pipeline='full_page',
- skip_steps=['columns', 'rows'],
- features={'error': 'empty image'},
- )
-
- h, w = ocr_img.shape[:2]
-
- # --- 1. Vertical projection profile → detect column gaps ---
- # Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
- # Invert: dark pixels on white background → high values = text.
- vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
-
- # Smooth the profile to avoid noise spikes
- kernel_size = max(3, w // 100)
- if kernel_size % 2 == 0:
- kernel_size += 1
- vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
-
- # Find significant vertical gaps (columns of near-zero text density)
- # A gap must be at least 1% of image width and have < 5% of max density
- max_density = max(vert_smooth.max(), 1)
- gap_threshold = max_density * 0.05
- min_gap_width = max(5, w // 100)
-
- in_gap = False
- gap_count = 0
- gap_start = 0
- vert_gaps = []
-
- for x in range(w):
- if vert_smooth[x] < gap_threshold:
- if not in_gap:
- in_gap = True
- gap_start = x
- else:
- if in_gap:
- gap_width = x - gap_start
- if gap_width >= min_gap_width:
- gap_count += 1
- vert_gaps.append((gap_start, x, gap_width))
- in_gap = False
-
- # Filter out margin gaps (within 10% of image edges)
- margin_threshold = w * 0.10
- internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
- internal_gap_count = len(internal_gaps)
-
- # --- 2. Horizontal projection profile → detect row gaps ---
- horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
- h_kernel = max(3, h // 200)
- if h_kernel % 2 == 0:
- h_kernel += 1
- horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
-
- h_max = max(horiz_smooth.max(), 1)
- h_gap_threshold = h_max * 0.05
- min_row_gap = max(3, h // 200)
-
- row_gap_count = 0
- in_gap = False
- for y in range(h):
- if horiz_smooth[y] < h_gap_threshold:
- if not in_gap:
- in_gap = True
- gap_start = y
- else:
- if in_gap:
- if y - gap_start >= min_row_gap:
- row_gap_count += 1
- in_gap = False
-
- # --- 3. Text density distribution (4×4 grid) ---
- grid_rows, grid_cols = 4, 4
- cell_h, cell_w = h // grid_rows, w // grid_cols
- densities = []
- for gr in range(grid_rows):
- for gc in range(grid_cols):
- cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
- gc * cell_w:(gc + 1) * cell_w]
- if cell.size > 0:
- d = float(np.count_nonzero(cell < 128)) / cell.size
- densities.append(d)
-
- density_std = float(np.std(densities)) if densities else 0
- density_mean = float(np.mean(densities)) if densities else 0
-
- features = {
- 'vertical_gaps': gap_count,
- 'internal_vertical_gaps': internal_gap_count,
- 'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
- 'row_gaps': row_gap_count,
- 'density_mean': round(density_mean, 4),
- 'density_std': round(density_std, 4),
- 'image_size': (w, h),
- }
-
- # --- 4. Decision tree ---
- # Use internal_gap_count (excludes margin gaps) for column detection.
- if internal_gap_count >= 2 and row_gap_count >= 5:
- # Multiple internal vertical gaps + many row gaps → table
- confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
- return DocumentTypeResult(
- doc_type='vocab_table',
- confidence=round(confidence, 2),
- pipeline='cell_first',
- skip_steps=[],
- features=features,
- )
- elif internal_gap_count >= 1 and row_gap_count >= 3:
- # Some internal structure, likely a table
- confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
- return DocumentTypeResult(
- doc_type='generic_table',
- confidence=round(confidence, 2),
- pipeline='cell_first',
- skip_steps=[],
- features=features,
- )
- elif internal_gap_count == 0:
- # No internal column gaps → full text (regardless of density)
- confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
- return DocumentTypeResult(
- doc_type='full_text',
- confidence=round(confidence, 2),
- pipeline='full_page',
- skip_steps=['columns', 'rows'],
- features=features,
- )
- else:
- # Ambiguous — default to vocab_table (most common use case)
- return DocumentTypeResult(
- doc_type='vocab_table',
- confidence=0.5,
- pipeline='cell_first',
- skip_steps=[],
- features=features,
- )
-
-
-# =============================================================================
-# Stage 4: Dual Image Preparation
-# =============================================================================
-
-def create_ocr_image(img: np.ndarray) -> np.ndarray:
- """Create a binarized image optimized for Tesseract OCR.
-
- Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
-
- Args:
- img: BGR image.
-
- Returns:
- Binary image (white text on black background inverted to black on white).
- """
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-
- # Background normalization: divide by blurred version
- bg = cv2.GaussianBlur(gray, (51, 51), 0)
- normalized = cv2.divide(gray, bg, scale=255)
-
- # Adaptive binarization
- binary = cv2.adaptiveThreshold(
- normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
- cv2.THRESH_BINARY, 31, 10
- )
-
- # Light denoise
- denoised = cv2.medianBlur(binary, 3)
-
- return denoised
-
-
-def create_layout_image(img: np.ndarray) -> np.ndarray:
- """Create a CLAHE-enhanced grayscale image for layout analysis.
-
- Args:
- img: BGR image.
-
- Returns:
- Enhanced grayscale image.
- """
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
- enhanced = clahe.apply(gray)
- return enhanced
-
-
-# =============================================================================
-# Stage 5: Layout Analysis (Projection Profiles)
-# =============================================================================
-
-def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
- """Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
- out = mask.copy()
- n = len(out)
- i = 0
- while i < n:
- if out[i]:
- start = i
- while i < n and out[i]:
- i += 1
- if (i - start) < min_width:
- out[start:i] = False
- else:
- i += 1
- return out
-
-
-def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
- """Find the bounding box of actual text content (excluding page margins).
-
- Scan artefacts (thin black lines at page edges) are filtered out by
- discarding contiguous projection runs narrower than 1 % of the image
- dimension (min 5 px).
-
- Returns:
- Tuple of (left_x, right_x, top_y, bottom_y).
- """
- h, w = inv.shape[:2]
- threshold = 0.005
-
- # --- Horizontal projection for top/bottom ---
- h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
- h_mask = h_proj > threshold
- min_h_run = max(5, h // 100)
- h_mask = _filter_narrow_runs(h_mask, min_h_run)
-
- top_y = 0
- for y in range(h):
- if h_mask[y]:
- top_y = max(0, y - 5)
- break
-
- bottom_y = h
- for y in range(h - 1, 0, -1):
- if h_mask[y]:
- bottom_y = min(h, y + 5)
- break
-
- # --- Vertical projection for left/right margins ---
- v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
- v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
- v_mask = v_proj_norm > threshold
- min_v_run = max(5, w // 100)
- v_mask = _filter_narrow_runs(v_mask, min_v_run)
-
- left_x = 0
- for x in range(w):
- if v_mask[x]:
- left_x = max(0, x - 2)
- break
-
- right_x = w
- for x in range(w - 1, 0, -1):
- if v_mask[x]:
- right_x = min(w, x + 2)
- break
-
- return left_x, right_x, top_y, bottom_y
-
-
-def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
- """Detect columns, header, and footer using projection profiles.
-
- Uses content-bounds detection to exclude page margins before searching
- for column separators within the actual text area.
-
- Args:
- layout_img: CLAHE-enhanced grayscale image.
- ocr_img: Binarized image for text density analysis.
-
- Returns:
- List of PageRegion objects describing detected regions.
- """
- h, w = ocr_img.shape[:2]
-
- # Invert: black text on white → white text on black for projection
- inv = cv2.bitwise_not(ocr_img)
-
- # --- Find actual content bounds (exclude page margins) ---
- left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
- content_w = right_x - left_x
- content_h = bottom_y - top_y
-
- logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
- f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
-
- if content_w < w * 0.3 or content_h < h * 0.3:
- # Fallback if detection seems wrong
- left_x, right_x = 0, w
- top_y, bottom_y = 0, h
- content_w, content_h = w, h
-
- # --- Vertical projection within content area to find column separators ---
- content_strip = inv[top_y:bottom_y, left_x:right_x]
- v_proj = np.sum(content_strip, axis=0).astype(float)
- v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
-
- # Smooth the projection profile
- kernel_size = max(5, content_w // 50)
- if kernel_size % 2 == 0:
- kernel_size += 1
- v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
-
- # Debug: log projection profile statistics
- p_mean = float(np.mean(v_proj_smooth))
- p_median = float(np.median(v_proj_smooth))
- p_min = float(np.min(v_proj_smooth))
- p_max = float(np.max(v_proj_smooth))
- logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
- f"mean={p_mean:.4f}, median={p_median:.4f}")
-
- # Find valleys using multiple threshold strategies
- # Strategy 1: relative to median (catches clear separators)
- # Strategy 2: local minima approach (catches subtle gaps)
- threshold = max(p_median * 0.3, p_mean * 0.2)
- logger.info(f"Layout: valley threshold={threshold:.4f}")
-
- in_valley = v_proj_smooth < threshold
-
- # Find contiguous valley regions
- all_valleys = []
- start = None
- for x in range(len(v_proj_smooth)):
- if in_valley[x] and start is None:
- start = x
- elif not in_valley[x] and start is not None:
- valley_width = x - start
- valley_depth = float(np.min(v_proj_smooth[start:x]))
- # Valley must be at least 3px wide
- if valley_width >= 3:
- all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
- start = None
-
- logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
- f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
-
- # Filter: valleys must be inside the content area (not at edges)
- inner_margin = int(content_w * 0.08)
- valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
-
- # If no valleys found with strict threshold, try local minima approach
- if len(valleys) < 2:
- logger.info("Layout: trying local minima approach for column detection")
- # Divide content into 20 segments, find the 2 lowest
- seg_count = 20
- seg_width = content_w // seg_count
- seg_scores = []
- for i in range(seg_count):
- sx = i * seg_width
- ex = min((i + 1) * seg_width, content_w)
- seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
- seg_scores.append((i, sx, ex, seg_mean))
-
- seg_scores.sort(key=lambda s: s[3])
- logger.info(f"Layout: segment scores (lowest 5): "
- f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
-
- # Find two lowest non-adjacent segments that create reasonable columns
- candidate_valleys = []
- for seg_idx, sx, ex, seg_mean in seg_scores:
- # Must not be at the edges
- if seg_idx <= 1 or seg_idx >= seg_count - 2:
- continue
- # Must be significantly lower than overall mean
- if seg_mean < p_mean * 0.6:
- center = (sx + ex) // 2
- candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
-
- if len(candidate_valleys) >= 2:
- # Pick the best pair: non-adjacent, creating reasonable column widths
- candidate_valleys.sort(key=lambda v: v[2])
- best_pair = None
- best_score = float('inf')
- for i in range(len(candidate_valleys)):
- for j in range(i + 1, len(candidate_valleys)):
- c1 = candidate_valleys[i][2]
- c2 = candidate_valleys[j][2]
- # Must be at least 20% apart
- if (c2 - c1) < content_w * 0.2:
- continue
- col1 = c1
- col2 = c2 - c1
- col3 = content_w - c2
- # Each column at least 15%
- if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
- continue
- parts = sorted([col1, col2, col3])
- score = parts[2] - parts[0]
- if score < best_score:
- best_score = score
- best_pair = (candidate_valleys[i], candidate_valleys[j])
-
- if best_pair:
- valleys = list(best_pair)
- logger.info(f"Layout: local minima found 2 valleys: "
- f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
-
- logger.info(f"Layout: final {len(valleys)} valleys: "
- f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
-
- regions = []
-
- if len(valleys) >= 2:
- # 3-column layout detected
- valleys.sort(key=lambda v: v[2])
-
- if len(valleys) == 2:
- sep1_center = valleys[0][2]
- sep2_center = valleys[1][2]
- else:
- # Pick the two valleys that best divide into 3 parts
- # Prefer wider valleys (more likely true separators)
- best_pair = None
- best_score = float('inf')
- for i in range(len(valleys)):
- for j in range(i + 1, len(valleys)):
- c1, c2 = valleys[i][2], valleys[j][2]
- # Each column should be at least 15% of content width
- col1 = c1
- col2 = c2 - c1
- col3 = content_w - c2
- if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
- continue
- # Score: lower is better (more even distribution)
- parts = sorted([col1, col2, col3])
- score = parts[2] - parts[0]
- # Bonus for wider valleys (subtract valley width)
- score -= (valleys[i][3] + valleys[j][3]) * 0.5
- if score < best_score:
- best_score = score
- best_pair = (c1, c2)
- if best_pair:
- sep1_center, sep2_center = best_pair
- else:
- sep1_center = valleys[0][2]
- sep2_center = valleys[1][2]
-
- # Convert from content-relative to absolute coordinates
- abs_sep1 = sep1_center + left_x
- abs_sep2 = sep2_center + left_x
-
- logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
- f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
-
- regions.append(PageRegion(
- type='column_en', x=0, y=top_y,
- width=abs_sep1, height=content_h
- ))
- regions.append(PageRegion(
- type='column_de', x=abs_sep1, y=top_y,
- width=abs_sep2 - abs_sep1, height=content_h
- ))
- regions.append(PageRegion(
- type='column_example', x=abs_sep2, y=top_y,
- width=w - abs_sep2, height=content_h
- ))
-
- elif len(valleys) == 1:
- # 2-column layout
- abs_sep = valleys[0][2] + left_x
-
- logger.info(f"Layout: 2 columns at separator x={abs_sep}")
-
- regions.append(PageRegion(
- type='column_en', x=0, y=top_y,
- width=abs_sep, height=content_h
- ))
- regions.append(PageRegion(
- type='column_de', x=abs_sep, y=top_y,
- width=w - abs_sep, height=content_h
- ))
-
- else:
- # No columns detected — run full-page OCR as single column
- logger.warning("Layout: no column separators found, using full page")
- regions.append(PageRegion(
- type='column_en', x=0, y=top_y,
- width=w, height=content_h
- ))
-
- # Add header/footer info (gap-based detection with fallback)
- _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
-
- top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
- bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
- col_count = len([r for r in regions if r.type.startswith('column')])
- logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
-
- return regions
-
-
-# =============================================================================
-# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
-# =============================================================================
-
-# --- Phase A: Geometry Detection ---
-
-def _detect_columns_by_clustering(
- word_dicts: List[Dict],
- left_edges: List[int],
- edge_word_indices: List[int],
- content_w: int,
- content_h: int,
- left_x: int,
- right_x: int,
- top_y: int,
- bottom_y: int,
- inv: Optional[np.ndarray] = None,
-) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
- """Fallback: detect columns by clustering left-aligned word positions.
-
- Used when the primary gap-based algorithm finds fewer than 2 gaps.
- """
- tolerance = max(10, int(content_w * 0.01))
- sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
-
- clusters = []
- cluster_widxs = []
- cur_edges = [sorted_pairs[0][0]]
- cur_widxs = [sorted_pairs[0][1]]
- for edge, widx in sorted_pairs[1:]:
- if edge - cur_edges[-1] <= tolerance:
- cur_edges.append(edge)
- cur_widxs.append(widx)
- else:
- clusters.append(cur_edges)
- cluster_widxs.append(cur_widxs)
- cur_edges = [edge]
- cur_widxs = [widx]
- clusters.append(cur_edges)
- cluster_widxs.append(cur_widxs)
-
- MIN_Y_COVERAGE_PRIMARY = 0.30
- MIN_Y_COVERAGE_SECONDARY = 0.15
- MIN_WORDS_SECONDARY = 5
-
- cluster_infos = []
- for c_edges, c_widxs in zip(clusters, cluster_widxs):
- if len(c_edges) < 2:
- continue
- y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
- y_span = max(y_positions) - min(y_positions)
- y_coverage = y_span / content_h if content_h > 0 else 0.0
- cluster_infos.append({
- 'mean_x': int(np.mean(c_edges)),
- 'count': len(c_edges),
- 'min_edge': min(c_edges),
- 'max_edge': max(c_edges),
- 'y_min': min(y_positions),
- 'y_max': max(y_positions),
- 'y_coverage': y_coverage,
- })
-
- primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
- primary_set = set(id(c) for c in primary)
- secondary = [c for c in cluster_infos
- if id(c) not in primary_set
- and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
- and c['count'] >= MIN_WORDS_SECONDARY]
- significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
-
- if len(significant) < 3:
- logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
- return None
-
- merge_distance = max(30, int(content_w * 0.06))
- merged = [significant[0].copy()]
- for s in significant[1:]:
- if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
- prev = merged[-1]
- total = prev['count'] + s['count']
- avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
- prev['mean_x'] = avg_x
- prev['count'] = total
- prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
- prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
- else:
- merged.append(s.copy())
-
- if len(merged) < 3:
- logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
- return None
-
- logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
-
- margin_px = max(6, int(content_w * 0.003))
- return _build_geometries_from_starts(
- [(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
- word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
- )
-
-
-def _detect_sub_columns(
- geometries: List[ColumnGeometry],
- content_w: int,
- left_x: int = 0,
- top_y: int = 0,
- header_y: Optional[int] = None,
- footer_y: Optional[int] = None,
- _edge_tolerance: int = 8,
- _min_col_start_ratio: float = 0.10,
-) -> List[ColumnGeometry]:
- """Split columns that contain internal sub-columns based on left-edge alignment.
-
- For each column, clusters word left-edges into alignment bins (within
- ``_edge_tolerance`` px). The leftmost bin whose word count reaches
- ``_min_col_start_ratio`` of the column total is treated as the true column
- start. Any words to the left of that bin form a sub-column, provided they
- number >= 2 and < 35 % of total.
-
- Word ``left`` values are relative to the content ROI (offset by *left_x*),
- while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
- bridges the two coordinate systems.
-
- If *header_y* / *footer_y* are provided (absolute y-coordinates), words
- in header/footer regions are excluded from alignment clustering to avoid
- polluting the bins with page numbers or chapter titles. Word ``top``
- values are relative to *top_y*.
-
- Returns a new list of ColumnGeometry — potentially longer than the input.
- """
- if content_w <= 0:
- return geometries
-
- result: List[ColumnGeometry] = []
- for geo in geometries:
- # Only consider wide-enough columns with enough words
- if geo.width_ratio < 0.15 or geo.word_count < 5:
- result.append(geo)
- continue
-
- # Collect left-edges of confident words, excluding header/footer
- # Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
- min_top_rel = (header_y - top_y) if header_y is not None else None
- max_top_rel = (footer_y - top_y) if footer_y is not None else None
-
- confident = [w for w in geo.words
- if w.get('conf', 0) >= 30
- and (min_top_rel is None or w['top'] >= min_top_rel)
- and (max_top_rel is None or w['top'] <= max_top_rel)]
- if len(confident) < 3:
- result.append(geo)
- continue
-
- # --- Cluster left-edges into alignment bins ---
- sorted_edges = sorted(w['left'] for w in confident)
- bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge)
- cur = [sorted_edges[0]]
- for i in range(1, len(sorted_edges)):
- if sorted_edges[i] - cur[-1] <= _edge_tolerance:
- cur.append(sorted_edges[i])
- else:
- bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
- cur = [sorted_edges[i]]
- bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
-
- # --- Find the leftmost bin qualifying as a real column start ---
- total = len(confident)
- min_count = max(3, int(total * _min_col_start_ratio))
- col_start_bin = None
- for b in bins:
- if b[1] >= min_count:
- col_start_bin = b
- break
-
- if col_start_bin is None:
- result.append(geo)
- continue
-
- # Words to the left of the column-start bin are sub-column candidates
- split_threshold = col_start_bin[2] - _edge_tolerance
- sub_words = [w for w in geo.words if w['left'] < split_threshold]
- main_words = [w for w in geo.words if w['left'] >= split_threshold]
-
- # Count only body words (excluding header/footer) for the threshold check
- # so that header/footer words don't artificially trigger a split.
- sub_body = [w for w in sub_words
- if (min_top_rel is None or w['top'] >= min_top_rel)
- and (max_top_rel is None or w['top'] <= max_top_rel)]
- if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
- result.append(geo)
- continue
-
- # --- Build two sub-column geometries ---
- # Word 'left' values are relative to left_x; geo.x is absolute.
- # Convert the split position from relative to absolute coordinates.
- max_sub_left = max(w['left'] for w in sub_words)
- split_rel = (max_sub_left + col_start_bin[2]) // 2
- split_abs = split_rel + left_x
-
- sub_x = geo.x
- sub_width = split_abs - geo.x
- main_x = split_abs
- main_width = (geo.x + geo.width) - split_abs
-
- if sub_width <= 0 or main_width <= 0:
- result.append(geo)
- continue
-
- sub_geo = ColumnGeometry(
- index=0,
- x=sub_x,
- y=geo.y,
- width=sub_width,
- height=geo.height,
- word_count=len(sub_words),
- words=sub_words,
- width_ratio=sub_width / content_w if content_w > 0 else 0.0,
- is_sub_column=True,
- )
- main_geo = ColumnGeometry(
- index=0,
- x=main_x,
- y=geo.y,
- width=main_width,
- height=geo.height,
- word_count=len(main_words),
- words=main_words,
- width_ratio=main_width / content_w if content_w > 0 else 0.0,
- is_sub_column=True,
- )
-
- result.append(sub_geo)
- result.append(main_geo)
-
- logger.info(
- f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
- f"(rel={split_rel}), sub={len(sub_words)} words, "
- f"main={len(main_words)} words, "
- f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
- )
-
- # Re-index by left-to-right order
- result.sort(key=lambda g: g.x)
- for i, g in enumerate(result):
- g.index = i
-
- return result
-
-
-def _split_broad_columns(
- geometries: List[ColumnGeometry],
- content_w: int,
- left_x: int = 0,
- _broad_threshold: float = 0.35,
- _min_gap_px: int = 15,
- _min_words_per_split: int = 5,
-) -> List[ColumnGeometry]:
- """Split overly broad columns that contain two language blocks (EN+DE).
-
- Uses word-coverage gap analysis: builds a per-pixel coverage array from the
- words inside each broad column, finds the largest horizontal gap, and splits
- the column at that gap.
-
- Args:
- geometries: Column geometries from _detect_sub_columns.
- content_w: Width of the content area in pixels.
- left_x: Left edge of content ROI in absolute image coordinates.
- _broad_threshold: Minimum width_ratio to consider a column "broad".
- _min_gap_px: Minimum gap width (pixels) to trigger a split.
- _min_words_per_split: Both halves must have at least this many words.
-
- Returns:
- Updated list of ColumnGeometry (possibly with more columns).
- """
- result: List[ColumnGeometry] = []
-
- logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
- f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")
-
- for geo in geometries:
- if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
- result.append(geo)
- continue
-
- # Build word-coverage array (per pixel within column)
- col_left_rel = geo.x - left_x # column left in content-relative coords
- coverage = np.zeros(geo.width, dtype=np.float32)
-
- for wd in geo.words:
- # wd['left'] is relative to left_x (content ROI)
- wl = wd['left'] - col_left_rel
- wr = wl + wd.get('width', 0)
- wl = max(0, int(wl))
- wr = min(geo.width, int(wr))
- if wr > wl:
- coverage[wl:wr] += 1.0
-
- # Light smoothing (kernel=3px) to avoid noise
- if len(coverage) > 3:
- kernel = np.ones(3, dtype=np.float32) / 3.0
- coverage = np.convolve(coverage, kernel, mode='same')
-
- # Normalise to [0, 1]
- cmax = coverage.max()
- if cmax > 0:
- coverage /= cmax
-
- # Find INTERNAL gaps where coverage < 0.5
- # Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
- low_mask = coverage < 0.5
- all_gaps = []
- _gs = None
- for px in range(len(low_mask)):
- if low_mask[px]:
- if _gs is None:
- _gs = px
- else:
- if _gs is not None:
- all_gaps.append((_gs, px, px - _gs))
- _gs = None
- if _gs is not None:
- all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
-
- # Filter: only internal gaps (not touching column edges)
- _edge_margin = 10 # pixels from edge to ignore
- internal_gaps = [g for g in all_gaps
- if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
- best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
-
- logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
- f"{[g for g in all_gaps if g[2] >= 5]}, "
- f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
- f"best={best_gap}")
-
- if best_gap is None or best_gap[2] < _min_gap_px:
- result.append(geo)
- continue
-
- gap_center = (best_gap[0] + best_gap[1]) // 2
-
- # Split words by midpoint relative to gap
- left_words = []
- right_words = []
- for wd in geo.words:
- wl = wd['left'] - col_left_rel
- mid = wl + wd.get('width', 0) / 2.0
- if mid < gap_center:
- left_words.append(wd)
- else:
- right_words.append(wd)
-
- if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
- result.append(geo)
- continue
-
- # Build two new ColumnGeometry objects
- split_x_abs = geo.x + gap_center
- left_w = gap_center
- right_w = geo.width - gap_center
-
- left_geo = ColumnGeometry(
- index=0,
- x=geo.x,
- y=geo.y,
- width=left_w,
- height=geo.height,
- word_count=len(left_words),
- words=left_words,
- width_ratio=left_w / content_w if content_w else 0,
- is_sub_column=True,
- )
- right_geo = ColumnGeometry(
- index=0,
- x=split_x_abs,
- y=geo.y,
- width=right_w,
- height=geo.height,
- word_count=len(right_words),
- words=right_words,
- width_ratio=right_w / content_w if content_w else 0,
- is_sub_column=True,
- )
-
- logger.info(
- f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
- f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
- f"left={len(left_words)} words (w={left_w}), "
- f"right={len(right_words)} words (w={right_w})"
- )
-
- result.append(left_geo)
- result.append(right_geo)
-
- # Re-index left-to-right
- result.sort(key=lambda g: g.x)
- for i, g in enumerate(result):
- g.index = i
-
- return result
-
-
-def _build_geometries_from_starts(
- col_starts: List[Tuple[int, int]],
- word_dicts: List[Dict],
- left_x: int,
- right_x: int,
- top_y: int,
- bottom_y: int,
- content_w: int,
- content_h: int,
- inv: Optional[np.ndarray] = None,
-) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
- """Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
- geometries = []
- for i, (start_x, count) in enumerate(col_starts):
- if i + 1 < len(col_starts):
- col_width = col_starts[i + 1][0] - start_x
- else:
- col_width = right_x - start_x
-
- col_left_rel = start_x - left_x
- col_right_rel = col_left_rel + col_width
- col_words = [w for w in word_dicts
- if col_left_rel <= w['left'] < col_right_rel]
-
- geometries.append(ColumnGeometry(
- index=i,
- x=start_x,
- y=top_y,
- width=col_width,
- height=content_h,
- word_count=len(col_words),
- words=col_words,
- width_ratio=col_width / content_w if content_w > 0 else 0.0,
- ))
-
- logger.info(f"ColumnGeometry: {len(geometries)} columns: "
- f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
- return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
-
-
-def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
- """Detect column geometry using whitespace-gap analysis with word validation.
-
- Phase A of the two-phase column detection. Uses vertical projection
- profiles to find whitespace gaps between columns, then validates that
- no gap cuts through a word bounding box.
-
- Falls back to clustering-based detection if fewer than 2 gaps are found.
-
- Args:
- ocr_img: Binarized grayscale image for layout analysis.
- dewarped_bgr: Original BGR image (for Tesseract word detection).
-
- Returns:
- Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
- or None if detection fails entirely.
- """
- h, w = ocr_img.shape[:2]
-
- # --- Step 1: Find content bounds ---
- inv = cv2.bitwise_not(ocr_img)
- left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
- content_w = right_x - left_x
- content_h = bottom_y - top_y
-
- if content_w < w * 0.3 or content_h < h * 0.3:
- left_x, right_x = 0, w
- top_y, bottom_y = 0, h
- content_w, content_h = w, h
-
- logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
- f"y=[{top_y}..{bottom_y}] ({content_h}px)")
-
- # --- Step 2: Get word bounding boxes from Tesseract ---
- # Crop from left_x to full image width (not right_x) so words at the right
- # edge of the last column are included even if they extend past the detected
- # content boundary (right_x).
- content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
- pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
-
- try:
- data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
- except Exception as e:
- logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
- return None
-
- word_dicts = []
- left_edges = []
- edge_word_indices = []
- n_words = len(data['text'])
- for i in range(n_words):
- conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
- text = str(data['text'][i]).strip()
- if conf < 30 or not text:
- continue
- lx = int(data['left'][i])
- ty = int(data['top'][i])
- bw = int(data['width'][i])
- bh = int(data['height'][i])
- left_edges.append(lx)
- edge_word_indices.append(len(word_dicts))
- word_dicts.append({
- 'text': text, 'conf': conf,
- 'left': lx, 'top': ty, 'width': bw, 'height': bh,
- })
-
- if len(left_edges) < 5:
- logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
- return None
-
- logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
-
- # --- Step 2b: Segment by sub-headers ---
- # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
- # text bands that pollute the vertical projection. We detect large
- # horizontal gaps (= whitespace rows separating sections) and use only
- # the tallest content segment for the projection. This makes column
- # detection immune to sub-headers, illustrations, and section dividers.
- content_strip = inv[top_y:bottom_y, left_x:right_x]
- h_proj_row = np.sum(content_strip, axis=1).astype(float)
- h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
-
- # Find horizontal gaps (near-empty rows)
- H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty"
- h_in_gap = h_proj_row_norm < H_GAP_THRESH
- H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px
-
- h_gaps: List[Tuple[int, int]] = []
- h_gap_start = None
- for y_idx in range(len(h_in_gap)):
- if h_in_gap[y_idx]:
- if h_gap_start is None:
- h_gap_start = y_idx
- else:
- if h_gap_start is not None:
- if y_idx - h_gap_start >= H_MIN_GAP:
- h_gaps.append((h_gap_start, y_idx))
- h_gap_start = None
- if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
- h_gaps.append((h_gap_start, len(h_in_gap)))
-
- # Identify "large" gaps (significantly bigger than median) that indicate
- # section boundaries (sub-headers, chapter titles).
- if len(h_gaps) >= 3:
- gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
- median_gap_h = gap_sizes[len(gap_sizes) // 2]
- large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
- large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
- else:
- large_gaps = h_gaps
-
- # Build content segments between large gaps and pick the tallest
- seg_boundaries = [0]
- for gs, ge in large_gaps:
- seg_boundaries.append(gs)
- seg_boundaries.append(ge)
- seg_boundaries.append(content_h)
-
- segments = []
- for i in range(0, len(seg_boundaries) - 1, 2):
- seg_top = seg_boundaries[i]
- seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
- seg_height = seg_bot - seg_top
- if seg_height > 20: # ignore tiny fragments
- segments.append((seg_top, seg_bot, seg_height))
-
- if segments:
- segments.sort(key=lambda s: s[2], reverse=True)
- best_seg = segments[0]
- proj_strip = content_strip[best_seg[0]:best_seg[1], :]
- effective_h = best_seg[2]
- if len(segments) > 1:
- logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
- f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
- f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
- else:
- proj_strip = content_strip
- effective_h = content_h
-
- # --- Step 3: Vertical projection profile ---
- v_proj = np.sum(proj_strip, axis=0).astype(float)
- v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
-
- # Smooth the projection to avoid noise-induced micro-gaps
- kernel_size = max(5, content_w // 80)
- if kernel_size % 2 == 0:
- kernel_size += 1 # keep odd for symmetry
- v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
-
- # --- Step 4: Find whitespace gaps ---
- # Threshold: areas with very little ink density are gaps
- median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
- gap_threshold = max(median_density * 0.15, 0.005)
-
- in_gap = v_smooth < gap_threshold
- MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
-
- # Collect contiguous gap regions
- raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
- gap_start = None
- for x in range(len(in_gap)):
- if in_gap[x]:
- if gap_start is None:
- gap_start = x
- else:
- if gap_start is not None:
- gap_width = x - gap_start
- if gap_width >= MIN_GAP_WIDTH:
- raw_gaps.append((gap_start, x))
- gap_start = None
- # Handle gap at the right edge
- if gap_start is not None:
- gap_width = len(in_gap) - gap_start
- if gap_width >= MIN_GAP_WIDTH:
- raw_gaps.append((gap_start, len(in_gap)))
-
- logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
- f"min_width={MIN_GAP_WIDTH}px): "
- f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
-
- # --- Step 5: Validate gaps against word bounding boxes ---
- # When using a segment for projection, only validate against words
- # inside that segment — words from sub-headers or other sections
- # would incorrectly overlap with real column gaps.
- if segments and len(segments) > 1:
- seg_top_abs = best_seg[0] # relative to content strip
- seg_bot_abs = best_seg[1]
- segment_words = [wd for wd in word_dicts
- if wd['top'] >= seg_top_abs
- and wd['top'] + wd['height'] <= seg_bot_abs]
- logger.info(f"ColumnGeometry: filtering words to segment: "
- f"{len(segment_words)}/{len(word_dicts)} words")
- else:
- segment_words = word_dicts
-
- validated_gaps = []
- for gap_start_rel, gap_end_rel in raw_gaps:
- # Check if any word overlaps with this gap region
- overlapping = False
- for wd in segment_words:
- word_left = wd['left']
- word_right = wd['left'] + wd['width']
- if word_left < gap_end_rel and word_right > gap_start_rel:
- overlapping = True
- break
-
- if not overlapping:
- validated_gaps.append((gap_start_rel, gap_end_rel))
- else:
- # Try to shift the gap to avoid the overlapping word(s)
- # Find the tightest word boundaries within the gap region
- min_word_left = content_w
- max_word_right = 0
- for wd in segment_words:
- word_left = wd['left']
- word_right = wd['left'] + wd['width']
- if word_left < gap_end_rel and word_right > gap_start_rel:
- min_word_left = min(min_word_left, word_left)
- max_word_right = max(max_word_right, word_right)
-
- # Try gap before the overlapping words
- if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
- validated_gaps.append((gap_start_rel, min_word_left))
- logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
- # Try gap after the overlapping words
- elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
- validated_gaps.append((max_word_right, gap_end_rel))
- logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
- else:
- logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
- f"discarded (word overlap, no room to shift)")
-
- logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
- f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
-
- # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
- # When pixel-based projection fails (e.g. due to illustrations or colored
- # bands), use word bounding boxes to find clear vertical gaps. This is
- # immune to decorative graphics that Tesseract doesn't recognise as words.
- if len(validated_gaps) < 2:
- logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
- word_coverage = np.zeros(content_w, dtype=np.int32)
- for wd in segment_words:
- wl = max(0, wd['left'])
- wr = min(wd['left'] + wd['width'], content_w)
- if wr > wl:
- word_coverage[wl:wr] += 1
-
- # Smooth slightly to bridge tiny 1-2px noise gaps between words
- wc_kernel = max(3, content_w // 300)
- if wc_kernel % 2 == 0:
- wc_kernel += 1
- wc_smooth = np.convolve(word_coverage.astype(float),
- np.ones(wc_kernel) / wc_kernel, mode='same')
-
- wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage
- WC_MIN_GAP = max(4, content_w // 300)
-
- wc_gaps: List[Tuple[int, int]] = []
- wc_gap_start = None
- for x in range(len(wc_in_gap)):
- if wc_in_gap[x]:
- if wc_gap_start is None:
- wc_gap_start = x
- else:
- if wc_gap_start is not None:
- if x - wc_gap_start >= WC_MIN_GAP:
- wc_gaps.append((wc_gap_start, x))
- wc_gap_start = None
- if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
- wc_gaps.append((wc_gap_start, len(wc_in_gap)))
-
- logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
- f"(min_width={WC_MIN_GAP}px): "
- f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
-
- if len(wc_gaps) >= 2:
- validated_gaps = wc_gaps
-
- # --- Step 6: Fallback to clustering if too few gaps ---
- if len(validated_gaps) < 2:
- logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
- return _detect_columns_by_clustering(
- word_dicts, left_edges, edge_word_indices,
- content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
- )
-
- # --- Step 7: Derive column boundaries from gaps ---
- # Sort gaps by position
- validated_gaps.sort(key=lambda g: g[0])
-
- # Identify margin gaps (first and last) vs interior gaps
- # A margin gap touches the edge of the content area (within 2% tolerance)
- edge_tolerance = max(10, int(content_w * 0.02))
-
- is_left_margin = validated_gaps[0][0] <= edge_tolerance
- is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
-
- # Interior gaps define column boundaries
- # Column starts at the end of a gap, ends at the start of the next gap
- col_starts = []
-
- if is_left_margin:
- # First column starts after the left margin gap
- first_gap_end = validated_gaps[0][1]
- interior_gaps = validated_gaps[1:]
- else:
- # No left margin gap — first column starts at content left edge
- first_gap_end = 0
- interior_gaps = validated_gaps[:]
-
- if is_right_margin:
- # Last gap is right margin — don't use it as column start
- interior_gaps_for_boundaries = interior_gaps[:-1]
- right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
- else:
- interior_gaps_for_boundaries = interior_gaps
- right_boundary = content_w
-
- # First column
- col_starts.append(left_x + first_gap_end)
-
- # Columns between interior gaps
- for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
- col_starts.append(left_x + gap_end_rel)
-
- # Count words per column region (for logging)
- col_start_counts = []
- for i, start_x in enumerate(col_starts):
- if i + 1 < len(col_starts):
- next_start = col_starts[i + 1]
- else:
- # Rightmost column always extends to full image width (w).
- # The page margin contains only white space — extending the OCR
- # crop to the image edge is safe and prevents text near the right
- # border from being cut off.
- next_start = w
-
- col_left_rel = start_x - left_x
- col_right_rel = next_start - left_x
- n_words_in_col = sum(1 for w in word_dicts
- if col_left_rel <= w['left'] < col_right_rel)
- col_start_counts.append((start_x, n_words_in_col))
-
- logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
- f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
- f"{col_start_counts}")
-
- # --- Step 8: Build ColumnGeometry objects ---
- # Determine right edge for each column
- all_boundaries = []
- for i, start_x in enumerate(col_starts):
- if i + 1 < len(col_starts):
- end_x = col_starts[i + 1]
- else:
- # Rightmost column always extends to full image width (w).
- end_x = w
- all_boundaries.append((start_x, end_x))
-
- geometries = []
- for i, (start_x, end_x) in enumerate(all_boundaries):
- col_width = end_x - start_x
- col_left_rel = start_x - left_x
- col_right_rel = col_left_rel + col_width
- col_words = [w for w in word_dicts
- if col_left_rel <= w['left'] < col_right_rel]
-
- geometries.append(ColumnGeometry(
- index=i,
- x=start_x,
- y=top_y,
- width=col_width,
- height=content_h,
- word_count=len(col_words),
- words=col_words,
- width_ratio=col_width / content_w if content_w > 0 else 0.0,
- ))
-
- logger.info(f"ColumnGeometry: {len(geometries)} columns: "
- f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
-
- # --- Step 9: Filter phantom narrow columns ---
- # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
- # columns (< 3% of content width) with zero or no words. These are not
- # real columns — remove them and close the gap between neighbors.
- min_real_col_w = max(20, int(content_w * 0.03))
- filtered_geoms = [g for g in geometries
- if not (g.word_count < 3 and g.width < min_real_col_w)]
- if len(filtered_geoms) < len(geometries):
- n_removed = len(geometries) - len(filtered_geoms)
- logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
- f"(width < {min_real_col_w}px and words < 3)")
- # Extend each remaining column to close gaps with its right neighbor
- for i, g in enumerate(filtered_geoms):
- if i + 1 < len(filtered_geoms):
- g.width = filtered_geoms[i + 1].x - g.x
- else:
- g.width = w - g.x
- g.index = i
- col_left_rel = g.x - left_x
- col_right_rel = col_left_rel + g.width
- g.words = [w for w in word_dicts
- if col_left_rel <= w['left'] < col_right_rel]
- g.word_count = len(g.words)
- geometries = filtered_geoms
- logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
- f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
-
- return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
-
-
-def expand_narrow_columns(
- geometries: List[ColumnGeometry],
- content_w: int,
- left_x: int,
- word_dicts: List[Dict],
-) -> List[ColumnGeometry]:
- """Expand narrow columns into adjacent whitespace gaps.
-
- Narrow columns (marker, page_ref, < 10% content width) often lose
- content at image edges due to residual shear. This expands them toward
- the neighbouring column, but never past 40% of the gap or past the
- nearest word in the neighbour.
-
- Must be called AFTER _detect_sub_columns() so that sub-column splits
- (which create the narrowest columns) have already happened.
- """
- _NARROW_THRESHOLD_PCT = 10.0
- _MIN_WORD_MARGIN = 4
-
- if len(geometries) < 2:
- return geometries
-
- logger.info("ExpandNarrowCols: input %d cols: %s",
- len(geometries),
- [(i, g.x, g.width, round(g.width / content_w * 100, 1))
- for i, g in enumerate(geometries)])
-
- for i, g in enumerate(geometries):
- col_pct = g.width / content_w * 100 if content_w > 0 else 100
- if col_pct >= _NARROW_THRESHOLD_PCT:
- continue
-
- expanded = False
- orig_pct = col_pct
-
- # --- try expanding to the LEFT ---
- if i > 0:
- left_nb = geometries[i - 1]
- # Gap can be 0 if sub-column split created adjacent columns.
- # In that case, look at where the neighbor's rightmost words
- # actually are — there may be unused space we can claim.
- nb_words_right = [wd['left'] + wd.get('width', 0)
- for wd in left_nb.words]
- if nb_words_right:
- rightmost_word_abs = left_x + max(nb_words_right)
- safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
- else:
- # No words in neighbor → we can take up to neighbor's start
- safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
-
- if safe_left_abs < g.x:
- g.width += (g.x - safe_left_abs)
- g.x = safe_left_abs
- expanded = True
-
- # --- try expanding to the RIGHT ---
- if i + 1 < len(geometries):
- right_nb = geometries[i + 1]
- nb_words_left = [wd['left'] for wd in right_nb.words]
- if nb_words_left:
- leftmost_word_abs = left_x + min(nb_words_left)
- safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
- else:
- safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
-
- cur_right = g.x + g.width
- if safe_right_abs > cur_right:
- g.width = safe_right_abs - g.x
- expanded = True
-
- if expanded:
- col_left_rel = g.x - left_x
- col_right_rel = col_left_rel + g.width
- g.words = [wd for wd in word_dicts
- if col_left_rel <= wd['left'] < col_right_rel]
- g.word_count = len(g.words)
- g.width_ratio = g.width / content_w if content_w > 0 else 0.0
- logger.info(
- "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
- i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
-
- # --- Shrink overlapping neighbors to match new boundaries ---
- # Left neighbor: its right edge must not exceed our new left edge
- if i > 0:
- left_nb = geometries[i - 1]
- nb_right = left_nb.x + left_nb.width
- if nb_right > g.x:
- left_nb.width = g.x - left_nb.x
- if left_nb.width < 0:
- left_nb.width = 0
- left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
- # Re-assign words
- nb_left_rel = left_nb.x - left_x
- nb_right_rel = nb_left_rel + left_nb.width
- left_nb.words = [wd for wd in word_dicts
- if nb_left_rel <= wd['left'] < nb_right_rel]
- left_nb.word_count = len(left_nb.words)
-
- # Right neighbor: its left edge must not be before our new right edge
- if i + 1 < len(geometries):
- right_nb = geometries[i + 1]
- my_right = g.x + g.width
- if right_nb.x < my_right:
- old_right_edge = right_nb.x + right_nb.width
- right_nb.x = my_right
- right_nb.width = old_right_edge - right_nb.x
- if right_nb.width < 0:
- right_nb.width = 0
- right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
- # Re-assign words
- nb_left_rel = right_nb.x - left_x
- nb_right_rel = nb_left_rel + right_nb.width
- right_nb.words = [wd for wd in word_dicts
- if nb_left_rel <= wd['left'] < nb_right_rel]
- right_nb.word_count = len(right_nb.words)
-
- return geometries
-
-
-# =============================================================================
-# Row Geometry Detection (horizontal whitespace-gap analysis)
-# =============================================================================
-
-def detect_row_geometry(
- inv: np.ndarray,
- word_dicts: List[Dict],
- left_x: int, right_x: int,
- top_y: int, bottom_y: int,
-) -> List['RowGeometry']:
- """Detect row geometry using horizontal whitespace-gap analysis.
-
- Mirrors the vertical gap approach used for columns, but operates on
- horizontal projection profiles to find gaps between text lines.
- Also classifies header/footer rows based on gap size.
-
- Args:
- inv: Inverted binarized image (white text on black bg, full page).
- word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
- left_x, right_x: Absolute X bounds of the content area.
- top_y, bottom_y: Absolute Y bounds of the content area.
-
- Returns:
- List of RowGeometry objects sorted top to bottom.
- """
- content_w = right_x - left_x
- content_h = bottom_y - top_y
-
- if content_h < 10 or content_w < 10:
- logger.warning("detect_row_geometry: content area too small")
- return []
-
- # --- Step 1: Horizontal projection profile (text-only, images masked out) ---
- content_strip = inv[top_y:bottom_y, left_x:right_x]
-
- # Build a word-coverage mask so that image regions (high ink density but no
- # Tesseract words) are ignored. Only pixels within/near word bounding boxes
- # contribute to the projection. This prevents large illustrations from
- # merging multiple vocabulary rows into one.
- WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
- word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
- for wd in word_dicts:
- y1 = max(0, wd['top'] - WORD_PAD_Y)
- y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
- x1 = max(0, wd['left'])
- x2 = min(content_w, wd['left'] + wd['width'])
- word_mask[y1:y2, x1:x2] = 255
-
- masked_strip = cv2.bitwise_and(content_strip, word_mask)
- h_proj = np.sum(masked_strip, axis=1).astype(float)
- h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
-
- # --- Step 2: Smoothing + threshold ---
- kernel_size = max(3, content_h // 200)
- if kernel_size % 2 == 0:
- kernel_size += 1
- h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
-
- median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
- gap_threshold = max(median_density * 0.15, 0.003)
-
- in_gap = h_smooth < gap_threshold
- MIN_GAP_HEIGHT = max(3, content_h // 500)
-
- # --- Step 3: Collect contiguous gap regions ---
- raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI
- gap_start = None
- for y in range(len(in_gap)):
- if in_gap[y]:
- if gap_start is None:
- gap_start = y
- else:
- if gap_start is not None:
- gap_height = y - gap_start
- if gap_height >= MIN_GAP_HEIGHT:
- raw_gaps.append((gap_start, y))
- gap_start = None
- if gap_start is not None:
- gap_height = len(in_gap) - gap_start
- if gap_height >= MIN_GAP_HEIGHT:
- raw_gaps.append((gap_start, len(in_gap)))
-
- logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
- f"min_height={MIN_GAP_HEIGHT}px)")
-
- # --- Step 4: Validate gaps against word bounding boxes ---
- validated_gaps = []
- for gap_start_rel, gap_end_rel in raw_gaps:
- overlapping = False
- for wd in word_dicts:
- word_top = wd['top']
- word_bottom = wd['top'] + wd['height']
- if word_top < gap_end_rel and word_bottom > gap_start_rel:
- overlapping = True
- break
-
- if not overlapping:
- validated_gaps.append((gap_start_rel, gap_end_rel))
- else:
- # Try to shift the gap to avoid overlapping words
- min_word_top = content_h
- max_word_bottom = 0
- for wd in word_dicts:
- word_top = wd['top']
- word_bottom = wd['top'] + wd['height']
- if word_top < gap_end_rel and word_bottom > gap_start_rel:
- min_word_top = min(min_word_top, word_top)
- max_word_bottom = max(max_word_bottom, word_bottom)
-
- if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
- validated_gaps.append((gap_start_rel, min_word_top))
- elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
- validated_gaps.append((max_word_bottom, gap_end_rel))
- else:
- logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
- f"discarded (word overlap, no room to shift)")
-
- logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
-
- # --- Fallback if too few gaps ---
- if len(validated_gaps) < 2:
- logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
- return _build_rows_from_word_grouping(
- word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
- )
-
- validated_gaps.sort(key=lambda g: g[0])
-
- # --- Step 5: Header/footer detection via gap size ---
- HEADER_FOOTER_ZONE = 0.15
- GAP_MULTIPLIER = 2.0
-
- gap_sizes = [g[1] - g[0] for g in validated_gaps]
- median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
- large_gap_threshold = median_gap * GAP_MULTIPLIER
-
- header_boundary_rel = None # y below which is header
- footer_boundary_rel = None # y above which is footer
-
- header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
- footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
-
- # Find largest gap in header zone
- best_header_gap = None
- for gs, ge in validated_gaps:
- gap_mid = (gs + ge) / 2
- gap_size = ge - gs
- if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
- if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
- best_header_gap = (gs, ge)
-
- if best_header_gap is not None:
- header_boundary_rel = best_header_gap[1]
- logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
- f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
- f"median_gap={median_gap:.0f}px)")
-
- # Find largest gap in footer zone
- best_footer_gap = None
- for gs, ge in validated_gaps:
- gap_mid = (gs + ge) / 2
- gap_size = ge - gs
- if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
- if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
- best_footer_gap = (gs, ge)
-
- if best_footer_gap is not None:
- footer_boundary_rel = best_footer_gap[0]
- logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
- f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
-
- # --- Step 6: Build RowGeometry objects from gaps ---
- # Rows are the spans between gaps
- row_boundaries = [] # (start_y_rel, end_y_rel)
-
- # Top of content to first gap
- if validated_gaps[0][0] > MIN_GAP_HEIGHT:
- row_boundaries.append((0, validated_gaps[0][0]))
-
- # Between gaps
- for i in range(len(validated_gaps) - 1):
- row_start = validated_gaps[i][1]
- row_end = validated_gaps[i + 1][0]
- if row_end - row_start > 0:
- row_boundaries.append((row_start, row_end))
-
- # Last gap to bottom of content
- if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
- row_boundaries.append((validated_gaps[-1][1], content_h))
-
- rows = []
- for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
- # Determine row type
- row_mid = (row_start_rel + row_end_rel) / 2
- if header_boundary_rel is not None and row_mid < header_boundary_rel:
- row_type = 'header'
- elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
- row_type = 'footer'
- else:
- row_type = 'content'
-
- # Collect words in this row
- row_words = [w for w in word_dicts
- if w['top'] + w['height'] / 2 >= row_start_rel
- and w['top'] + w['height'] / 2 < row_end_rel]
-
- # Gap before this row
- gap_before = 0
- if idx == 0 and validated_gaps[0][0] > 0:
- gap_before = validated_gaps[0][0]
- elif idx > 0:
- # Find the gap just before this row boundary
- for gs, ge in validated_gaps:
- if ge == row_start_rel:
- gap_before = ge - gs
- break
-
- rows.append(RowGeometry(
- index=idx,
- x=left_x,
- y=top_y + row_start_rel,
- width=content_w,
- height=row_end_rel - row_start_rel,
- word_count=len(row_words),
- words=row_words,
- row_type=row_type,
- gap_before=gap_before,
- ))
-
- # --- Step 7: Word-center grid regularization ---
- # Derive precise row boundaries from word vertical centers. Detects
- # section breaks (headings, paragraphs) and builds per-section grids.
- rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
- content_w, content_h, inv)
-
- type_counts = {}
- for r in rows:
- type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
- logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
-
- return rows
-
-
-def _regularize_row_grid(
- rows: List['RowGeometry'],
- word_dicts: List[Dict],
- left_x: int, right_x: int,
- top_y: int,
- content_w: int, content_h: int,
- inv: np.ndarray,
-) -> List['RowGeometry']:
- """Rebuild row boundaries from word center-lines with section-break awareness.
-
- Instead of overlaying a rigid grid, this derives row positions bottom-up
- from the words themselves:
-
- 1. Group words into line clusters (by Y proximity).
- 2. For each cluster compute center_y (median of word vertical centers)
- and letter_height (median of word heights).
- 3. Compute the pitch (distance between consecutive centers).
- 4. Detect section breaks where the gap is >1.8× the median pitch
- (headings, sub-headings, paragraph breaks).
- 5. Within each section, use the local pitch to place row boundaries
- at the midpoints between consecutive centers.
- 6. Validate that ≥85% of words land in a grid row; otherwise fall back.
-
- Header/footer rows from the gap-based detection are preserved.
- """
- content_rows = [r for r in rows if r.row_type == 'content']
- non_content = [r for r in rows if r.row_type != 'content']
-
- if len(content_rows) < 5:
- return rows
-
- # --- Step A: Group ALL words into line clusters ---
- # Collect words that belong to content rows (deduplicated)
- content_words: List[Dict] = []
- seen_keys: set = set()
- for r in content_rows:
- for w in r.words:
- key = (w['left'], w['top'], w['width'], w['height'])
- if key not in seen_keys:
- seen_keys.add(key)
- content_words.append(w)
-
- if len(content_words) < 5:
- return rows
-
- # Compute median word height (excluding outliers like tall brackets/IPA)
- word_heights = sorted(w['height'] for w in content_words)
- median_wh = word_heights[len(word_heights) // 2]
-
- # Compute median gap-based row height — this is the actual line height
- # as detected by the horizontal projection. We use 40% of this as
- # grouping tolerance. This is much more reliable than using word height
- # alone, because words on the same line can have very different heights
- # (e.g. lowercase vs uppercase, brackets, phonetic symbols).
- gap_row_heights = sorted(r.height for r in content_rows)
- median_row_h = gap_row_heights[len(gap_row_heights) // 2]
-
- # Tolerance: 40% of row height. Words on the same line should have
- # centers within this range. Even if a word's bbox is taller/shorter,
- # its center should stay within half a row height of the line center.
- y_tol = max(10, int(median_row_h * 0.4))
-
- # Sort by center_y, then group by proximity
- words_by_center = sorted(content_words,
- key=lambda w: (w['top'] + w['height'] / 2, w['left']))
- line_clusters: List[List[Dict]] = []
- current_line: List[Dict] = [words_by_center[0]]
- current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
-
- for w in words_by_center[1:]:
- w_center = w['top'] + w['height'] / 2
- if abs(w_center - current_center) <= y_tol:
- current_line.append(w)
- else:
- current_line.sort(key=lambda w: w['left'])
- line_clusters.append(current_line)
- current_line = [w]
- current_center = w_center
-
- if current_line:
- current_line.sort(key=lambda w: w['left'])
- line_clusters.append(current_line)
-
- if len(line_clusters) < 3:
- return rows
-
- # --- Step B: Compute center_y per cluster ---
- # center_y = median of (word_top + word_height/2) across all words in cluster
- # letter_h = median of word heights, but excluding outlier-height words
- # (>2× median) so that tall brackets/IPA don't skew the height
- cluster_info: List[Dict] = []
- for cl_words in line_clusters:
- centers = [w['top'] + w['height'] / 2 for w in cl_words]
- # Filter outlier heights for letter_h computation
- normal_heights = [w['height'] for w in cl_words
- if w['height'] <= median_wh * 2.0]
- if not normal_heights:
- normal_heights = [w['height'] for w in cl_words]
- center_y = float(np.median(centers))
- letter_h = float(np.median(normal_heights))
- cluster_info.append({
- 'center_y_rel': center_y, # relative to content ROI
- 'center_y_abs': center_y + top_y, # absolute
- 'letter_h': letter_h,
- 'words': cl_words,
- })
-
- cluster_info.sort(key=lambda c: c['center_y_rel'])
-
- # --- Step B2: Merge clusters that are too close together ---
- # Even with center-based grouping, some edge cases can produce
- # spurious clusters. Merge any pair whose centers are closer
- # than 30% of the row height (they're definitely the same text line).
- merge_threshold = max(8, median_row_h * 0.3)
- merged: List[Dict] = [cluster_info[0]]
- for cl in cluster_info[1:]:
- prev = merged[-1]
- if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
- # Merge: combine words, recompute center
- combined_words = prev['words'] + cl['words']
- centers = [w['top'] + w['height'] / 2 for w in combined_words]
- normal_heights = [w['height'] for w in combined_words
- if w['height'] <= median_wh * 2.0]
- if not normal_heights:
- normal_heights = [w['height'] for w in combined_words]
- prev['center_y_rel'] = float(np.median(centers))
- prev['center_y_abs'] = prev['center_y_rel'] + top_y
- prev['letter_h'] = float(np.median(normal_heights))
- prev['words'] = combined_words
- else:
- merged.append(cl)
-
- cluster_info = merged
-
- if len(cluster_info) < 3:
- return rows
-
- # --- Step C: Compute pitches and detect section breaks ---
- pitches: List[float] = []
- for i in range(1, len(cluster_info)):
- pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
- pitches.append(pitch)
-
- if not pitches:
- return rows
-
- median_pitch = float(np.median(pitches))
- if median_pitch <= 5:
- return rows
-
- # A section break is where the gap between line centers is much larger
- # than the normal pitch (sub-headings, section titles, etc.)
- BREAK_FACTOR = 1.8
-
- # --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
- sections: List[List[Dict]] = []
- current_section: List[Dict] = [cluster_info[0]]
-
- for i in range(1, len(cluster_info)):
- gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
- if gap > median_pitch * BREAK_FACTOR:
- sections.append(current_section)
- current_section = [cluster_info[i]]
- else:
- current_section.append(cluster_info[i])
-
- if current_section:
- sections.append(current_section)
-
- # --- Step E: Build row boundaries per section ---
- grid_rows: List[RowGeometry] = []
-
- for section in sections:
- if not section:
- continue
-
- if len(section) == 1:
- # Single-line section (likely a heading)
- cl = section[0]
- half_h = max(cl['letter_h'], median_pitch * 0.4)
- row_top = cl['center_y_abs'] - half_h
- row_bot = cl['center_y_abs'] + half_h
- grid_rows.append(RowGeometry(
- index=0,
- x=left_x,
- y=round(row_top),
- width=content_w,
- height=round(row_bot - row_top),
- word_count=len(cl['words']),
- words=cl['words'],
- row_type='content',
- gap_before=0,
- ))
- continue
-
- # Compute local pitch for this section
- local_pitches = []
- for i in range(1, len(section)):
- local_pitches.append(
- section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
- )
- local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
-
- # Row boundaries are placed at midpoints between consecutive centers.
- # First row: top = center - local_pitch/2
- # Last row: bottom = center + local_pitch/2
- for i, cl in enumerate(section):
- if i == 0:
- row_top = cl['center_y_abs'] - local_pitch / 2
- else:
- # Midpoint between this center and previous center
- prev_center = section[i - 1]['center_y_abs']
- row_top = (prev_center + cl['center_y_abs']) / 2
-
- if i == len(section) - 1:
- row_bot = cl['center_y_abs'] + local_pitch / 2
- else:
- next_center = section[i + 1]['center_y_abs']
- row_bot = (cl['center_y_abs'] + next_center) / 2
-
- # Clamp to reasonable bounds
- row_top = max(top_y, row_top)
- row_bot = min(top_y + content_h, row_bot)
-
- if row_bot - row_top < 5:
- continue
-
- grid_rows.append(RowGeometry(
- index=0,
- x=left_x,
- y=round(row_top),
- width=content_w,
- height=round(row_bot - row_top),
- word_count=len(cl['words']),
- words=cl['words'],
- row_type='content',
- gap_before=0,
- ))
-
- if not grid_rows:
- return rows
-
- # --- Step F: Re-assign words to grid rows ---
- # Words may have shifted slightly; assign each word to the row whose
- # center is closest to the word's vertical center.
- for gr in grid_rows:
- gr.words = []
-
- for w in content_words:
- w_center = w['top'] + top_y + w['height'] / 2
- best_row = None
- best_dist = float('inf')
- for gr in grid_rows:
- row_center = gr.y + gr.height / 2
- dist = abs(w_center - row_center)
- if dist < best_dist:
- best_dist = dist
- best_row = gr
- if best_row is not None and best_dist < median_pitch:
- best_row.words.append(w)
-
- for gr in grid_rows:
- gr.word_count = len(gr.words)
-
- # --- Step G: Validate ---
- words_placed = sum(gr.word_count for gr in grid_rows)
- if len(content_words) > 0:
- match_ratio = words_placed / len(content_words)
- if match_ratio < 0.85:
- logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
- f"of words, keeping gap-based rows")
- return rows
-
- # Remove empty grid rows (no words assigned)
- grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
-
- # --- Step H: Merge header/footer + re-index ---
- result = list(non_content) + grid_rows
- result.sort(key=lambda r: r.y)
- for i, r in enumerate(result):
- r.index = i
-
- row_heights = [gr.height for gr in grid_rows]
- min_h = min(row_heights) if row_heights else 0
- max_h = max(row_heights) if row_heights else 0
- logger.info(f"RowGrid: word-center grid applied "
- f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
- f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
- f"{len(sections)} sections, "
- f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
- f"was {len(content_rows)} gap-based rows)")
-
- return result
-
-
-def _build_rows_from_word_grouping(
- word_dicts: List[Dict],
- left_x: int, right_x: int,
- top_y: int, bottom_y: int,
- content_w: int, content_h: int,
-) -> List['RowGeometry']:
- """Fallback: build rows by grouping words by Y position.
-
- Uses _group_words_into_lines() with a generous tolerance.
- No header/footer detection in fallback mode.
- """
- if not word_dicts:
- return []
-
- y_tolerance = max(20, content_h // 100)
- lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
-
- rows = []
- for idx, line_words in enumerate(lines):
- if not line_words:
- continue
- min_top = min(w['top'] for w in line_words)
- max_bottom = max(w['top'] + w['height'] for w in line_words)
- row_height = max_bottom - min_top
-
- rows.append(RowGeometry(
- index=idx,
- x=left_x,
- y=top_y + min_top,
- width=content_w,
- height=row_height,
- word_count=len(line_words),
- words=line_words,
- row_type='content',
- gap_before=0,
- ))
-
- logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
- return rows
-
-
-# --- Phase B: Content-Based Classification ---
-
-def _score_language(words: List[Dict]) -> Dict[str, float]:
- """Score the language of a column's words.
-
- Analyzes function words, umlauts, and capitalization patterns
- to determine whether text is English or German.
-
- Args:
- words: List of word dicts with 'text' and 'conf' keys.
-
- Returns:
- Dict with 'eng' and 'deu' scores (0.0-1.0).
- """
- if not words:
- return {'eng': 0.0, 'deu': 0.0}
-
- # Only consider words with decent confidence
- good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
- if not good_words:
- return {'eng': 0.0, 'deu': 0.0}
-
- total = len(good_words)
- en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
- de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
-
- # Check for umlauts (strong German signal)
- raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
- umlaut_count = sum(1 for t in raw_texts
- for c in t if c in 'äöüÄÖÜß')
-
- # German capitalization: nouns are capitalized mid-sentence
- # Count words that start with uppercase but aren't at position 0
- cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
-
- en_score = en_hits / total if total > 0 else 0.0
- de_score = de_hits / total if total > 0 else 0.0
-
- # Boost German score for umlauts
- if umlaut_count > 0:
- de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
-
- # Boost German score for high capitalization ratio (typical for German nouns)
- if total > 5:
- cap_ratio = cap_words / total
- if cap_ratio > 0.3:
- de_score = min(1.0, de_score + 0.1)
-
- return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
-
-
-def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
- """Score the role of a column based on its geometry and content patterns.
-
- Args:
- geom: ColumnGeometry with words and dimensions.
-
- Returns:
- Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
- """
- scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
-
- if not geom.words:
- return scores
-
- texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
- if not texts:
- return scores
-
- avg_word_len = sum(len(t) for t in texts) / len(texts)
- has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
- digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
- digit_ratio = digit_words / len(texts) if texts else 0.0
-
- # Reference: narrow + mostly numbers/page references
- if geom.width_ratio < 0.12:
- scores['reference'] = 0.5
- if digit_ratio > 0.4:
- scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
-
- # Marker: narrow + few short entries
- if geom.width_ratio < 0.06 and geom.word_count <= 15:
- scores['marker'] = 0.7
- if avg_word_len < 4:
- scores['marker'] = 0.9
- # Very narrow non-edge column → strong marker regardless of word count
- if geom.width_ratio < 0.04 and geom.index > 0:
- scores['marker'] = max(scores['marker'], 0.9)
-
- # Sentence: longer words + punctuation present
- if geom.width_ratio > 0.15 and has_punctuation > 2:
- scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
- if avg_word_len > 4:
- scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
-
- # Vocabulary: medium width + medium word length
- if 0.10 < geom.width_ratio < 0.45:
- scores['vocabulary'] = 0.4
- if 3 < avg_word_len < 8:
- scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
-
- return {k: round(v, 3) for k, v in scores.items()}
-
-
-def _build_margin_regions(
- all_regions: List[PageRegion],
- left_x: int,
- right_x: int,
- img_w: int,
- top_y: int,
- content_h: int,
-) -> List[PageRegion]:
- """Create margin_left / margin_right PageRegions from content bounds.
-
- Margins represent the space between the image edge and the first/last
- content column. They are used downstream for faithful page
- reconstruction but are skipped during OCR.
- """
- margins: List[PageRegion] = []
- # Minimum gap (px) to create a margin region
- _min_gap = 5
-
- if left_x > _min_gap:
- margins.append(PageRegion(
- type='margin_left', x=0, y=top_y,
- width=left_x, height=content_h,
- classification_confidence=1.0,
- classification_method='content_bounds',
- ))
-
- # Right margin: from end of last content column to image edge
- non_margin = [r for r in all_regions
- if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
- 'margin_top', 'margin_bottom')]
- if non_margin:
- last_col_end = max(r.x + r.width for r in non_margin)
- else:
- last_col_end = right_x
- if img_w - last_col_end > _min_gap:
- margins.append(PageRegion(
- type='margin_right', x=last_col_end, y=top_y,
- width=img_w - last_col_end, height=content_h,
- classification_confidence=1.0,
- classification_method='content_bounds',
- ))
-
- if margins:
- logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
- f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
-
- return margins
-
-
-def positional_column_regions(
- geometries: List[ColumnGeometry],
- content_w: int,
- content_h: int,
- left_x: int,
-) -> List[PageRegion]:
- """Classify columns by position only (no language scoring).
-
- Structural columns (page_ref, column_marker) are identified by geometry.
- Remaining content columns are labelled left→right as column_en, column_de,
- column_example. The names are purely positional – no language analysis.
- """
- structural: List[PageRegion] = []
- content_cols: List[ColumnGeometry] = []
-
- for g in geometries:
- rel_x = g.x - left_x
- # page_ref: narrow column in the leftmost 20% region
- if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
- structural.append(PageRegion(
- type='page_ref', x=g.x, y=g.y,
- width=g.width, height=content_h,
- classification_confidence=0.95,
- classification_method='positional',
- ))
- # column_marker: very narrow, few words
- elif g.width_ratio < 0.06 and g.word_count <= 15:
- structural.append(PageRegion(
- type='column_marker', x=g.x, y=g.y,
- width=g.width, height=content_h,
- classification_confidence=0.95,
- classification_method='positional',
- ))
- # empty or near-empty narrow column → treat as margin/structural
- elif g.word_count <= 2 and g.width_ratio < 0.15:
- structural.append(PageRegion(
- type='column_marker', x=g.x, y=g.y,
- width=g.width, height=content_h,
- classification_confidence=0.85,
- classification_method='positional',
- ))
- else:
- content_cols.append(g)
-
- # Single content column → plain text page
- if len(content_cols) == 1:
- g = content_cols[0]
- return structural + [PageRegion(
- type='column_text', x=g.x, y=g.y,
- width=g.width, height=content_h,
- classification_confidence=0.9,
- classification_method='positional',
- )]
-
- # No content columns
- if not content_cols:
- return structural
-
- # Sort content columns left→right and assign positional labels
- content_cols.sort(key=lambda g: g.x)
-
- # With exactly 2 content columns: if the left one is very wide (>35%),
- # it likely contains EN+DE combined, so the right one is examples.
- if (len(content_cols) == 2
- and content_cols[0].width_ratio > 0.35
- and content_cols[1].width_ratio > 0.20):
- labels = ['column_en', 'column_example']
- else:
- labels = ['column_en', 'column_de', 'column_example']
-
- regions = list(structural)
- for i, g in enumerate(content_cols):
- label = labels[i] if i < len(labels) else 'column_example'
- regions.append(PageRegion(
- type=label, x=g.x, y=g.y,
- width=g.width, height=content_h,
- classification_confidence=0.95,
- classification_method='positional',
- ))
-
- logger.info(f"PositionalColumns: {len(structural)} structural, "
- f"{len(content_cols)} content → "
- f"{[r.type for r in regions]}")
- return regions
-
-
-def classify_column_types(geometries: List[ColumnGeometry],
- content_w: int,
- top_y: int,
- img_w: int,
- img_h: int,
- bottom_y: int,
- left_x: int = 0,
- right_x: int = 0,
- inv: Optional[np.ndarray] = None) -> List[PageRegion]:
- """Classify column types using a 3-level fallback chain.
-
- Level 1: Content-based (language + role scoring)
- Level 2: Position + language (old rules enhanced with language detection)
- Level 3: Pure position (exact old code, no regression)
-
- Args:
- geometries: List of ColumnGeometry from Phase A.
- content_w: Total content width.
- top_y: Top Y of content area.
- img_w: Full image width.
- img_h: Full image height.
- bottom_y: Bottom Y of content area.
- left_x: Left content bound (from _find_content_bounds).
- right_x: Right content bound (from _find_content_bounds).
-
- Returns:
- List of PageRegion with types, confidence, and method.
- """
- content_h = bottom_y - top_y
-
- def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
- """Append margin_left / margin_right regions to *result*."""
- margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
- return result + margins
-
- # Special case: single column → plain text page
- if len(geometries) == 1:
- geom = geometries[0]
- return _with_margins([PageRegion(
- type='column_text', x=geom.x, y=geom.y,
- width=geom.width, height=geom.height,
- classification_confidence=0.9,
- classification_method='content',
- )])
-
- # --- Pre-filter: first/last columns with very few words → column_ignore ---
- # Sub-columns from _detect_sub_columns() are exempt: they intentionally
- # have few words (page refs, markers) and should not be discarded.
- ignore_regions = []
- active_geometries = []
- for idx, g in enumerate(geometries):
- if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
- ignore_regions.append(PageRegion(
- type='column_ignore', x=g.x, y=g.y,
- width=g.width, height=content_h,
- classification_confidence=0.95,
- classification_method='content',
- ))
- logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)")
- else:
- active_geometries.append(g)
-
- # Re-index active geometries for classification
- for new_idx, g in enumerate(active_geometries):
- g.index = new_idx
- geometries = active_geometries
-
- # Handle edge case: all columns ignored or only 1 left
- if len(geometries) == 0:
- return _with_margins(ignore_regions)
- if len(geometries) == 1:
- geom = geometries[0]
- ignore_regions.append(PageRegion(
- type='column_text', x=geom.x, y=geom.y,
- width=geom.width, height=geom.height,
- classification_confidence=0.9,
- classification_method='content',
- ))
- return _with_margins(ignore_regions)
-
- # --- Score all columns ---
- lang_scores = [_score_language(g.words) for g in geometries]
- role_scores = [_score_role(g) for g in geometries]
-
- logger.info(f"ClassifyColumns: language scores: "
- f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
- logger.info(f"ClassifyColumns: role scores: "
- f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
-
- # --- Level 1: Content-based classification ---
- regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
- if regions is not None:
- logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
- _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
- return _with_margins(ignore_regions + regions)
-
- # --- Level 2: Position + language enhanced ---
- regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
- if regions is not None:
- logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
- _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
- return _with_margins(ignore_regions + regions)
-
- # --- Level 3: Pure position fallback (old code, no regression) ---
- logger.info("ClassifyColumns: Level 3 (position fallback)")
- regions = _classify_by_position_fallback(geometries, content_w, content_h)
- _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
- return _with_margins(ignore_regions + regions)
-
-
-def _classify_by_content(geometries: List[ColumnGeometry],
- lang_scores: List[Dict[str, float]],
- role_scores: List[Dict[str, float]],
- content_w: int,
- content_h: int) -> Optional[List[PageRegion]]:
- """Level 1: Classify columns purely by content analysis.
-
- Requires clear language signals to distinguish EN/DE columns.
- Returns None if language signals are too weak.
- """
- regions = []
- assigned = set()
-
- # Step 1: Assign structural roles first (reference, marker)
- # left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
- left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
-
- for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
- is_left_side = geom.x < left_20_threshold
- has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
- if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
- regions.append(PageRegion(
- type='page_ref', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=rs['reference'],
- classification_method='content',
- ))
- assigned.add(i)
- elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
- regions.append(PageRegion(
- type='column_marker', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=rs['marker'],
- classification_method='content',
- ))
- assigned.add(i)
- elif geom.width_ratio < 0.05 and not is_left_side:
- # Narrow column on the right side → marker, not page_ref
- regions.append(PageRegion(
- type='column_marker', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=0.8,
- classification_method='content',
- ))
- assigned.add(i)
-
- # Step 2: Among remaining columns, find EN and DE by language scores
- remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
- for i in range(len(geometries)) if i not in assigned]
-
- if len(remaining) < 2:
- # Not enough columns for EN/DE pair
- if len(remaining) == 1:
- i, geom, ls, rs = remaining[0]
- regions.append(PageRegion(
- type='column_text', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=0.6,
- classification_method='content',
- ))
- regions.sort(key=lambda r: r.x)
- return regions
-
- # Check if we have enough language signal
- en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
- de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
-
- # Position tiebreaker: when language signals are weak, use left=EN, right=DE
- if (not en_candidates or not de_candidates) and len(remaining) >= 2:
- max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
- max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
- if max_eng < 0.15 and max_deu < 0.15:
- # Both signals weak — fall back to positional: left=EN, right=DE
- sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
- best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
- best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
- logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
- en_conf = 0.4
- de_conf = 0.4
-
- regions.append(PageRegion(
- type='column_en', x=best_en[1].x, y=best_en[1].y,
- width=best_en[1].width, height=content_h,
- classification_confidence=en_conf,
- classification_method='content',
- ))
- assigned.add(best_en[0])
-
- regions.append(PageRegion(
- type='column_de', x=best_de[1].x, y=best_de[1].y,
- width=best_de[1].width, height=content_h,
- classification_confidence=de_conf,
- classification_method='content',
- ))
- assigned.add(best_de[0])
-
- # Assign remaining as example
- for i, geom, ls, rs in remaining:
- if i not in assigned:
- regions.append(PageRegion(
- type='column_example', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=0.4,
- classification_method='content',
- ))
- regions.sort(key=lambda r: r.x)
- return regions
-
- if not en_candidates or not de_candidates:
- # Language signals too weak for content-based classification
- logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
- return None
-
- # Pick the best EN and DE candidates
- best_en = max(en_candidates, key=lambda x: x[2]['eng'])
- best_de = max(de_candidates, key=lambda x: x[2]['deu'])
-
- # Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
- # Example sentences contain English function words ("the", "a", "is") which inflate
- # the eng score of the Example column. When the best EN candidate sits to the RIGHT
- # of the DE column and there is another EN candidate to the LEFT, prefer the left one
- # — it is almost certainly the real vocabulary column.
- if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
- left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
- if left_of_de:
- alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
- logger.info(
- f"ClassifyColumns: Level 1 position fix — best EN col {best_en[0]} "
- f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
- f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
- best_en = alt_en
-
- if best_en[0] == best_de[0]:
- # Same column scored highest for both — ambiguous
- logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
- return None
-
- en_conf = best_en[2]['eng']
- de_conf = best_de[2]['deu']
-
- regions.append(PageRegion(
- type='column_en', x=best_en[1].x, y=best_en[1].y,
- width=best_en[1].width, height=content_h,
- classification_confidence=round(en_conf, 2),
- classification_method='content',
- ))
- assigned.add(best_en[0])
-
- regions.append(PageRegion(
- type='column_de', x=best_de[1].x, y=best_de[1].y,
- width=best_de[1].width, height=content_h,
- classification_confidence=round(de_conf, 2),
- classification_method='content',
- ))
- assigned.add(best_de[0])
-
- # Step 3: Remaining columns → example or text based on role scores
- for i, geom, ls, rs in remaining:
- if i in assigned:
- continue
- if rs['sentence'] > 0.4:
- regions.append(PageRegion(
- type='column_example', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=round(rs['sentence'], 2),
- classification_method='content',
- ))
- else:
- regions.append(PageRegion(
- type='column_example', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=0.5,
- classification_method='content',
- ))
-
- regions.sort(key=lambda r: r.x)
- return regions
-
-
-def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
- lang_scores: List[Dict[str, float]],
- content_w: int,
- content_h: int) -> Optional[List[PageRegion]]:
- """Level 2: Position-based rules enhanced with language confirmation.
-
- Uses the old positional heuristics but confirms EN/DE assignment
- with language scores (swapping if needed).
- """
- regions = []
- untyped = list(range(len(geometries)))
- first_x = geometries[0].x if geometries else 0
- left_20_threshold = first_x + content_w * 0.20
-
- # Rule 1: Leftmost narrow column → page_ref (only if in left 20%, no strong language)
- g0 = geometries[0]
- ls0 = lang_scores[0]
- has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
- if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
- regions.append(PageRegion(
- type='page_ref', x=g0.x, y=g0.y,
- width=g0.width, height=content_h,
- classification_confidence=0.8,
- classification_method='position_enhanced',
- ))
- untyped.remove(0)
-
- # Rule 2: Narrow columns with few words → marker
- for i in list(untyped):
- geom = geometries[i]
- if geom.width_ratio < 0.06 and geom.word_count <= 15:
- regions.append(PageRegion(
- type='column_marker', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=0.7,
- classification_method='position_enhanced',
- ))
- untyped.remove(i)
-
- # Rule 3: Rightmost remaining → column_example (if 3+ remaining)
- if len(untyped) >= 3:
- last_idx = untyped[-1]
- geom = geometries[last_idx]
- regions.append(PageRegion(
- type='column_example', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=0.7,
- classification_method='position_enhanced',
- ))
- untyped.remove(last_idx)
-
- # Rule 4: First two remaining → EN/DE, but check language to possibly swap
- if len(untyped) >= 2:
- idx_a = untyped[0]
- idx_b = untyped[1]
- ls_a = lang_scores[idx_a]
- ls_b = lang_scores[idx_b]
-
- # Default: first=EN, second=DE (old behavior)
- en_idx, de_idx = idx_a, idx_b
- conf = 0.7
-
- # Swap if language signals clearly indicate the opposite
- if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
- en_idx, de_idx = idx_b, idx_a
- conf = 0.85
- logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
-
- regions.append(PageRegion(
- type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
- width=geometries[en_idx].width, height=content_h,
- classification_confidence=conf,
- classification_method='position_enhanced',
- ))
- regions.append(PageRegion(
- type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
- width=geometries[de_idx].width, height=content_h,
- classification_confidence=conf,
- classification_method='position_enhanced',
- ))
- untyped = untyped[2:]
- elif len(untyped) == 1:
- idx = untyped[0]
- geom = geometries[idx]
- regions.append(PageRegion(
- type='column_en', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=0.5,
- classification_method='position_enhanced',
- ))
- untyped = []
-
- # Remaining → example
- for idx in untyped:
- geom = geometries[idx]
- regions.append(PageRegion(
- type='column_example', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=0.5,
- classification_method='position_enhanced',
- ))
-
- regions.sort(key=lambda r: r.x)
- return regions
-
-
-def _classify_by_position_fallback(geometries: List[ColumnGeometry],
- content_w: int,
- content_h: int) -> List[PageRegion]:
- """Level 3: Pure position-based fallback (identical to old code).
-
- Guarantees no regression from the previous behavior.
- """
- regions = []
- untyped = list(range(len(geometries)))
- first_x = geometries[0].x if geometries else 0
- left_20_threshold = first_x + content_w * 0.20
-
- # Rule 1: Leftmost narrow column → page_ref (only if in left 20%)
- g0 = geometries[0]
- if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
- regions.append(PageRegion(
- type='page_ref', x=g0.x, y=g0.y,
- width=g0.width, height=content_h,
- classification_confidence=1.0,
- classification_method='position_fallback',
- ))
- untyped.remove(0)
-
- # Rule 2: Narrow + few words → marker
- for i in list(untyped):
- geom = geometries[i]
- if geom.width_ratio < 0.06 and geom.word_count <= 15:
- regions.append(PageRegion(
- type='column_marker', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=1.0,
- classification_method='position_fallback',
- ))
- untyped.remove(i)
-
- # Rule 3: Rightmost remaining → example (if 3+)
- if len(untyped) >= 3:
- last_idx = untyped[-1]
- geom = geometries[last_idx]
- regions.append(PageRegion(
- type='column_example', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=1.0,
- classification_method='position_fallback',
- ))
- untyped.remove(last_idx)
-
- # Rule 4: First remaining → EN, second → DE
- if len(untyped) >= 2:
- en_idx = untyped[0]
- de_idx = untyped[1]
- regions.append(PageRegion(
- type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
- width=geometries[en_idx].width, height=content_h,
- classification_confidence=1.0,
- classification_method='position_fallback',
- ))
- regions.append(PageRegion(
- type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
- width=geometries[de_idx].width, height=content_h,
- classification_confidence=1.0,
- classification_method='position_fallback',
- ))
- untyped = untyped[2:]
- elif len(untyped) == 1:
- idx = untyped[0]
- geom = geometries[idx]
- regions.append(PageRegion(
- type='column_en', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=1.0,
- classification_method='position_fallback',
- ))
- untyped = []
-
- for idx in untyped:
- geom = geometries[idx]
- regions.append(PageRegion(
- type='column_example', x=geom.x, y=geom.y,
- width=geom.width, height=content_h,
- classification_confidence=1.0,
- classification_method='position_fallback',
- ))
-
- regions.sort(key=lambda r: r.x)
- return regions
-
-
-def _detect_header_footer_gaps(
- inv: np.ndarray,
- img_w: int,
- img_h: int,
-) -> Tuple[Optional[int], Optional[int]]:
- """Detect header/footer boundaries via horizontal projection gap analysis.
-
- Scans the full-page inverted image for large horizontal gaps in the top/bottom
- 20% that separate header/footer content from the main body.
-
- Returns:
- (header_y, footer_y) — absolute y-coordinates.
- header_y = bottom edge of header region (None if no header detected).
- footer_y = top edge of footer region (None if no footer detected).
- """
- HEADER_FOOTER_ZONE = 0.20
- GAP_MULTIPLIER = 2.0
-
- # Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
- actual_h = min(inv.shape[0], img_h)
- roi = inv[:actual_h, :]
- h_proj = np.sum(roi, axis=1).astype(float)
- proj_w = roi.shape[1]
- h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
-
- # Step 2: Smoothing
- kernel_size = max(3, actual_h // 200)
- if kernel_size % 2 == 0:
- kernel_size += 1
- h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
-
- # Step 3: Gap threshold
- positive = h_smooth[h_smooth > 0]
- median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
- gap_threshold = max(median_density * 0.15, 0.003)
-
- in_gap = h_smooth < gap_threshold
- MIN_GAP_HEIGHT = max(3, actual_h // 500)
-
- # Step 4: Collect contiguous gaps
- raw_gaps: List[Tuple[int, int]] = []
- gap_start: Optional[int] = None
- for y in range(len(in_gap)):
- if in_gap[y]:
- if gap_start is None:
- gap_start = y
- else:
- if gap_start is not None:
- gap_height = y - gap_start
- if gap_height >= MIN_GAP_HEIGHT:
- raw_gaps.append((gap_start, y))
- gap_start = None
- if gap_start is not None:
- gap_height = len(in_gap) - gap_start
- if gap_height >= MIN_GAP_HEIGHT:
- raw_gaps.append((gap_start, len(in_gap)))
-
- if not raw_gaps:
- return None, None
-
- # Step 5: Compute median gap size and large-gap threshold
- gap_sizes = [g[1] - g[0] for g in raw_gaps]
- median_gap = float(np.median(gap_sizes))
- large_gap_threshold = median_gap * GAP_MULTIPLIER
-
- # Step 6: Find largest qualifying gap in header / footer zones
- # A separator gap must have content on BOTH sides — edge-touching gaps
- # (e.g. dewarp padding at bottom) are not valid separators.
- EDGE_MARGIN = max(5, actual_h // 400)
- header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
- footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
-
- header_y: Optional[int] = None
- footer_y: Optional[int] = None
-
- best_header_size = 0
- for gs, ge in raw_gaps:
- if gs <= EDGE_MARGIN:
- continue # skip gaps touching the top edge
- gap_mid = (gs + ge) / 2
- gap_size = ge - gs
- if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
- if gap_size > best_header_size:
- best_header_size = gap_size
- header_y = ge # bottom edge of gap
-
- best_footer_size = 0
- for gs, ge in raw_gaps:
- if ge >= actual_h - EDGE_MARGIN:
- continue # skip gaps touching the bottom edge
- gap_mid = (gs + ge) / 2
- gap_size = ge - gs
- if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
- if gap_size > best_footer_size:
- best_footer_size = gap_size
- footer_y = gs # top edge of gap
-
- if header_y is not None:
- logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
- f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
- if footer_y is not None:
- logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
- f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
-
- return header_y, footer_y
-
-
-def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
- min_density: float = 0.005) -> bool:
- """Check whether a horizontal strip contains meaningful ink.
-
- Args:
- inv: Inverted binarized image (white-on-black).
- y_start: Top of the region (inclusive).
- y_end: Bottom of the region (exclusive).
- min_density: Fraction of white pixels required to count as content.
-
- Returns:
- True if the region contains text/graphics, False if empty margin.
- """
- if y_start >= y_end:
- return False
- strip = inv[y_start:y_end, :]
- density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
- return density > min_density
-
-
-def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
- img_w: int, img_h: int,
- inv: Optional[np.ndarray] = None) -> None:
- """Add header/footer/margin regions in-place.
-
- Uses gap-based detection when *inv* is provided, otherwise falls back
- to simple top_y/bottom_y bounds.
-
- Region types depend on whether there is actual content (text/graphics):
- - 'header' / 'footer' — region contains text (e.g. title, page number)
- - 'margin_top' / 'margin_bottom' — region is empty page margin
- """
- header_y: Optional[int] = None
- footer_y: Optional[int] = None
-
- if inv is not None:
- header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
-
- # --- Top region ---
- top_boundary = header_y if header_y is not None and header_y > 10 else (
- top_y if top_y > 10 else None
- )
- if top_boundary is not None:
- has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
- rtype = 'header' if has_content else 'margin_top'
- regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
- logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
- f"(has_content={has_content})")
-
- # --- Bottom region ---
- bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
- bottom_y if bottom_y < img_h - 10 else None
- )
- if bottom_boundary is not None:
- has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
- rtype = 'footer' if has_content else 'margin_bottom'
- regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
- height=img_h - bottom_boundary))
- logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
- f"height={img_h - bottom_boundary}px (has_content={has_content})")
-
-
-# --- Main Entry Point ---
-
-def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
- """Detect columns using two-phase approach: geometry then content classification.
-
- Phase A: detect_column_geometry() — clustering word positions into columns.
- Phase B: classify_column_types() — content-based type assignment with fallback.
-
- Falls back to projection-based analyze_layout() if geometry detection fails.
-
- Args:
- ocr_img: Binarized grayscale image for layout analysis.
- dewarped_bgr: Original BGR image (for Tesseract word detection).
-
- Returns:
- List of PageRegion objects with types, confidence, and method.
- """
- h, w = ocr_img.shape[:2]
-
- # Phase A: Geometry detection
- result = detect_column_geometry(ocr_img, dewarped_bgr)
-
- if result is None:
- # Fallback to projection-based layout
- logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
- layout_img = create_layout_image(dewarped_bgr)
- return analyze_layout(layout_img, ocr_img)
-
- geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
- content_w = right_x - left_x
-
- # Detect header/footer early so sub-column clustering ignores them
- header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
-
- # Split sub-columns (e.g. page references) before classification
- geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
- top_y=top_y, header_y=header_y, footer_y=footer_y)
-
- # Split broad columns that contain EN+DE mixed via word-coverage gaps
- geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
-
- # Phase B: Positional classification (no language scoring)
- content_h = bottom_y - top_y
- regions = positional_column_regions(geometries, content_w, content_h, left_x)
-
- col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
- methods = set(r.classification_method for r in regions if r.classification_method)
- logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
- f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
-
- return regions
-
-
-# =============================================================================
-# Pipeline Step 5: Word Grid from Columns × Rows
-# =============================================================================
-
-def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
- """Group OCR words into visual lines in reading order.
-
- Returns a list of line strings (one per visual line in the cell).
- """
- if not words:
- return []
-
- lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
- return [' '.join(w['text'] for w in line) for line in lines]
-
-
-def _rejoin_hyphenated(lines: List[str]) -> List[str]:
- """Rejoin words split by line-break hyphenation.
-
- E.g. ['Fuß-', 'boden'] → ['Fußboden']
- ['some text-', 'thing here'] → ['something here']
- """
- if len(lines) <= 1:
- return lines
-
- result = []
- i = 0
- while i < len(lines):
- line = lines[i]
- # If line ends with '-' and there's a next line, rejoin
- if i + 1 < len(lines) and line.rstrip().endswith('-'):
- stripped = line.rstrip()
- # Get the word fragment before hyphen (last word)
- prefix = stripped[:-1] # remove trailing hyphen
- next_line = lines[i + 1]
- # Join: last word of this line + first word of next line
- prefix_words = prefix.rsplit(' ', 1)
- next_words = next_line.split(' ', 1)
- if len(prefix_words) > 1:
- joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
- else:
- joined = prefix_words[0] + next_words[0]
- remainder = next_words[1] if len(next_words) > 1 else ''
- if remainder:
- result.append(joined + ' ' + remainder)
- else:
- result.append(joined)
- i += 2
- else:
- result.append(line)
- i += 1
- return result
-
-
-def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
- """Join OCR words into text in correct reading order, preserving line breaks.
-
- Groups words into visual lines by Y-tolerance, sorts each line by X,
- rejoins hyphenated words, then joins lines with newlines.
- """
- lines = _words_to_reading_order_lines(words, y_tolerance_px)
- lines = _rejoin_hyphenated(lines)
- return '\n'.join(lines)
-
-
-# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
-
-_rapid_engine = None
-RAPIDOCR_AVAILABLE = False
-
-try:
- from rapidocr import RapidOCR as _RapidOCRClass
- from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
- RAPIDOCR_AVAILABLE = True
- logger.info("RapidOCR available — can be used as alternative to Tesseract")
-except ImportError:
- logger.info("RapidOCR not installed — using Tesseract only")
-
-
-def _get_rapid_engine():
- """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
- global _rapid_engine
- if _rapid_engine is None:
- _rapid_engine = _RapidOCRClass(params={
- # PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
- "Rec.lang_type": _LangRec.LATIN,
- "Rec.model_type": _ModelType.SERVER,
- "Rec.ocr_version": _OCRVersion.PPOCRV5,
- # Tighter detection boxes to reduce word merging
- "Det.unclip_ratio": 1.3,
- # Lower threshold to detect small chars (periods, ellipsis, phonetics)
- "Det.box_thresh": 0.4,
- # Silence verbose logging
- "Global.log_level": "critical",
- })
- logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
- return _rapid_engine
-
-
-def ocr_region_rapid(
- img_bgr: np.ndarray,
- region: PageRegion,
-) -> List[Dict[str, Any]]:
- """Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format.
-
- Args:
- img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray).
- region: Region to crop and OCR.
-
- Returns:
- List of word dicts with text, left, top, width, height, conf, region_type.
- """
- engine = _get_rapid_engine()
-
- # Crop region from BGR image
- crop = img_bgr[region.y:region.y + region.height,
- region.x:region.x + region.width]
-
- if crop.size == 0:
- return []
-
- result = engine(crop)
-
- if result is None or result.boxes is None or result.txts is None:
- return []
-
- words = []
- boxes = result.boxes # shape (N, 4, 2) — 4 corner points per text line
- txts = result.txts # tuple of strings
- scores = result.scores # tuple of floats
-
- for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
- if not txt or not txt.strip():
- continue
-
- # box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left)
- xs = [p[0] for p in box]
- ys = [p[1] for p in box]
- left = int(min(xs))
- top = int(min(ys))
- w = int(max(xs) - left)
- h = int(max(ys) - top)
-
- words.append({
- 'text': txt.strip(),
- 'left': left + region.x, # Absolute coords
- 'top': top + region.y,
- 'width': w,
- 'height': h,
- 'conf': int(score * 100), # 0-100 like Tesseract
- 'region_type': region.type,
- })
-
- return words
-
-
-def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
- """Run TrOCR on a region. Returns line-level word dicts (same format as ocr_region_rapid).
-
- Uses trocr_service.get_trocr_model() + _split_into_lines() for line segmentation.
- Bboxes are approximated from equal line-height distribution within the region.
- Falls back to Tesseract if TrOCR is not available.
- """
- from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
-
- if not _check_trocr_available():
- logger.warning("TrOCR not available, falling back to Tesseract")
- if region.height > 0 and region.width > 0:
- ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
- if ocr_img_crop is not None:
- return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
- return []
-
- crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
- if crop.size == 0:
- return []
-
- try:
- import torch
- from PIL import Image as _PILImage
-
- processor, model = get_trocr_model(handwritten=handwritten)
- if processor is None or model is None:
- logger.warning("TrOCR model not loaded, falling back to Tesseract")
- ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
- return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
-
- pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
- lines = _split_into_lines(pil_crop)
- if not lines:
- lines = [pil_crop]
-
- device = next(model.parameters()).device
- all_text = []
- confidences = []
- for line_img in lines:
- pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
- with torch.no_grad():
- generated_ids = model.generate(pixel_values, max_length=128)
- text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
- if text_line:
- all_text.append(text_line)
- confidences.append(0.85 if len(text_line) > 3 else 0.5)
-
- if not all_text:
- return []
-
- avg_conf = int(sum(confidences) / len(confidences) * 100)
- line_h = region.height // max(len(all_text), 1)
- words = []
- for i, line in enumerate(all_text):
- words.append({
- "text": line,
- "left": region.x,
- "top": region.y + i * line_h,
- "width": region.width,
- "height": line_h,
- "conf": avg_conf,
- "region_type": region.type,
- })
- return words
-
- except Exception as e:
- logger.error(f"ocr_region_trocr failed: {e}")
- return []
-
-
-def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
- """Run LightOnOCR-2-1B on a region. Returns line-level word dicts (same format as ocr_region_rapid).
-
- Falls back to RapidOCR or Tesseract if LightOnOCR is not available.
- """
- from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
-
- if not _check_lighton_available():
- logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
- if RAPIDOCR_AVAILABLE and img_bgr is not None:
- return ocr_region_rapid(img_bgr, region)
- ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
- return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
-
- crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
- if crop.size == 0:
- return []
-
- try:
- import io
- import torch
- from PIL import Image as _PILImage
-
- processor, model = get_lighton_model()
- if processor is None or model is None:
- logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
- if RAPIDOCR_AVAILABLE and img_bgr is not None:
- return ocr_region_rapid(img_bgr, region)
- ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
- return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
-
- pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
- conversation = [{"role": "user", "content": [{"type": "image"}]}]
- inputs = processor.apply_chat_template(
- conversation, images=[pil_crop],
- add_generation_prompt=True, return_tensors="pt"
- ).to(model.device)
-
- with torch.no_grad():
- output_ids = model.generate(**inputs, max_new_tokens=1024)
-
- text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
- if not text:
- return []
-
- lines = [l.strip() for l in text.split("\n") if l.strip()]
- line_h = region.height // max(len(lines), 1)
- words = []
- for i, line in enumerate(lines):
- words.append({
- "text": line,
- "left": region.x,
- "top": region.y + i * line_h,
- "width": region.width,
- "height": line_h,
- "conf": 85,
- "region_type": region.type,
- })
- return words
-
- except Exception as e:
- logger.error(f"ocr_region_lighton failed: {e}")
- return []
-
-
-# =============================================================================
-# Post-Processing: Deterministic Quality Fixes
-# =============================================================================
-
-# --- A. Character Confusion Fix (I/1/l) ---
-
-# Common OCR confusion pairs in vocabulary context
-_CHAR_CONFUSION_RULES = [
- # "1" at word start followed by lowercase → likely "I" or "l"
- # Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
- (re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
- # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
- (re.compile(r'(? List[Dict[str, Any]]:
- """Fix common OCR character confusions using context.
-
- Deterministic rules:
- - "1" at word start → "I" or "l" based on context
- - Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
- - "y " artifact at word boundaries → remove (e.g. "y you" → "you")
- """
- for entry in entries:
- en = entry.get('english', '') or ''
- de = entry.get('german', '') or ''
- ex = entry.get('example', '') or ''
-
- # Apply general rules to all fields
- for pattern, replacement in _CHAR_CONFUSION_RULES:
- en = pattern.sub(replacement, en)
- de = pattern.sub(replacement, de)
- ex = pattern.sub(replacement, ex)
-
- # Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
- de_lower_words = set(de.lower().replace(',', ' ').split())
- if de_lower_words & _DE_INDICATORS_FOR_EN_I:
- # Any remaining "1" in EN that looks like "I"
- en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
-
- # Fix "y " artifact before repeated word: "y you" → "you"
- en = re.sub(r'\by\s+([a-z])', r'\1', en)
- ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
-
- entry['english'] = en.strip()
- entry['german'] = de.strip()
- entry['example'] = ex.strip()
-
- return entries
-
-
-# --- B. Comma-Separated Word Form Splitting ---
-
-def _is_singular_plural_pair(parts: List[str]) -> bool:
- """Detect if comma-separated parts are singular/plural forms of the same word.
-
- E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
- "break, broke, broken" → False (different verb forms, OK to split).
-
- Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
- OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
- """
- if len(parts) != 2:
- return False
-
- a, b = parts[0].lower().strip(), parts[1].lower().strip()
- if not a or not b:
- return False
-
- # Common prefix heuristic: if words share >= 50% of the shorter word,
- # they are likely forms of the same word (Maus/Mäuse, child/children).
- min_len = min(len(a), len(b))
- common = 0
- for ca, cb in zip(a, b):
- if ca == cb:
- common += 1
- else:
- break
- if common >= max(2, min_len * 0.5):
- return True
-
- # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
- umlaut_map = str.maketrans('aou', 'äöü')
- if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
- return True
-
- return False
-
-
-def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Split entries with comma-separated word forms into individual entries.
-
- E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
- → 3 entries: break/brechen, broke/brach, broken/gebrochen
-
- Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
- because those are forms of the same vocabulary entry.
-
- Only splits when both EN and DE have the same number of comma-parts,
- parts are short (word forms, not sentences), and at least 3 parts
- (to avoid splitting pairs that likely belong together).
- """
- result: List[Dict[str, Any]] = []
-
- for entry in entries:
- en = (entry.get('english', '') or '').strip()
- de = (entry.get('german', '') or '').strip()
-
- # Split by comma (but not inside brackets or parentheses)
- en_parts = _split_by_comma(en)
- de_parts = _split_by_comma(de)
-
- # Only split if we have multiple parts and counts match
- should_split = False
- if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
- # All parts must be short (word forms, not sentences)
- if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
- # Do NOT split singular/plural pairs (2 parts that are
- # forms of the same word)
- if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
- should_split = False
- else:
- should_split = True
-
- if not should_split:
- result.append(entry)
- continue
-
- # Split into individual entries
- for k in range(len(en_parts)):
- sub = dict(entry) # shallow copy
- sub['english'] = en_parts[k].strip()
- sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
- sub['example'] = '' # examples get attached later
- sub['split_from_comma'] = True
- result.append(sub)
-
- # Re-number
- for i, e in enumerate(result):
- e['row_index'] = i
-
- return result
-
-
-def _split_by_comma(text: str) -> List[str]:
- """Split text by commas, but not inside brackets [...] or parens (...)."""
- if ',' not in text:
- return [text]
-
- parts = []
- depth_bracket = 0
- depth_paren = 0
- current = []
-
- for ch in text:
- if ch == '[':
- depth_bracket += 1
- elif ch == ']':
- depth_bracket = max(0, depth_bracket - 1)
- elif ch == '(':
- depth_paren += 1
- elif ch == ')':
- depth_paren = max(0, depth_paren - 1)
- elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
- parts.append(''.join(current).strip())
- current = []
- continue
- current.append(ch)
-
- if current:
- parts.append(''.join(current).strip())
-
- # Filter empty parts
- return [p for p in parts if p]
-
-
-# --- C. Example Sentence Attachment ---
-
-def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
- """Find the vocab entry whose English word(s) best match the example sentence.
-
- Returns index into vocab_entries, or -1 if no match found.
- Uses word stem overlap: "a broken arm" matches "broken" or "break".
- """
- if not vocab_entries or not example_text:
- return -1
-
- example_lower = example_text.lower()
- example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
-
- best_idx = -1
- best_score = 0
-
- for i, entry in enumerate(vocab_entries):
- en = (entry.get('english', '') or '').lower()
- if not en:
- continue
-
- # Extract vocab words (split on space, comma, newline)
- vocab_words = set(re.findall(r'[a-zäöüß]+', en))
-
- # Score: how many vocab words appear in the example?
- # Also check if example words share a common stem (first 4 chars)
- direct_matches = vocab_words & example_words
- score = len(direct_matches) * 10
-
- # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
- if score == 0:
- for vw in vocab_words:
- if len(vw) < 3:
- continue
- stem = vw[:4] if len(vw) >= 4 else vw[:3]
- for ew in example_words:
- if len(ew) >= len(stem) and ew[:len(stem)] == stem:
- score += 5
- break
-
- if score > best_score:
- best_score = score
- best_idx = i
-
- return best_idx if best_score > 0 else -1
-
-
-def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """Attach rows with EN text but no DE translation as examples to matching vocab entries.
-
- Vocabulary worksheets often have:
- Row 1: break, broke, broken / brechen, brach, gebrochen
- Row 2: a broken arm (no DE → example for "broken")
- Row 3: a broken plate (no DE → example for "broken")
- Row 4: egg / Ei (has DE → new vocab entry)
-
- Rules (deterministic, generic):
- - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
- - Find the best matching vocab entry by checking which entry's English words
- appear in the example sentence (semantic matching via word overlap)
- - Fall back to the nearest preceding entry if no word match found
- - Multiple examples get joined with " | "
- """
- if not entries:
- return entries
-
- # Separate into vocab entries (have DE) and example candidates (no DE)
- vocab_entries: List[Dict[str, Any]] = []
- examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
-
- for entry in entries:
- en = (entry.get('english', '') or '').strip()
- de = (entry.get('german', '') or '').strip()
- ex = (entry.get('example', '') or '').strip()
-
- # Treat single-char DE as OCR noise, not real translation.
- # "Ei" (2 chars) is a valid German word, so threshold is 1.
- has_de = len(de) > 1
- has_en = bool(en)
-
- # Heuristic: a row without DE is an "example sentence" only if
- # the EN text looks like a sentence (>= 4 words, or contains
- # typical sentence punctuation). Short EN text (1-3 words) is
- # more likely a vocab entry whose DE was missed by OCR.
- _looks_like_sentence = (
- len(en.split()) >= 4
- or en.rstrip().endswith(('.', '!', '?'))
- )
- is_example_candidate = (
- has_en and not has_de and _looks_like_sentence and vocab_entries
- )
-
- if is_example_candidate:
- # This is an example sentence — find best matching vocab entry
- example_text = en
-
- match_idx = _find_best_vocab_match(en, vocab_entries)
- if match_idx < 0:
- # No word match → fall back to last entry
- match_idx = len(vocab_entries) - 1
-
- if match_idx not in examples_for:
- examples_for[match_idx] = []
- examples_for[match_idx].append(example_text)
- else:
- vocab_entries.append(entry)
-
- # Attach examples to their matched vocab entries
- for idx, example_list in examples_for.items():
- if 0 <= idx < len(vocab_entries):
- entry = vocab_entries[idx]
- existing_ex = (entry.get('example', '') or '').strip()
- new_examples = ' | '.join(example_list)
- entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
-
- # Re-number
- for i, e in enumerate(vocab_entries):
- e['row_index'] = i
-
- return vocab_entries
-
-
-# --- D. Phonetic Bracket IPA Replacement ---
-
-# Pattern: word followed by any bracket type containing phonetic content.
-# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
-# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
-# This intentionally matches mixed brackets (e.g. {content]) because
-# Tesseract frequently misrecognizes bracket characters.
-_PHONETIC_BRACKET_RE = re.compile(
- r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
+from cv_vocab_types import * # noqa: F401,F403
+from cv_preprocessing import * # noqa: F401,F403
+from cv_layout import * # noqa: F401,F403
+from cv_ocr_engines import * # noqa: F401,F403
+from cv_cell_grid import * # noqa: F401,F403
+from cv_review import * # noqa: F401,F403
+
+# Private names used by consumers — not covered by wildcard re-exports.
+from cv_preprocessing import _apply_shear # noqa: F401
+from cv_layout import ( # noqa: F401
+ _detect_header_footer_gaps,
+ _detect_sub_columns,
+ _split_broad_columns,
)
-
-# Unicode IPA characters — used to distinguish correct IPA (from dictionary
-# lookup) from garbled OCR content when stripping orphan brackets.
-_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
-
-# Minimum word confidence for full-page Tesseract results (0-100).
-# Words below this threshold are OCR noise (scanner shadows, borders).
-_MIN_WORD_CONF = 30
-
-
-def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
- """Look up IPA for a word using the selected pronunciation dictionary.
-
- Args:
- word: English word to look up.
- pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
-
- Returns:
- IPA string or None if not found.
- """
- word_lower = word.lower().strip()
- if not word_lower:
- return None
-
- if pronunciation == 'british' and _britfone_dict:
- ipa = _britfone_dict.get(word_lower)
- if ipa:
- return ipa
- # Fallback to American if not in Britfone
- if _ipa_convert_american:
- result = _ipa_convert_american(word_lower)
- if result and '*' not in result:
- return result
- return None
-
- if pronunciation == 'american' and _ipa_convert_american:
- result = _ipa_convert_american(word_lower)
- if result and '*' not in result:
- return result
- # Fallback to Britfone if not in CMU
- if _britfone_dict:
- ipa = _britfone_dict.get(word_lower)
- if ipa:
- return ipa
- return None
-
- # Try any available source
- if _britfone_dict:
- ipa = _britfone_dict.get(word_lower)
- if ipa:
- return ipa
- if _ipa_convert_american:
- result = _ipa_convert_american(word_lower)
- if result and '*' not in result:
- return result
-
- return None
-
-
-def _fix_phonetic_brackets(
- entries: List[Dict[str, Any]],
- pronunciation: str = 'british',
-) -> List[Dict[str, Any]]:
- """Replace OCR'd phonetic transcriptions with dictionary IPA.
-
- Detects patterns like "dance [du:ns]" and replaces with correct IPA:
- - British: "dance [dˈɑːns]" (Britfone, MIT)
- - American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
-
- Only replaces if the word before brackets is found in the dictionary.
- """
- if not IPA_AVAILABLE:
- return entries
-
- # IPA phonetics only appear in the ENGLISH field of vocab tables.
- # German and example fields contain meaningful parenthetical content:
- # german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
- # example: "(sich beschweren)", "(brauchen)", "(jammern)"
- # These must NEVER be processed as phonetic transcriptions.
- replaced_count = 0
- for entry in entries:
- text = entry.get('english', '') or ''
- if not any(ch in text for ch in '[{('):
- continue
- new_text = _replace_phonetics_in_text(text, pronunciation)
- if new_text != text:
- logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
- replaced_count += 1
- entry['english'] = new_text
-
- if replaced_count:
- logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
- return entries
-
-
-# Grammar particles that appear in brackets after English words:
-# cross (with), complain (about/of), agree (on/with), look (sth) up
-# These must NOT be replaced with IPA. Only used for the English field
-# (German/example fields are never processed for IPA replacement).
-_GRAMMAR_BRACKET_WORDS = frozenset({
- # English prepositions/particles commonly in vocab tables
- 'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
- 'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
- # English grammar abbreviations used in vocab tables
- 'sth', 'sb', 'adj', 'adv',
-})
-
-
-def _is_grammar_bracket_content(content: str) -> bool:
- """Return True if bracket content is grammar info in the ENGLISH field.
-
- Grammar info: cross (with), complain (about/of), agree (on/with)
- NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
-
- Since we only process the English field, we only need to recognize
- English grammar particles. Everything else is (garbled) IPA.
- """
- if not content:
- return False
-
- # Split on / for patterns like (about/of), (on/with)
- tokens = [t.strip().lower() for t in content.split('/') if t.strip()]
- if not tokens:
- return False
-
- # ALL tokens must be known grammar words
- return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
-
-
-def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
- """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
-
- Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
- We match any bracket type and replace with dictionary IPA if found.
- Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
- """
- if not IPA_AVAILABLE:
- return text
-
- def replacer(match):
- word = match.group(1)
- bracket_content = match.group(2).strip()
- full_match = match.group(0)
-
- # Skip if bracket content looks like regular text (multiple words)
- if len(bracket_content.split()) > 3:
- return full_match
-
- # Look up IPA for the word before brackets
- ipa = _lookup_ipa(word, pronunciation)
-
- if ipa:
- # Word has IPA → bracket content is phonetic (garbled or correct).
- # Exception: grammar particles like cross (with) — keep those.
- if _is_grammar_bracket_content(bracket_content):
- return full_match
- logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
- return f"{word} [{ipa}]"
-
- # No IPA for this word — keep as-is
- return full_match
-
- text = _PHONETIC_BRACKET_RE.sub(replacer, text)
-
- # Second pass: strip remaining orphan brackets that are garbled IPA.
- # These have no word before them (the main regex requires \b word \s* bracket).
- # Examples: "[mais]", "{'mani setva]", trailing "(kros]"
- # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
- def _strip_orphan_bracket(m):
- content = m.group(1).strip()
- # Keep grammar info: (sich beschweren), (about/of)
- if _is_grammar_bracket_content(content):
- return m.group(0)
- # Keep correct IPA (contains Unicode IPA characters)
- if any(ch in _IPA_CHARS for ch in content):
- return m.group(0)
- logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
- return ''
-
- text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
- text = text.strip()
-
- return text
-
-
-def _assign_row_words_to_columns(
- row: RowGeometry,
- columns: List[PageRegion],
-) -> Dict[int, List[Dict]]:
- """Assign each word in a row to exactly one column.
-
- Uses a two-pass strategy:
- 1. Containment: if a word's center falls within a column's horizontal
- bounds (with padding), assign it to that column.
- 2. Nearest center: for words not contained by any column, fall back to
- nearest column center distance.
-
- This prevents long sentences in wide columns (e.g. example) from having
- their rightmost words stolen by an adjacent column.
-
- Args:
- row: Row with words (relative coordinates).
- columns: Sorted list of columns (absolute coordinates).
-
- Returns:
- Dict mapping col_index → list of words assigned to that column.
- """
- result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
-
- if not row.words or not columns:
- return result
-
- left_x = row.x # content ROI left (absolute)
-
- # Build non-overlapping column assignment ranges using midpoints.
- # For adjacent columns, the boundary is the midpoint between them.
- # This prevents words near column borders from being assigned to
- # the wrong column (e.g. "We" at the start of an example sentence
- # being stolen by the preceding DE column).
- n = len(columns)
- col_ranges_rel = [] # (assign_left, assign_right) per column
- for ci, col in enumerate(columns):
- col_left_rel = col.x - left_x
- col_right_rel = col_left_rel + col.width
-
- # Left boundary: midpoint to previous column, or 0
- if ci == 0:
- assign_left = 0
- else:
- prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
- assign_left = (prev_right + col_left_rel) / 2
-
- # Right boundary: midpoint to next column, or infinity (row width)
- if ci == n - 1:
- assign_right = row.width + 100 # generous for last column
- else:
- next_left = columns[ci + 1].x - left_x
- assign_right = (col_right_rel + next_left) / 2
-
- col_ranges_rel.append((assign_left, assign_right))
-
- for w in row.words:
- w_left = w['left']
- w_right = w_left + w['width']
- w_center_x = w_left + w['width'] / 2
-
- # Primary: overlap-based matching — assign to column with most overlap.
- # This is more robust than center-based for narrow columns (page_ref)
- # where the last character's center may fall into the next column.
- best_col = -1
- best_overlap = 0
- for ci, col in enumerate(columns):
- col_left_rel = col.x - left_x
- col_right_rel = col_left_rel + col.width
- overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
- if overlap > best_overlap:
- best_overlap = overlap
- best_col = ci
-
- if best_col >= 0 and best_overlap > 0:
- result[best_col].append(w)
- else:
- # Fallback: center-based range matching
- assigned = False
- for ci, (al, ar) in enumerate(col_ranges_rel):
- if al <= w_center_x < ar:
- result[ci].append(w)
- assigned = True
- break
-
- if not assigned:
- # Last resort: nearest column center
- best_col = 0
- col_left_0 = columns[0].x - left_x
- best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
- for ci in range(1, n):
- col_left = columns[ci].x - left_x
- dist = abs(w_center_x - (col_left + columns[ci].width / 2))
- if dist < best_dist:
- best_dist = dist
- best_col = ci
- result[best_col].append(w)
-
- return result
-
-
-# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
-_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
-_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
-
-# Common short EN/DE words (2-3 chars). Tokens at the end of a cell
-# that do NOT appear here are treated as trailing OCR noise.
-_COMMON_SHORT_WORDS: set = {
- # EN 1-2 letter
- 'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
- 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
- 'or', 'so', 'to', 'up', 'us', 'we',
- # EN 3 letter
- 'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
- 'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
- 'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
- 'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
- 'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
- 'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
- 'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
- 'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
- 'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
- 'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
- 'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
- 'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
- 'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
- 'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
- 'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
- 'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
- 'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
- 'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
- 'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
- 'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
- 'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
- 'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
- 'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
- 'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
- 'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
- 'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
- 'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
- 'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
- 'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
- 'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
- 'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
- 'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
- 'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
- 'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
- 'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
- 'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
- 'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
- 'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
- 'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
- 'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
- 'zap', 'zip', 'zoo',
- # DE 2-3 letter
- 'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
- 'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
- 'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
- 'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
- 'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
- 'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
- 'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
- 'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
- 'wut', 'zum', 'zur',
-}
-
-# Known abbreviations found in EN/DE textbooks and dictionaries.
-# Stored WITHOUT trailing period (the noise filter strips periods).
-# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
-_KNOWN_ABBREVIATIONS: set = {
- # EN dictionary meta-words
- 'sth', 'sb', 'smth', 'smb', 'sbd',
- # EN general
- 'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
- 'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
- # EN references / textbook
- 'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
- 'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
- 'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
- 'ans', 'wb', 'tb', 'vocab',
- # EN parts of speech / grammar
- 'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
- 'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
- 'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
- 'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
- 'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
- 'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
- 'syn', 'ant', 'opp', 'var', 'orig',
- # EN titles
- 'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
- # EN pronunciation
- 'br', 'am', 'brit', 'amer',
- # EN units
- 'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
- # DE general
- 'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
- 'bes', 'insb', 'insbes', 'bspw', 'ca',
- 'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
- 'inkl', 'exkl', 'zzgl', 'abzgl',
- # DE references
- 'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
- 'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
- 's', 'sp', 'zit', 'zs', 'vlg',
- # DE grammar
- 'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
- 'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
- 'trennb', 'untrennb', 'ugs', 'geh', 'pej',
- # DE regional
- 'nordd', 'österr', 'schweiz',
- # Linguistic
- 'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
- 'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
- 'count', 'uncount', 'indef', 'def', 'poss', 'demon',
-}
-
-
-def _is_noise_tail_token(token: str) -> bool:
- """Check if a token at the END of cell text is trailing OCR noise.
-
- Trailing fragments are very common OCR artifacts from image edges,
- borders, and neighbouring cells. This is more aggressive than a
- general word filter: any short token that isn't in the dictionary
- of common EN/DE words is considered noise.
-
- Examples of noise: "Es)", "3", "ee", "B"
- Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]"
- """
- t = token.strip()
- if not t:
- return True
-
- # Keep ellipsis
- if t in ('...', '…'):
- return False
-
- # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
- if t.startswith('[') or t.startswith('["') or t.startswith("['"):
- return False
- if t.endswith(']'):
- return False
-
- # Pure non-alpha → noise ("3", ")", "|")
- alpha_chars = _RE_ALPHA.findall(t)
- if not alpha_chars:
- return True
-
- # Extract only alpha characters for dictionary lookup
- cleaned = ''.join(alpha_chars)
-
- # Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
- if cleaned.lower() in _KNOWN_ABBREVIATIONS:
- return False
-
- # Strip normal trailing punctuation before checking for internal noise.
- stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes"
- t_check = stripped_punct if stripped_punct else t
-
- # Check for legitimate punctuation patterns vs. real noise.
- # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
- # "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
- # Noise: "3d", "B|", "x7"
- # Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
- # THEN check if residual contains only alpha characters.
- t_inner = t_check
- # Remove all parentheses, hyphens, slashes, and dots — these are normal
- # in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
- # "(zer)brechen", "wir/uns", "e.g."
- t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
- # Now check: does the inner form still have non-alpha noise?
- inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
- has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
-
- # Long alpha words (4+ chars) without internal noise are likely real
- if len(cleaned) >= 4 and not has_internal_noise:
- return False
-
- # Short words: check dictionary (uses only alpha chars)
- if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
- return False
-
- # Default: short or suspicious → noise
- return True
-
-
-def _is_garbage_text(text: str) -> bool:
- """Check if entire cell text is OCR garbage from image areas.
-
- Garbage text = no recognizable dictionary word. Catches
- "(ci]oeu", "uanoaain." etc.
- """
- words = _RE_REAL_WORD.findall(text)
- if not words:
- # Check if any token is a known abbreviation (e.g. "e.g.")
- alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
- if alpha_only in _KNOWN_ABBREVIATIONS:
- return False
- return True
-
- for w in words:
- wl = w.lower()
- # Known short word or abbreviation → not garbage
- if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
- return False
- # Long word (>= 4 chars): check vowel/consonant ratio.
- # Real EN/DE words have 20-60% vowels. Garbage like "uanoaain"
- # or "cioeu" has unusual ratios (too many or too few vowels).
- if len(wl) >= 4:
- vowels = sum(1 for c in wl if c in 'aeiouäöü')
- ratio = vowels / len(wl)
- if 0.15 <= ratio <= 0.65:
- return False # plausible vowel ratio → real word
-
- return True
-
-
-def _clean_cell_text(text: str) -> str:
- """Remove OCR noise from cell text. Generic filters:
-
- 1. If the entire text has no real alphabetic word (>= 2 letters), clear.
- 2. If the entire text is garbage (no dictionary word), clear.
- 3. Strip trailing noise tokens from the end of the text.
- """
- stripped = text.strip()
- if not stripped:
- return ''
-
- # --- Filter 1: No real word at all ---
- if not _RE_REAL_WORD.search(stripped):
- # Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
- alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
- if alpha_only not in _KNOWN_ABBREVIATIONS:
- return ''
-
- # --- Filter 2: Entire text is garbage ---
- if _is_garbage_text(stripped):
- return ''
-
- # --- Filter 3: Strip trailing noise tokens ---
- tokens = stripped.split()
- while tokens and _is_noise_tail_token(tokens[-1]):
- tokens.pop()
- if not tokens:
- return ''
-
- return ' '.join(tokens)
-
-
-def _clean_cell_text_lite(text: str) -> str:
- """Simplified noise filter for cell-first OCR (isolated cell crops).
-
- Since each cell is OCR'd in isolation (no neighbour content visible),
- trailing-noise stripping is unnecessary. Only 2 filters remain:
-
- 1. No real alphabetic word (>= 2 letters) and not a known abbreviation → empty.
- 2. Entire text is garbage (no dictionary word) → empty.
- """
- stripped = text.strip()
- if not stripped:
- return ''
-
- # --- Filter 1: No real word at all ---
- if not _RE_REAL_WORD.search(stripped):
- alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
- if alpha_only not in _KNOWN_ABBREVIATIONS:
- return ''
-
- # --- Filter 2: Entire text is garbage ---
- if _is_garbage_text(stripped):
- return ''
-
- return stripped
-
-
-# ---------------------------------------------------------------------------
-# Bold detection via stroke-width analysis (relative / page-level)
-# ---------------------------------------------------------------------------
-
-def _measure_stroke_width(gray_crop: np.ndarray) -> float:
- """Measure mean stroke width in a binarised cell crop.
-
- Returns a DPI-normalised value (mean stroke width as % of crop height),
- or 0.0 if measurement is not possible.
- """
- if gray_crop is None or gray_crop.size == 0:
- return 0.0
- h, w = gray_crop.shape[:2]
- if h < 10 or w < 10:
- return 0.0
-
- # Binarise: text = white (255), background = black (0)
- _, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
- if cv2.countNonZero(bw) < 20:
- return 0.0
-
- # Distance transform: value at each white pixel = distance to nearest black
- dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
-
- # Skeleton via morphological thinning
- kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
- thin = bw.copy()
- for _ in range(max(1, min(h, w) // 6)):
- eroded = cv2.erode(thin, kernel)
- if cv2.countNonZero(eroded) < 5:
- break
- thin = eroded
-
- skeleton_pts = thin > 0
- if not np.any(skeleton_pts):
- return 0.0
- mean_stroke = float(np.mean(dist[skeleton_pts]))
- return mean_stroke / max(h, 1) * 100 # normalised: % of cell height
-
-
-def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
- img_w: int, img_h: int) -> None:
- """Two-pass bold detection: measure all cells, then compare against median.
-
- Cells with stroke width > 1.4× the page median are marked as bold.
- This adapts automatically to font, DPI and scan quality.
- Modifies cells in-place (sets 'is_bold' key).
- """
- if ocr_img is None:
- return
-
- # Pass 1: measure stroke width for every cell with text
- metrics: List[float] = []
- cell_strokes: List[float] = []
- for cell in cells:
- sw = 0.0
- if cell.get('text', '').strip():
- bp = cell['bbox_px']
- y1 = max(0, bp['y'])
- y2 = min(img_h, bp['y'] + bp['h'])
- x1 = max(0, bp['x'])
- x2 = min(img_w, bp['x'] + bp['w'])
- if y2 > y1 and x2 > x1:
- sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
- cell_strokes.append(sw)
- if sw > 0:
- metrics.append(sw)
-
- if len(metrics) < 3:
- # Too few cells to compare — leave all as non-bold
- return
-
- median_sw = float(np.median(metrics))
- if median_sw <= 0:
- return
-
- # Pass 2: cells significantly above median → bold
- for cell, sw in zip(cells, cell_strokes):
- cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
-
-
-# ---------------------------------------------------------------------------
-# Cell-First OCR (v2) — each cell cropped and OCR'd in isolation
-# ---------------------------------------------------------------------------
-
-def _ocr_cell_crop(
- row_idx: int,
- col_idx: int,
- row: RowGeometry,
- col: PageRegion,
- ocr_img: np.ndarray,
- img_bgr: Optional[np.ndarray],
- img_w: int,
- img_h: int,
- engine_name: str,
- lang: str,
- lang_map: Dict[str, str],
-) -> Dict[str, Any]:
- """OCR a single cell by cropping the exact column×row intersection.
-
- No padding beyond cell boundaries → no neighbour bleeding.
- """
- # Display bbox: exact column × row intersection
- disp_x = col.x
- disp_y = row.y
- disp_w = col.width
- disp_h = row.height
-
- # Crop boundaries: add small internal padding (3px each side) to avoid
- # clipping characters near column/row edges (e.g. parentheses, descenders).
- # Stays within image bounds but may extend slightly beyond strict cell.
- # 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
- _PAD = 3
- cx = max(0, disp_x - _PAD)
- cy = max(0, disp_y - _PAD)
- cx2 = min(img_w, disp_x + disp_w + _PAD)
- cy2 = min(img_h, disp_y + disp_h + _PAD)
- cw = cx2 - cx
- ch = cy2 - cy
-
- empty_cell = {
- 'cell_id': f"R{row_idx:02d}_C{col_idx}",
- 'row_index': row_idx,
- 'col_index': col_idx,
- 'col_type': col.type,
- 'text': '',
- 'confidence': 0.0,
- 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
- 'bbox_pct': {
- 'x': round(disp_x / img_w * 100, 2) if img_w else 0,
- 'y': round(disp_y / img_h * 100, 2) if img_h else 0,
- 'w': round(disp_w / img_w * 100, 2) if img_w else 0,
- 'h': round(disp_h / img_h * 100, 2) if img_h else 0,
- },
- 'ocr_engine': 'cell_crop_v2',
- 'is_bold': False,
- }
-
- if cw <= 0 or ch <= 0:
- logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
- return empty_cell
-
- # --- Pixel-density check: skip truly empty cells ---
- if ocr_img is not None:
- crop = ocr_img[cy:cy + ch, cx:cx + cw]
- if crop.size > 0:
- dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
- if dark_ratio < 0.005:
- logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
- row_idx, col_idx, dark_ratio, cw, ch)
- return empty_cell
-
- # --- Prepare crop for OCR ---
- cell_lang = lang_map.get(col.type, lang)
- psm = _select_psm_for_column(col.type, col.width, row.height)
- text = ''
- avg_conf = 0.0
- used_engine = 'cell_crop_v2'
-
- if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
- cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
- words = ocr_region_trocr(img_bgr, cell_region,
- handwritten=(engine_name == "trocr-handwritten"))
- elif engine_name == "lighton" and img_bgr is not None:
- cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
- words = ocr_region_lighton(img_bgr, cell_region)
- elif engine_name == "rapid" and img_bgr is not None:
- # Upscale small BGR crops for RapidOCR.
- # Cell crops typically have height 35-55px but width >300px.
- # _ensure_minimum_crop_size only scales when EITHER dim < min_dim,
- # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78).
- # For very short heights (< 80px), force 3× upscale for better OCR
- # of small characters like periods, ellipsis, and phonetic symbols.
- bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
- if bgr_crop.size == 0:
- words = []
- else:
- crop_h, crop_w = bgr_crop.shape[:2]
- if crop_h < 80:
- # Force 3× upscale for short rows — small chars need more pixels
- scale = 3.0
- bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
- interpolation=cv2.INTER_CUBIC)
- else:
- bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
- up_h, up_w = bgr_up.shape[:2]
- scale_x = up_w / max(crop_w, 1)
- scale_y = up_h / max(crop_h, 1)
- was_scaled = (up_w != crop_w or up_h != crop_h)
- logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
- row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
- tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
- words = ocr_region_rapid(bgr_up, tmp_region)
- # Remap positions back to original image coords
- if words and was_scaled:
- for w in words:
- w['left'] = int(w['left'] / scale_x) + cx
- w['top'] = int(w['top'] / scale_y) + cy
- w['width'] = int(w['width'] / scale_x)
- w['height'] = int(w['height'] / scale_y)
- elif words:
- for w in words:
- w['left'] += cx
- w['top'] += cy
- else:
- # Tesseract: upscale tiny crops for better recognition
- if ocr_img is not None:
- crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
- upscaled = _ensure_minimum_crop_size(crop_slice)
- up_h, up_w = upscaled.shape[:2]
- tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
- words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
- # Remap word positions back to original image coordinates
- if words and (up_w != cw or up_h != ch):
- sx = cw / max(up_w, 1)
- sy = ch / max(up_h, 1)
- for w in words:
- w['left'] = int(w['left'] * sx) + cx
- w['top'] = int(w['top'] * sy) + cy
- w['width'] = int(w['width'] * sx)
- w['height'] = int(w['height'] * sy)
- elif words:
- for w in words:
- w['left'] += cx
- w['top'] += cy
- else:
- words = []
-
- # Filter low-confidence words
- _MIN_WORD_CONF = 30
- if words:
- words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
- if words:
- y_tol = max(15, ch)
- text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
- avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
- logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
- row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
- else:
- logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
- row_idx, col_idx, cw, ch, psm, engine_name)
-
- # --- PSM 7 fallback for still-empty Tesseract cells ---
- if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
- crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
- upscaled = _ensure_minimum_crop_size(crop_slice)
- up_h, up_w = upscaled.shape[:2]
- tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
- psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
- if psm7_words:
- psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
- if psm7_words:
- p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
- if p7_text.strip():
- text = p7_text
- avg_conf = round(
- sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
- )
- used_engine = 'cell_crop_v2_psm7'
-
- # --- Noise filter ---
- if text.strip():
- pre_filter = text
- text = _clean_cell_text_lite(text)
- if not text:
- logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
- row_idx, col_idx, pre_filter)
- avg_conf = 0.0
-
- result = dict(empty_cell)
- result['text'] = text
- result['confidence'] = avg_conf
- result['ocr_engine'] = used_engine
- return result
-
-
-# Threshold: columns narrower than this (% of image width) use single-cell
-# crop OCR instead of full-page word assignment.
-#
-# Broad columns (>= threshold): Full-page Tesseract word assignment.
-# Better for multi-word content (sentences, IPA brackets, punctuation).
-# Examples: EN vocabulary, DE translation, example sentences.
-#
-# Narrow columns (< threshold): Isolated cell-crop OCR.
-# Prevents neighbour bleeding from adjacent broad columns.
-# Examples: page_ref, marker, numbering columns.
-#
-# 15% was empirically validated across vocab table scans with 3-5 columns.
-# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width.
-# The 15% boundary cleanly separates the two groups.
-_NARROW_COL_THRESHOLD_PCT = 15.0
-
-
-def build_cell_grid_v2(
- ocr_img: np.ndarray,
- column_regions: List[PageRegion],
- row_geometries: List[RowGeometry],
- img_w: int,
- img_h: int,
- lang: str = "eng+deu",
- ocr_engine: str = "auto",
- img_bgr: Optional[np.ndarray] = None,
-) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
- """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
-
- Drop-in replacement for build_cell_grid() — same signature & return type.
-
- Strategy:
- - Broad columns (>15% image width): Use pre-assigned full-page Tesseract
- words (from row.words). Handles IPA brackets, punctuation, sentence
- continuity correctly.
- - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
- neighbour bleeding from adjacent broad columns.
- """
- engine_name = "tesseract"
- if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
- engine_name = ocr_engine
- elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
- engine_name = "rapid"
-
- logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
-
- # Filter to content rows only
- content_rows = [r for r in row_geometries if r.row_type == 'content']
- if not content_rows:
- logger.warning("build_cell_grid_v2: no content rows found")
- return [], []
-
- # Filter phantom rows (word_count=0) and artifact rows
- before = len(content_rows)
- content_rows = [r for r in content_rows if r.word_count > 0]
- skipped = before - len(content_rows)
- if skipped > 0:
- logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
- if not content_rows:
- logger.warning("build_cell_grid_v2: no content rows with words found")
- return [], []
-
- before_art = len(content_rows)
- content_rows = [r for r in content_rows if not _is_artifact_row(r)]
- artifact_skipped = before_art - len(content_rows)
- if artifact_skipped > 0:
- logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
- if not content_rows:
- logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
- return [], []
-
- # Filter columns
- _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
- 'margin_bottom', 'margin_left', 'margin_right'}
- relevant_cols = [c for c in column_regions if c.type not in _skip_types]
- if not relevant_cols:
- logger.warning("build_cell_grid_v2: no usable columns found")
- return [], []
-
- # Heal row gaps — use header/footer boundaries
- content_rows.sort(key=lambda r: r.y)
- header_rows = [r for r in row_geometries if r.row_type == 'header']
- footer_rows = [r for r in row_geometries if r.row_type == 'footer']
- if header_rows:
- top_bound = max(r.y + r.height for r in header_rows)
- else:
- top_bound = content_rows[0].y
- if footer_rows:
- bottom_bound = min(r.y for r in footer_rows)
- else:
- bottom_bound = content_rows[-1].y + content_rows[-1].height
-
- _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
-
- relevant_cols.sort(key=lambda c: c.x)
-
- columns_meta = [
- {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
- for ci, c in enumerate(relevant_cols)
- ]
-
- lang_map = {
- 'column_en': 'eng',
- 'column_de': 'deu',
- 'column_example': 'eng+deu',
- }
-
- # --- Classify columns as broad vs narrow ---
- narrow_col_indices = set()
- for ci, col in enumerate(relevant_cols):
- col_pct = (col.width / img_w * 100) if img_w > 0 else 0
- if col_pct < _NARROW_COL_THRESHOLD_PCT:
- narrow_col_indices.add(ci)
-
- broad_col_count = len(relevant_cols) - len(narrow_col_indices)
- logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
- f"{len(narrow_col_indices)} narrow columns (cell-crop)")
-
- # --- Phase 1: Broad columns via full-page word assignment ---
- cells: List[Dict[str, Any]] = []
-
- for row_idx, row in enumerate(content_rows):
- # Assign full-page words to columns for this row
- col_words = _assign_row_words_to_columns(row, relevant_cols)
-
- for col_idx, col in enumerate(relevant_cols):
- if col_idx not in narrow_col_indices:
- # BROAD column: use pre-assigned full-page words
- words = col_words.get(col_idx, [])
- # Filter low-confidence words
- words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
- if words:
- y_tol = max(15, row.height)
- text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
- avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
- else:
- text = ''
- avg_conf = 0.0
-
- # Apply noise filter
- text = _clean_cell_text(text)
-
- cell = {
- 'cell_id': f"R{row_idx:02d}_C{col_idx}",
- 'row_index': row_idx,
- 'col_index': col_idx,
- 'col_type': col.type,
- 'text': text,
- 'confidence': avg_conf,
- 'bbox_px': {
- 'x': col.x, 'y': row.y,
- 'w': col.width, 'h': row.height,
- },
- 'bbox_pct': {
- 'x': round(col.x / img_w * 100, 2) if img_w else 0,
- 'y': round(row.y / img_h * 100, 2) if img_h else 0,
- 'w': round(col.width / img_w * 100, 2) if img_w else 0,
- 'h': round(row.height / img_h * 100, 2) if img_h else 0,
- },
- 'ocr_engine': 'word_lookup',
- 'is_bold': False,
- }
- cells.append(cell)
-
- # --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
- narrow_tasks = []
- for row_idx, row in enumerate(content_rows):
- for col_idx, col in enumerate(relevant_cols):
- if col_idx in narrow_col_indices:
- narrow_tasks.append((row_idx, col_idx, row, col))
-
- if narrow_tasks:
- max_workers = 4 if engine_name == "tesseract" else 2
- with ThreadPoolExecutor(max_workers=max_workers) as pool:
- futures = {
- pool.submit(
- _ocr_cell_crop,
- ri, ci, row, col,
- ocr_img, img_bgr, img_w, img_h,
- engine_name, lang, lang_map,
- ): (ri, ci)
- for ri, ci, row, col in narrow_tasks
- }
- for future in as_completed(futures):
- try:
- cell = future.result()
- cells.append(cell)
- except Exception as e:
- ri, ci = futures[future]
- logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
-
- # Sort cells by (row_index, col_index)
- cells.sort(key=lambda c: (c['row_index'], c['col_index']))
-
- # Remove all-empty rows
- rows_with_text: set = set()
- for cell in cells:
- if cell['text'].strip():
- rows_with_text.add(cell['row_index'])
- before_filter = len(cells)
- cells = [c for c in cells if c['row_index'] in rows_with_text]
- empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
- if empty_rows_removed > 0:
- logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
-
- # Bold detection disabled: cell-level stroke-width analysis cannot
- # distinguish bold from non-bold when cells contain mixed formatting
- # (e.g. "cookie ['kuki]" — bold word + non-bold phonetics).
- # TODO: word-level bold detection would require per-word bounding boxes.
-
- logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
- f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
- f"engine={engine_name} (hybrid)")
-
- return cells, columns_meta
-
-
-def build_cell_grid_v2_streaming(
- ocr_img: np.ndarray,
- column_regions: List[PageRegion],
- row_geometries: List[RowGeometry],
- img_w: int,
- img_h: int,
- lang: str = "eng+deu",
- ocr_engine: str = "auto",
- img_bgr: Optional[np.ndarray] = None,
-) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
- """Streaming variant of build_cell_grid_v2 — yields each cell as OCR'd.
-
- Yields:
- (cell_dict, columns_meta, total_cells)
- """
- # Resolve engine — default to Tesseract for cell-first OCR.
- # Tesseract excels at isolated text crops (binarized, upscaled).
- # RapidOCR is optimized for full-page scene-text and produces artifacts
- # on small cell crops (extra chars, missing punctuation, garbled IPA).
- use_rapid = False
- if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
- engine_name = ocr_engine
- elif ocr_engine == "auto":
- engine_name = "tesseract"
- elif ocr_engine == "rapid":
- if not RAPIDOCR_AVAILABLE:
- logger.warning("RapidOCR requested but not available, falling back to Tesseract")
- else:
- use_rapid = True
- engine_name = "rapid" if use_rapid else "tesseract"
- else:
- engine_name = "tesseract"
-
- content_rows = [r for r in row_geometries if r.row_type == 'content']
- if not content_rows:
- return
-
- content_rows = [r for r in content_rows if r.word_count > 0]
- if not content_rows:
- return
-
- _skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
- 'margin_bottom', 'margin_left', 'margin_right'}
- relevant_cols = [c for c in column_regions if c.type not in _skip_types]
- if not relevant_cols:
- return
-
- content_rows = [r for r in content_rows if not _is_artifact_row(r)]
- if not content_rows:
- return
-
- # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2)
- content_rows.sort(key=lambda r: r.y)
- header_rows = [r for r in row_geometries if r.row_type == 'header']
- footer_rows = [r for r in row_geometries if r.row_type == 'footer']
- if header_rows:
- top_bound = max(r.y + r.height for r in header_rows)
- else:
- top_bound = content_rows[0].y
- if footer_rows:
- bottom_bound = min(r.y for r in footer_rows)
- else:
- bottom_bound = content_rows[-1].y + content_rows[-1].height
-
- _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
-
- relevant_cols.sort(key=lambda c: c.x)
-
- columns_meta = [
- {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
- for ci, c in enumerate(relevant_cols)
- ]
-
- lang_map = {
- 'column_en': 'eng',
- 'column_de': 'deu',
- 'column_example': 'eng+deu',
- }
-
- total_cells = len(content_rows) * len(relevant_cols)
-
- for row_idx, row in enumerate(content_rows):
- for col_idx, col in enumerate(relevant_cols):
- cell = _ocr_cell_crop(
- row_idx, col_idx, row, col,
- ocr_img, img_bgr, img_w, img_h,
- engine_name, lang, lang_map,
- )
- yield cell, columns_meta, total_cells
-
-
-# ---------------------------------------------------------------------------
-# Narrow-column OCR helpers (Proposal B) — DEPRECATED (kept for legacy build_cell_grid)
-# ---------------------------------------------------------------------------
-
-def _compute_cell_padding(col_width: int, img_w: int) -> int:
- """Adaptive padding for OCR crops based on column width.
-
- Narrow columns (page_ref, marker) need more surrounding context so
- Tesseract can segment characters correctly. Wide columns keep the
- minimal 4 px padding to avoid pulling in neighbours.
- """
- col_pct = col_width / img_w * 100 if img_w > 0 else 100
- if col_pct < 5:
- return max(20, col_width // 2)
- if col_pct < 10:
- return max(12, col_width // 4)
- if col_pct < 15:
- return 8
- return 4
-
-
-def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
- max_scale: int = 3) -> np.ndarray:
- """Upscale tiny crops so Tesseract gets enough pixel data.
-
- If either dimension is below *min_dim*, the crop is bicubic-upscaled
- so the smallest dimension reaches *min_dim* (capped at *max_scale* ×).
- """
- h, w = crop.shape[:2]
- if h >= min_dim and w >= min_dim:
- return crop
- scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
- if scale <= 1.0:
- return crop
- new_w = int(w * scale)
- new_h = int(h * scale)
- return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
-
-
-def _select_psm_for_column(col_type: str, col_width: int,
- row_height: int) -> int:
- """Choose the best Tesseract PSM for a given column geometry.
-
- - page_ref columns are almost always single short tokens → PSM 8
- - Very narrow or short cells → PSM 7 (single text line)
- - Everything else → PSM 6 (uniform block)
- """
- if col_type in ('page_ref', 'marker'):
- return 8 # single word
- if col_width < 100 or row_height < 30:
- return 7 # single line
- return 6 # uniform block
-
-
-def _ocr_single_cell(
- row_idx: int,
- col_idx: int,
- row: RowGeometry,
- col: PageRegion,
- ocr_img: np.ndarray,
- img_bgr: Optional[np.ndarray],
- img_w: int,
- img_h: int,
- use_rapid: bool,
- engine_name: str,
- lang: str,
- lang_map: Dict[str, str],
- preassigned_words: Optional[List[Dict]] = None,
-) -> Dict[str, Any]:
- """Populate a single cell (column x row intersection) via word lookup."""
- # Display bbox: exact column × row intersection (no padding)
- disp_x = col.x
- disp_y = row.y
- disp_w = col.width
- disp_h = row.height
-
- # OCR crop: adaptive padding — narrow columns get more context
- pad = _compute_cell_padding(col.width, img_w)
- cell_x = max(0, col.x - pad)
- cell_y = max(0, row.y - pad)
- cell_w = min(col.width + 2 * pad, img_w - cell_x)
- cell_h = min(row.height + 2 * pad, img_h - cell_y)
- is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
-
- if disp_w <= 0 or disp_h <= 0:
- return {
- 'cell_id': f"R{row_idx:02d}_C{col_idx}",
- 'row_index': row_idx,
- 'col_index': col_idx,
- 'col_type': col.type,
- 'text': '',
- 'confidence': 0.0,
- 'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
- 'bbox_pct': {
- 'x': round(col.x / img_w * 100, 2),
- 'y': round(row.y / img_h * 100, 2),
- 'w': round(col.width / img_w * 100, 2),
- 'h': round(row.height / img_h * 100, 2),
- },
- 'ocr_engine': 'word_lookup',
- }
-
- # --- PRIMARY: Word-lookup from full-page Tesseract ---
- words = preassigned_words if preassigned_words is not None else []
- used_engine = 'word_lookup'
-
- # Filter low-confidence words (OCR noise from images/artifacts).
- # Tesseract gives low confidence to misread image edges, borders,
- # and other non-text elements.
- _MIN_WORD_CONF = 30
- if words:
- words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
-
- if words:
- # Use row height as Y-tolerance so all words within a single row
- # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
- # across two lines due to slight vertical offset).
- y_tol = max(15, row.height)
- text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
- avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
- else:
- text = ''
- avg_conf = 0.0
-
- # --- FALLBACK: Cell-OCR for empty cells ---
- # Full-page Tesseract can miss small or isolated words (e.g. "Ei").
- # Re-run OCR on the cell crop to catch what word-lookup missed.
- # To avoid wasting time on truly empty cells, check pixel density first:
- # only run Tesseract if the cell crop contains enough dark pixels to
- # plausibly contain text.
- _run_fallback = False
- if not text.strip() and cell_w > 0 and cell_h > 0:
- if ocr_img is not None:
- crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
- if crop.size > 0:
- # Threshold: pixels darker than 180 (on 0-255 grayscale).
- # Use 0.5% to catch even small text like "Ei" (2 chars)
- # in an otherwise empty cell.
- dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
- _run_fallback = dark_ratio > 0.005
- if _run_fallback:
- # For narrow columns, upscale the crop before OCR
- if is_narrow and ocr_img is not None:
- _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
- _upscaled = _ensure_minimum_crop_size(_crop_slice)
- if _upscaled is not _crop_slice:
- # Build a temporary full-size image with the upscaled crop
- # placed at origin so ocr_region can crop it cleanly.
- _up_h, _up_w = _upscaled.shape[:2]
- _tmp_region = PageRegion(
- type=col.type, x=0, y=0, width=_up_w, height=_up_h,
- )
- _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
- cell_lang = lang_map.get(col.type, lang)
- fallback_words = ocr_region(_upscaled, _tmp_region,
- lang=cell_lang, psm=_cell_psm)
- # Remap word positions back to original image coordinates
- _sx = cell_w / max(_up_w, 1)
- _sy = cell_h / max(_up_h, 1)
- for _fw in (fallback_words or []):
- _fw['left'] = int(_fw['left'] * _sx) + cell_x
- _fw['top'] = int(_fw['top'] * _sy) + cell_y
- _fw['width'] = int(_fw['width'] * _sx)
- _fw['height'] = int(_fw['height'] * _sy)
- else:
- # No upscaling needed, use adaptive PSM
- cell_region = PageRegion(
- type=col.type, x=cell_x, y=cell_y,
- width=cell_w, height=cell_h,
- )
- _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
- cell_lang = lang_map.get(col.type, lang)
- fallback_words = ocr_region(ocr_img, cell_region,
- lang=cell_lang, psm=_cell_psm)
- else:
- cell_region = PageRegion(
- type=col.type,
- x=cell_x, y=cell_y,
- width=cell_w, height=cell_h,
- )
- if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
- fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
- elif engine_name == "lighton" and img_bgr is not None:
- fallback_words = ocr_region_lighton(img_bgr, cell_region)
- elif use_rapid and img_bgr is not None:
- fallback_words = ocr_region_rapid(img_bgr, cell_region)
- else:
- _cell_psm = _select_psm_for_column(col.type, col.width, row.height)
- cell_lang = lang_map.get(col.type, lang)
- fallback_words = ocr_region(ocr_img, cell_region,
- lang=cell_lang, psm=_cell_psm)
-
- if fallback_words:
- # Apply same confidence filter to fallback words
- fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
- if fallback_words:
- fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
- fb_y_tol = max(10, int(fb_avg_h * 0.5))
- fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
- if fb_text.strip():
- text = fb_text
- avg_conf = round(
- sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
- )
- used_engine = 'cell_ocr_fallback'
-
- # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
- if not text.strip() and _run_fallback and not use_rapid:
- _fb_region = PageRegion(
- type=col.type, x=cell_x, y=cell_y,
- width=cell_w, height=cell_h,
- )
- cell_lang = lang_map.get(col.type, lang)
- psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
- if psm7_words:
- psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
- if psm7_words:
- p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
- if p7_text.strip():
- text = p7_text
- avg_conf = round(
- sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
- )
- used_engine = 'cell_ocr_psm7'
-
- # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
- # If a narrow cell is still empty, OCR the entire row strip with
- # RapidOCR (which handles small text better) and assign words by
- # X-position overlap with this column.
- if not text.strip() and is_narrow and img_bgr is not None:
- row_region = PageRegion(
- type='_row_strip', x=0, y=row.y,
- width=img_w, height=row.height,
- )
- strip_words = ocr_region_rapid(img_bgr, row_region)
- if strip_words:
- # Filter to words overlapping this column's X-range
- col_left = col.x
- col_right = col.x + col.width
- col_words = []
- for sw in strip_words:
- sw_left = sw.get('left', 0)
- sw_right = sw_left + sw.get('width', 0)
- overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
- if overlap > sw.get('width', 1) * 0.3:
- col_words.append(sw)
- if col_words:
- col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
- if col_words:
- rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
- if rs_text.strip():
- text = rs_text
- avg_conf = round(
- sum(w['conf'] for w in col_words) / len(col_words), 1
- )
- used_engine = 'row_strip_rapid'
-
- # --- NOISE FILTER: clear cells that contain only OCR artifacts ---
- if text.strip():
- text = _clean_cell_text(text)
- if not text:
- avg_conf = 0.0
-
- return {
- 'cell_id': f"R{row_idx:02d}_C{col_idx}",
- 'row_index': row_idx,
- 'col_index': col_idx,
- 'col_type': col.type,
- 'text': text,
- 'confidence': avg_conf,
- 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
- 'bbox_pct': {
- 'x': round(disp_x / img_w * 100, 2),
- 'y': round(disp_y / img_h * 100, 2),
- 'w': round(disp_w / img_w * 100, 2),
- 'h': round(disp_h / img_h * 100, 2),
- },
- 'ocr_engine': used_engine,
- }
-
-
-def _is_artifact_row(row: RowGeometry) -> bool:
- """Return True if this row contains only scan artifacts, not real text.
-
- Artifact rows (scanner shadows, noise) typically produce only single-character
- detections. A real content row always has at least one token with 2+ characters.
- """
- if row.word_count == 0:
- return True
- texts = [w.get('text', '').strip() for w in row.words]
- return all(len(t) <= 1 for t in texts)
-
-
-def _heal_row_gaps(
- rows: List[RowGeometry],
- top_bound: int,
- bottom_bound: int,
-) -> None:
- """Expand row y/height to fill vertical gaps caused by removed adjacent rows.
-
- After filtering out empty or artifact rows, remaining content rows may have
- gaps between them where the removed rows used to be. This function mutates
- each row to extend upward/downward to the midpoint of such gaps so that
- OCR crops cover the full available content area.
-
- The first row always extends to top_bound; the last row to bottom_bound.
- """
- if not rows:
- return
- rows.sort(key=lambda r: r.y)
- n = len(rows)
- orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
-
- for i, row in enumerate(rows):
- # New top: midpoint between previous row's bottom and this row's top
- if i == 0:
- new_top = top_bound
- else:
- prev_bot = orig[i - 1][1]
- my_top = orig[i][0]
- gap = my_top - prev_bot
- new_top = prev_bot + gap // 2 if gap > 1 else my_top
-
- # New bottom: midpoint between this row's bottom and next row's top
- if i == n - 1:
- new_bottom = bottom_bound
- else:
- my_bot = orig[i][1]
- next_top = orig[i + 1][0]
- gap = next_top - my_bot
- new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
-
- row.y = new_top
- row.height = max(5, new_bottom - new_top)
-
- logger.debug(
- f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
- f"(bounds: top={top_bound}, bottom={bottom_bound})"
- )
-
-
-def build_cell_grid(
- ocr_img: np.ndarray,
- column_regions: List[PageRegion],
- row_geometries: List[RowGeometry],
- img_w: int,
- img_h: int,
- lang: str = "eng+deu",
- ocr_engine: str = "auto",
- img_bgr: Optional[np.ndarray] = None,
-) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
- """Generic Cell-Grid: Columns × Rows → cells with OCR text.
-
- This is the layout-agnostic foundation. Every column (except column_ignore)
- is intersected with every content row to produce numbered cells.
-
- Args:
- ocr_img: Binarized full-page image (for Tesseract).
- column_regions: Classified columns from Step 3 (PageRegion list).
- row_geometries: Rows from Step 4 (RowGeometry list).
- img_w: Image width in pixels.
- img_h: Image height in pixels.
- lang: Default Tesseract language.
- ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'.
- img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR).
-
- Returns:
- (cells, columns_meta) where cells is a list of cell dicts and
- columns_meta describes the columns used.
- """
- # Resolve engine choice
- use_rapid = False
- if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
- engine_name = ocr_engine
- elif ocr_engine == "auto":
- use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
- engine_name = "rapid" if use_rapid else "tesseract"
- elif ocr_engine == "rapid":
- if not RAPIDOCR_AVAILABLE:
- logger.warning("RapidOCR requested but not available, falling back to Tesseract")
- else:
- use_rapid = True
- engine_name = "rapid" if use_rapid else "tesseract"
- else:
- engine_name = "tesseract"
-
- logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
-
- # Filter to content rows only (skip header/footer)
- content_rows = [r for r in row_geometries if r.row_type == 'content']
- if not content_rows:
- logger.warning("build_cell_grid: no content rows found")
- return [], []
-
- # Filter phantom rows: rows with no Tesseract words assigned are
- # inter-line whitespace gaps that would produce garbage OCR.
- before = len(content_rows)
- content_rows = [r for r in content_rows if r.word_count > 0]
- skipped = before - len(content_rows)
- if skipped > 0:
- logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
- if not content_rows:
- logger.warning("build_cell_grid: no content rows with words found")
- return [], []
-
- # Use columns only — skip ignore, header, footer, page_ref
- _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
- relevant_cols = [c for c in column_regions if c.type not in _skip_types]
- if not relevant_cols:
- logger.warning("build_cell_grid: no usable columns found")
- return [], []
-
- # Filter artifact rows: rows whose detected words are all single characters
- # are caused by scanner shadows or noise, not real text.
- before_art = len(content_rows)
- content_rows = [r for r in content_rows if not _is_artifact_row(r)]
- artifact_skipped = before_art - len(content_rows)
- if artifact_skipped > 0:
- logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
- if not content_rows:
- logger.warning("build_cell_grid: no content rows after artifact filtering")
- return [], []
-
- # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
- # to fill the space so OCR crops are not artificially narrow.
- _heal_row_gaps(
- content_rows,
- top_bound=min(c.y for c in relevant_cols),
- bottom_bound=max(c.y + c.height for c in relevant_cols),
- )
-
- # Sort columns left-to-right
- relevant_cols.sort(key=lambda c: c.x)
-
- # Build columns_meta
- columns_meta = [
- {
- 'index': col_idx,
- 'type': col.type,
- 'x': col.x,
- 'width': col.width,
- }
- for col_idx, col in enumerate(relevant_cols)
- ]
-
- # Choose OCR language per column type (Tesseract only)
- lang_map = {
- 'column_en': 'eng',
- 'column_de': 'deu',
- 'column_example': 'eng+deu',
- }
-
- cells: List[Dict[str, Any]] = []
-
- for row_idx, row in enumerate(content_rows):
- # Pre-assign each word to exactly one column (nearest center)
- col_words = _assign_row_words_to_columns(row, relevant_cols)
- for col_idx, col in enumerate(relevant_cols):
- cell = _ocr_single_cell(
- row_idx, col_idx, row, col,
- ocr_img, img_bgr, img_w, img_h,
- use_rapid, engine_name, lang, lang_map,
- preassigned_words=col_words[col_idx],
- )
- cells.append(cell)
-
- # --- BATCH FALLBACK: re-OCR empty cells by column strip ---
- # Collect cells that are still empty but have visible pixels.
- # Instead of calling Tesseract once per cell (expensive), crop an entire
- # column strip and run OCR once, then assign words to cells by Y position.
- empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices]
- for ci, cell in enumerate(cells):
- if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
- bpx = cell['bbox_px']
- x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
- if w > 0 and h > 0 and ocr_img is not None:
- crop = ocr_img[y:y + h, x:x + w]
- if crop.size > 0:
- dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
- if dark_ratio > 0.005:
- empty_by_col.setdefault(cell['col_index'], []).append(ci)
-
- for col_idx, cell_indices in empty_by_col.items():
- if len(cell_indices) < 3:
- continue # Not worth batching for < 3 cells
-
- # Find the column strip bounding box (union of all empty cell bboxes)
- min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
- max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
- col_x = cells[cell_indices[0]]['bbox_px']['x']
- col_w = cells[cell_indices[0]]['bbox_px']['w']
-
- strip_region = PageRegion(
- type=relevant_cols[col_idx].type,
- x=col_x, y=min_y,
- width=col_w, height=max_y_h - min_y,
- )
- strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
-
- if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
- strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
- elif engine_name == "lighton" and img_bgr is not None:
- strip_words = ocr_region_lighton(img_bgr, strip_region)
- elif use_rapid and img_bgr is not None:
- strip_words = ocr_region_rapid(img_bgr, strip_region)
- else:
- strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
-
- if not strip_words:
- continue
-
- strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
- if not strip_words:
- continue
-
- # Assign words to cells by Y overlap
- for ci in cell_indices:
- cell_y = cells[ci]['bbox_px']['y']
- cell_h = cells[ci]['bbox_px']['h']
- cell_mid_y = cell_y + cell_h / 2
-
- matched_words = [
- w for w in strip_words
- if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
- ]
- if matched_words:
- matched_words.sort(key=lambda w: w['left'])
- batch_text = ' '.join(w['text'] for w in matched_words)
- batch_text = _clean_cell_text(batch_text)
- if batch_text.strip():
- cells[ci]['text'] = batch_text
- cells[ci]['confidence'] = round(
- sum(w['conf'] for w in matched_words) / len(matched_words), 1
- )
- cells[ci]['ocr_engine'] = 'batch_column_ocr'
-
- batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
- if batch_filled > 0:
- logger.info(
- f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
- f"empty cells in column {col_idx}"
- )
-
- # Post-OCR: remove rows where ALL cells are empty (inter-row gaps
- # that had stray Tesseract artifacts giving word_count > 0).
- rows_with_text: set = set()
- for cell in cells:
- if cell['text'].strip():
- rows_with_text.add(cell['row_index'])
- before_filter = len(cells)
- cells = [c for c in cells if c['row_index'] in rows_with_text]
- empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
- if empty_rows_removed > 0:
- logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
-
- logger.info(f"build_cell_grid: {len(cells)} cells from "
- f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
- f"engine={engine_name}")
-
- return cells, columns_meta
-
-
-def build_cell_grid_streaming(
- ocr_img: np.ndarray,
- column_regions: List[PageRegion],
- row_geometries: List[RowGeometry],
- img_w: int,
- img_h: int,
- lang: str = "eng+deu",
- ocr_engine: str = "auto",
- img_bgr: Optional[np.ndarray] = None,
-) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
- """Like build_cell_grid(), but yields each cell as it is OCR'd.
-
- Yields:
- (cell_dict, columns_meta, total_cells) for each cell.
- """
- # Resolve engine choice (same as build_cell_grid)
- use_rapid = False
- if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
- engine_name = ocr_engine
- elif ocr_engine == "auto":
- use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
- engine_name = "rapid" if use_rapid else "tesseract"
- elif ocr_engine == "rapid":
- if not RAPIDOCR_AVAILABLE:
- logger.warning("RapidOCR requested but not available, falling back to Tesseract")
- else:
- use_rapid = True
- engine_name = "rapid" if use_rapid else "tesseract"
- else:
- engine_name = "tesseract"
-
- content_rows = [r for r in row_geometries if r.row_type == 'content']
- if not content_rows:
- return
-
- # Filter phantom rows: rows with no Tesseract words assigned are
- # inter-line whitespace gaps that would produce garbage OCR.
- before = len(content_rows)
- content_rows = [r for r in content_rows if r.word_count > 0]
- skipped = before - len(content_rows)
- if skipped > 0:
- logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
- if not content_rows:
- return
-
- _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
- relevant_cols = [c for c in column_regions if c.type not in _skip_types]
- if not relevant_cols:
- return
-
- # Filter artifact rows + heal gaps (same logic as build_cell_grid)
- before_art = len(content_rows)
- content_rows = [r for r in content_rows if not _is_artifact_row(r)]
- artifact_skipped = before_art - len(content_rows)
- if artifact_skipped > 0:
- logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
- if not content_rows:
- return
- _heal_row_gaps(
- content_rows,
- top_bound=min(c.y for c in relevant_cols),
- bottom_bound=max(c.y + c.height for c in relevant_cols),
- )
-
- relevant_cols.sort(key=lambda c: c.x)
-
- columns_meta = [
- {
- 'index': col_idx,
- 'type': col.type,
- 'x': col.x,
- 'width': col.width,
- }
- for col_idx, col in enumerate(relevant_cols)
- ]
-
- lang_map = {
- 'column_en': 'eng',
- 'column_de': 'deu',
- 'column_example': 'eng+deu',
- }
-
- total_cells = len(content_rows) * len(relevant_cols)
-
- for row_idx, row in enumerate(content_rows):
- # Pre-assign each word to exactly one column (nearest center)
- col_words = _assign_row_words_to_columns(row, relevant_cols)
- for col_idx, col in enumerate(relevant_cols):
- cell = _ocr_single_cell(
- row_idx, col_idx, row, col,
- ocr_img, img_bgr, img_w, img_h,
- use_rapid, engine_name, lang, lang_map,
- preassigned_words=col_words[col_idx],
- )
- yield cell, columns_meta, total_cells
-
-
-def _cells_to_vocab_entries(
- cells: List[Dict[str, Any]],
- columns_meta: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
- """Map generic cells to vocab entries with english/german/example fields.
-
- Groups cells by row_index, maps col_type → field name, and produces
- one entry per row (only rows with at least one non-empty field).
- """
- # Determine image dimensions from first cell (for row-level bbox)
- col_type_to_field = {
- 'column_en': 'english',
- 'column_de': 'german',
- 'column_example': 'example',
- 'page_ref': 'source_page',
- 'column_marker': 'marker',
- }
- bbox_key_map = {
- 'column_en': 'bbox_en',
- 'column_de': 'bbox_de',
- 'column_example': 'bbox_ex',
- 'page_ref': 'bbox_ref',
- 'column_marker': 'bbox_marker',
- }
-
- # Group cells by row_index
- rows: Dict[int, List[Dict]] = {}
- for cell in cells:
- ri = cell['row_index']
- rows.setdefault(ri, []).append(cell)
-
- entries: List[Dict[str, Any]] = []
- for row_idx in sorted(rows.keys()):
- row_cells = rows[row_idx]
- entry: Dict[str, Any] = {
- 'row_index': row_idx,
- 'english': '',
- 'german': '',
- 'example': '',
- 'source_page': '',
- 'marker': '',
- 'confidence': 0.0,
- 'bbox': None,
- 'bbox_en': None,
- 'bbox_de': None,
- 'bbox_ex': None,
- 'bbox_ref': None,
- 'bbox_marker': None,
- 'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
- }
-
- confidences = []
- for cell in row_cells:
- col_type = cell['col_type']
- field = col_type_to_field.get(col_type)
- if field:
- entry[field] = cell['text']
- bbox_field = bbox_key_map.get(col_type)
- if bbox_field:
- entry[bbox_field] = cell['bbox_pct']
- if cell['confidence'] > 0:
- confidences.append(cell['confidence'])
-
- # Compute row-level bbox as union of all cell bboxes
- all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
- if all_bboxes:
- min_x = min(b['x'] for b in all_bboxes)
- min_y = min(b['y'] for b in all_bboxes)
- max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
- max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
- entry['bbox'] = {
- 'x': round(min_x, 2),
- 'y': round(min_y, 2),
- 'w': round(max_x2 - min_x, 2),
- 'h': round(max_y2 - min_y, 2),
- }
-
- entry['confidence'] = round(
- sum(confidences) / len(confidences), 1
- ) if confidences else 0.0
-
- # Only include if at least one mapped field has text
- has_content = any(
- entry.get(f)
- for f in col_type_to_field.values()
- )
- if has_content:
- entries.append(entry)
-
- return entries
-
-
-# Regex: line starts with phonetic bracket content only (no real word before it)
-_PHONETIC_ONLY_RE = re.compile(
- r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+from cv_ocr_engines import ( # noqa: F401
+ _fix_character_confusion,
+ _fix_phonetic_brackets,
)
-
-
-def _is_phonetic_only_text(text: str) -> bool:
- """Check if text consists only of phonetic transcription.
-
- Phonetic-only patterns:
- ['mani serva] → True
- [dɑːns] → True
- ["a:mand] → True
- almond ['a:mand] → False (has real word before bracket)
- Mandel → False
- """
- t = text.strip()
- if not t:
- return False
- # Must contain at least one bracket
- if '[' not in t and ']' not in t:
- return False
- # Remove all bracket content and surrounding punctuation/whitespace
- without_brackets = re.sub(r"\[.*?\]", '', t)
- without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
- # If nothing meaningful remains, it's phonetic-only
- alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
- return len(alpha_remaining) < 2
-
-
-def _merge_phonetic_continuation_rows(
- entries: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
- """Merge rows that contain only phonetic transcription into previous entry.
-
- In dictionary pages, phonetic transcription sometimes wraps to the next
- row. E.g.:
- Row 28: EN="it's a money-saver" DE="es spart Kosten"
- Row 29: EN="['mani serva]" DE=""
-
- Row 29 is phonetic-only → merge into row 28's EN field.
- """
- if len(entries) < 2:
- return entries
-
- merged: List[Dict[str, Any]] = []
- for entry in entries:
- en = (entry.get('english') or '').strip()
- de = (entry.get('german') or '').strip()
- ex = (entry.get('example') or '').strip()
-
- # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
- if merged and _is_phonetic_only_text(en) and not de:
- prev = merged[-1]
- prev_en = (prev.get('english') or '').strip()
- # Append phonetic to previous entry's EN
- if prev_en:
- prev['english'] = prev_en + ' ' + en
- else:
- prev['english'] = en
- # If there was an example, append to previous too
- if ex:
- prev_ex = (prev.get('example') or '').strip()
- prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
- logger.debug(
- f"Merged phonetic row {entry.get('row_index')} "
- f"into previous entry: {prev['english']!r}"
- )
- continue
-
- merged.append(entry)
-
- return merged
-
-
-def _merge_continuation_rows(
- entries: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
- """Merge multi-line vocabulary entries where text wraps to the next row.
-
- A row is a continuation of the previous entry when:
- - EN has text, but DE is empty
- - EN starts with a lowercase letter (not a new vocab entry)
- - Previous entry's EN does NOT end with a sentence terminator (.!?)
- - The continuation text has fewer than 4 words (not an example sentence)
- - The row was not already merged as phonetic
-
- Example:
- Row 5: EN="to put up" DE="aufstellen"
- Row 6: EN="with sth." DE=""
- → Merged: EN="to put up with sth." DE="aufstellen"
- """
- if len(entries) < 2:
- return entries
-
- merged: List[Dict[str, Any]] = []
- for entry in entries:
- en = (entry.get('english') or '').strip()
- de = (entry.get('german') or '').strip()
-
- if merged and en and not de:
- # Check: not phonetic (already handled)
- if _is_phonetic_only_text(en):
- merged.append(entry)
- continue
-
- # Check: starts with lowercase
- first_alpha = next((c for c in en if c.isalpha()), '')
- starts_lower = first_alpha and first_alpha.islower()
-
- # Check: fewer than 4 words (not an example sentence)
- word_count = len(en.split())
- is_short = word_count < 4
-
- # Check: previous entry doesn't end with sentence terminator
- prev = merged[-1]
- prev_en = (prev.get('english') or '').strip()
- prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
-
- if starts_lower and is_short and not prev_ends_sentence:
- # Merge into previous entry
- prev['english'] = (prev_en + ' ' + en).strip()
- # Merge example if present
- ex = (entry.get('example') or '').strip()
- if ex:
- prev_ex = (prev.get('example') or '').strip()
- prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
- logger.debug(
- f"Merged continuation row {entry.get('row_index')} "
- f"into previous entry: {prev['english']!r}"
- )
- continue
-
- merged.append(entry)
-
- return merged
-
-
-def build_word_grid(
- ocr_img: np.ndarray,
- column_regions: List[PageRegion],
- row_geometries: List[RowGeometry],
- img_w: int,
- img_h: int,
- lang: str = "eng+deu",
- ocr_engine: str = "auto",
- img_bgr: Optional[np.ndarray] = None,
- pronunciation: str = "british",
-) -> List[Dict[str, Any]]:
- """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
-
- Wrapper around build_cell_grid() that adds vocabulary-specific logic:
- - Maps cells to english/german/example entries
- - Applies character confusion fixes, IPA lookup, comma splitting, etc.
- - Falls back to returning raw cells if no vocab columns detected.
-
- Args:
- ocr_img: Binarized full-page image (for Tesseract).
- column_regions: Classified columns from Step 3.
- row_geometries: Rows from Step 4.
- img_w, img_h: Image dimensions.
- lang: Default Tesseract language.
- ocr_engine: 'tesseract', 'rapid', or 'auto'.
- img_bgr: BGR color image (required for RapidOCR).
- pronunciation: 'british' or 'american' for IPA lookup.
-
- Returns:
- List of entry dicts with english/german/example text and bbox info (percent).
- """
- cells, columns_meta = build_cell_grid(
- ocr_img, column_regions, row_geometries, img_w, img_h,
- lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
- )
-
- if not cells:
- return []
-
- # Check if vocab layout is present
- col_types = {c['type'] for c in columns_meta}
- if not (col_types & {'column_en', 'column_de'}):
- logger.info("build_word_grid: no vocab columns — returning raw cells")
- return cells
-
- # Vocab mapping: cells → entries
- entries = _cells_to_vocab_entries(cells, columns_meta)
-
- # --- Post-processing pipeline (deterministic, no LLM) ---
- n_raw = len(entries)
-
- # 0a. Merge phonetic-only continuation rows into previous entry
- entries = _merge_phonetic_continuation_rows(entries)
-
- # 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
- entries = _merge_continuation_rows(entries)
-
- # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
- # llm_review_entries_streaming so changes are visible to the user in Step 6.
-
- # 2. Replace OCR'd phonetics with dictionary IPA
- entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
-
- # 3. Split comma-separated word forms (break, broke, broken → 3 entries)
- entries = _split_comma_entries(entries)
-
- # 4. Attach example sentences (rows without DE → examples for preceding entry)
- entries = _attach_example_sentences(entries)
-
- engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
- logger.info(f"build_word_grid: {len(entries)} entries from "
- f"{n_raw} raw → {len(entries)} after post-processing "
- f"(engine={engine_name})")
-
- return entries
-
-
-# =============================================================================
-# Stage 6: Multi-Pass OCR
-# =============================================================================
-
-def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
- psm: int, fallback_psm: Optional[int] = None,
- min_confidence: float = 40.0) -> List[Dict[str, Any]]:
- """Run Tesseract OCR on a specific region with given PSM.
-
- Args:
- ocr_img: Binarized full-page image.
- region: Region to crop and OCR.
- lang: Tesseract language string.
- psm: Page Segmentation Mode.
- fallback_psm: If confidence too low, retry with this PSM per line.
- min_confidence: Minimum average confidence before fallback.
-
- Returns:
- List of word dicts with text, position, confidence.
- """
- # Crop region
- crop = ocr_img[region.y:region.y + region.height,
- region.x:region.x + region.width]
-
- if crop.size == 0:
- return []
-
- # Convert to PIL for pytesseract
- pil_img = Image.fromarray(crop)
-
- # Run Tesseract with specified PSM
- config = f'--psm {psm} --oem 3'
- try:
- data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
- output_type=pytesseract.Output.DICT)
- except Exception as e:
- logger.warning(f"Tesseract failed for region {region.type}: {e}")
- return []
-
- words = []
- for i in range(len(data['text'])):
- text = data['text'][i].strip()
- conf = int(data['conf'][i])
- if not text or conf < 10:
- continue
- words.append({
- 'text': text,
- 'left': data['left'][i] + region.x, # Absolute coords
- 'top': data['top'][i] + region.y,
- 'width': data['width'][i],
- 'height': data['height'][i],
- 'conf': conf,
- 'region_type': region.type,
- })
-
- # Check average confidence
- if words and fallback_psm is not None:
- avg_conf = sum(w['conf'] for w in words) / len(words)
- if avg_conf < min_confidence:
- logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
- f"trying fallback PSM {fallback_psm}")
- words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
-
- return words
-
-
-def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
- lang: str, psm: int) -> List[Dict[str, Any]]:
- """OCR a region line by line (fallback for low-confidence regions).
-
- Splits the region into horizontal strips based on text density,
- then OCRs each strip individually with the given PSM.
- """
- crop = ocr_img[region.y:region.y + region.height,
- region.x:region.x + region.width]
-
- if crop.size == 0:
- return []
-
- # Find text lines via horizontal projection
- inv = cv2.bitwise_not(crop)
- h_proj = np.sum(inv, axis=1)
- threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
-
- # Find line boundaries
- lines = []
- in_text = False
- line_start = 0
- for y in range(len(h_proj)):
- if h_proj[y] > threshold and not in_text:
- line_start = y
- in_text = True
- elif h_proj[y] <= threshold and in_text:
- if y - line_start > 5: # Minimum line height
- lines.append((line_start, y))
- in_text = False
- if in_text and len(h_proj) - line_start > 5:
- lines.append((line_start, len(h_proj)))
-
- all_words = []
- config = f'--psm {psm} --oem 3'
-
- for line_y_start, line_y_end in lines:
- # Add small padding
- pad = 3
- y1 = max(0, line_y_start - pad)
- y2 = min(crop.shape[0], line_y_end + pad)
- line_crop = crop[y1:y2, :]
-
- if line_crop.size == 0:
- continue
-
- pil_img = Image.fromarray(line_crop)
- try:
- data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
- output_type=pytesseract.Output.DICT)
- except Exception:
- continue
-
- for i in range(len(data['text'])):
- text = data['text'][i].strip()
- conf = int(data['conf'][i])
- if not text or conf < 10:
- continue
- all_words.append({
- 'text': text,
- 'left': data['left'][i] + region.x,
- 'top': data['top'][i] + region.y + y1,
- 'width': data['width'][i],
- 'height': data['height'][i],
- 'conf': conf,
- 'region_type': region.type,
- })
-
- return all_words
-
-
-def run_multi_pass_ocr(ocr_img: np.ndarray,
- regions: List[PageRegion],
- lang: str = "eng+deu") -> Dict[str, List[Dict]]:
- """Run OCR on each detected region with optimized settings.
-
- Args:
- ocr_img: Binarized full-page image.
- regions: Detected page regions.
- lang: Default language.
-
- Returns:
- Dict mapping region type to list of word dicts.
- """
- results: Dict[str, List[Dict]] = {}
-
- _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
- for region in regions:
- if region.type in _ocr_skip:
- continue # Skip non-content regions
-
- if region.type == 'column_en':
- words = ocr_region(ocr_img, region, lang='eng', psm=4)
- elif region.type == 'column_de':
- words = ocr_region(ocr_img, region, lang='deu', psm=4)
- elif region.type == 'column_example':
- words = ocr_region(ocr_img, region, lang=lang, psm=6,
- fallback_psm=7, min_confidence=40.0)
- else:
- words = ocr_region(ocr_img, region, lang=lang, psm=6)
-
- results[region.type] = words
- logger.info(f"OCR {region.type}: {len(words)} words")
-
- return results
-
-
-# =============================================================================
-# Stage 7: Line Alignment → Vocabulary Entries
-# =============================================================================
-
-def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
- """Group words by Y position into lines, sorted by X within each line."""
- if not words:
- return []
-
- sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
- lines: List[List[Dict]] = []
- current_line: List[Dict] = [sorted_words[0]]
- current_y = sorted_words[0]['top']
-
- for word in sorted_words[1:]:
- if abs(word['top'] - current_y) <= y_tolerance_px:
- current_line.append(word)
- else:
- current_line.sort(key=lambda w: w['left'])
- lines.append(current_line)
- current_line = [word]
- current_y = word['top']
-
- if current_line:
- current_line.sort(key=lambda w: w['left'])
- lines.append(current_line)
-
- return lines
-
-
-def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
- regions: List[PageRegion],
- y_tolerance_px: int = 25) -> List[VocabRow]:
- """Align OCR results from different columns into vocabulary rows.
-
- Uses Y-coordinate matching to pair English words, German translations,
- and example sentences that appear on the same line.
-
- Args:
- ocr_results: Dict mapping region type to word lists.
- regions: Detected regions (for reference).
- y_tolerance_px: Max Y-distance to consider words on the same row.
-
- Returns:
- List of VocabRow objects.
- """
- # If no vocabulary columns detected (e.g. plain text page), return empty
- if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
- logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
- return []
-
- # Group words into lines per column
- en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
- de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
- ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
-
- def line_y_center(line: List[Dict]) -> float:
- return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
-
- def line_text(line: List[Dict]) -> str:
- return ' '.join(w['text'] for w in line)
-
- def line_confidence(line: List[Dict]) -> float:
- return sum(w['conf'] for w in line) / len(line) if line else 0
-
- # Build EN entries as the primary reference
- vocab_rows: List[VocabRow] = []
-
- for en_line in en_lines:
- en_y = line_y_center(en_line)
- en_text = line_text(en_line)
- en_conf = line_confidence(en_line)
-
- # Skip very short or likely header content
- if len(en_text.strip()) < 2:
- continue
-
- # Find matching DE line
- de_text = ""
- de_conf = 0.0
- best_de_dist = float('inf')
- best_de_idx = -1
- for idx, de_line in enumerate(de_lines):
- dist = abs(line_y_center(de_line) - en_y)
- if dist < y_tolerance_px and dist < best_de_dist:
- best_de_dist = dist
- best_de_idx = idx
-
- if best_de_idx >= 0:
- de_text = line_text(de_lines[best_de_idx])
- de_conf = line_confidence(de_lines[best_de_idx])
-
- # Find matching example line
- ex_text = ""
- ex_conf = 0.0
- best_ex_dist = float('inf')
- best_ex_idx = -1
- for idx, ex_line in enumerate(ex_lines):
- dist = abs(line_y_center(ex_line) - en_y)
- if dist < y_tolerance_px and dist < best_ex_dist:
- best_ex_dist = dist
- best_ex_idx = idx
-
- if best_ex_idx >= 0:
- ex_text = line_text(ex_lines[best_ex_idx])
- ex_conf = line_confidence(ex_lines[best_ex_idx])
-
- avg_conf = en_conf
- conf_count = 1
- if de_conf > 0:
- avg_conf += de_conf
- conf_count += 1
- if ex_conf > 0:
- avg_conf += ex_conf
- conf_count += 1
-
- vocab_rows.append(VocabRow(
- english=en_text.strip(),
- german=de_text.strip(),
- example=ex_text.strip(),
- confidence=avg_conf / conf_count,
- y_position=int(en_y),
- ))
-
- # Handle multi-line wrapping in example column:
- # If an example line has no matching EN/DE, append to previous entry
- matched_ex_ys = set()
- for row in vocab_rows:
- if row.example:
- matched_ex_ys.add(row.y_position)
-
- for ex_line in ex_lines:
- ex_y = line_y_center(ex_line)
- # Check if already matched
- already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
- if already_matched:
- continue
-
- # Find nearest previous vocab row
- best_row = None
- best_dist = float('inf')
- for row in vocab_rows:
- dist = ex_y - row.y_position
- if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
- best_dist = dist
- best_row = row
-
- if best_row:
- continuation = line_text(ex_line).strip()
- if continuation:
- best_row.example = (best_row.example + " " + continuation).strip()
-
- # Sort by Y position
- vocab_rows.sort(key=lambda r: r.y_position)
-
- return vocab_rows
-
-
-# =============================================================================
-# Stage 8: Optional LLM Post-Correction
-# =============================================================================
-
-async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
- confidence_threshold: float = 50.0,
- enabled: bool = False) -> List[VocabRow]:
- """Optionally send low-confidence regions to Qwen-VL for correction.
-
- Default: disabled. Enable per parameter.
-
- Args:
- img: Original BGR image.
- vocab_rows: Current vocabulary rows.
- confidence_threshold: Rows below this get LLM correction.
- enabled: Whether to actually run LLM correction.
-
- Returns:
- Corrected vocabulary rows.
- """
- if not enabled:
- return vocab_rows
-
- # TODO: Implement Qwen-VL correction for low-confidence entries
- # For each row with confidence < threshold:
- # 1. Crop the relevant region from img
- # 2. Send crop + OCR text to Qwen-VL
- # 3. Replace text if LLM provides a confident correction
- logger.info(f"LLM post-correction skipped (not yet implemented)")
- return vocab_rows
-
-
-# =============================================================================
-# Orchestrator
-# =============================================================================
-
-async def run_cv_pipeline(
- pdf_data: Optional[bytes] = None,
- image_data: Optional[bytes] = None,
- page_number: int = 0,
- zoom: float = 3.0,
- enable_dewarp: bool = True,
- enable_llm_correction: bool = False,
- lang: str = "eng+deu",
-) -> PipelineResult:
- """Run the complete CV document reconstruction pipeline.
-
- Args:
- pdf_data: Raw PDF bytes (mutually exclusive with image_data).
- image_data: Raw image bytes (mutually exclusive with pdf_data).
- page_number: 0-indexed page number (for PDF).
- zoom: PDF rendering zoom factor.
- enable_dewarp: Whether to run dewarp stage.
- enable_llm_correction: Whether to run LLM post-correction.
- lang: Tesseract language string.
-
- Returns:
- PipelineResult with vocabulary and timing info.
- """
- if not CV_PIPELINE_AVAILABLE:
- return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
-
- result = PipelineResult()
- total_start = time.time()
-
- try:
- # Stage 1: Render
- t = time.time()
- if pdf_data:
- img = render_pdf_high_res(pdf_data, page_number, zoom)
- elif image_data:
- img = render_image_high_res(image_data)
- else:
- return PipelineResult(error="No input data (pdf_data or image_data required)")
- result.stages['render'] = round(time.time() - t, 2)
- result.image_width = img.shape[1]
- result.image_height = img.shape[0]
- logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
-
- # Stage 2: Deskew
- t = time.time()
- img, angle = deskew_image(img)
- result.stages['deskew'] = round(time.time() - t, 2)
- logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s")
-
- # Stage 3: Dewarp
- if enable_dewarp:
- t = time.time()
- img, _dewarp_info = dewarp_image(img)
- result.stages['dewarp'] = round(time.time() - t, 2)
-
- # Stage 4: Dual image preparation
- t = time.time()
- ocr_img = create_ocr_image(img)
- layout_img = create_layout_image(img)
- result.stages['image_prep'] = round(time.time() - t, 2)
-
- # Stage 5: Layout analysis
- t = time.time()
- regions = analyze_layout(layout_img, ocr_img)
- result.stages['layout'] = round(time.time() - t, 2)
- result.columns_detected = len([r for r in regions if r.type.startswith('column')])
- logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
-
- # Stage 6: Multi-pass OCR
- t = time.time()
- ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
- result.stages['ocr'] = round(time.time() - t, 2)
- total_words = sum(len(w) for w in ocr_results.values())
- result.word_count = total_words
- logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
-
- # Stage 7: Line alignment
- t = time.time()
- vocab_rows = match_lines_to_vocab(ocr_results, regions)
- result.stages['alignment'] = round(time.time() - t, 2)
-
- # Stage 8: Optional LLM correction
- if enable_llm_correction:
- t = time.time()
- vocab_rows = await llm_post_correct(img, vocab_rows)
- result.stages['llm_correction'] = round(time.time() - t, 2)
-
- # Convert to output format
- result.vocabulary = [
- {
- "english": row.english,
- "german": row.german,
- "example": row.example,
- "confidence": round(row.confidence, 1),
- }
- for row in vocab_rows
- if row.english or row.german # Skip empty rows
- ]
-
- result.duration_seconds = round(time.time() - total_start, 2)
- logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
-
- except Exception as e:
- logger.error(f"CV Pipeline error: {e}")
- import traceback
- logger.debug(traceback.format_exc())
- result.error = str(e)
- result.duration_seconds = round(time.time() - total_start, 2)
-
- return result
-
-
-# ---------------------------------------------------------------------------
-# LLM-based OCR Correction (Step 6)
-# ---------------------------------------------------------------------------
-
-import httpx
-import os
-import json as _json
-import re as _re
-
-_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
-OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
-_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
-logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
-
-# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]"
-_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]')
-
-# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion.
-# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B)
-# when they appear inside or next to a word character.
-_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])')
-
-
-def _entry_needs_review(entry: Dict) -> bool:
- """Check if an entry should be sent to the LLM for review.
-
- Sends all non-empty entries that don't have IPA phonetic transcriptions.
- The LLM prompt and _is_spurious_change() guard against unwanted changes.
- """
- en = entry.get("english", "") or ""
- de = entry.get("german", "") or ""
-
- # Skip completely empty entries
- if not en.strip() and not de.strip():
- return False
- # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them
- if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
- return False
- return True
-
-
-def _build_llm_prompt(table_lines: List[Dict]) -> str:
- """Build the LLM correction prompt for a batch of entries."""
- return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
-
-DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
-
-NUR diese Korrekturen sind erlaubt:
-- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball"
-- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old"
-- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
-- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
-- Ziffer 6 statt G oder g: "6eld" → "Geld"
-- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"
-
-ABSOLUT VERBOTEN — aendere NIEMALS:
-- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
-- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN
-- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst
-- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest
-- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
-- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
-- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren
-- Beispielsaetze in der ex-Spalte — NIEMALS aendern
-
-Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
-
-Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
-Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
-
-/no_think
-
-Eingabe:
-{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
-
-
-def _is_spurious_change(old_val: str, new_val: str) -> bool:
- """Detect LLM changes that are likely wrong and should be discarded.
-
- Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are
- legitimate OCR corrections. Everything else is rejected.
-
- Filters out:
- - Case-only changes
- - Changes that don't contain any digit→letter fix
- - Completely different words (LLM translating or hallucinating)
- - Additions or removals of whole words (count changed)
- """
- if not old_val or not new_val:
- return False
-
- # Case-only change — never a real OCR error
- if old_val.lower() == new_val.lower():
- return True
-
- # If the word count changed significantly, the LLM rewrote rather than fixed
- old_words = old_val.split()
- new_words = new_val.split()
- if abs(len(old_words) - len(new_words)) > 1:
- return True
-
- # Core rule: a legitimate correction replaces a digit with the corresponding
- # letter. If the change doesn't include such a substitution, reject it.
- # Build a set of (old_char, new_char) pairs that differ between old and new.
- # Use character-level diff heuristic: if lengths are close, zip and compare.
- # Map of characters that OCR commonly misreads → set of correct replacements
- _OCR_CHAR_MAP = {
- # Digits mistaken for letters
- '0': set('oOgG'),
- '1': set('lLiI'),
- '5': set('sS'),
- '6': set('gG'),
- '8': set('bB'),
- # Non-letter symbols mistaken for letters
- '|': set('lLiI1'), # pipe → lowercase l, capital I, or digit 1
- 'l': set('iI|1'), # lowercase l → capital I (and reverse)
- }
- has_valid_fix = False
- if len(old_val) == len(new_val):
- for oc, nc in zip(old_val, new_val):
- if oc != nc:
- if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
- has_valid_fix = True
- elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
- # Reverse check (e.g. l→I where new is the "correct" char)
- has_valid_fix = True
- else:
- # Length changed by 1: accept if old had a suspicious char sequence
- _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
- if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
- has_valid_fix = True
-
- if not has_valid_fix:
- return True # Reject — looks like translation or hallucination
-
- return False
-
-
-def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
- """Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
- changes = []
- entries_out = []
- for i, orig in enumerate(originals):
- if i < len(corrected):
- c = corrected[i]
- entry = dict(orig)
- for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
- new_val = c.get(key, "").strip()
- old_val = (orig.get(field_name, "") or "").strip()
- if new_val and new_val != old_val:
- # Filter spurious LLM changes
- if _is_spurious_change(old_val, new_val):
- continue
- changes.append({
- "row_index": orig.get("row_index", i),
- "field": field_name,
- "old": old_val,
- "new": new_val,
- })
- entry[field_name] = new_val
- entry["llm_corrected"] = True
- entries_out.append(entry)
- else:
- entries_out.append(dict(orig))
- return changes, entries_out
-
-
-# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
-
-REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
-
-try:
- from spellchecker import SpellChecker as _SpellChecker
- _en_spell = _SpellChecker(language='en', distance=1)
- _de_spell = _SpellChecker(language='de', distance=1)
- _SPELL_AVAILABLE = True
- logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
-except ImportError:
- _SPELL_AVAILABLE = False
- logger.warning("pyspellchecker not installed — falling back to LLM review")
-
-# ─── Page-Ref Normalization ───────────────────────────────────────────────────
-# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60"
-_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE)
-
-
-def _normalize_page_ref(text: str) -> str:
- """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'."""
- if not text:
- return text
- return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
-
-
-# Suspicious OCR chars → ordered list of most-likely correct replacements
-_SPELL_SUBS: Dict[str, List[str]] = {
- '0': ['O', 'o'],
- '1': ['l', 'I'],
- '5': ['S', 's'],
- '6': ['G', 'g'],
- '8': ['B', 'b'],
- '|': ['I', 'l', '1'],
-}
-_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
-
-# Tokenizer: word tokens (letters + pipe) alternating with separators
-_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
-
-
-def _spell_dict_knows(word: str) -> bool:
- """True if word is known in EN or DE dictionary."""
- if not _SPELL_AVAILABLE:
- return False
- w = word.lower()
- return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
-
-
-def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
- """Return corrected form of token, or None if no fix needed/possible.
-
- *field* is 'english' or 'german' — used to pick the right dictionary
- for general spell correction (step 3 below).
- """
- has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
-
- # 1. Already known word → no fix needed
- if _spell_dict_knows(token):
- return None
-
- # 2. Digit/pipe substitution (existing logic)
- if has_suspicious:
- # Standalone pipe → capital I
- if token == '|':
- return 'I'
- # Dictionary-backed single-char substitution
- for i, ch in enumerate(token):
- if ch not in _SPELL_SUBS:
- continue
- for replacement in _SPELL_SUBS[ch]:
- candidate = token[:i] + replacement + token[i + 1:]
- if _spell_dict_knows(candidate):
- return candidate
- # Structural rule: suspicious char at position 0 + rest is all lowercase letters
- first = token[0]
- if first in _SPELL_SUBS and len(token) >= 2:
- rest = token[1:]
- if rest.isalpha() and rest.islower():
- candidate = _SPELL_SUBS[first][0] + rest
- if not candidate[0].isdigit():
- return candidate
-
- # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u)
- # Try single-char umlaut substitutions and check against dictionary.
- if len(token) >= 3 and token.isalpha() and field == "german":
- _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü',
- 'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'}
- for i, ch in enumerate(token):
- if ch in _UMLAUT_SUBS:
- candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
- if _spell_dict_knows(candidate):
- return candidate
-
- # 4. General spell correction for unknown words (no digits/pipes)
- # e.g. "beautful" → "beautiful"
- if not has_suspicious and len(token) >= 3 and token.isalpha():
- spell = _en_spell if field == "english" else _de_spell if field == "german" else None
- if spell is not None:
- correction = spell.correction(token.lower())
- if correction and correction != token.lower():
- # Preserve original capitalisation pattern
- if token[0].isupper():
- correction = correction[0].upper() + correction[1:]
- if _spell_dict_knows(correction):
- return correction
- return None
-
-
-def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
- """Apply OCR corrections to a text field. Returns (fixed_text, was_changed).
-
- *field* is 'english' or 'german' — forwarded to _spell_fix_token for
- dictionary selection.
- """
- if not text:
- return text, False
- has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
- # If no suspicious chars AND no alpha chars that could be misspelled, skip
- if not has_suspicious and not any(c.isalpha() for c in text):
- return text, False
- # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
- fixed = _re.sub(r'(? Dict:
- """Rule-based OCR correction: spell-checker + structural heuristics.
-
- Deterministic — never translates, never touches IPA, never hallucinates.
- """
- t0 = time.time()
- changes: List[Dict] = []
- all_corrected: List[Dict] = []
- for i, entry in enumerate(entries):
- e = dict(entry)
- # Page-ref normalization (always, regardless of review status)
- old_ref = (e.get("source_page") or "").strip()
- if old_ref:
- new_ref = _normalize_page_ref(old_ref)
- if new_ref != old_ref:
- changes.append({
- "row_index": e.get("row_index", i),
- "field": "source_page",
- "old": old_ref,
- "new": new_ref,
- })
- e["source_page"] = new_ref
- e["llm_corrected"] = True
- if not _entry_needs_review(e):
- all_corrected.append(e)
- continue
- for field_name in ("english", "german", "example"):
- old_val = (e.get(field_name) or "").strip()
- if not old_val:
- continue
- # example field is mixed-language — try German first (for umlauts)
- lang = "german" if field_name in ("german", "example") else "english"
- new_val, was_changed = _spell_fix_field(old_val, field=lang)
- if was_changed and new_val != old_val:
- changes.append({
- "row_index": e.get("row_index", i),
- "field": field_name,
- "old": old_val,
- "new": new_val,
- })
- e[field_name] = new_val
- e["llm_corrected"] = True
- all_corrected.append(e)
- duration_ms = int((time.time() - t0) * 1000)
- return {
- "entries_original": entries,
- "entries_corrected": all_corrected,
- "changes": changes,
- "skipped_count": 0,
- "model_used": "spell-checker",
- "duration_ms": duration_ms,
- }
-
-
-async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
- """Async generator yielding SSE-compatible events for spell-checker review."""
- total = len(entries)
- yield {
- "type": "meta",
- "total_entries": total,
- "to_review": total,
- "skipped": 0,
- "model": "spell-checker",
- "batch_size": batch_size,
- }
- result = spell_review_entries_sync(entries)
- changes = result["changes"]
- yield {
- "type": "batch",
- "batch_index": 0,
- "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
- "changes": changes,
- "duration_ms": result["duration_ms"],
- "progress": {"current": total, "total": total},
- }
- yield {
- "type": "complete",
- "changes": changes,
- "model_used": "spell-checker",
- "duration_ms": result["duration_ms"],
- "total_entries": total,
- "reviewed": total,
- "skipped": 0,
- "corrections_found": len(changes),
- "entries_corrected": result["entries_corrected"],
- }
-
-# ─── End Spell-Checker ────────────────────────────────────────────────────────
-
-
-async def llm_review_entries(
- entries: List[Dict],
- model: str = None,
-) -> Dict:
- """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
- if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
- return spell_review_entries_sync(entries)
- if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
- logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
-
- model = model or OLLAMA_REVIEW_MODEL
-
- # Filter: only entries that need review
- reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
-
- if not reviewable:
- return {
- "entries_original": entries,
- "entries_corrected": [dict(e) for e in entries],
- "changes": [],
- "skipped_count": len(entries),
- "model_used": model,
- "duration_ms": 0,
- }
-
- review_entries = [e for _, e in reviewable]
- table_lines = [
- {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
- for e in review_entries
- ]
-
- logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
- len(review_entries), len(entries), model, len(entries) - len(reviewable))
- logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False))
-
- prompt = _build_llm_prompt(table_lines)
-
- t0 = time.time()
- async with httpx.AsyncClient(timeout=300.0) as client:
- resp = await client.post(
- f"{_OLLAMA_URL}/api/chat",
- json={
- "model": model,
- "messages": [{"role": "user", "content": prompt}],
- "stream": False,
- "think": False, # qwen3: disable chain-of-thought (Ollama >=0.6)
- "options": {"temperature": 0.1, "num_predict": 8192},
- },
- )
- resp.raise_for_status()
- content = resp.json().get("message", {}).get("content", "")
- duration_ms = int((time.time() - t0) * 1000)
-
- logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
- logger.debug("LLM review raw response (first 500): %.500s", content)
-
- corrected = _parse_llm_json_array(content)
- logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected))
- changes, corrected_entries = _diff_batch(review_entries, corrected)
-
- # Merge corrected entries back into the full list
- all_corrected = [dict(e) for e in entries]
- for batch_idx, (orig_idx, _) in enumerate(reviewable):
- if batch_idx < len(corrected_entries):
- all_corrected[orig_idx] = corrected_entries[batch_idx]
-
- return {
- "entries_original": entries,
- "entries_corrected": all_corrected,
- "changes": changes,
- "skipped_count": len(entries) - len(reviewable),
- "model_used": model,
- "duration_ms": duration_ms,
- }
-
-
-async def llm_review_entries_streaming(
- entries: List[Dict],
- model: str = None,
- batch_size: int = _REVIEW_BATCH_SIZE,
-):
- """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
-
- Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
- visible in the UI — this is the only place the fix now runs (removed from Step 1
- of build_vocab_pipeline_streaming).
- """
- # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
- _CONF_FIELDS = ('english', 'german', 'example')
- originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
- _fix_character_confusion(entries) # modifies in-place, returns same list
- char_changes = [
- {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
- for i in range(len(entries))
- for f in _CONF_FIELDS
- if originals[i][f] != entries[i].get(f, '')
- ]
-
- if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
- # Inject char_changes as a batch right after the meta event from the spell checker
- _meta_sent = False
- async for event in spell_review_entries_streaming(entries, batch_size):
- yield event
- if not _meta_sent and event.get('type') == 'meta' and char_changes:
- _meta_sent = True
- yield {
- 'type': 'batch',
- 'changes': char_changes,
- 'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
- 'progress': {'current': 0, 'total': len(entries)},
- }
- return
-
- if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
- logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
-
- # LLM path: emit char_changes first (before meta) so they appear in the UI
- if char_changes:
- yield {
- 'type': 'batch',
- 'changes': char_changes,
- 'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
- 'progress': {'current': 0, 'total': len(entries)},
- }
-
- model = model or OLLAMA_REVIEW_MODEL
-
- # Separate reviewable from skipped entries
- reviewable = []
- skipped_indices = []
- for i, e in enumerate(entries):
- if _entry_needs_review(e):
- reviewable.append((i, e))
- else:
- skipped_indices.append(i)
-
- total_to_review = len(reviewable)
-
- # meta event
- yield {
- "type": "meta",
- "total_entries": len(entries),
- "to_review": total_to_review,
- "skipped": len(skipped_indices),
- "model": model,
- "batch_size": batch_size,
- }
-
- all_changes = []
- all_corrected = [dict(e) for e in entries]
- total_duration_ms = 0
- reviewed_count = 0
-
- # Process in batches
- for batch_start in range(0, total_to_review, batch_size):
- batch_items = reviewable[batch_start:batch_start + batch_size]
- batch_entries = [e for _, e in batch_items]
-
- table_lines = [
- {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
- for e in batch_entries
- ]
-
- prompt = _build_llm_prompt(table_lines)
-
- logger.info("LLM review streaming: batch %d — sending %d entries to %s",
- batch_start // batch_size, len(batch_entries), model)
-
- t0 = time.time()
- async with httpx.AsyncClient(timeout=300.0) as client:
- resp = await client.post(
- f"{_OLLAMA_URL}/api/chat",
- json={
- "model": model,
- "messages": [{"role": "user", "content": prompt}],
- "stream": False,
- "think": False, # qwen3: disable chain-of-thought
- "options": {"temperature": 0.1, "num_predict": 8192},
- },
- )
- resp.raise_for_status()
- content = resp.json().get("message", {}).get("content", "")
- batch_ms = int((time.time() - t0) * 1000)
- total_duration_ms += batch_ms
-
- logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content))
- logger.debug("LLM review streaming raw (first 500): %.500s", content)
-
- corrected = _parse_llm_json_array(content)
- logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected))
- batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
-
- # Merge back
- for batch_idx, (orig_idx, _) in enumerate(batch_items):
- if batch_idx < len(batch_corrected):
- all_corrected[orig_idx] = batch_corrected[batch_idx]
-
- all_changes.extend(batch_changes)
- reviewed_count += len(batch_items)
-
- # Yield batch result
- yield {
- "type": "batch",
- "batch_index": batch_start // batch_size,
- "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
- "changes": batch_changes,
- "duration_ms": batch_ms,
- "progress": {"current": reviewed_count, "total": total_to_review},
- }
-
- # Complete event
- yield {
- "type": "complete",
- "changes": all_changes,
- "model_used": model,
- "duration_ms": total_duration_ms,
- "total_entries": len(entries),
- "reviewed": total_to_review,
- "skipped": len(skipped_indices),
- "corrections_found": len(all_changes),
- "entries_corrected": all_corrected,
- }
-
-
-def _sanitize_for_json(text: str) -> str:
- """Remove or escape control characters that break JSON parsing.
-
- Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
- JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
- that are only valid inside JSON strings when properly escaped.
- """
- # Replace literal control chars (except \\t \\n \\r) with a space
- return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
-
-
-def _parse_llm_json_array(text: str) -> List[Dict]:
- """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
- # Strip qwen3 ... blocks (present even with think=False on some builds)
- text = _re.sub(r'.*?', '', text, flags=_re.DOTALL)
- # Strip markdown code fences
- text = _re.sub(r'```json\s*', '', text)
- text = _re.sub(r'```\s*', '', text)
- # Sanitize control characters before JSON parsing
- text = _sanitize_for_json(text)
- # Find first [ ... last ]
- match = _re.search(r'\[.*\]', text, _re.DOTALL)
- if match:
- try:
- return _json.loads(match.group())
- except (ValueError, _json.JSONDecodeError) as e:
- logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
- else:
- logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
- return []
+from cv_cell_grid import _cells_to_vocab_entries # noqa: F401
diff --git a/klausur-service/backend/cv_vocab_types.py b/klausur-service/backend/cv_vocab_types.py
new file mode 100644
index 0000000..74a6b9c
--- /dev/null
+++ b/klausur-service/backend/cv_vocab_types.py
@@ -0,0 +1,156 @@
+"""
+Shared types, constants, and availability guards for the CV vocabulary pipeline.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import os
+import re # noqa: F401 — re-exported for downstream modules
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+import numpy as np # noqa: F401
+
+logger = logging.getLogger(__name__)
+
+# --- Availability Guards ---
+
+try:
+ import cv2 # noqa: F401
+ CV2_AVAILABLE = True
+except ImportError:
+ cv2 = None # type: ignore[assignment]
+ CV2_AVAILABLE = False
+ logger.warning("OpenCV not available — CV pipeline disabled")
+
+try:
+ import pytesseract # noqa: F401
+ from PIL import Image # noqa: F401
+ TESSERACT_AVAILABLE = True
+except ImportError:
+ pytesseract = None # type: ignore[assignment]
+ Image = None # type: ignore[assignment,misc]
+ TESSERACT_AVAILABLE = False
+ logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
+
+CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
+
+# --- IPA Dictionary ---
+
+IPA_AVAILABLE = False
+_ipa_convert_american = None
+_britfone_dict: Dict[str, str] = {}
+
+try:
+ import eng_to_ipa as _eng_to_ipa
+ _ipa_convert_american = _eng_to_ipa.convert
+ IPA_AVAILABLE = True
+ logger.info("eng_to_ipa available — American IPA lookup enabled")
+except ImportError:
+ logger.info("eng_to_ipa not installed — American IPA disabled")
+
+# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
+_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
+if os.path.exists(_britfone_path):
+ try:
+ with open(_britfone_path, 'r', encoding='utf-8') as f:
+ _britfone_dict = json.load(f)
+ IPA_AVAILABLE = True
+ logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
+ except Exception as e:
+ logger.warning(f"Failed to load Britfone: {e}")
+else:
+ logger.info("Britfone not found — British IPA disabled")
+
+# --- Language Detection Constants ---
+
+GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
+ 'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
+ 'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
+ 'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
+ 'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
+
+ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
+ 'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
+ 'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
+ 'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
+ 'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
+
+
+# --- Data Classes ---
+
+@dataclass
+class PageRegion:
+ """A detected region on the page."""
+ type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
+ x: int
+ y: int
+ width: int
+ height: int
+ classification_confidence: float = 1.0 # 0.0-1.0
+ classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback'
+
+
+@dataclass
+class ColumnGeometry:
+ """Geometrisch erkannte Spalte vor Typ-Klassifikation."""
+ index: int # 0-basiert, links->rechts
+ x: int
+ y: int
+ width: int
+ height: int
+ word_count: int
+ words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
+ width_ratio: float # width / content_width (0.0-1.0)
+ is_sub_column: bool = False # True if created by _detect_sub_columns() split
+
+
+@dataclass
+class RowGeometry:
+ """Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
+ index: int # 0-basiert, oben→unten
+ x: int # absolute left (= content left_x)
+ y: int # absolute y start
+ width: int # content width
+ height: int # Zeilenhoehe in px
+ word_count: int
+ words: List[Dict]
+ row_type: str = 'content' # 'content' | 'header' | 'footer'
+ gap_before: int = 0 # Gap in px ueber dieser Zeile
+
+
+@dataclass
+class VocabRow:
+ """A single vocabulary entry assembled from multi-column OCR."""
+ english: str = ""
+ german: str = ""
+ example: str = ""
+ source_page: str = ""
+ confidence: float = 0.0
+ y_position: int = 0
+
+
+@dataclass
+class PipelineResult:
+ """Complete result of the CV pipeline."""
+ vocabulary: List[Dict[str, Any]] = field(default_factory=list)
+ word_count: int = 0
+ columns_detected: int = 0
+ duration_seconds: float = 0.0
+ stages: Dict[str, float] = field(default_factory=dict)
+ error: Optional[str] = None
+ image_width: int = 0
+ image_height: int = 0
+
+
+@dataclass
+class DocumentTypeResult:
+ """Result of automatic document type detection."""
+ doc_type: str # 'vocab_table' | 'full_text' | 'generic_table'
+ confidence: float # 0.0-1.0
+ pipeline: str # 'cell_first' | 'full_page'
+ skip_steps: List[str] = field(default_factory=list) # e.g. ['columns', 'rows']
+ features: Dict[str, Any] = field(default_factory=dict) # debug info