From 9a5a35bff199cf45a6cbf229bdb7ea352cdc088a Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 8 Mar 2026 23:46:47 +0100 Subject: [PATCH] =?UTF-8?q?refactor:=20cv=5Fvocab=5Fpipeline.py=20in=206?= =?UTF-8?q?=20Module=20aufteilen=20(8163=20=E2=86=92=206=20+=20Fassade)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Monolithische 8163-Zeilen-Datei aufgeteilt in fokussierte Module: - cv_vocab_types.py (156 Z.): Dataklassen, Konstanten, IPA, Feature-Flags - cv_preprocessing.py (1166 Z.): Bild-I/O, Orientierung, Deskew, Dewarp - cv_layout.py (3036 Z.): Dokumenttyp, Spalten, Zeilen, Klassifikation - cv_ocr_engines.py (1282 Z.): OCR-Engines, Vocab-Postprocessing, Text-Cleaning - cv_cell_grid.py (1510 Z.): Cell-Grid v2+Legacy, Vocab-Konvertierung - cv_review.py (1184 Z.): LLM/Spell Review, Pipeline-Orchestrierung cv_vocab_pipeline.py ist jetzt eine Re-Export-Fassade (35 Z.) — alle bestehenden Imports bleiben unveraendert. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_cell_grid.py | 1510 ++++ klausur-service/backend/cv_layout.py | 3036 +++++++ klausur-service/backend/cv_ocr_engines.py | 1282 +++ klausur-service/backend/cv_preprocessing.py | 1166 +++ klausur-service/backend/cv_review.py | 1184 +++ klausur-service/backend/cv_vocab_pipeline.py | 8178 +----------------- klausur-service/backend/cv_vocab_types.py | 156 + 7 files changed, 8359 insertions(+), 8153 deletions(-) create mode 100644 klausur-service/backend/cv_cell_grid.py create mode 100644 klausur-service/backend/cv_layout.py create mode 100644 klausur-service/backend/cv_ocr_engines.py create mode 100644 klausur-service/backend/cv_preprocessing.py create mode 100644 klausur-service/backend/cv_review.py create mode 100644 klausur-service/backend/cv_vocab_types.py diff --git a/klausur-service/backend/cv_cell_grid.py b/klausur-service/backend/cv_cell_grid.py new file mode 100644 index 0000000..6e55509 --- /dev/null +++ b/klausur-service/backend/cv_cell_grid.py @@ -0,0 +1,1510 @@ +""" +Cell-grid construction (v2 + legacy), vocab conversion, and word-grid OCR. + +Lizenz: Apache 2.0 (kommerziell nutzbar) +DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +import logging +import re +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Any, Dict, Generator, List, Optional, Tuple + +import numpy as np + +from cv_vocab_types import PageRegion, RowGeometry +from cv_ocr_engines import ( + _assign_row_words_to_columns, + _attach_example_sentences, + _clean_cell_text, + _clean_cell_text_lite, + _fix_phonetic_brackets, + _split_comma_entries, + _words_to_reading_order_text, + ocr_region_lighton, + ocr_region_rapid, + ocr_region_trocr, +) + +logger = logging.getLogger(__name__) + +try: + import cv2 +except ImportError: + cv2 = None # type: ignore[assignment] + +try: + from PIL import Image +except ImportError: + Image = None # type: ignore[assignment,misc] + + +# --------------------------------------------------------------------------- + +def _ocr_cell_crop( + row_idx: int, + col_idx: int, + row: RowGeometry, + col: PageRegion, + ocr_img: np.ndarray, + img_bgr: Optional[np.ndarray], + img_w: int, + img_h: int, + engine_name: str, + lang: str, + lang_map: Dict[str, str], +) -> Dict[str, Any]: + """OCR a single cell by cropping the exact column×row intersection. + + No padding beyond cell boundaries → no neighbour bleeding. + """ + # Display bbox: exact column × row intersection + disp_x = col.x + disp_y = row.y + disp_w = col.width + disp_h = row.height + + # Crop boundaries: add small internal padding (3px each side) to avoid + # clipping characters near column/row edges (e.g. parentheses, descenders). + # Stays within image bounds but may extend slightly beyond strict cell. + # 3px is small enough to avoid neighbour content at typical scan DPI (200-300). + _PAD = 3 + cx = max(0, disp_x - _PAD) + cy = max(0, disp_y - _PAD) + cx2 = min(img_w, disp_x + disp_w + _PAD) + cy2 = min(img_h, disp_y + disp_h + _PAD) + cw = cx2 - cx + ch = cy2 - cy + + empty_cell = { + 'cell_id': f"R{row_idx:02d}_C{col_idx}", + 'row_index': row_idx, + 'col_index': col_idx, + 'col_type': col.type, + 'text': '', + 'confidence': 0.0, + 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h}, + 'bbox_pct': { + 'x': round(disp_x / img_w * 100, 2) if img_w else 0, + 'y': round(disp_y / img_h * 100, 2) if img_h else 0, + 'w': round(disp_w / img_w * 100, 2) if img_w else 0, + 'h': round(disp_h / img_h * 100, 2) if img_h else 0, + }, + 'ocr_engine': 'cell_crop_v2', + 'is_bold': False, + } + + if cw <= 0 or ch <= 0: + logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch) + return empty_cell + + # --- Pixel-density check: skip truly empty cells --- + if ocr_img is not None: + crop = ocr_img[cy:cy + ch, cx:cx + cw] + if crop.size > 0: + dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size + if dark_ratio < 0.005: + logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)", + row_idx, col_idx, dark_ratio, cw, ch) + return empty_cell + + # --- Prepare crop for OCR --- + cell_lang = lang_map.get(col.type, lang) + psm = _select_psm_for_column(col.type, col.width, row.height) + text = '' + avg_conf = 0.0 + used_engine = 'cell_crop_v2' + + if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None: + cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch) + words = ocr_region_trocr(img_bgr, cell_region, + handwritten=(engine_name == "trocr-handwritten")) + elif engine_name == "lighton" and img_bgr is not None: + cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch) + words = ocr_region_lighton(img_bgr, cell_region) + elif engine_name == "rapid" and img_bgr is not None: + # Upscale small BGR crops for RapidOCR. + # Cell crops typically have height 35-55px but width >300px. + # _ensure_minimum_crop_size only scales when EITHER dim < min_dim, + # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78). + # For very short heights (< 80px), force 3× upscale for better OCR + # of small characters like periods, ellipsis, and phonetic symbols. + bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw] + if bgr_crop.size == 0: + words = [] + else: + crop_h, crop_w = bgr_crop.shape[:2] + if crop_h < 80: + # Force 3× upscale for short rows — small chars need more pixels + scale = 3.0 + bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale, + interpolation=cv2.INTER_CUBIC) + else: + bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3) + up_h, up_w = bgr_up.shape[:2] + scale_x = up_w / max(crop_w, 1) + scale_y = up_h / max(crop_h, 1) + was_scaled = (up_w != crop_w or up_h != crop_h) + logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)", + row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y) + tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h) + words = ocr_region_rapid(bgr_up, tmp_region) + # Remap positions back to original image coords + if words and was_scaled: + for w in words: + w['left'] = int(w['left'] / scale_x) + cx + w['top'] = int(w['top'] / scale_y) + cy + w['width'] = int(w['width'] / scale_x) + w['height'] = int(w['height'] / scale_y) + elif words: + for w in words: + w['left'] += cx + w['top'] += cy + else: + # Tesseract: upscale tiny crops for better recognition + if ocr_img is not None: + crop_slice = ocr_img[cy:cy + ch, cx:cx + cw] + upscaled = _ensure_minimum_crop_size(crop_slice) + up_h, up_w = upscaled.shape[:2] + tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h) + words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm) + # Remap word positions back to original image coordinates + if words and (up_w != cw or up_h != ch): + sx = cw / max(up_w, 1) + sy = ch / max(up_h, 1) + for w in words: + w['left'] = int(w['left'] * sx) + cx + w['top'] = int(w['top'] * sy) + cy + w['width'] = int(w['width'] * sx) + w['height'] = int(w['height'] * sy) + elif words: + for w in words: + w['left'] += cx + w['top'] += cy + else: + words = [] + + # Filter low-confidence words + _MIN_WORD_CONF = 30 + if words: + words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF] + + if words: + y_tol = max(15, ch) + text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) + avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) + logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s", + row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name) + else: + logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)", + row_idx, col_idx, cw, ch, psm, engine_name) + + # --- PSM 7 fallback for still-empty Tesseract cells --- + if not text.strip() and engine_name == "tesseract" and ocr_img is not None: + crop_slice = ocr_img[cy:cy + ch, cx:cx + cw] + upscaled = _ensure_minimum_crop_size(crop_slice) + up_h, up_w = upscaled.shape[:2] + tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h) + psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7) + if psm7_words: + psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF] + if psm7_words: + p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10) + if p7_text.strip(): + text = p7_text + avg_conf = round( + sum(w['conf'] for w in psm7_words) / len(psm7_words), 1 + ) + used_engine = 'cell_crop_v2_psm7' + + # --- Noise filter --- + if text.strip(): + pre_filter = text + text = _clean_cell_text_lite(text) + if not text: + logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r", + row_idx, col_idx, pre_filter) + avg_conf = 0.0 + + result = dict(empty_cell) + result['text'] = text + result['confidence'] = avg_conf + result['ocr_engine'] = used_engine + return result + + +# Threshold: columns narrower than this (% of image width) use single-cell +# crop OCR instead of full-page word assignment. +# +# Broad columns (>= threshold): Full-page Tesseract word assignment. +# Better for multi-word content (sentences, IPA brackets, punctuation). +# Examples: EN vocabulary, DE translation, example sentences. +# +# Narrow columns (< threshold): Isolated cell-crop OCR. +# Prevents neighbour bleeding from adjacent broad columns. +# Examples: page_ref, marker, numbering columns. +# +# 15% was empirically validated across vocab table scans with 3-5 columns. +# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width. +# The 15% boundary cleanly separates the two groups. +_NARROW_COL_THRESHOLD_PCT = 15.0 + + +def build_cell_grid_v2( + ocr_img: np.ndarray, + column_regions: List[PageRegion], + row_geometries: List[RowGeometry], + img_w: int, + img_h: int, + lang: str = "eng+deu", + ocr_engine: str = "auto", + img_bgr: Optional[np.ndarray] = None, +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones. + + Drop-in replacement for build_cell_grid() — same signature & return type. + + Strategy: + - Broad columns (>15% image width): Use pre-assigned full-page Tesseract + words (from row.words). Handles IPA brackets, punctuation, sentence + continuity correctly. + - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent + neighbour bleeding from adjacent broad columns. + """ + engine_name = "tesseract" + if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): + engine_name = ocr_engine + elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE: + engine_name = "rapid" + + logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)") + + # Filter to content rows only + content_rows = [r for r in row_geometries if r.row_type == 'content'] + if not content_rows: + logger.warning("build_cell_grid_v2: no content rows found") + return [], [] + + # Filter phantom rows (word_count=0) and artifact rows + before = len(content_rows) + content_rows = [r for r in content_rows if r.word_count > 0] + skipped = before - len(content_rows) + if skipped > 0: + logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)") + if not content_rows: + logger.warning("build_cell_grid_v2: no content rows with words found") + return [], [] + + before_art = len(content_rows) + content_rows = [r for r in content_rows if not _is_artifact_row(r)] + artifact_skipped = before_art - len(content_rows) + if artifact_skipped > 0: + logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows") + if not content_rows: + logger.warning("build_cell_grid_v2: no content rows after artifact filtering") + return [], [] + + # Filter columns + _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', + 'margin_bottom', 'margin_left', 'margin_right'} + relevant_cols = [c for c in column_regions if c.type not in _skip_types] + if not relevant_cols: + logger.warning("build_cell_grid_v2: no usable columns found") + return [], [] + + # Heal row gaps — use header/footer boundaries + content_rows.sort(key=lambda r: r.y) + header_rows = [r for r in row_geometries if r.row_type == 'header'] + footer_rows = [r for r in row_geometries if r.row_type == 'footer'] + if header_rows: + top_bound = max(r.y + r.height for r in header_rows) + else: + top_bound = content_rows[0].y + if footer_rows: + bottom_bound = min(r.y for r in footer_rows) + else: + bottom_bound = content_rows[-1].y + content_rows[-1].height + + _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound) + + relevant_cols.sort(key=lambda c: c.x) + + columns_meta = [ + {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width} + for ci, c in enumerate(relevant_cols) + ] + + lang_map = { + 'column_en': 'eng', + 'column_de': 'deu', + 'column_example': 'eng+deu', + } + + # --- Classify columns as broad vs narrow --- + narrow_col_indices = set() + for ci, col in enumerate(relevant_cols): + col_pct = (col.width / img_w * 100) if img_w > 0 else 0 + if col_pct < _NARROW_COL_THRESHOLD_PCT: + narrow_col_indices.add(ci) + + broad_col_count = len(relevant_cols) - len(narrow_col_indices) + logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), " + f"{len(narrow_col_indices)} narrow columns (cell-crop)") + + # --- Phase 1: Broad columns via full-page word assignment --- + cells: List[Dict[str, Any]] = [] + + for row_idx, row in enumerate(content_rows): + # Assign full-page words to columns for this row + col_words = _assign_row_words_to_columns(row, relevant_cols) + + for col_idx, col in enumerate(relevant_cols): + if col_idx not in narrow_col_indices: + # BROAD column: use pre-assigned full-page words + words = col_words.get(col_idx, []) + # Filter low-confidence words + words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF] + + if words: + y_tol = max(15, row.height) + text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) + avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) + else: + text = '' + avg_conf = 0.0 + + # Apply noise filter + text = _clean_cell_text(text) + + cell = { + 'cell_id': f"R{row_idx:02d}_C{col_idx}", + 'row_index': row_idx, + 'col_index': col_idx, + 'col_type': col.type, + 'text': text, + 'confidence': avg_conf, + 'bbox_px': { + 'x': col.x, 'y': row.y, + 'w': col.width, 'h': row.height, + }, + 'bbox_pct': { + 'x': round(col.x / img_w * 100, 2) if img_w else 0, + 'y': round(row.y / img_h * 100, 2) if img_h else 0, + 'w': round(col.width / img_w * 100, 2) if img_w else 0, + 'h': round(row.height / img_h * 100, 2) if img_h else 0, + }, + 'ocr_engine': 'word_lookup', + 'is_bold': False, + } + cells.append(cell) + + # --- Phase 2: Narrow columns via cell-crop OCR (parallel) --- + narrow_tasks = [] + for row_idx, row in enumerate(content_rows): + for col_idx, col in enumerate(relevant_cols): + if col_idx in narrow_col_indices: + narrow_tasks.append((row_idx, col_idx, row, col)) + + if narrow_tasks: + max_workers = 4 if engine_name == "tesseract" else 2 + with ThreadPoolExecutor(max_workers=max_workers) as pool: + futures = { + pool.submit( + _ocr_cell_crop, + ri, ci, row, col, + ocr_img, img_bgr, img_w, img_h, + engine_name, lang, lang_map, + ): (ri, ci) + for ri, ci, row, col in narrow_tasks + } + for future in as_completed(futures): + try: + cell = future.result() + cells.append(cell) + except Exception as e: + ri, ci = futures[future] + logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}") + + # Sort cells by (row_index, col_index) + cells.sort(key=lambda c: (c['row_index'], c['col_index'])) + + # Remove all-empty rows + rows_with_text: set = set() + for cell in cells: + if cell['text'].strip(): + rows_with_text.add(cell['row_index']) + before_filter = len(cells) + cells = [c for c in cells if c['row_index'] in rows_with_text] + empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1) + if empty_rows_removed > 0: + logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows") + + # Bold detection disabled: cell-level stroke-width analysis cannot + # distinguish bold from non-bold when cells contain mixed formatting + # (e.g. "cookie ['kuki]" — bold word + non-bold phonetics). + # TODO: word-level bold detection would require per-word bounding boxes. + + logger.info(f"build_cell_grid_v2: {len(cells)} cells from " + f"{len(content_rows)} rows × {len(relevant_cols)} columns, " + f"engine={engine_name} (hybrid)") + + return cells, columns_meta + + +def build_cell_grid_v2_streaming( + ocr_img: np.ndarray, + column_regions: List[PageRegion], + row_geometries: List[RowGeometry], + img_w: int, + img_h: int, + lang: str = "eng+deu", + ocr_engine: str = "auto", + img_bgr: Optional[np.ndarray] = None, +) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]: + """Streaming variant of build_cell_grid_v2 — yields each cell as OCR'd. + + Yields: + (cell_dict, columns_meta, total_cells) + """ + # Resolve engine — default to Tesseract for cell-first OCR. + # Tesseract excels at isolated text crops (binarized, upscaled). + # RapidOCR is optimized for full-page scene-text and produces artifacts + # on small cell crops (extra chars, missing punctuation, garbled IPA). + use_rapid = False + if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): + engine_name = ocr_engine + elif ocr_engine == "auto": + engine_name = "tesseract" + elif ocr_engine == "rapid": + if not RAPIDOCR_AVAILABLE: + logger.warning("RapidOCR requested but not available, falling back to Tesseract") + else: + use_rapid = True + engine_name = "rapid" if use_rapid else "tesseract" + else: + engine_name = "tesseract" + + content_rows = [r for r in row_geometries if r.row_type == 'content'] + if not content_rows: + return + + content_rows = [r for r in content_rows if r.word_count > 0] + if not content_rows: + return + + _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', + 'margin_bottom', 'margin_left', 'margin_right'} + relevant_cols = [c for c in column_regions if c.type not in _skip_types] + if not relevant_cols: + return + + content_rows = [r for r in content_rows if not _is_artifact_row(r)] + if not content_rows: + return + + # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2) + content_rows.sort(key=lambda r: r.y) + header_rows = [r for r in row_geometries if r.row_type == 'header'] + footer_rows = [r for r in row_geometries if r.row_type == 'footer'] + if header_rows: + top_bound = max(r.y + r.height for r in header_rows) + else: + top_bound = content_rows[0].y + if footer_rows: + bottom_bound = min(r.y for r in footer_rows) + else: + bottom_bound = content_rows[-1].y + content_rows[-1].height + + _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound) + + relevant_cols.sort(key=lambda c: c.x) + + columns_meta = [ + {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width} + for ci, c in enumerate(relevant_cols) + ] + + lang_map = { + 'column_en': 'eng', + 'column_de': 'deu', + 'column_example': 'eng+deu', + } + + total_cells = len(content_rows) * len(relevant_cols) + + for row_idx, row in enumerate(content_rows): + for col_idx, col in enumerate(relevant_cols): + cell = _ocr_cell_crop( + row_idx, col_idx, row, col, + ocr_img, img_bgr, img_w, img_h, + engine_name, lang, lang_map, + ) + yield cell, columns_meta, total_cells + + +# --------------------------------------------------------------------------- +# Narrow-column OCR helpers (Proposal B) — DEPRECATED (kept for legacy build_cell_grid) +# --------------------------------------------------------------------------- + +def _compute_cell_padding(col_width: int, img_w: int) -> int: + """Adaptive padding for OCR crops based on column width. + + Narrow columns (page_ref, marker) need more surrounding context so + Tesseract can segment characters correctly. Wide columns keep the + minimal 4 px padding to avoid pulling in neighbours. + """ + col_pct = col_width / img_w * 100 if img_w > 0 else 100 + if col_pct < 5: + return max(20, col_width // 2) + if col_pct < 10: + return max(12, col_width // 4) + if col_pct < 15: + return 8 + return 4 + + +def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150, + max_scale: int = 3) -> np.ndarray: + """Upscale tiny crops so Tesseract gets enough pixel data. + + If either dimension is below *min_dim*, the crop is bicubic-upscaled + so the smallest dimension reaches *min_dim* (capped at *max_scale* ×). + """ + h, w = crop.shape[:2] + if h >= min_dim and w >= min_dim: + return crop + scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1))) + if scale <= 1.0: + return crop + new_w = int(w * scale) + new_h = int(h * scale) + return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC) + + +def _select_psm_for_column(col_type: str, col_width: int, + row_height: int) -> int: + """Choose the best Tesseract PSM for a given column geometry. + + - page_ref columns are almost always single short tokens → PSM 8 + - Very narrow or short cells → PSM 7 (single text line) + - Everything else → PSM 6 (uniform block) + """ + if col_type in ('page_ref', 'marker'): + return 8 # single word + if col_width < 100 or row_height < 30: + return 7 # single line + return 6 # uniform block + + +def _ocr_single_cell( + row_idx: int, + col_idx: int, + row: RowGeometry, + col: PageRegion, + ocr_img: np.ndarray, + img_bgr: Optional[np.ndarray], + img_w: int, + img_h: int, + use_rapid: bool, + engine_name: str, + lang: str, + lang_map: Dict[str, str], + preassigned_words: Optional[List[Dict]] = None, +) -> Dict[str, Any]: + """Populate a single cell (column x row intersection) via word lookup.""" + # Display bbox: exact column × row intersection (no padding) + disp_x = col.x + disp_y = row.y + disp_w = col.width + disp_h = row.height + + # OCR crop: adaptive padding — narrow columns get more context + pad = _compute_cell_padding(col.width, img_w) + cell_x = max(0, col.x - pad) + cell_y = max(0, row.y - pad) + cell_w = min(col.width + 2 * pad, img_w - cell_x) + cell_h = min(row.height + 2 * pad, img_h - cell_y) + is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False + + if disp_w <= 0 or disp_h <= 0: + return { + 'cell_id': f"R{row_idx:02d}_C{col_idx}", + 'row_index': row_idx, + 'col_index': col_idx, + 'col_type': col.type, + 'text': '', + 'confidence': 0.0, + 'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height}, + 'bbox_pct': { + 'x': round(col.x / img_w * 100, 2), + 'y': round(row.y / img_h * 100, 2), + 'w': round(col.width / img_w * 100, 2), + 'h': round(row.height / img_h * 100, 2), + }, + 'ocr_engine': 'word_lookup', + } + + # --- PRIMARY: Word-lookup from full-page Tesseract --- + words = preassigned_words if preassigned_words is not None else [] + used_engine = 'word_lookup' + + # Filter low-confidence words (OCR noise from images/artifacts). + # Tesseract gives low confidence to misread image edges, borders, + # and other non-text elements. + _MIN_WORD_CONF = 30 + if words: + words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF] + + if words: + # Use row height as Y-tolerance so all words within a single row + # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse" + # across two lines due to slight vertical offset). + y_tol = max(15, row.height) + text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) + avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) + else: + text = '' + avg_conf = 0.0 + + # --- FALLBACK: Cell-OCR for empty cells --- + # Full-page Tesseract can miss small or isolated words (e.g. "Ei"). + # Re-run OCR on the cell crop to catch what word-lookup missed. + # To avoid wasting time on truly empty cells, check pixel density first: + # only run Tesseract if the cell crop contains enough dark pixels to + # plausibly contain text. + _run_fallback = False + if not text.strip() and cell_w > 0 and cell_h > 0: + if ocr_img is not None: + crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] + if crop.size > 0: + # Threshold: pixels darker than 180 (on 0-255 grayscale). + # Use 0.5% to catch even small text like "Ei" (2 chars) + # in an otherwise empty cell. + dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size + _run_fallback = dark_ratio > 0.005 + if _run_fallback: + # For narrow columns, upscale the crop before OCR + if is_narrow and ocr_img is not None: + _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] + _upscaled = _ensure_minimum_crop_size(_crop_slice) + if _upscaled is not _crop_slice: + # Build a temporary full-size image with the upscaled crop + # placed at origin so ocr_region can crop it cleanly. + _up_h, _up_w = _upscaled.shape[:2] + _tmp_region = PageRegion( + type=col.type, x=0, y=0, width=_up_w, height=_up_h, + ) + _cell_psm = _select_psm_for_column(col.type, col.width, row.height) + cell_lang = lang_map.get(col.type, lang) + fallback_words = ocr_region(_upscaled, _tmp_region, + lang=cell_lang, psm=_cell_psm) + # Remap word positions back to original image coordinates + _sx = cell_w / max(_up_w, 1) + _sy = cell_h / max(_up_h, 1) + for _fw in (fallback_words or []): + _fw['left'] = int(_fw['left'] * _sx) + cell_x + _fw['top'] = int(_fw['top'] * _sy) + cell_y + _fw['width'] = int(_fw['width'] * _sx) + _fw['height'] = int(_fw['height'] * _sy) + else: + # No upscaling needed, use adaptive PSM + cell_region = PageRegion( + type=col.type, x=cell_x, y=cell_y, + width=cell_w, height=cell_h, + ) + _cell_psm = _select_psm_for_column(col.type, col.width, row.height) + cell_lang = lang_map.get(col.type, lang) + fallback_words = ocr_region(ocr_img, cell_region, + lang=cell_lang, psm=_cell_psm) + else: + cell_region = PageRegion( + type=col.type, + x=cell_x, y=cell_y, + width=cell_w, height=cell_h, + ) + if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None: + fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten")) + elif engine_name == "lighton" and img_bgr is not None: + fallback_words = ocr_region_lighton(img_bgr, cell_region) + elif use_rapid and img_bgr is not None: + fallback_words = ocr_region_rapid(img_bgr, cell_region) + else: + _cell_psm = _select_psm_for_column(col.type, col.width, row.height) + cell_lang = lang_map.get(col.type, lang) + fallback_words = ocr_region(ocr_img, cell_region, + lang=cell_lang, psm=_cell_psm) + + if fallback_words: + # Apply same confidence filter to fallback words + fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF] + if fallback_words: + fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words) + fb_y_tol = max(10, int(fb_avg_h * 0.5)) + fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol) + if fb_text.strip(): + text = fb_text + avg_conf = round( + sum(w['conf'] for w in fallback_words) / len(fallback_words), 1 + ) + used_engine = 'cell_ocr_fallback' + + # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells --- + if not text.strip() and _run_fallback and not use_rapid: + _fb_region = PageRegion( + type=col.type, x=cell_x, y=cell_y, + width=cell_w, height=cell_h, + ) + cell_lang = lang_map.get(col.type, lang) + psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7) + if psm7_words: + psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF] + if psm7_words: + p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10) + if p7_text.strip(): + text = p7_text + avg_conf = round( + sum(w['conf'] for w in psm7_words) / len(psm7_words), 1 + ) + used_engine = 'cell_ocr_psm7' + + # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns --- + # If a narrow cell is still empty, OCR the entire row strip with + # RapidOCR (which handles small text better) and assign words by + # X-position overlap with this column. + if not text.strip() and is_narrow and img_bgr is not None: + row_region = PageRegion( + type='_row_strip', x=0, y=row.y, + width=img_w, height=row.height, + ) + strip_words = ocr_region_rapid(img_bgr, row_region) + if strip_words: + # Filter to words overlapping this column's X-range + col_left = col.x + col_right = col.x + col.width + col_words = [] + for sw in strip_words: + sw_left = sw.get('left', 0) + sw_right = sw_left + sw.get('width', 0) + overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left)) + if overlap > sw.get('width', 1) * 0.3: + col_words.append(sw) + if col_words: + col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF] + if col_words: + rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height) + if rs_text.strip(): + text = rs_text + avg_conf = round( + sum(w['conf'] for w in col_words) / len(col_words), 1 + ) + used_engine = 'row_strip_rapid' + + # --- NOISE FILTER: clear cells that contain only OCR artifacts --- + if text.strip(): + text = _clean_cell_text(text) + if not text: + avg_conf = 0.0 + + return { + 'cell_id': f"R{row_idx:02d}_C{col_idx}", + 'row_index': row_idx, + 'col_index': col_idx, + 'col_type': col.type, + 'text': text, + 'confidence': avg_conf, + 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h}, + 'bbox_pct': { + 'x': round(disp_x / img_w * 100, 2), + 'y': round(disp_y / img_h * 100, 2), + 'w': round(disp_w / img_w * 100, 2), + 'h': round(disp_h / img_h * 100, 2), + }, + 'ocr_engine': used_engine, + } + + +def _is_artifact_row(row: RowGeometry) -> bool: + """Return True if this row contains only scan artifacts, not real text. + + Artifact rows (scanner shadows, noise) typically produce only single-character + detections. A real content row always has at least one token with 2+ characters. + """ + if row.word_count == 0: + return True + texts = [w.get('text', '').strip() for w in row.words] + return all(len(t) <= 1 for t in texts) + + +def _heal_row_gaps( + rows: List[RowGeometry], + top_bound: int, + bottom_bound: int, +) -> None: + """Expand row y/height to fill vertical gaps caused by removed adjacent rows. + + After filtering out empty or artifact rows, remaining content rows may have + gaps between them where the removed rows used to be. This function mutates + each row to extend upward/downward to the midpoint of such gaps so that + OCR crops cover the full available content area. + + The first row always extends to top_bound; the last row to bottom_bound. + """ + if not rows: + return + rows.sort(key=lambda r: r.y) + n = len(rows) + orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation + + for i, row in enumerate(rows): + # New top: midpoint between previous row's bottom and this row's top + if i == 0: + new_top = top_bound + else: + prev_bot = orig[i - 1][1] + my_top = orig[i][0] + gap = my_top - prev_bot + new_top = prev_bot + gap // 2 if gap > 1 else my_top + + # New bottom: midpoint between this row's bottom and next row's top + if i == n - 1: + new_bottom = bottom_bound + else: + my_bot = orig[i][1] + next_top = orig[i + 1][0] + gap = next_top - my_bot + new_bottom = my_bot + gap // 2 if gap > 1 else my_bot + + row.y = new_top + row.height = max(5, new_bottom - new_top) + + logger.debug( + f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] " + f"(bounds: top={top_bound}, bottom={bottom_bound})" + ) + + +def build_cell_grid( + ocr_img: np.ndarray, + column_regions: List[PageRegion], + row_geometries: List[RowGeometry], + img_w: int, + img_h: int, + lang: str = "eng+deu", + ocr_engine: str = "auto", + img_bgr: Optional[np.ndarray] = None, +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """Generic Cell-Grid: Columns × Rows → cells with OCR text. + + This is the layout-agnostic foundation. Every column (except column_ignore) + is intersected with every content row to produce numbered cells. + + Args: + ocr_img: Binarized full-page image (for Tesseract). + column_regions: Classified columns from Step 3 (PageRegion list). + row_geometries: Rows from Step 4 (RowGeometry list). + img_w: Image width in pixels. + img_h: Image height in pixels. + lang: Default Tesseract language. + ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'. + img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR). + + Returns: + (cells, columns_meta) where cells is a list of cell dicts and + columns_meta describes the columns used. + """ + # Resolve engine choice + use_rapid = False + if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): + engine_name = ocr_engine + elif ocr_engine == "auto": + use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None + engine_name = "rapid" if use_rapid else "tesseract" + elif ocr_engine == "rapid": + if not RAPIDOCR_AVAILABLE: + logger.warning("RapidOCR requested but not available, falling back to Tesseract") + else: + use_rapid = True + engine_name = "rapid" if use_rapid else "tesseract" + else: + engine_name = "tesseract" + + logger.info(f"build_cell_grid: using OCR engine '{engine_name}'") + + # Filter to content rows only (skip header/footer) + content_rows = [r for r in row_geometries if r.row_type == 'content'] + if not content_rows: + logger.warning("build_cell_grid: no content rows found") + return [], [] + + # Filter phantom rows: rows with no Tesseract words assigned are + # inter-line whitespace gaps that would produce garbage OCR. + before = len(content_rows) + content_rows = [r for r in content_rows if r.word_count > 0] + skipped = before - len(content_rows) + if skipped > 0: + logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)") + if not content_rows: + logger.warning("build_cell_grid: no content rows with words found") + return [], [] + + # Use columns only — skip ignore, header, footer, page_ref + _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} + relevant_cols = [c for c in column_regions if c.type not in _skip_types] + if not relevant_cols: + logger.warning("build_cell_grid: no usable columns found") + return [], [] + + # Filter artifact rows: rows whose detected words are all single characters + # are caused by scanner shadows or noise, not real text. + before_art = len(content_rows) + content_rows = [r for r in content_rows if not _is_artifact_row(r)] + artifact_skipped = before_art - len(content_rows) + if artifact_skipped > 0: + logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)") + if not content_rows: + logger.warning("build_cell_grid: no content rows after artifact filtering") + return [], [] + + # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows + # to fill the space so OCR crops are not artificially narrow. + _heal_row_gaps( + content_rows, + top_bound=min(c.y for c in relevant_cols), + bottom_bound=max(c.y + c.height for c in relevant_cols), + ) + + # Sort columns left-to-right + relevant_cols.sort(key=lambda c: c.x) + + # Build columns_meta + columns_meta = [ + { + 'index': col_idx, + 'type': col.type, + 'x': col.x, + 'width': col.width, + } + for col_idx, col in enumerate(relevant_cols) + ] + + # Choose OCR language per column type (Tesseract only) + lang_map = { + 'column_en': 'eng', + 'column_de': 'deu', + 'column_example': 'eng+deu', + } + + cells: List[Dict[str, Any]] = [] + + for row_idx, row in enumerate(content_rows): + # Pre-assign each word to exactly one column (nearest center) + col_words = _assign_row_words_to_columns(row, relevant_cols) + for col_idx, col in enumerate(relevant_cols): + cell = _ocr_single_cell( + row_idx, col_idx, row, col, + ocr_img, img_bgr, img_w, img_h, + use_rapid, engine_name, lang, lang_map, + preassigned_words=col_words[col_idx], + ) + cells.append(cell) + + # --- BATCH FALLBACK: re-OCR empty cells by column strip --- + # Collect cells that are still empty but have visible pixels. + # Instead of calling Tesseract once per cell (expensive), crop an entire + # column strip and run OCR once, then assign words to cells by Y position. + empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices] + for ci, cell in enumerate(cells): + if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7': + bpx = cell['bbox_px'] + x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h'] + if w > 0 and h > 0 and ocr_img is not None: + crop = ocr_img[y:y + h, x:x + w] + if crop.size > 0: + dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size + if dark_ratio > 0.005: + empty_by_col.setdefault(cell['col_index'], []).append(ci) + + for col_idx, cell_indices in empty_by_col.items(): + if len(cell_indices) < 3: + continue # Not worth batching for < 3 cells + + # Find the column strip bounding box (union of all empty cell bboxes) + min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices) + max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices) + col_x = cells[cell_indices[0]]['bbox_px']['x'] + col_w = cells[cell_indices[0]]['bbox_px']['w'] + + strip_region = PageRegion( + type=relevant_cols[col_idx].type, + x=col_x, y=min_y, + width=col_w, height=max_y_h - min_y, + ) + strip_lang = lang_map.get(relevant_cols[col_idx].type, lang) + + if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None: + strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten")) + elif engine_name == "lighton" and img_bgr is not None: + strip_words = ocr_region_lighton(img_bgr, strip_region) + elif use_rapid and img_bgr is not None: + strip_words = ocr_region_rapid(img_bgr, strip_region) + else: + strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6) + + if not strip_words: + continue + + strip_words = [w for w in strip_words if w.get('conf', 0) >= 30] + if not strip_words: + continue + + # Assign words to cells by Y overlap + for ci in cell_indices: + cell_y = cells[ci]['bbox_px']['y'] + cell_h = cells[ci]['bbox_px']['h'] + cell_mid_y = cell_y + cell_h / 2 + + matched_words = [ + w for w in strip_words + if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8 + ] + if matched_words: + matched_words.sort(key=lambda w: w['left']) + batch_text = ' '.join(w['text'] for w in matched_words) + batch_text = _clean_cell_text(batch_text) + if batch_text.strip(): + cells[ci]['text'] = batch_text + cells[ci]['confidence'] = round( + sum(w['conf'] for w in matched_words) / len(matched_words), 1 + ) + cells[ci]['ocr_engine'] = 'batch_column_ocr' + + batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip()) + if batch_filled > 0: + logger.info( + f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} " + f"empty cells in column {col_idx}" + ) + + # Post-OCR: remove rows where ALL cells are empty (inter-row gaps + # that had stray Tesseract artifacts giving word_count > 0). + rows_with_text: set = set() + for cell in cells: + if cell['text'].strip(): + rows_with_text.add(cell['row_index']) + before_filter = len(cells) + cells = [c for c in cells if c['row_index'] in rows_with_text] + empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1) + if empty_rows_removed > 0: + logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR") + + logger.info(f"build_cell_grid: {len(cells)} cells from " + f"{len(content_rows)} rows × {len(relevant_cols)} columns, " + f"engine={engine_name}") + + return cells, columns_meta + + +def build_cell_grid_streaming( + ocr_img: np.ndarray, + column_regions: List[PageRegion], + row_geometries: List[RowGeometry], + img_w: int, + img_h: int, + lang: str = "eng+deu", + ocr_engine: str = "auto", + img_bgr: Optional[np.ndarray] = None, +) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]: + """Like build_cell_grid(), but yields each cell as it is OCR'd. + + Yields: + (cell_dict, columns_meta, total_cells) for each cell. + """ + # Resolve engine choice (same as build_cell_grid) + use_rapid = False + if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): + engine_name = ocr_engine + elif ocr_engine == "auto": + use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None + engine_name = "rapid" if use_rapid else "tesseract" + elif ocr_engine == "rapid": + if not RAPIDOCR_AVAILABLE: + logger.warning("RapidOCR requested but not available, falling back to Tesseract") + else: + use_rapid = True + engine_name = "rapid" if use_rapid else "tesseract" + else: + engine_name = "tesseract" + + content_rows = [r for r in row_geometries if r.row_type == 'content'] + if not content_rows: + return + + # Filter phantom rows: rows with no Tesseract words assigned are + # inter-line whitespace gaps that would produce garbage OCR. + before = len(content_rows) + content_rows = [r for r in content_rows if r.word_count > 0] + skipped = before - len(content_rows) + if skipped > 0: + logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)") + if not content_rows: + return + + _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} + relevant_cols = [c for c in column_regions if c.type not in _skip_types] + if not relevant_cols: + return + + # Filter artifact rows + heal gaps (same logic as build_cell_grid) + before_art = len(content_rows) + content_rows = [r for r in content_rows if not _is_artifact_row(r)] + artifact_skipped = before_art - len(content_rows) + if artifact_skipped > 0: + logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows") + if not content_rows: + return + _heal_row_gaps( + content_rows, + top_bound=min(c.y for c in relevant_cols), + bottom_bound=max(c.y + c.height for c in relevant_cols), + ) + + relevant_cols.sort(key=lambda c: c.x) + + columns_meta = [ + { + 'index': col_idx, + 'type': col.type, + 'x': col.x, + 'width': col.width, + } + for col_idx, col in enumerate(relevant_cols) + ] + + lang_map = { + 'column_en': 'eng', + 'column_de': 'deu', + 'column_example': 'eng+deu', + } + + total_cells = len(content_rows) * len(relevant_cols) + + for row_idx, row in enumerate(content_rows): + # Pre-assign each word to exactly one column (nearest center) + col_words = _assign_row_words_to_columns(row, relevant_cols) + for col_idx, col in enumerate(relevant_cols): + cell = _ocr_single_cell( + row_idx, col_idx, row, col, + ocr_img, img_bgr, img_w, img_h, + use_rapid, engine_name, lang, lang_map, + preassigned_words=col_words[col_idx], + ) + yield cell, columns_meta, total_cells + + +def _cells_to_vocab_entries( + cells: List[Dict[str, Any]], + columns_meta: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Map generic cells to vocab entries with english/german/example fields. + + Groups cells by row_index, maps col_type → field name, and produces + one entry per row (only rows with at least one non-empty field). + """ + # Determine image dimensions from first cell (for row-level bbox) + col_type_to_field = { + 'column_en': 'english', + 'column_de': 'german', + 'column_example': 'example', + 'page_ref': 'source_page', + 'column_marker': 'marker', + } + bbox_key_map = { + 'column_en': 'bbox_en', + 'column_de': 'bbox_de', + 'column_example': 'bbox_ex', + 'page_ref': 'bbox_ref', + 'column_marker': 'bbox_marker', + } + + # Group cells by row_index + rows: Dict[int, List[Dict]] = {} + for cell in cells: + ri = cell['row_index'] + rows.setdefault(ri, []).append(cell) + + entries: List[Dict[str, Any]] = [] + for row_idx in sorted(rows.keys()): + row_cells = rows[row_idx] + entry: Dict[str, Any] = { + 'row_index': row_idx, + 'english': '', + 'german': '', + 'example': '', + 'source_page': '', + 'marker': '', + 'confidence': 0.0, + 'bbox': None, + 'bbox_en': None, + 'bbox_de': None, + 'bbox_ex': None, + 'bbox_ref': None, + 'bbox_marker': None, + 'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '', + } + + confidences = [] + for cell in row_cells: + col_type = cell['col_type'] + field = col_type_to_field.get(col_type) + if field: + entry[field] = cell['text'] + bbox_field = bbox_key_map.get(col_type) + if bbox_field: + entry[bbox_field] = cell['bbox_pct'] + if cell['confidence'] > 0: + confidences.append(cell['confidence']) + + # Compute row-level bbox as union of all cell bboxes + all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')] + if all_bboxes: + min_x = min(b['x'] for b in all_bboxes) + min_y = min(b['y'] for b in all_bboxes) + max_x2 = max(b['x'] + b['w'] for b in all_bboxes) + max_y2 = max(b['y'] + b['h'] for b in all_bboxes) + entry['bbox'] = { + 'x': round(min_x, 2), + 'y': round(min_y, 2), + 'w': round(max_x2 - min_x, 2), + 'h': round(max_y2 - min_y, 2), + } + + entry['confidence'] = round( + sum(confidences) / len(confidences), 1 + ) if confidences else 0.0 + + # Only include if at least one mapped field has text + has_content = any( + entry.get(f) + for f in col_type_to_field.values() + ) + if has_content: + entries.append(entry) + + return entries + + +# Regex: line starts with phonetic bracket content only (no real word before it) +_PHONETIC_ONLY_RE = re.compile( + r'''^\s*[\[\('"]*[^\]]*[\])\s]*$''' +) + + +def _is_phonetic_only_text(text: str) -> bool: + """Check if text consists only of phonetic transcription. + + Phonetic-only patterns: + ['mani serva] → True + [dɑːns] → True + ["a:mand] → True + almond ['a:mand] → False (has real word before bracket) + Mandel → False + """ + t = text.strip() + if not t: + return False + # Must contain at least one bracket + if '[' not in t and ']' not in t: + return False + # Remove all bracket content and surrounding punctuation/whitespace + without_brackets = re.sub(r"\[.*?\]", '', t) + without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets) + # If nothing meaningful remains, it's phonetic-only + alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets)) + return len(alpha_remaining) < 2 + + +def _merge_phonetic_continuation_rows( + entries: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Merge rows that contain only phonetic transcription into previous entry. + + In dictionary pages, phonetic transcription sometimes wraps to the next + row. E.g.: + Row 28: EN="it's a money-saver" DE="es spart Kosten" + Row 29: EN="['mani serva]" DE="" + + Row 29 is phonetic-only → merge into row 28's EN field. + """ + if len(entries) < 2: + return entries + + merged: List[Dict[str, Any]] = [] + for entry in entries: + en = (entry.get('english') or '').strip() + de = (entry.get('german') or '').strip() + ex = (entry.get('example') or '').strip() + + # Check if this entry is phonetic-only (EN has only phonetics, DE empty) + if merged and _is_phonetic_only_text(en) and not de: + prev = merged[-1] + prev_en = (prev.get('english') or '').strip() + # Append phonetic to previous entry's EN + if prev_en: + prev['english'] = prev_en + ' ' + en + else: + prev['english'] = en + # If there was an example, append to previous too + if ex: + prev_ex = (prev.get('example') or '').strip() + prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex + logger.debug( + f"Merged phonetic row {entry.get('row_index')} " + f"into previous entry: {prev['english']!r}" + ) + continue + + merged.append(entry) + + return merged + + +def _merge_continuation_rows( + entries: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Merge multi-line vocabulary entries where text wraps to the next row. + + A row is a continuation of the previous entry when: + - EN has text, but DE is empty + - EN starts with a lowercase letter (not a new vocab entry) + - Previous entry's EN does NOT end with a sentence terminator (.!?) + - The continuation text has fewer than 4 words (not an example sentence) + - The row was not already merged as phonetic + + Example: + Row 5: EN="to put up" DE="aufstellen" + Row 6: EN="with sth." DE="" + → Merged: EN="to put up with sth." DE="aufstellen" + """ + if len(entries) < 2: + return entries + + merged: List[Dict[str, Any]] = [] + for entry in entries: + en = (entry.get('english') or '').strip() + de = (entry.get('german') or '').strip() + + if merged and en and not de: + # Check: not phonetic (already handled) + if _is_phonetic_only_text(en): + merged.append(entry) + continue + + # Check: starts with lowercase + first_alpha = next((c for c in en if c.isalpha()), '') + starts_lower = first_alpha and first_alpha.islower() + + # Check: fewer than 4 words (not an example sentence) + word_count = len(en.split()) + is_short = word_count < 4 + + # Check: previous entry doesn't end with sentence terminator + prev = merged[-1] + prev_en = (prev.get('english') or '').strip() + prev_ends_sentence = prev_en and prev_en[-1] in '.!?' + + if starts_lower and is_short and not prev_ends_sentence: + # Merge into previous entry + prev['english'] = (prev_en + ' ' + en).strip() + # Merge example if present + ex = (entry.get('example') or '').strip() + if ex: + prev_ex = (prev.get('example') or '').strip() + prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex + logger.debug( + f"Merged continuation row {entry.get('row_index')} " + f"into previous entry: {prev['english']!r}" + ) + continue + + merged.append(entry) + + return merged + + +def build_word_grid( + ocr_img: np.ndarray, + column_regions: List[PageRegion], + row_geometries: List[RowGeometry], + img_w: int, + img_h: int, + lang: str = "eng+deu", + ocr_engine: str = "auto", + img_bgr: Optional[np.ndarray] = None, + pronunciation: str = "british", +) -> List[Dict[str, Any]]: + """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing. + + Wrapper around build_cell_grid() that adds vocabulary-specific logic: + - Maps cells to english/german/example entries + - Applies character confusion fixes, IPA lookup, comma splitting, etc. + - Falls back to returning raw cells if no vocab columns detected. + + Args: + ocr_img: Binarized full-page image (for Tesseract). + column_regions: Classified columns from Step 3. + row_geometries: Rows from Step 4. + img_w, img_h: Image dimensions. + lang: Default Tesseract language. + ocr_engine: 'tesseract', 'rapid', or 'auto'. + img_bgr: BGR color image (required for RapidOCR). + pronunciation: 'british' or 'american' for IPA lookup. + + Returns: + List of entry dicts with english/german/example text and bbox info (percent). + """ + cells, columns_meta = build_cell_grid( + ocr_img, column_regions, row_geometries, img_w, img_h, + lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr, + ) + + if not cells: + return [] + + # Check if vocab layout is present + col_types = {c['type'] for c in columns_meta} + if not (col_types & {'column_en', 'column_de'}): + logger.info("build_word_grid: no vocab columns — returning raw cells") + return cells + + # Vocab mapping: cells → entries + entries = _cells_to_vocab_entries(cells, columns_meta) + + # --- Post-processing pipeline (deterministic, no LLM) --- + n_raw = len(entries) + + # 0a. Merge phonetic-only continuation rows into previous entry + entries = _merge_phonetic_continuation_rows(entries) + + # 0b. Merge multi-line continuation rows (lowercase EN, empty DE) + entries = _merge_continuation_rows(entries) + + # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in + # llm_review_entries_streaming so changes are visible to the user in Step 6. + + # 2. Replace OCR'd phonetics with dictionary IPA + entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) + + # 3. Split comma-separated word forms (break, broke, broken → 3 entries) + entries = _split_comma_entries(entries) + + # 4. Attach example sentences (rows without DE → examples for preceding entry) + entries = _attach_example_sentences(entries) + + engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown' + logger.info(f"build_word_grid: {len(entries)} entries from " + f"{n_raw} raw → {len(entries)} after post-processing " + f"(engine={engine_name})") + + return entries + diff --git a/klausur-service/backend/cv_layout.py b/klausur-service/backend/cv_layout.py new file mode 100644 index 0000000..47713a1 --- /dev/null +++ b/klausur-service/backend/cv_layout.py @@ -0,0 +1,3036 @@ +""" +Document type detection, layout analysis, column/row geometry, and classification. + +Lizenz: Apache 2.0 (kommerziell nutzbar) +DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +import logging +import re +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np + +from cv_vocab_types import ( + ColumnGeometry, + DocumentTypeResult, + ENGLISH_FUNCTION_WORDS, + GERMAN_FUNCTION_WORDS, + PageRegion, + RowGeometry, +) + +logger = logging.getLogger(__name__) + +try: + import cv2 +except ImportError: + cv2 = None # type: ignore[assignment] + +try: + import pytesseract + from PIL import Image +except ImportError: + pytesseract = None # type: ignore[assignment] + Image = None # type: ignore[assignment,misc] + + +def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult: + """Detect whether the page is a vocab table, generic table, or full text. + + Uses projection profiles and text density analysis — no OCR required. + Runs in < 2 seconds. + + Args: + ocr_img: Binarized grayscale image (for projection profiles). + img_bgr: BGR color image. + + Returns: + DocumentTypeResult with doc_type, confidence, pipeline, skip_steps. + """ + if ocr_img is None or ocr_img.size == 0: + return DocumentTypeResult( + doc_type='full_text', confidence=0.5, pipeline='full_page', + skip_steps=['columns', 'rows'], + features={'error': 'empty image'}, + ) + + h, w = ocr_img.shape[:2] + + # --- 1. Vertical projection profile → detect column gaps --- + # Sum dark pixels along each column (x-axis). Gaps = valleys in the profile. + # Invert: dark pixels on white background → high values = text. + vert_proj = np.sum(ocr_img < 128, axis=0).astype(float) + + # Smooth the profile to avoid noise spikes + kernel_size = max(3, w // 100) + if kernel_size % 2 == 0: + kernel_size += 1 + vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same') + + # Find significant vertical gaps (columns of near-zero text density) + # A gap must be at least 1% of image width and have < 5% of max density + max_density = max(vert_smooth.max(), 1) + gap_threshold = max_density * 0.05 + min_gap_width = max(5, w // 100) + + in_gap = False + gap_count = 0 + gap_start = 0 + vert_gaps = [] + + for x in range(w): + if vert_smooth[x] < gap_threshold: + if not in_gap: + in_gap = True + gap_start = x + else: + if in_gap: + gap_width = x - gap_start + if gap_width >= min_gap_width: + gap_count += 1 + vert_gaps.append((gap_start, x, gap_width)) + in_gap = False + + # Filter out margin gaps (within 10% of image edges) + margin_threshold = w * 0.10 + internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold] + internal_gap_count = len(internal_gaps) + + # --- 2. Horizontal projection profile → detect row gaps --- + horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float) + h_kernel = max(3, h // 200) + if h_kernel % 2 == 0: + h_kernel += 1 + horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same') + + h_max = max(horiz_smooth.max(), 1) + h_gap_threshold = h_max * 0.05 + min_row_gap = max(3, h // 200) + + row_gap_count = 0 + in_gap = False + for y in range(h): + if horiz_smooth[y] < h_gap_threshold: + if not in_gap: + in_gap = True + gap_start = y + else: + if in_gap: + if y - gap_start >= min_row_gap: + row_gap_count += 1 + in_gap = False + + # --- 3. Text density distribution (4×4 grid) --- + grid_rows, grid_cols = 4, 4 + cell_h, cell_w = h // grid_rows, w // grid_cols + densities = [] + for gr in range(grid_rows): + for gc in range(grid_cols): + cell = ocr_img[gr * cell_h:(gr + 1) * cell_h, + gc * cell_w:(gc + 1) * cell_w] + if cell.size > 0: + d = float(np.count_nonzero(cell < 128)) / cell.size + densities.append(d) + + density_std = float(np.std(densities)) if densities else 0 + density_mean = float(np.mean(densities)) if densities else 0 + + features = { + 'vertical_gaps': gap_count, + 'internal_vertical_gaps': internal_gap_count, + 'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]], + 'row_gaps': row_gap_count, + 'density_mean': round(density_mean, 4), + 'density_std': round(density_std, 4), + 'image_size': (w, h), + } + + # --- 4. Decision tree --- + # Use internal_gap_count (excludes margin gaps) for column detection. + if internal_gap_count >= 2 and row_gap_count >= 5: + # Multiple internal vertical gaps + many row gaps → table + confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005) + return DocumentTypeResult( + doc_type='vocab_table', + confidence=round(confidence, 2), + pipeline='cell_first', + skip_steps=[], + features=features, + ) + elif internal_gap_count >= 1 and row_gap_count >= 3: + # Some internal structure, likely a table + confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01) + return DocumentTypeResult( + doc_type='generic_table', + confidence=round(confidence, 2), + pipeline='cell_first', + skip_steps=[], + features=features, + ) + elif internal_gap_count == 0: + # No internal column gaps → full text (regardless of density) + confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15) + return DocumentTypeResult( + doc_type='full_text', + confidence=round(confidence, 2), + pipeline='full_page', + skip_steps=['columns', 'rows'], + features=features, + ) + else: + # Ambiguous — default to vocab_table (most common use case) + return DocumentTypeResult( + doc_type='vocab_table', + confidence=0.5, + pipeline='cell_first', + skip_steps=[], + features=features, + ) + + +# ============================================================================= +# Stage 4: Dual Image Preparation +# ============================================================================= + +def create_ocr_image(img: np.ndarray) -> np.ndarray: + """Create a binarized image optimized for Tesseract OCR. + + Steps: Grayscale → Background normalization → Adaptive threshold → Denoise. + + Args: + img: BGR image. + + Returns: + Binary image (white text on black background inverted to black on white). + """ + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + # Background normalization: divide by blurred version + bg = cv2.GaussianBlur(gray, (51, 51), 0) + normalized = cv2.divide(gray, bg, scale=255) + + # Adaptive binarization + binary = cv2.adaptiveThreshold( + normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, 31, 10 + ) + + # Light denoise + denoised = cv2.medianBlur(binary, 3) + + return denoised + + +def create_layout_image(img: np.ndarray) -> np.ndarray: + """Create a CLAHE-enhanced grayscale image for layout analysis. + + Args: + img: BGR image. + + Returns: + Enhanced grayscale image. + """ + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) + enhanced = clahe.apply(gray) + return enhanced + + +# ============================================================================= +# Stage 5: Layout Analysis (Projection Profiles) +# ============================================================================= + +def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray: + """Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask.""" + out = mask.copy() + n = len(out) + i = 0 + while i < n: + if out[i]: + start = i + while i < n and out[i]: + i += 1 + if (i - start) < min_width: + out[start:i] = False + else: + i += 1 + return out + + +def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]: + """Find the bounding box of actual text content (excluding page margins). + + Scan artefacts (thin black lines at page edges) are filtered out by + discarding contiguous projection runs narrower than 1 % of the image + dimension (min 5 px). + + Returns: + Tuple of (left_x, right_x, top_y, bottom_y). + """ + h, w = inv.shape[:2] + threshold = 0.005 + + # --- Horizontal projection for top/bottom --- + h_proj = np.sum(inv, axis=1).astype(float) / (w * 255) + h_mask = h_proj > threshold + min_h_run = max(5, h // 100) + h_mask = _filter_narrow_runs(h_mask, min_h_run) + + top_y = 0 + for y in range(h): + if h_mask[y]: + top_y = max(0, y - 5) + break + + bottom_y = h + for y in range(h - 1, 0, -1): + if h_mask[y]: + bottom_y = min(h, y + 5) + break + + # --- Vertical projection for left/right margins --- + v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float) + v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj + v_mask = v_proj_norm > threshold + min_v_run = max(5, w // 100) + v_mask = _filter_narrow_runs(v_mask, min_v_run) + + left_x = 0 + for x in range(w): + if v_mask[x]: + left_x = max(0, x - 2) + break + + right_x = w + for x in range(w - 1, 0, -1): + if v_mask[x]: + right_x = min(w, x + 2) + break + + return left_x, right_x, top_y, bottom_y + + +def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]: + """Detect columns, header, and footer using projection profiles. + + Uses content-bounds detection to exclude page margins before searching + for column separators within the actual text area. + + Args: + layout_img: CLAHE-enhanced grayscale image. + ocr_img: Binarized image for text density analysis. + + Returns: + List of PageRegion objects describing detected regions. + """ + h, w = ocr_img.shape[:2] + + # Invert: black text on white → white text on black for projection + inv = cv2.bitwise_not(ocr_img) + + # --- Find actual content bounds (exclude page margins) --- + left_x, right_x, top_y, bottom_y = _find_content_bounds(inv) + content_w = right_x - left_x + content_h = bottom_y - top_y + + logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), " + f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image") + + if content_w < w * 0.3 or content_h < h * 0.3: + # Fallback if detection seems wrong + left_x, right_x = 0, w + top_y, bottom_y = 0, h + content_w, content_h = w, h + + # --- Vertical projection within content area to find column separators --- + content_strip = inv[top_y:bottom_y, left_x:right_x] + v_proj = np.sum(content_strip, axis=0).astype(float) + v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj + + # Smooth the projection profile + kernel_size = max(5, content_w // 50) + if kernel_size % 2 == 0: + kernel_size += 1 + v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') + + # Debug: log projection profile statistics + p_mean = float(np.mean(v_proj_smooth)) + p_median = float(np.median(v_proj_smooth)) + p_min = float(np.min(v_proj_smooth)) + p_max = float(np.max(v_proj_smooth)) + logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, " + f"mean={p_mean:.4f}, median={p_median:.4f}") + + # Find valleys using multiple threshold strategies + # Strategy 1: relative to median (catches clear separators) + # Strategy 2: local minima approach (catches subtle gaps) + threshold = max(p_median * 0.3, p_mean * 0.2) + logger.info(f"Layout: valley threshold={threshold:.4f}") + + in_valley = v_proj_smooth < threshold + + # Find contiguous valley regions + all_valleys = [] + start = None + for x in range(len(v_proj_smooth)): + if in_valley[x] and start is None: + start = x + elif not in_valley[x] and start is not None: + valley_width = x - start + valley_depth = float(np.min(v_proj_smooth[start:x])) + # Valley must be at least 3px wide + if valley_width >= 3: + all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth)) + start = None + + logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — " + f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}") + + # Filter: valleys must be inside the content area (not at edges) + inner_margin = int(content_w * 0.08) + valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin] + + # If no valleys found with strict threshold, try local minima approach + if len(valleys) < 2: + logger.info("Layout: trying local minima approach for column detection") + # Divide content into 20 segments, find the 2 lowest + seg_count = 20 + seg_width = content_w // seg_count + seg_scores = [] + for i in range(seg_count): + sx = i * seg_width + ex = min((i + 1) * seg_width, content_w) + seg_mean = float(np.mean(v_proj_smooth[sx:ex])) + seg_scores.append((i, sx, ex, seg_mean)) + + seg_scores.sort(key=lambda s: s[3]) + logger.info(f"Layout: segment scores (lowest 5): " + f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}") + + # Find two lowest non-adjacent segments that create reasonable columns + candidate_valleys = [] + for seg_idx, sx, ex, seg_mean in seg_scores: + # Must not be at the edges + if seg_idx <= 1 or seg_idx >= seg_count - 2: + continue + # Must be significantly lower than overall mean + if seg_mean < p_mean * 0.6: + center = (sx + ex) // 2 + candidate_valleys.append((sx, ex, center, ex - sx, seg_mean)) + + if len(candidate_valleys) >= 2: + # Pick the best pair: non-adjacent, creating reasonable column widths + candidate_valleys.sort(key=lambda v: v[2]) + best_pair = None + best_score = float('inf') + for i in range(len(candidate_valleys)): + for j in range(i + 1, len(candidate_valleys)): + c1 = candidate_valleys[i][2] + c2 = candidate_valleys[j][2] + # Must be at least 20% apart + if (c2 - c1) < content_w * 0.2: + continue + col1 = c1 + col2 = c2 - c1 + col3 = content_w - c2 + # Each column at least 15% + if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12: + continue + parts = sorted([col1, col2, col3]) + score = parts[2] - parts[0] + if score < best_score: + best_score = score + best_pair = (candidate_valleys[i], candidate_valleys[j]) + + if best_pair: + valleys = list(best_pair) + logger.info(f"Layout: local minima found 2 valleys: " + f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}") + + logger.info(f"Layout: final {len(valleys)} valleys: " + f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}") + + regions = [] + + if len(valleys) >= 2: + # 3-column layout detected + valleys.sort(key=lambda v: v[2]) + + if len(valleys) == 2: + sep1_center = valleys[0][2] + sep2_center = valleys[1][2] + else: + # Pick the two valleys that best divide into 3 parts + # Prefer wider valleys (more likely true separators) + best_pair = None + best_score = float('inf') + for i in range(len(valleys)): + for j in range(i + 1, len(valleys)): + c1, c2 = valleys[i][2], valleys[j][2] + # Each column should be at least 15% of content width + col1 = c1 + col2 = c2 - c1 + col3 = content_w - c2 + if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15: + continue + # Score: lower is better (more even distribution) + parts = sorted([col1, col2, col3]) + score = parts[2] - parts[0] + # Bonus for wider valleys (subtract valley width) + score -= (valleys[i][3] + valleys[j][3]) * 0.5 + if score < best_score: + best_score = score + best_pair = (c1, c2) + if best_pair: + sep1_center, sep2_center = best_pair + else: + sep1_center = valleys[0][2] + sep2_center = valleys[1][2] + + # Convert from content-relative to absolute coordinates + abs_sep1 = sep1_center + left_x + abs_sep2 = sep2_center + left_x + + logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} " + f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})") + + regions.append(PageRegion( + type='column_en', x=0, y=top_y, + width=abs_sep1, height=content_h + )) + regions.append(PageRegion( + type='column_de', x=abs_sep1, y=top_y, + width=abs_sep2 - abs_sep1, height=content_h + )) + regions.append(PageRegion( + type='column_example', x=abs_sep2, y=top_y, + width=w - abs_sep2, height=content_h + )) + + elif len(valleys) == 1: + # 2-column layout + abs_sep = valleys[0][2] + left_x + + logger.info(f"Layout: 2 columns at separator x={abs_sep}") + + regions.append(PageRegion( + type='column_en', x=0, y=top_y, + width=abs_sep, height=content_h + )) + regions.append(PageRegion( + type='column_de', x=abs_sep, y=top_y, + width=w - abs_sep, height=content_h + )) + + else: + # No columns detected — run full-page OCR as single column + logger.warning("Layout: no column separators found, using full page") + regions.append(PageRegion( + type='column_en', x=0, y=top_y, + width=w, height=content_h + )) + + # Add header/footer info (gap-based detection with fallback) + _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv) + + top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none') + bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none') + col_count = len([r for r in regions if r.type.startswith('column')]) + logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}") + + return regions + + +# ============================================================================= +# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection) +# ============================================================================= + +# --- Phase A: Geometry Detection --- + +def _detect_columns_by_clustering( + word_dicts: List[Dict], + left_edges: List[int], + edge_word_indices: List[int], + content_w: int, + content_h: int, + left_x: int, + right_x: int, + top_y: int, + bottom_y: int, + inv: Optional[np.ndarray] = None, +) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]: + """Fallback: detect columns by clustering left-aligned word positions. + + Used when the primary gap-based algorithm finds fewer than 2 gaps. + """ + tolerance = max(10, int(content_w * 0.01)) + sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0]) + + clusters = [] + cluster_widxs = [] + cur_edges = [sorted_pairs[0][0]] + cur_widxs = [sorted_pairs[0][1]] + for edge, widx in sorted_pairs[1:]: + if edge - cur_edges[-1] <= tolerance: + cur_edges.append(edge) + cur_widxs.append(widx) + else: + clusters.append(cur_edges) + cluster_widxs.append(cur_widxs) + cur_edges = [edge] + cur_widxs = [widx] + clusters.append(cur_edges) + cluster_widxs.append(cur_widxs) + + MIN_Y_COVERAGE_PRIMARY = 0.30 + MIN_Y_COVERAGE_SECONDARY = 0.15 + MIN_WORDS_SECONDARY = 5 + + cluster_infos = [] + for c_edges, c_widxs in zip(clusters, cluster_widxs): + if len(c_edges) < 2: + continue + y_positions = [word_dicts[idx]['top'] for idx in c_widxs] + y_span = max(y_positions) - min(y_positions) + y_coverage = y_span / content_h if content_h > 0 else 0.0 + cluster_infos.append({ + 'mean_x': int(np.mean(c_edges)), + 'count': len(c_edges), + 'min_edge': min(c_edges), + 'max_edge': max(c_edges), + 'y_min': min(y_positions), + 'y_max': max(y_positions), + 'y_coverage': y_coverage, + }) + + primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY] + primary_set = set(id(c) for c in primary) + secondary = [c for c in cluster_infos + if id(c) not in primary_set + and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY + and c['count'] >= MIN_WORDS_SECONDARY] + significant = sorted(primary + secondary, key=lambda c: c['mean_x']) + + if len(significant) < 3: + logger.info("ColumnGeometry clustering fallback: < 3 significant clusters") + return None + + merge_distance = max(30, int(content_w * 0.06)) + merged = [significant[0].copy()] + for s in significant[1:]: + if s['mean_x'] - merged[-1]['mean_x'] < merge_distance: + prev = merged[-1] + total = prev['count'] + s['count'] + avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total + prev['mean_x'] = avg_x + prev['count'] = total + prev['min_edge'] = min(prev['min_edge'], s['min_edge']) + prev['max_edge'] = max(prev['max_edge'], s['max_edge']) + else: + merged.append(s.copy()) + + if len(merged) < 3: + logger.info("ColumnGeometry clustering fallback: < 3 merged clusters") + return None + + logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering") + + margin_px = max(6, int(content_w * 0.003)) + return _build_geometries_from_starts( + [(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged], + word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv, + ) + + +def _detect_sub_columns( + geometries: List[ColumnGeometry], + content_w: int, + left_x: int = 0, + top_y: int = 0, + header_y: Optional[int] = None, + footer_y: Optional[int] = None, + _edge_tolerance: int = 8, + _min_col_start_ratio: float = 0.10, +) -> List[ColumnGeometry]: + """Split columns that contain internal sub-columns based on left-edge alignment. + + For each column, clusters word left-edges into alignment bins (within + ``_edge_tolerance`` px). The leftmost bin whose word count reaches + ``_min_col_start_ratio`` of the column total is treated as the true column + start. Any words to the left of that bin form a sub-column, provided they + number >= 2 and < 35 % of total. + + Word ``left`` values are relative to the content ROI (offset by *left_x*), + while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x* + bridges the two coordinate systems. + + If *header_y* / *footer_y* are provided (absolute y-coordinates), words + in header/footer regions are excluded from alignment clustering to avoid + polluting the bins with page numbers or chapter titles. Word ``top`` + values are relative to *top_y*. + + Returns a new list of ColumnGeometry — potentially longer than the input. + """ + if content_w <= 0: + return geometries + + result: List[ColumnGeometry] = [] + for geo in geometries: + # Only consider wide-enough columns with enough words + if geo.width_ratio < 0.15 or geo.word_count < 5: + result.append(geo) + continue + + # Collect left-edges of confident words, excluding header/footer + # Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y) + min_top_rel = (header_y - top_y) if header_y is not None else None + max_top_rel = (footer_y - top_y) if footer_y is not None else None + + confident = [w for w in geo.words + if w.get('conf', 0) >= 30 + and (min_top_rel is None or w['top'] >= min_top_rel) + and (max_top_rel is None or w['top'] <= max_top_rel)] + if len(confident) < 3: + result.append(geo) + continue + + # --- Cluster left-edges into alignment bins --- + sorted_edges = sorted(w['left'] for w in confident) + bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge) + cur = [sorted_edges[0]] + for i in range(1, len(sorted_edges)): + if sorted_edges[i] - cur[-1] <= _edge_tolerance: + cur.append(sorted_edges[i]) + else: + bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur))) + cur = [sorted_edges[i]] + bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur))) + + # --- Find the leftmost bin qualifying as a real column start --- + total = len(confident) + min_count = max(3, int(total * _min_col_start_ratio)) + col_start_bin = None + for b in bins: + if b[1] >= min_count: + col_start_bin = b + break + + if col_start_bin is None: + result.append(geo) + continue + + # Words to the left of the column-start bin are sub-column candidates + split_threshold = col_start_bin[2] - _edge_tolerance + sub_words = [w for w in geo.words if w['left'] < split_threshold] + main_words = [w for w in geo.words if w['left'] >= split_threshold] + + # Count only body words (excluding header/footer) for the threshold check + # so that header/footer words don't artificially trigger a split. + sub_body = [w for w in sub_words + if (min_top_rel is None or w['top'] >= min_top_rel) + and (max_top_rel is None or w['top'] <= max_top_rel)] + if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35: + result.append(geo) + continue + + # --- Build two sub-column geometries --- + # Word 'left' values are relative to left_x; geo.x is absolute. + # Convert the split position from relative to absolute coordinates. + max_sub_left = max(w['left'] for w in sub_words) + split_rel = (max_sub_left + col_start_bin[2]) // 2 + split_abs = split_rel + left_x + + sub_x = geo.x + sub_width = split_abs - geo.x + main_x = split_abs + main_width = (geo.x + geo.width) - split_abs + + if sub_width <= 0 or main_width <= 0: + result.append(geo) + continue + + sub_geo = ColumnGeometry( + index=0, + x=sub_x, + y=geo.y, + width=sub_width, + height=geo.height, + word_count=len(sub_words), + words=sub_words, + width_ratio=sub_width / content_w if content_w > 0 else 0.0, + is_sub_column=True, + ) + main_geo = ColumnGeometry( + index=0, + x=main_x, + y=geo.y, + width=main_width, + height=geo.height, + word_count=len(main_words), + words=main_words, + width_ratio=main_width / content_w if content_w > 0 else 0.0, + is_sub_column=True, + ) + + result.append(sub_geo) + result.append(main_geo) + + logger.info( + f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} " + f"(rel={split_rel}), sub={len(sub_words)} words, " + f"main={len(main_words)} words, " + f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})" + ) + + # Re-index by left-to-right order + result.sort(key=lambda g: g.x) + for i, g in enumerate(result): + g.index = i + + return result + + +def _split_broad_columns( + geometries: List[ColumnGeometry], + content_w: int, + left_x: int = 0, + _broad_threshold: float = 0.35, + _min_gap_px: int = 15, + _min_words_per_split: int = 5, +) -> List[ColumnGeometry]: + """Split overly broad columns that contain two language blocks (EN+DE). + + Uses word-coverage gap analysis: builds a per-pixel coverage array from the + words inside each broad column, finds the largest horizontal gap, and splits + the column at that gap. + + Args: + geometries: Column geometries from _detect_sub_columns. + content_w: Width of the content area in pixels. + left_x: Left edge of content ROI in absolute image coordinates. + _broad_threshold: Minimum width_ratio to consider a column "broad". + _min_gap_px: Minimum gap width (pixels) to trigger a split. + _min_words_per_split: Both halves must have at least this many words. + + Returns: + Updated list of ColumnGeometry (possibly with more columns). + """ + result: List[ColumnGeometry] = [] + + logger.info(f"SplitBroadCols: input {len(geometries)} cols: " + f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}") + + for geo in geometries: + if geo.width_ratio <= _broad_threshold or len(geo.words) < 10: + result.append(geo) + continue + + # Build word-coverage array (per pixel within column) + col_left_rel = geo.x - left_x # column left in content-relative coords + coverage = np.zeros(geo.width, dtype=np.float32) + + for wd in geo.words: + # wd['left'] is relative to left_x (content ROI) + wl = wd['left'] - col_left_rel + wr = wl + wd.get('width', 0) + wl = max(0, int(wl)) + wr = min(geo.width, int(wr)) + if wr > wl: + coverage[wl:wr] += 1.0 + + # Light smoothing (kernel=3px) to avoid noise + if len(coverage) > 3: + kernel = np.ones(3, dtype=np.float32) / 3.0 + coverage = np.convolve(coverage, kernel, mode='same') + + # Normalise to [0, 1] + cmax = coverage.max() + if cmax > 0: + coverage /= cmax + + # Find INTERNAL gaps where coverage < 0.5 + # Exclude edge gaps (touching pixel 0 or geo.width) — those are margins. + low_mask = coverage < 0.5 + all_gaps = [] + _gs = None + for px in range(len(low_mask)): + if low_mask[px]: + if _gs is None: + _gs = px + else: + if _gs is not None: + all_gaps.append((_gs, px, px - _gs)) + _gs = None + if _gs is not None: + all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs)) + + # Filter: only internal gaps (not touching column edges) + _edge_margin = 10 # pixels from edge to ignore + internal_gaps = [g for g in all_gaps + if g[0] > _edge_margin and g[1] < geo.width - _edge_margin] + best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None + + logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): " + f"{[g for g in all_gaps if g[2] >= 5]}, " + f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, " + f"best={best_gap}") + + if best_gap is None or best_gap[2] < _min_gap_px: + result.append(geo) + continue + + gap_center = (best_gap[0] + best_gap[1]) // 2 + + # Split words by midpoint relative to gap + left_words = [] + right_words = [] + for wd in geo.words: + wl = wd['left'] - col_left_rel + mid = wl + wd.get('width', 0) / 2.0 + if mid < gap_center: + left_words.append(wd) + else: + right_words.append(wd) + + if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split: + result.append(geo) + continue + + # Build two new ColumnGeometry objects + split_x_abs = geo.x + gap_center + left_w = gap_center + right_w = geo.width - gap_center + + left_geo = ColumnGeometry( + index=0, + x=geo.x, + y=geo.y, + width=left_w, + height=geo.height, + word_count=len(left_words), + words=left_words, + width_ratio=left_w / content_w if content_w else 0, + is_sub_column=True, + ) + right_geo = ColumnGeometry( + index=0, + x=split_x_abs, + y=geo.y, + width=right_w, + height=geo.height, + word_count=len(right_words), + words=right_words, + width_ratio=right_w / content_w if content_w else 0, + is_sub_column=True, + ) + + logger.info( + f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} " + f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), " + f"left={len(left_words)} words (w={left_w}), " + f"right={len(right_words)} words (w={right_w})" + ) + + result.append(left_geo) + result.append(right_geo) + + # Re-index left-to-right + result.sort(key=lambda g: g.x) + for i, g in enumerate(result): + g.index = i + + return result + + +def _build_geometries_from_starts( + col_starts: List[Tuple[int, int]], + word_dicts: List[Dict], + left_x: int, + right_x: int, + top_y: int, + bottom_y: int, + content_w: int, + content_h: int, + inv: Optional[np.ndarray] = None, +) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]: + """Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs.""" + geometries = [] + for i, (start_x, count) in enumerate(col_starts): + if i + 1 < len(col_starts): + col_width = col_starts[i + 1][0] - start_x + else: + col_width = right_x - start_x + + col_left_rel = start_x - left_x + col_right_rel = col_left_rel + col_width + col_words = [w for w in word_dicts + if col_left_rel <= w['left'] < col_right_rel] + + geometries.append(ColumnGeometry( + index=i, + x=start_x, + y=top_y, + width=col_width, + height=content_h, + word_count=len(col_words), + words=col_words, + width_ratio=col_width / content_w if content_w > 0 else 0.0, + )) + + logger.info(f"ColumnGeometry: {len(geometries)} columns: " + f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") + return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv) + + +def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]: + """Detect column geometry using whitespace-gap analysis with word validation. + + Phase A of the two-phase column detection. Uses vertical projection + profiles to find whitespace gaps between columns, then validates that + no gap cuts through a word bounding box. + + Falls back to clustering-based detection if fewer than 2 gaps are found. + + Args: + ocr_img: Binarized grayscale image for layout analysis. + dewarped_bgr: Original BGR image (for Tesseract word detection). + + Returns: + Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv) + or None if detection fails entirely. + """ + h, w = ocr_img.shape[:2] + + # --- Step 1: Find content bounds --- + inv = cv2.bitwise_not(ocr_img) + left_x, right_x, top_y, bottom_y = _find_content_bounds(inv) + content_w = right_x - left_x + content_h = bottom_y - top_y + + if content_w < w * 0.3 or content_h < h * 0.3: + left_x, right_x = 0, w + top_y, bottom_y = 0, h + content_w, content_h = w, h + + logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), " + f"y=[{top_y}..{bottom_y}] ({content_h}px)") + + # --- Step 2: Get word bounding boxes from Tesseract --- + # Crop from left_x to full image width (not right_x) so words at the right + # edge of the last column are included even if they extend past the detected + # content boundary (right_x). + content_roi = dewarped_bgr[top_y:bottom_y, left_x:w] + pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB)) + + try: + data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT) + except Exception as e: + logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}") + return None + + word_dicts = [] + left_edges = [] + edge_word_indices = [] + n_words = len(data['text']) + for i in range(n_words): + conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1 + text = str(data['text'][i]).strip() + if conf < 30 or not text: + continue + lx = int(data['left'][i]) + ty = int(data['top'][i]) + bw = int(data['width'][i]) + bh = int(data['height'][i]) + left_edges.append(lx) + edge_word_indices.append(len(word_dicts)) + word_dicts.append({ + 'text': text, 'conf': conf, + 'left': lx, 'top': ty, 'width': bw, 'height': bh, + }) + + if len(left_edges) < 5: + logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected") + return None + + logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area") + + # --- Step 2b: Segment by sub-headers --- + # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width + # text bands that pollute the vertical projection. We detect large + # horizontal gaps (= whitespace rows separating sections) and use only + # the tallest content segment for the projection. This makes column + # detection immune to sub-headers, illustrations, and section dividers. + content_strip = inv[top_y:bottom_y, left_x:right_x] + h_proj_row = np.sum(content_strip, axis=1).astype(float) + h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row + + # Find horizontal gaps (near-empty rows) + H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty" + h_in_gap = h_proj_row_norm < H_GAP_THRESH + H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px + + h_gaps: List[Tuple[int, int]] = [] + h_gap_start = None + for y_idx in range(len(h_in_gap)): + if h_in_gap[y_idx]: + if h_gap_start is None: + h_gap_start = y_idx + else: + if h_gap_start is not None: + if y_idx - h_gap_start >= H_MIN_GAP: + h_gaps.append((h_gap_start, y_idx)) + h_gap_start = None + if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP: + h_gaps.append((h_gap_start, len(h_in_gap))) + + # Identify "large" gaps (significantly bigger than median) that indicate + # section boundaries (sub-headers, chapter titles). + if len(h_gaps) >= 3: + gap_sizes = sorted(g[1] - g[0] for g in h_gaps) + median_gap_h = gap_sizes[len(gap_sizes) // 2] + large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3) + large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh] + else: + large_gaps = h_gaps + + # Build content segments between large gaps and pick the tallest + seg_boundaries = [0] + for gs, ge in large_gaps: + seg_boundaries.append(gs) + seg_boundaries.append(ge) + seg_boundaries.append(content_h) + + segments = [] + for i in range(0, len(seg_boundaries) - 1, 2): + seg_top = seg_boundaries[i] + seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h + seg_height = seg_bot - seg_top + if seg_height > 20: # ignore tiny fragments + segments.append((seg_top, seg_bot, seg_height)) + + if segments: + segments.sort(key=lambda s: s[2], reverse=True) + best_seg = segments[0] + proj_strip = content_strip[best_seg[0]:best_seg[1], :] + effective_h = best_seg[2] + if len(segments) > 1: + logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} " + f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} " + f"({effective_h}px, {effective_h*100/content_h:.0f}%)") + else: + proj_strip = content_strip + effective_h = content_h + + # --- Step 3: Vertical projection profile --- + v_proj = np.sum(proj_strip, axis=0).astype(float) + v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj + + # Smooth the projection to avoid noise-induced micro-gaps + kernel_size = max(5, content_w // 80) + if kernel_size % 2 == 0: + kernel_size += 1 # keep odd for symmetry + v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') + + # --- Step 4: Find whitespace gaps --- + # Threshold: areas with very little ink density are gaps + median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01 + gap_threshold = max(median_density * 0.15, 0.005) + + in_gap = v_smooth < gap_threshold + MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width + + # Collect contiguous gap regions + raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI + gap_start = None + for x in range(len(in_gap)): + if in_gap[x]: + if gap_start is None: + gap_start = x + else: + if gap_start is not None: + gap_width = x - gap_start + if gap_width >= MIN_GAP_WIDTH: + raw_gaps.append((gap_start, x)) + gap_start = None + # Handle gap at the right edge + if gap_start is not None: + gap_width = len(in_gap) - gap_start + if gap_width >= MIN_GAP_WIDTH: + raw_gaps.append((gap_start, len(in_gap))) + + logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, " + f"min_width={MIN_GAP_WIDTH}px): " + f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}") + + # --- Step 5: Validate gaps against word bounding boxes --- + # When using a segment for projection, only validate against words + # inside that segment — words from sub-headers or other sections + # would incorrectly overlap with real column gaps. + if segments and len(segments) > 1: + seg_top_abs = best_seg[0] # relative to content strip + seg_bot_abs = best_seg[1] + segment_words = [wd for wd in word_dicts + if wd['top'] >= seg_top_abs + and wd['top'] + wd['height'] <= seg_bot_abs] + logger.info(f"ColumnGeometry: filtering words to segment: " + f"{len(segment_words)}/{len(word_dicts)} words") + else: + segment_words = word_dicts + + validated_gaps = [] + for gap_start_rel, gap_end_rel in raw_gaps: + # Check if any word overlaps with this gap region + overlapping = False + for wd in segment_words: + word_left = wd['left'] + word_right = wd['left'] + wd['width'] + if word_left < gap_end_rel and word_right > gap_start_rel: + overlapping = True + break + + if not overlapping: + validated_gaps.append((gap_start_rel, gap_end_rel)) + else: + # Try to shift the gap to avoid the overlapping word(s) + # Find the tightest word boundaries within the gap region + min_word_left = content_w + max_word_right = 0 + for wd in segment_words: + word_left = wd['left'] + word_right = wd['left'] + wd['width'] + if word_left < gap_end_rel and word_right > gap_start_rel: + min_word_left = min(min_word_left, word_left) + max_word_right = max(max_word_right, word_right) + + # Try gap before the overlapping words + if min_word_left - gap_start_rel >= MIN_GAP_WIDTH: + validated_gaps.append((gap_start_rel, min_word_left)) + logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}") + # Try gap after the overlapping words + elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH: + validated_gaps.append((max_word_right, gap_end_rel)) + logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}") + else: + logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " + f"discarded (word overlap, no room to shift)") + + logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: " + f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}") + + # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) --- + # When pixel-based projection fails (e.g. due to illustrations or colored + # bands), use word bounding boxes to find clear vertical gaps. This is + # immune to decorative graphics that Tesseract doesn't recognise as words. + if len(validated_gaps) < 2: + logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps") + word_coverage = np.zeros(content_w, dtype=np.int32) + for wd in segment_words: + wl = max(0, wd['left']) + wr = min(wd['left'] + wd['width'], content_w) + if wr > wl: + word_coverage[wl:wr] += 1 + + # Smooth slightly to bridge tiny 1-2px noise gaps between words + wc_kernel = max(3, content_w // 300) + if wc_kernel % 2 == 0: + wc_kernel += 1 + wc_smooth = np.convolve(word_coverage.astype(float), + np.ones(wc_kernel) / wc_kernel, mode='same') + + wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage + WC_MIN_GAP = max(4, content_w // 300) + + wc_gaps: List[Tuple[int, int]] = [] + wc_gap_start = None + for x in range(len(wc_in_gap)): + if wc_in_gap[x]: + if wc_gap_start is None: + wc_gap_start = x + else: + if wc_gap_start is not None: + if x - wc_gap_start >= WC_MIN_GAP: + wc_gaps.append((wc_gap_start, x)) + wc_gap_start = None + if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP: + wc_gaps.append((wc_gap_start, len(wc_in_gap))) + + logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found " + f"(min_width={WC_MIN_GAP}px): " + f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}") + + if len(wc_gaps) >= 2: + validated_gaps = wc_gaps + + # --- Step 6: Fallback to clustering if too few gaps --- + if len(validated_gaps) < 2: + logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering") + return _detect_columns_by_clustering( + word_dicts, left_edges, edge_word_indices, + content_w, content_h, left_x, right_x, top_y, bottom_y, inv, + ) + + # --- Step 7: Derive column boundaries from gaps --- + # Sort gaps by position + validated_gaps.sort(key=lambda g: g[0]) + + # Identify margin gaps (first and last) vs interior gaps + # A margin gap touches the edge of the content area (within 2% tolerance) + edge_tolerance = max(10, int(content_w * 0.02)) + + is_left_margin = validated_gaps[0][0] <= edge_tolerance + is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance + + # Interior gaps define column boundaries + # Column starts at the end of a gap, ends at the start of the next gap + col_starts = [] + + if is_left_margin: + # First column starts after the left margin gap + first_gap_end = validated_gaps[0][1] + interior_gaps = validated_gaps[1:] + else: + # No left margin gap — first column starts at content left edge + first_gap_end = 0 + interior_gaps = validated_gaps[:] + + if is_right_margin: + # Last gap is right margin — don't use it as column start + interior_gaps_for_boundaries = interior_gaps[:-1] + right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start + else: + interior_gaps_for_boundaries = interior_gaps + right_boundary = content_w + + # First column + col_starts.append(left_x + first_gap_end) + + # Columns between interior gaps + for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries: + col_starts.append(left_x + gap_end_rel) + + # Count words per column region (for logging) + col_start_counts = [] + for i, start_x in enumerate(col_starts): + if i + 1 < len(col_starts): + next_start = col_starts[i + 1] + else: + # Rightmost column always extends to full image width (w). + # The page margin contains only white space — extending the OCR + # crop to the image edge is safe and prevents text near the right + # border from being cut off. + next_start = w + + col_left_rel = start_x - left_x + col_right_rel = next_start - left_x + n_words_in_col = sum(1 for w in word_dicts + if col_left_rel <= w['left'] < col_right_rel) + col_start_counts.append((start_x, n_words_in_col)) + + logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps " + f"(left_margin={is_left_margin}, right_margin={is_right_margin}): " + f"{col_start_counts}") + + # --- Step 8: Build ColumnGeometry objects --- + # Determine right edge for each column + all_boundaries = [] + for i, start_x in enumerate(col_starts): + if i + 1 < len(col_starts): + end_x = col_starts[i + 1] + else: + # Rightmost column always extends to full image width (w). + end_x = w + all_boundaries.append((start_x, end_x)) + + geometries = [] + for i, (start_x, end_x) in enumerate(all_boundaries): + col_width = end_x - start_x + col_left_rel = start_x - left_x + col_right_rel = col_left_rel + col_width + col_words = [w for w in word_dicts + if col_left_rel <= w['left'] < col_right_rel] + + geometries.append(ColumnGeometry( + index=i, + x=start_x, + y=top_y, + width=col_width, + height=content_h, + word_count=len(col_words), + words=col_words, + width_ratio=col_width / content_w if content_w > 0 else 0.0, + )) + + logger.info(f"ColumnGeometry: {len(geometries)} columns: " + f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") + + # --- Step 9: Filter phantom narrow columns --- + # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow + # columns (< 3% of content width) with zero or no words. These are not + # real columns — remove them and close the gap between neighbors. + min_real_col_w = max(20, int(content_w * 0.03)) + filtered_geoms = [g for g in geometries + if not (g.word_count < 3 and g.width < min_real_col_w)] + if len(filtered_geoms) < len(geometries): + n_removed = len(geometries) - len(filtered_geoms) + logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) " + f"(width < {min_real_col_w}px and words < 3)") + # Extend each remaining column to close gaps with its right neighbor + for i, g in enumerate(filtered_geoms): + if i + 1 < len(filtered_geoms): + g.width = filtered_geoms[i + 1].x - g.x + else: + g.width = w - g.x + g.index = i + col_left_rel = g.x - left_x + col_right_rel = col_left_rel + g.width + g.words = [w for w in word_dicts + if col_left_rel <= w['left'] < col_right_rel] + g.word_count = len(g.words) + geometries = filtered_geoms + logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: " + f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") + + return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv) + + +def expand_narrow_columns( + geometries: List[ColumnGeometry], + content_w: int, + left_x: int, + word_dicts: List[Dict], +) -> List[ColumnGeometry]: + """Expand narrow columns into adjacent whitespace gaps. + + Narrow columns (marker, page_ref, < 10% content width) often lose + content at image edges due to residual shear. This expands them toward + the neighbouring column, but never past 40% of the gap or past the + nearest word in the neighbour. + + Must be called AFTER _detect_sub_columns() so that sub-column splits + (which create the narrowest columns) have already happened. + """ + _NARROW_THRESHOLD_PCT = 10.0 + _MIN_WORD_MARGIN = 4 + + if len(geometries) < 2: + return geometries + + logger.info("ExpandNarrowCols: input %d cols: %s", + len(geometries), + [(i, g.x, g.width, round(g.width / content_w * 100, 1)) + for i, g in enumerate(geometries)]) + + for i, g in enumerate(geometries): + col_pct = g.width / content_w * 100 if content_w > 0 else 100 + if col_pct >= _NARROW_THRESHOLD_PCT: + continue + + expanded = False + orig_pct = col_pct + + # --- try expanding to the LEFT --- + if i > 0: + left_nb = geometries[i - 1] + # Gap can be 0 if sub-column split created adjacent columns. + # In that case, look at where the neighbor's rightmost words + # actually are — there may be unused space we can claim. + nb_words_right = [wd['left'] + wd.get('width', 0) + for wd in left_nb.words] + if nb_words_right: + rightmost_word_abs = left_x + max(nb_words_right) + safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN + else: + # No words in neighbor → we can take up to neighbor's start + safe_left_abs = left_nb.x + _MIN_WORD_MARGIN + + if safe_left_abs < g.x: + g.width += (g.x - safe_left_abs) + g.x = safe_left_abs + expanded = True + + # --- try expanding to the RIGHT --- + if i + 1 < len(geometries): + right_nb = geometries[i + 1] + nb_words_left = [wd['left'] for wd in right_nb.words] + if nb_words_left: + leftmost_word_abs = left_x + min(nb_words_left) + safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN + else: + safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN + + cur_right = g.x + g.width + if safe_right_abs > cur_right: + g.width = safe_right_abs - g.x + expanded = True + + if expanded: + col_left_rel = g.x - left_x + col_right_rel = col_left_rel + g.width + g.words = [wd for wd in word_dicts + if col_left_rel <= wd['left'] < col_right_rel] + g.word_count = len(g.words) + g.width_ratio = g.width / content_w if content_w > 0 else 0.0 + logger.info( + "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d", + i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count) + + # --- Shrink overlapping neighbors to match new boundaries --- + # Left neighbor: its right edge must not exceed our new left edge + if i > 0: + left_nb = geometries[i - 1] + nb_right = left_nb.x + left_nb.width + if nb_right > g.x: + left_nb.width = g.x - left_nb.x + if left_nb.width < 0: + left_nb.width = 0 + left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0 + # Re-assign words + nb_left_rel = left_nb.x - left_x + nb_right_rel = nb_left_rel + left_nb.width + left_nb.words = [wd for wd in word_dicts + if nb_left_rel <= wd['left'] < nb_right_rel] + left_nb.word_count = len(left_nb.words) + + # Right neighbor: its left edge must not be before our new right edge + if i + 1 < len(geometries): + right_nb = geometries[i + 1] + my_right = g.x + g.width + if right_nb.x < my_right: + old_right_edge = right_nb.x + right_nb.width + right_nb.x = my_right + right_nb.width = old_right_edge - right_nb.x + if right_nb.width < 0: + right_nb.width = 0 + right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0 + # Re-assign words + nb_left_rel = right_nb.x - left_x + nb_right_rel = nb_left_rel + right_nb.width + right_nb.words = [wd for wd in word_dicts + if nb_left_rel <= wd['left'] < nb_right_rel] + right_nb.word_count = len(right_nb.words) + + return geometries + + +# ============================================================================= +# Row Geometry Detection (horizontal whitespace-gap analysis) +# ============================================================================= + +def detect_row_geometry( + inv: np.ndarray, + word_dicts: List[Dict], + left_x: int, right_x: int, + top_y: int, bottom_y: int, +) -> List['RowGeometry']: + """Detect row geometry using horizontal whitespace-gap analysis. + + Mirrors the vertical gap approach used for columns, but operates on + horizontal projection profiles to find gaps between text lines. + Also classifies header/footer rows based on gap size. + + Args: + inv: Inverted binarized image (white text on black bg, full page). + word_dicts: Word bounding boxes from Tesseract (relative to content ROI). + left_x, right_x: Absolute X bounds of the content area. + top_y, bottom_y: Absolute Y bounds of the content area. + + Returns: + List of RowGeometry objects sorted top to bottom. + """ + content_w = right_x - left_x + content_h = bottom_y - top_y + + if content_h < 10 or content_w < 10: + logger.warning("detect_row_geometry: content area too small") + return [] + + # --- Step 1: Horizontal projection profile (text-only, images masked out) --- + content_strip = inv[top_y:bottom_y, left_x:right_x] + + # Build a word-coverage mask so that image regions (high ink density but no + # Tesseract words) are ignored. Only pixels within/near word bounding boxes + # contribute to the projection. This prevents large illustrations from + # merging multiple vocabulary rows into one. + WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words + word_mask = np.zeros((content_h, content_w), dtype=np.uint8) + for wd in word_dicts: + y1 = max(0, wd['top'] - WORD_PAD_Y) + y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y) + x1 = max(0, wd['left']) + x2 = min(content_w, wd['left'] + wd['width']) + word_mask[y1:y2, x1:x2] = 255 + + masked_strip = cv2.bitwise_and(content_strip, word_mask) + h_proj = np.sum(masked_strip, axis=1).astype(float) + h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj + + # --- Step 2: Smoothing + threshold --- + kernel_size = max(3, content_h // 200) + if kernel_size % 2 == 0: + kernel_size += 1 + h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') + + median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01 + gap_threshold = max(median_density * 0.15, 0.003) + + in_gap = h_smooth < gap_threshold + MIN_GAP_HEIGHT = max(3, content_h // 500) + + # --- Step 3: Collect contiguous gap regions --- + raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI + gap_start = None + for y in range(len(in_gap)): + if in_gap[y]: + if gap_start is None: + gap_start = y + else: + if gap_start is not None: + gap_height = y - gap_start + if gap_height >= MIN_GAP_HEIGHT: + raw_gaps.append((gap_start, y)) + gap_start = None + if gap_start is not None: + gap_height = len(in_gap) - gap_start + if gap_height >= MIN_GAP_HEIGHT: + raw_gaps.append((gap_start, len(in_gap))) + + logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, " + f"min_height={MIN_GAP_HEIGHT}px)") + + # --- Step 4: Validate gaps against word bounding boxes --- + validated_gaps = [] + for gap_start_rel, gap_end_rel in raw_gaps: + overlapping = False + for wd in word_dicts: + word_top = wd['top'] + word_bottom = wd['top'] + wd['height'] + if word_top < gap_end_rel and word_bottom > gap_start_rel: + overlapping = True + break + + if not overlapping: + validated_gaps.append((gap_start_rel, gap_end_rel)) + else: + # Try to shift the gap to avoid overlapping words + min_word_top = content_h + max_word_bottom = 0 + for wd in word_dicts: + word_top = wd['top'] + word_bottom = wd['top'] + wd['height'] + if word_top < gap_end_rel and word_bottom > gap_start_rel: + min_word_top = min(min_word_top, word_top) + max_word_bottom = max(max_word_bottom, word_bottom) + + if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT: + validated_gaps.append((gap_start_rel, min_word_top)) + elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT: + validated_gaps.append((max_word_bottom, gap_end_rel)) + else: + logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] " + f"discarded (word overlap, no room to shift)") + + logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation") + + # --- Fallback if too few gaps --- + if len(validated_gaps) < 2: + logger.info("RowGeometry: < 2 gaps found, falling back to word grouping") + return _build_rows_from_word_grouping( + word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, + ) + + validated_gaps.sort(key=lambda g: g[0]) + + # --- Step 5: Header/footer detection via gap size --- + HEADER_FOOTER_ZONE = 0.15 + GAP_MULTIPLIER = 2.0 + + gap_sizes = [g[1] - g[0] for g in validated_gaps] + median_gap = float(np.median(gap_sizes)) if gap_sizes else 0 + large_gap_threshold = median_gap * GAP_MULTIPLIER + + header_boundary_rel = None # y below which is header + footer_boundary_rel = None # y above which is footer + + header_zone_limit = int(content_h * HEADER_FOOTER_ZONE) + footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE)) + + # Find largest gap in header zone + best_header_gap = None + for gs, ge in validated_gaps: + gap_mid = (gs + ge) / 2 + gap_size = ge - gs + if gap_mid < header_zone_limit and gap_size > large_gap_threshold: + if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]): + best_header_gap = (gs, ge) + + if best_header_gap is not None: + header_boundary_rel = best_header_gap[1] + logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} " + f"(gap={best_header_gap[1] - best_header_gap[0]}px, " + f"median_gap={median_gap:.0f}px)") + + # Find largest gap in footer zone + best_footer_gap = None + for gs, ge in validated_gaps: + gap_mid = (gs + ge) / 2 + gap_size = ge - gs + if gap_mid > footer_zone_start and gap_size > large_gap_threshold: + if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]): + best_footer_gap = (gs, ge) + + if best_footer_gap is not None: + footer_boundary_rel = best_footer_gap[0] + logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} " + f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)") + + # --- Step 6: Build RowGeometry objects from gaps --- + # Rows are the spans between gaps + row_boundaries = [] # (start_y_rel, end_y_rel) + + # Top of content to first gap + if validated_gaps[0][0] > MIN_GAP_HEIGHT: + row_boundaries.append((0, validated_gaps[0][0])) + + # Between gaps + for i in range(len(validated_gaps) - 1): + row_start = validated_gaps[i][1] + row_end = validated_gaps[i + 1][0] + if row_end - row_start > 0: + row_boundaries.append((row_start, row_end)) + + # Last gap to bottom of content + if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT: + row_boundaries.append((validated_gaps[-1][1], content_h)) + + rows = [] + for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries): + # Determine row type + row_mid = (row_start_rel + row_end_rel) / 2 + if header_boundary_rel is not None and row_mid < header_boundary_rel: + row_type = 'header' + elif footer_boundary_rel is not None and row_mid > footer_boundary_rel: + row_type = 'footer' + else: + row_type = 'content' + + # Collect words in this row + row_words = [w for w in word_dicts + if w['top'] + w['height'] / 2 >= row_start_rel + and w['top'] + w['height'] / 2 < row_end_rel] + + # Gap before this row + gap_before = 0 + if idx == 0 and validated_gaps[0][0] > 0: + gap_before = validated_gaps[0][0] + elif idx > 0: + # Find the gap just before this row boundary + for gs, ge in validated_gaps: + if ge == row_start_rel: + gap_before = ge - gs + break + + rows.append(RowGeometry( + index=idx, + x=left_x, + y=top_y + row_start_rel, + width=content_w, + height=row_end_rel - row_start_rel, + word_count=len(row_words), + words=row_words, + row_type=row_type, + gap_before=gap_before, + )) + + # --- Step 7: Word-center grid regularization --- + # Derive precise row boundaries from word vertical centers. Detects + # section breaks (headings, paragraphs) and builds per-section grids. + rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y, + content_w, content_h, inv) + + type_counts = {} + for r in rows: + type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1 + logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}") + + return rows + + +def _regularize_row_grid( + rows: List['RowGeometry'], + word_dicts: List[Dict], + left_x: int, right_x: int, + top_y: int, + content_w: int, content_h: int, + inv: np.ndarray, +) -> List['RowGeometry']: + """Rebuild row boundaries from word center-lines with section-break awareness. + + Instead of overlaying a rigid grid, this derives row positions bottom-up + from the words themselves: + + 1. Group words into line clusters (by Y proximity). + 2. For each cluster compute center_y (median of word vertical centers) + and letter_height (median of word heights). + 3. Compute the pitch (distance between consecutive centers). + 4. Detect section breaks where the gap is >1.8× the median pitch + (headings, sub-headings, paragraph breaks). + 5. Within each section, use the local pitch to place row boundaries + at the midpoints between consecutive centers. + 6. Validate that ≥85% of words land in a grid row; otherwise fall back. + + Header/footer rows from the gap-based detection are preserved. + """ + content_rows = [r for r in rows if r.row_type == 'content'] + non_content = [r for r in rows if r.row_type != 'content'] + + if len(content_rows) < 5: + return rows + + # --- Step A: Group ALL words into line clusters --- + # Collect words that belong to content rows (deduplicated) + content_words: List[Dict] = [] + seen_keys: set = set() + for r in content_rows: + for w in r.words: + key = (w['left'], w['top'], w['width'], w['height']) + if key not in seen_keys: + seen_keys.add(key) + content_words.append(w) + + if len(content_words) < 5: + return rows + + # Compute median word height (excluding outliers like tall brackets/IPA) + word_heights = sorted(w['height'] for w in content_words) + median_wh = word_heights[len(word_heights) // 2] + + # Compute median gap-based row height — this is the actual line height + # as detected by the horizontal projection. We use 40% of this as + # grouping tolerance. This is much more reliable than using word height + # alone, because words on the same line can have very different heights + # (e.g. lowercase vs uppercase, brackets, phonetic symbols). + gap_row_heights = sorted(r.height for r in content_rows) + median_row_h = gap_row_heights[len(gap_row_heights) // 2] + + # Tolerance: 40% of row height. Words on the same line should have + # centers within this range. Even if a word's bbox is taller/shorter, + # its center should stay within half a row height of the line center. + y_tol = max(10, int(median_row_h * 0.4)) + + # Sort by center_y, then group by proximity + words_by_center = sorted(content_words, + key=lambda w: (w['top'] + w['height'] / 2, w['left'])) + line_clusters: List[List[Dict]] = [] + current_line: List[Dict] = [words_by_center[0]] + current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2 + + for w in words_by_center[1:]: + w_center = w['top'] + w['height'] / 2 + if abs(w_center - current_center) <= y_tol: + current_line.append(w) + else: + current_line.sort(key=lambda w: w['left']) + line_clusters.append(current_line) + current_line = [w] + current_center = w_center + + if current_line: + current_line.sort(key=lambda w: w['left']) + line_clusters.append(current_line) + + if len(line_clusters) < 3: + return rows + + # --- Step B: Compute center_y per cluster --- + # center_y = median of (word_top + word_height/2) across all words in cluster + # letter_h = median of word heights, but excluding outlier-height words + # (>2× median) so that tall brackets/IPA don't skew the height + cluster_info: List[Dict] = [] + for cl_words in line_clusters: + centers = [w['top'] + w['height'] / 2 for w in cl_words] + # Filter outlier heights for letter_h computation + normal_heights = [w['height'] for w in cl_words + if w['height'] <= median_wh * 2.0] + if not normal_heights: + normal_heights = [w['height'] for w in cl_words] + center_y = float(np.median(centers)) + letter_h = float(np.median(normal_heights)) + cluster_info.append({ + 'center_y_rel': center_y, # relative to content ROI + 'center_y_abs': center_y + top_y, # absolute + 'letter_h': letter_h, + 'words': cl_words, + }) + + cluster_info.sort(key=lambda c: c['center_y_rel']) + + # --- Step B2: Merge clusters that are too close together --- + # Even with center-based grouping, some edge cases can produce + # spurious clusters. Merge any pair whose centers are closer + # than 30% of the row height (they're definitely the same text line). + merge_threshold = max(8, median_row_h * 0.3) + merged: List[Dict] = [cluster_info[0]] + for cl in cluster_info[1:]: + prev = merged[-1] + if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold: + # Merge: combine words, recompute center + combined_words = prev['words'] + cl['words'] + centers = [w['top'] + w['height'] / 2 for w in combined_words] + normal_heights = [w['height'] for w in combined_words + if w['height'] <= median_wh * 2.0] + if not normal_heights: + normal_heights = [w['height'] for w in combined_words] + prev['center_y_rel'] = float(np.median(centers)) + prev['center_y_abs'] = prev['center_y_rel'] + top_y + prev['letter_h'] = float(np.median(normal_heights)) + prev['words'] = combined_words + else: + merged.append(cl) + + cluster_info = merged + + if len(cluster_info) < 3: + return rows + + # --- Step C: Compute pitches and detect section breaks --- + pitches: List[float] = [] + for i in range(1, len(cluster_info)): + pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel'] + pitches.append(pitch) + + if not pitches: + return rows + + median_pitch = float(np.median(pitches)) + if median_pitch <= 5: + return rows + + # A section break is where the gap between line centers is much larger + # than the normal pitch (sub-headings, section titles, etc.) + BREAK_FACTOR = 1.8 + + # --- Step D: Build sections (groups of consecutive lines with normal spacing) --- + sections: List[List[Dict]] = [] + current_section: List[Dict] = [cluster_info[0]] + + for i in range(1, len(cluster_info)): + gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel'] + if gap > median_pitch * BREAK_FACTOR: + sections.append(current_section) + current_section = [cluster_info[i]] + else: + current_section.append(cluster_info[i]) + + if current_section: + sections.append(current_section) + + # --- Step E: Build row boundaries per section --- + grid_rows: List[RowGeometry] = [] + + for section in sections: + if not section: + continue + + if len(section) == 1: + # Single-line section (likely a heading) + cl = section[0] + half_h = max(cl['letter_h'], median_pitch * 0.4) + row_top = cl['center_y_abs'] - half_h + row_bot = cl['center_y_abs'] + half_h + grid_rows.append(RowGeometry( + index=0, + x=left_x, + y=round(row_top), + width=content_w, + height=round(row_bot - row_top), + word_count=len(cl['words']), + words=cl['words'], + row_type='content', + gap_before=0, + )) + continue + + # Compute local pitch for this section + local_pitches = [] + for i in range(1, len(section)): + local_pitches.append( + section[i]['center_y_rel'] - section[i - 1]['center_y_rel'] + ) + local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch + + # Row boundaries are placed at midpoints between consecutive centers. + # First row: top = center - local_pitch/2 + # Last row: bottom = center + local_pitch/2 + for i, cl in enumerate(section): + if i == 0: + row_top = cl['center_y_abs'] - local_pitch / 2 + else: + # Midpoint between this center and previous center + prev_center = section[i - 1]['center_y_abs'] + row_top = (prev_center + cl['center_y_abs']) / 2 + + if i == len(section) - 1: + row_bot = cl['center_y_abs'] + local_pitch / 2 + else: + next_center = section[i + 1]['center_y_abs'] + row_bot = (cl['center_y_abs'] + next_center) / 2 + + # Clamp to reasonable bounds + row_top = max(top_y, row_top) + row_bot = min(top_y + content_h, row_bot) + + if row_bot - row_top < 5: + continue + + grid_rows.append(RowGeometry( + index=0, + x=left_x, + y=round(row_top), + width=content_w, + height=round(row_bot - row_top), + word_count=len(cl['words']), + words=cl['words'], + row_type='content', + gap_before=0, + )) + + if not grid_rows: + return rows + + # --- Step F: Re-assign words to grid rows --- + # Words may have shifted slightly; assign each word to the row whose + # center is closest to the word's vertical center. + for gr in grid_rows: + gr.words = [] + + for w in content_words: + w_center = w['top'] + top_y + w['height'] / 2 + best_row = None + best_dist = float('inf') + for gr in grid_rows: + row_center = gr.y + gr.height / 2 + dist = abs(w_center - row_center) + if dist < best_dist: + best_dist = dist + best_row = gr + if best_row is not None and best_dist < median_pitch: + best_row.words.append(w) + + for gr in grid_rows: + gr.word_count = len(gr.words) + + # --- Step G: Validate --- + words_placed = sum(gr.word_count for gr in grid_rows) + if len(content_words) > 0: + match_ratio = words_placed / len(content_words) + if match_ratio < 0.85: + logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} " + f"of words, keeping gap-based rows") + return rows + + # Remove empty grid rows (no words assigned) + grid_rows = [gr for gr in grid_rows if gr.word_count > 0] + + # --- Step H: Merge header/footer + re-index --- + result = list(non_content) + grid_rows + result.sort(key=lambda r: r.y) + for i, r in enumerate(result): + r.index = i + + row_heights = [gr.height for gr in grid_rows] + min_h = min(row_heights) if row_heights else 0 + max_h = max(row_heights) if row_heights else 0 + logger.info(f"RowGrid: word-center grid applied " + f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, " + f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, " + f"{len(sections)} sections, " + f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], " + f"was {len(content_rows)} gap-based rows)") + + return result + + +def _build_rows_from_word_grouping( + word_dicts: List[Dict], + left_x: int, right_x: int, + top_y: int, bottom_y: int, + content_w: int, content_h: int, +) -> List['RowGeometry']: + """Fallback: build rows by grouping words by Y position. + + Uses _group_words_into_lines() with a generous tolerance. + No header/footer detection in fallback mode. + """ + if not word_dicts: + return [] + + y_tolerance = max(20, content_h // 100) + lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance) + + rows = [] + for idx, line_words in enumerate(lines): + if not line_words: + continue + min_top = min(w['top'] for w in line_words) + max_bottom = max(w['top'] + w['height'] for w in line_words) + row_height = max_bottom - min_top + + rows.append(RowGeometry( + index=idx, + x=left_x, + y=top_y + min_top, + width=content_w, + height=row_height, + word_count=len(line_words), + words=line_words, + row_type='content', + gap_before=0, + )) + + logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping") + return rows + + +# --- Phase B: Content-Based Classification --- + +def _score_language(words: List[Dict]) -> Dict[str, float]: + """Score the language of a column's words. + + Analyzes function words, umlauts, and capitalization patterns + to determine whether text is English or German. + + Args: + words: List of word dicts with 'text' and 'conf' keys. + + Returns: + Dict with 'eng' and 'deu' scores (0.0-1.0). + """ + if not words: + return {'eng': 0.0, 'deu': 0.0} + + # Only consider words with decent confidence + good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0] + if not good_words: + return {'eng': 0.0, 'deu': 0.0} + + total = len(good_words) + en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS) + de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS) + + # Check for umlauts (strong German signal) + raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40] + umlaut_count = sum(1 for t in raw_texts + for c in t if c in 'äöüÄÖÜß') + + # German capitalization: nouns are capitalized mid-sentence + # Count words that start with uppercase but aren't at position 0 + cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2) + + en_score = en_hits / total if total > 0 else 0.0 + de_score = de_hits / total if total > 0 else 0.0 + + # Boost German score for umlauts + if umlaut_count > 0: + de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5)) + + # Boost German score for high capitalization ratio (typical for German nouns) + if total > 5: + cap_ratio = cap_words / total + if cap_ratio > 0.3: + de_score = min(1.0, de_score + 0.1) + + return {'eng': round(en_score, 3), 'deu': round(de_score, 3)} + + +def _score_role(geom: ColumnGeometry) -> Dict[str, float]: + """Score the role of a column based on its geometry and content patterns. + + Args: + geom: ColumnGeometry with words and dimensions. + + Returns: + Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'. + """ + scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0} + + if not geom.words: + return scores + + texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40] + if not texts: + return scores + + avg_word_len = sum(len(t) for t in texts) / len(texts) + has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,')) + digit_words = sum(1 for t in texts if any(c.isdigit() for c in t)) + digit_ratio = digit_words / len(texts) if texts else 0.0 + + # Reference: narrow + mostly numbers/page references + if geom.width_ratio < 0.12: + scores['reference'] = 0.5 + if digit_ratio > 0.4: + scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5) + + # Marker: narrow + few short entries + if geom.width_ratio < 0.06 and geom.word_count <= 15: + scores['marker'] = 0.7 + if avg_word_len < 4: + scores['marker'] = 0.9 + # Very narrow non-edge column → strong marker regardless of word count + if geom.width_ratio < 0.04 and geom.index > 0: + scores['marker'] = max(scores['marker'], 0.9) + + # Sentence: longer words + punctuation present + if geom.width_ratio > 0.15 and has_punctuation > 2: + scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts)) + if avg_word_len > 4: + scores['sentence'] = min(1.0, scores['sentence'] + 0.2) + + # Vocabulary: medium width + medium word length + if 0.10 < geom.width_ratio < 0.45: + scores['vocabulary'] = 0.4 + if 3 < avg_word_len < 8: + scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3) + + return {k: round(v, 3) for k, v in scores.items()} + + +def _build_margin_regions( + all_regions: List[PageRegion], + left_x: int, + right_x: int, + img_w: int, + top_y: int, + content_h: int, +) -> List[PageRegion]: + """Create margin_left / margin_right PageRegions from content bounds. + + Margins represent the space between the image edge and the first/last + content column. They are used downstream for faithful page + reconstruction but are skipped during OCR. + """ + margins: List[PageRegion] = [] + # Minimum gap (px) to create a margin region + _min_gap = 5 + + if left_x > _min_gap: + margins.append(PageRegion( + type='margin_left', x=0, y=top_y, + width=left_x, height=content_h, + classification_confidence=1.0, + classification_method='content_bounds', + )) + + # Right margin: from end of last content column to image edge + non_margin = [r for r in all_regions + if r.type not in ('margin_left', 'margin_right', 'header', 'footer', + 'margin_top', 'margin_bottom')] + if non_margin: + last_col_end = max(r.x + r.width for r in non_margin) + else: + last_col_end = right_x + if img_w - last_col_end > _min_gap: + margins.append(PageRegion( + type='margin_right', x=last_col_end, y=top_y, + width=img_w - last_col_end, height=content_h, + classification_confidence=1.0, + classification_method='content_bounds', + )) + + if margins: + logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} " + f"(left_x={left_x}, right_x={right_x}, img_w={img_w})") + + return margins + + +def positional_column_regions( + geometries: List[ColumnGeometry], + content_w: int, + content_h: int, + left_x: int, +) -> List[PageRegion]: + """Classify columns by position only (no language scoring). + + Structural columns (page_ref, column_marker) are identified by geometry. + Remaining content columns are labelled left→right as column_en, column_de, + column_example. The names are purely positional – no language analysis. + """ + structural: List[PageRegion] = [] + content_cols: List[ColumnGeometry] = [] + + for g in geometries: + rel_x = g.x - left_x + # page_ref: narrow column in the leftmost 20% region + if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20: + structural.append(PageRegion( + type='page_ref', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='positional', + )) + # column_marker: very narrow, few words + elif g.width_ratio < 0.06 and g.word_count <= 15: + structural.append(PageRegion( + type='column_marker', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='positional', + )) + # empty or near-empty narrow column → treat as margin/structural + elif g.word_count <= 2 and g.width_ratio < 0.15: + structural.append(PageRegion( + type='column_marker', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.85, + classification_method='positional', + )) + else: + content_cols.append(g) + + # Single content column → plain text page + if len(content_cols) == 1: + g = content_cols[0] + return structural + [PageRegion( + type='column_text', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.9, + classification_method='positional', + )] + + # No content columns + if not content_cols: + return structural + + # Sort content columns left→right and assign positional labels + content_cols.sort(key=lambda g: g.x) + + # With exactly 2 content columns: if the left one is very wide (>35%), + # it likely contains EN+DE combined, so the right one is examples. + if (len(content_cols) == 2 + and content_cols[0].width_ratio > 0.35 + and content_cols[1].width_ratio > 0.20): + labels = ['column_en', 'column_example'] + else: + labels = ['column_en', 'column_de', 'column_example'] + + regions = list(structural) + for i, g in enumerate(content_cols): + label = labels[i] if i < len(labels) else 'column_example' + regions.append(PageRegion( + type=label, x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='positional', + )) + + logger.info(f"PositionalColumns: {len(structural)} structural, " + f"{len(content_cols)} content → " + f"{[r.type for r in regions]}") + return regions + + +def classify_column_types(geometries: List[ColumnGeometry], + content_w: int, + top_y: int, + img_w: int, + img_h: int, + bottom_y: int, + left_x: int = 0, + right_x: int = 0, + inv: Optional[np.ndarray] = None) -> List[PageRegion]: + """Classify column types using a 3-level fallback chain. + + Level 1: Content-based (language + role scoring) + Level 2: Position + language (old rules enhanced with language detection) + Level 3: Pure position (exact old code, no regression) + + Args: + geometries: List of ColumnGeometry from Phase A. + content_w: Total content width. + top_y: Top Y of content area. + img_w: Full image width. + img_h: Full image height. + bottom_y: Bottom Y of content area. + left_x: Left content bound (from _find_content_bounds). + right_x: Right content bound (from _find_content_bounds). + + Returns: + List of PageRegion with types, confidence, and method. + """ + content_h = bottom_y - top_y + + def _with_margins(result: List[PageRegion]) -> List[PageRegion]: + """Append margin_left / margin_right regions to *result*.""" + margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h) + return result + margins + + # Special case: single column → plain text page + if len(geometries) == 1: + geom = geometries[0] + return _with_margins([PageRegion( + type='column_text', x=geom.x, y=geom.y, + width=geom.width, height=geom.height, + classification_confidence=0.9, + classification_method='content', + )]) + + # --- Pre-filter: first/last columns with very few words → column_ignore --- + # Sub-columns from _detect_sub_columns() are exempt: they intentionally + # have few words (page refs, markers) and should not be discarded. + ignore_regions = [] + active_geometries = [] + for idx, g in enumerate(geometries): + if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column: + ignore_regions.append(PageRegion( + type='column_ignore', x=g.x, y=g.y, + width=g.width, height=content_h, + classification_confidence=0.95, + classification_method='content', + )) + logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)") + else: + active_geometries.append(g) + + # Re-index active geometries for classification + for new_idx, g in enumerate(active_geometries): + g.index = new_idx + geometries = active_geometries + + # Handle edge case: all columns ignored or only 1 left + if len(geometries) == 0: + return _with_margins(ignore_regions) + if len(geometries) == 1: + geom = geometries[0] + ignore_regions.append(PageRegion( + type='column_text', x=geom.x, y=geom.y, + width=geom.width, height=geom.height, + classification_confidence=0.9, + classification_method='content', + )) + return _with_margins(ignore_regions) + + # --- Score all columns --- + lang_scores = [_score_language(g.words) for g in geometries] + role_scores = [_score_role(g) for g in geometries] + + logger.info(f"ClassifyColumns: language scores: " + f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}") + logger.info(f"ClassifyColumns: role scores: " + f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}") + + # --- Level 1: Content-based classification --- + regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h) + if regions is not None: + logger.info("ClassifyColumns: Level 1 (content-based) succeeded") + _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv) + return _with_margins(ignore_regions + regions) + + # --- Level 2: Position + language enhanced --- + regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h) + if regions is not None: + logger.info("ClassifyColumns: Level 2 (position+language) succeeded") + _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv) + return _with_margins(ignore_regions + regions) + + # --- Level 3: Pure position fallback (old code, no regression) --- + logger.info("ClassifyColumns: Level 3 (position fallback)") + regions = _classify_by_position_fallback(geometries, content_w, content_h) + _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv) + return _with_margins(ignore_regions + regions) + + +def _classify_by_content(geometries: List[ColumnGeometry], + lang_scores: List[Dict[str, float]], + role_scores: List[Dict[str, float]], + content_w: int, + content_h: int) -> Optional[List[PageRegion]]: + """Level 1: Classify columns purely by content analysis. + + Requires clear language signals to distinguish EN/DE columns. + Returns None if language signals are too weak. + """ + regions = [] + assigned = set() + + # Step 1: Assign structural roles first (reference, marker) + # left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref + left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0 + + for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)): + is_left_side = geom.x < left_20_threshold + has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3 + if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language: + regions.append(PageRegion( + type='page_ref', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=rs['reference'], + classification_method='content', + )) + assigned.add(i) + elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06: + regions.append(PageRegion( + type='column_marker', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=rs['marker'], + classification_method='content', + )) + assigned.add(i) + elif geom.width_ratio < 0.05 and not is_left_side: + # Narrow column on the right side → marker, not page_ref + regions.append(PageRegion( + type='column_marker', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.8, + classification_method='content', + )) + assigned.add(i) + + # Step 2: Among remaining columns, find EN and DE by language scores + remaining = [(i, geometries[i], lang_scores[i], role_scores[i]) + for i in range(len(geometries)) if i not in assigned] + + if len(remaining) < 2: + # Not enough columns for EN/DE pair + if len(remaining) == 1: + i, geom, ls, rs = remaining[0] + regions.append(PageRegion( + type='column_text', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.6, + classification_method='content', + )) + regions.sort(key=lambda r: r.x) + return regions + + # Check if we have enough language signal + en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05] + de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05] + + # Position tiebreaker: when language signals are weak, use left=EN, right=DE + if (not en_candidates or not de_candidates) and len(remaining) >= 2: + max_eng = max(ls['eng'] for _, _, ls, _ in remaining) + max_deu = max(ls['deu'] for _, _, ls, _ in remaining) + if max_eng < 0.15 and max_deu < 0.15: + # Both signals weak — fall back to positional: left=EN, right=DE + sorted_remaining = sorted(remaining, key=lambda x: x[1].x) + best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2]) + best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2]) + logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE") + en_conf = 0.4 + de_conf = 0.4 + + regions.append(PageRegion( + type='column_en', x=best_en[1].x, y=best_en[1].y, + width=best_en[1].width, height=content_h, + classification_confidence=en_conf, + classification_method='content', + )) + assigned.add(best_en[0]) + + regions.append(PageRegion( + type='column_de', x=best_de[1].x, y=best_de[1].y, + width=best_de[1].width, height=content_h, + classification_confidence=de_conf, + classification_method='content', + )) + assigned.add(best_de[0]) + + # Assign remaining as example + for i, geom, ls, rs in remaining: + if i not in assigned: + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.4, + classification_method='content', + )) + regions.sort(key=lambda r: r.x) + return regions + + if not en_candidates or not de_candidates: + # Language signals too weak for content-based classification + logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split") + return None + + # Pick the best EN and DE candidates + best_en = max(en_candidates, key=lambda x: x[2]['eng']) + best_de = max(de_candidates, key=lambda x: x[2]['deu']) + + # Position-aware EN selection: in typical textbooks the layout is EN | DE | Example. + # Example sentences contain English function words ("the", "a", "is") which inflate + # the eng score of the Example column. When the best EN candidate sits to the RIGHT + # of the DE column and there is another EN candidate to the LEFT, prefer the left one + # — it is almost certainly the real vocabulary column. + if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1: + left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x] + if left_of_de: + alt_en = max(left_of_de, key=lambda x: x[2]['eng']) + logger.info( + f"ClassifyColumns: Level 1 position fix — best EN col {best_en[0]} " + f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; " + f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})") + best_en = alt_en + + if best_en[0] == best_de[0]: + # Same column scored highest for both — ambiguous + logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE") + return None + + en_conf = best_en[2]['eng'] + de_conf = best_de[2]['deu'] + + regions.append(PageRegion( + type='column_en', x=best_en[1].x, y=best_en[1].y, + width=best_en[1].width, height=content_h, + classification_confidence=round(en_conf, 2), + classification_method='content', + )) + assigned.add(best_en[0]) + + regions.append(PageRegion( + type='column_de', x=best_de[1].x, y=best_de[1].y, + width=best_de[1].width, height=content_h, + classification_confidence=round(de_conf, 2), + classification_method='content', + )) + assigned.add(best_de[0]) + + # Step 3: Remaining columns → example or text based on role scores + for i, geom, ls, rs in remaining: + if i in assigned: + continue + if rs['sentence'] > 0.4: + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=round(rs['sentence'], 2), + classification_method='content', + )) + else: + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.5, + classification_method='content', + )) + + regions.sort(key=lambda r: r.x) + return regions + + +def _classify_by_position_enhanced(geometries: List[ColumnGeometry], + lang_scores: List[Dict[str, float]], + content_w: int, + content_h: int) -> Optional[List[PageRegion]]: + """Level 2: Position-based rules enhanced with language confirmation. + + Uses the old positional heuristics but confirms EN/DE assignment + with language scores (swapping if needed). + """ + regions = [] + untyped = list(range(len(geometries))) + first_x = geometries[0].x if geometries else 0 + left_20_threshold = first_x + content_w * 0.20 + + # Rule 1: Leftmost narrow column → page_ref (only if in left 20%, no strong language) + g0 = geometries[0] + ls0 = lang_scores[0] + has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3 + if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0: + regions.append(PageRegion( + type='page_ref', x=g0.x, y=g0.y, + width=g0.width, height=content_h, + classification_confidence=0.8, + classification_method='position_enhanced', + )) + untyped.remove(0) + + # Rule 2: Narrow columns with few words → marker + for i in list(untyped): + geom = geometries[i] + if geom.width_ratio < 0.06 and geom.word_count <= 15: + regions.append(PageRegion( + type='column_marker', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.7, + classification_method='position_enhanced', + )) + untyped.remove(i) + + # Rule 3: Rightmost remaining → column_example (if 3+ remaining) + if len(untyped) >= 3: + last_idx = untyped[-1] + geom = geometries[last_idx] + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.7, + classification_method='position_enhanced', + )) + untyped.remove(last_idx) + + # Rule 4: First two remaining → EN/DE, but check language to possibly swap + if len(untyped) >= 2: + idx_a = untyped[0] + idx_b = untyped[1] + ls_a = lang_scores[idx_a] + ls_b = lang_scores[idx_b] + + # Default: first=EN, second=DE (old behavior) + en_idx, de_idx = idx_a, idx_b + conf = 0.7 + + # Swap if language signals clearly indicate the opposite + if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']: + en_idx, de_idx = idx_b, idx_a + conf = 0.85 + logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores") + + regions.append(PageRegion( + type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y, + width=geometries[en_idx].width, height=content_h, + classification_confidence=conf, + classification_method='position_enhanced', + )) + regions.append(PageRegion( + type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y, + width=geometries[de_idx].width, height=content_h, + classification_confidence=conf, + classification_method='position_enhanced', + )) + untyped = untyped[2:] + elif len(untyped) == 1: + idx = untyped[0] + geom = geometries[idx] + regions.append(PageRegion( + type='column_en', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.5, + classification_method='position_enhanced', + )) + untyped = [] + + # Remaining → example + for idx in untyped: + geom = geometries[idx] + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=0.5, + classification_method='position_enhanced', + )) + + regions.sort(key=lambda r: r.x) + return regions + + +def _classify_by_position_fallback(geometries: List[ColumnGeometry], + content_w: int, + content_h: int) -> List[PageRegion]: + """Level 3: Pure position-based fallback (identical to old code). + + Guarantees no regression from the previous behavior. + """ + regions = [] + untyped = list(range(len(geometries))) + first_x = geometries[0].x if geometries else 0 + left_20_threshold = first_x + content_w * 0.20 + + # Rule 1: Leftmost narrow column → page_ref (only if in left 20%) + g0 = geometries[0] + if g0.width_ratio < 0.12 and g0.x < left_20_threshold: + regions.append(PageRegion( + type='page_ref', x=g0.x, y=g0.y, + width=g0.width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', + )) + untyped.remove(0) + + # Rule 2: Narrow + few words → marker + for i in list(untyped): + geom = geometries[i] + if geom.width_ratio < 0.06 and geom.word_count <= 15: + regions.append(PageRegion( + type='column_marker', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', + )) + untyped.remove(i) + + # Rule 3: Rightmost remaining → example (if 3+) + if len(untyped) >= 3: + last_idx = untyped[-1] + geom = geometries[last_idx] + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', + )) + untyped.remove(last_idx) + + # Rule 4: First remaining → EN, second → DE + if len(untyped) >= 2: + en_idx = untyped[0] + de_idx = untyped[1] + regions.append(PageRegion( + type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y, + width=geometries[en_idx].width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', + )) + regions.append(PageRegion( + type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y, + width=geometries[de_idx].width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', + )) + untyped = untyped[2:] + elif len(untyped) == 1: + idx = untyped[0] + geom = geometries[idx] + regions.append(PageRegion( + type='column_en', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', + )) + untyped = [] + + for idx in untyped: + geom = geometries[idx] + regions.append(PageRegion( + type='column_example', x=geom.x, y=geom.y, + width=geom.width, height=content_h, + classification_confidence=1.0, + classification_method='position_fallback', + )) + + regions.sort(key=lambda r: r.x) + return regions + + +def _detect_header_footer_gaps( + inv: np.ndarray, + img_w: int, + img_h: int, +) -> Tuple[Optional[int], Optional[int]]: + """Detect header/footer boundaries via horizontal projection gap analysis. + + Scans the full-page inverted image for large horizontal gaps in the top/bottom + 20% that separate header/footer content from the main body. + + Returns: + (header_y, footer_y) — absolute y-coordinates. + header_y = bottom edge of header region (None if no header detected). + footer_y = top edge of footer region (None if no footer detected). + """ + HEADER_FOOTER_ZONE = 0.20 + GAP_MULTIPLIER = 2.0 + + # Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding + actual_h = min(inv.shape[0], img_h) + roi = inv[:actual_h, :] + h_proj = np.sum(roi, axis=1).astype(float) + proj_w = roi.shape[1] + h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj + + # Step 2: Smoothing + kernel_size = max(3, actual_h // 200) + if kernel_size % 2 == 0: + kernel_size += 1 + h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') + + # Step 3: Gap threshold + positive = h_smooth[h_smooth > 0] + median_density = float(np.median(positive)) if len(positive) > 0 else 0.01 + gap_threshold = max(median_density * 0.15, 0.003) + + in_gap = h_smooth < gap_threshold + MIN_GAP_HEIGHT = max(3, actual_h // 500) + + # Step 4: Collect contiguous gaps + raw_gaps: List[Tuple[int, int]] = [] + gap_start: Optional[int] = None + for y in range(len(in_gap)): + if in_gap[y]: + if gap_start is None: + gap_start = y + else: + if gap_start is not None: + gap_height = y - gap_start + if gap_height >= MIN_GAP_HEIGHT: + raw_gaps.append((gap_start, y)) + gap_start = None + if gap_start is not None: + gap_height = len(in_gap) - gap_start + if gap_height >= MIN_GAP_HEIGHT: + raw_gaps.append((gap_start, len(in_gap))) + + if not raw_gaps: + return None, None + + # Step 5: Compute median gap size and large-gap threshold + gap_sizes = [g[1] - g[0] for g in raw_gaps] + median_gap = float(np.median(gap_sizes)) + large_gap_threshold = median_gap * GAP_MULTIPLIER + + # Step 6: Find largest qualifying gap in header / footer zones + # A separator gap must have content on BOTH sides — edge-touching gaps + # (e.g. dewarp padding at bottom) are not valid separators. + EDGE_MARGIN = max(5, actual_h // 400) + header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE) + footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE)) + + header_y: Optional[int] = None + footer_y: Optional[int] = None + + best_header_size = 0 + for gs, ge in raw_gaps: + if gs <= EDGE_MARGIN: + continue # skip gaps touching the top edge + gap_mid = (gs + ge) / 2 + gap_size = ge - gs + if gap_mid < header_zone_limit and gap_size > large_gap_threshold: + if gap_size > best_header_size: + best_header_size = gap_size + header_y = ge # bottom edge of gap + + best_footer_size = 0 + for gs, ge in raw_gaps: + if ge >= actual_h - EDGE_MARGIN: + continue # skip gaps touching the bottom edge + gap_mid = (gs + ge) / 2 + gap_size = ge - gs + if gap_mid > footer_zone_start and gap_size > large_gap_threshold: + if gap_size > best_footer_size: + best_footer_size = gap_size + footer_y = gs # top edge of gap + + if header_y is not None: + logger.info(f"HeaderFooterGaps: header boundary at y={header_y} " + f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)") + if footer_y is not None: + logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} " + f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)") + + return header_y, footer_y + + +def _region_has_content(inv: np.ndarray, y_start: int, y_end: int, + min_density: float = 0.005) -> bool: + """Check whether a horizontal strip contains meaningful ink. + + Args: + inv: Inverted binarized image (white-on-black). + y_start: Top of the region (inclusive). + y_end: Bottom of the region (exclusive). + min_density: Fraction of white pixels required to count as content. + + Returns: + True if the region contains text/graphics, False if empty margin. + """ + if y_start >= y_end: + return False + strip = inv[y_start:y_end, :] + density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255) + return density > min_density + + +def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int, + img_w: int, img_h: int, + inv: Optional[np.ndarray] = None) -> None: + """Add header/footer/margin regions in-place. + + Uses gap-based detection when *inv* is provided, otherwise falls back + to simple top_y/bottom_y bounds. + + Region types depend on whether there is actual content (text/graphics): + - 'header' / 'footer' — region contains text (e.g. title, page number) + - 'margin_top' / 'margin_bottom' — region is empty page margin + """ + header_y: Optional[int] = None + footer_y: Optional[int] = None + + if inv is not None: + header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h) + + # --- Top region --- + top_boundary = header_y if header_y is not None and header_y > 10 else ( + top_y if top_y > 10 else None + ) + if top_boundary is not None: + has_content = inv is not None and _region_has_content(inv, 0, top_boundary) + rtype = 'header' if has_content else 'margin_top' + regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary)) + logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px " + f"(has_content={has_content})") + + # --- Bottom region --- + bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else ( + bottom_y if bottom_y < img_h - 10 else None + ) + if bottom_boundary is not None: + has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h) + rtype = 'footer' if has_content else 'margin_bottom' + regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w, + height=img_h - bottom_boundary)) + logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} " + f"height={img_h - bottom_boundary}px (has_content={has_content})") + + +# --- Main Entry Point --- + +def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]: + """Detect columns using two-phase approach: geometry then content classification. + + Phase A: detect_column_geometry() — clustering word positions into columns. + Phase B: classify_column_types() — content-based type assignment with fallback. + + Falls back to projection-based analyze_layout() if geometry detection fails. + + Args: + ocr_img: Binarized grayscale image for layout analysis. + dewarped_bgr: Original BGR image (for Tesseract word detection). + + Returns: + List of PageRegion objects with types, confidence, and method. + """ + h, w = ocr_img.shape[:2] + + # Phase A: Geometry detection + result = detect_column_geometry(ocr_img, dewarped_bgr) + + if result is None: + # Fallback to projection-based layout + logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles") + layout_img = create_layout_image(dewarped_bgr) + return analyze_layout(layout_img, ocr_img) + + geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result + content_w = right_x - left_x + + # Detect header/footer early so sub-column clustering ignores them + header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None) + + # Split sub-columns (e.g. page references) before classification + geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, + top_y=top_y, header_y=header_y, footer_y=footer_y) + + # Split broad columns that contain EN+DE mixed via word-coverage gaps + geometries = _split_broad_columns(geometries, content_w, left_x=left_x) + + # Phase B: Positional classification (no language scoring) + content_h = bottom_y - top_y + regions = positional_column_regions(geometries, content_w, content_h, left_x) + + col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref']) + methods = set(r.classification_method for r in regions if r.classification_method) + logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): " + f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}") + + return regions diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py new file mode 100644 index 0000000..2f630c3 --- /dev/null +++ b/klausur-service/backend/cv_ocr_engines.py @@ -0,0 +1,1282 @@ +""" +OCR engines (RapidOCR, TrOCR, LightOn), vocab postprocessing, and text cleaning. + +Lizenz: Apache 2.0 (kommerziell nutzbar) +DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +import io +import logging +import re +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np + +from cv_vocab_types import ( + IPA_AVAILABLE, + PageRegion, + RowGeometry, + _britfone_dict, + _ipa_convert_american, +) + +logger = logging.getLogger(__name__) + +try: + import cv2 +except ImportError: + cv2 = None # type: ignore[assignment] + +try: + from PIL import Image +except ImportError: + Image = None # type: ignore[assignment,misc] + + +# ============================================================================= +# Pipeline Step 5: Word Grid from Columns × Rows +# ============================================================================= + +def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]: + """Group OCR words into visual lines in reading order. + + Returns a list of line strings (one per visual line in the cell). + """ + if not words: + return [] + + lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px) + return [' '.join(w['text'] for w in line) for line in lines] + + +def _rejoin_hyphenated(lines: List[str]) -> List[str]: + """Rejoin words split by line-break hyphenation. + + E.g. ['Fuß-', 'boden'] → ['Fußboden'] + ['some text-', 'thing here'] → ['something here'] + """ + if len(lines) <= 1: + return lines + + result = [] + i = 0 + while i < len(lines): + line = lines[i] + # If line ends with '-' and there's a next line, rejoin + if i + 1 < len(lines) and line.rstrip().endswith('-'): + stripped = line.rstrip() + # Get the word fragment before hyphen (last word) + prefix = stripped[:-1] # remove trailing hyphen + next_line = lines[i + 1] + # Join: last word of this line + first word of next line + prefix_words = prefix.rsplit(' ', 1) + next_words = next_line.split(' ', 1) + if len(prefix_words) > 1: + joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0] + else: + joined = prefix_words[0] + next_words[0] + remainder = next_words[1] if len(next_words) > 1 else '' + if remainder: + result.append(joined + ' ' + remainder) + else: + result.append(joined) + i += 2 + else: + result.append(line) + i += 1 + return result + + +def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str: + """Join OCR words into text in correct reading order, preserving line breaks. + + Groups words into visual lines by Y-tolerance, sorts each line by X, + rejoins hyphenated words, then joins lines with newlines. + """ + lines = _words_to_reading_order_lines(words, y_tolerance_px) + lines = _rejoin_hyphenated(lines) + return '\n'.join(lines) + + +# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) --- + +_rapid_engine = None +RAPIDOCR_AVAILABLE = False + +try: + from rapidocr import RapidOCR as _RapidOCRClass + from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType + RAPIDOCR_AVAILABLE = True + logger.info("RapidOCR available — can be used as alternative to Tesseract") +except ImportError: + logger.info("RapidOCR not installed — using Tesseract only") + + +def _get_rapid_engine(): + """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support.""" + global _rapid_engine + if _rapid_engine is None: + _rapid_engine = _RapidOCRClass(params={ + # PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß) + "Rec.lang_type": _LangRec.LATIN, + "Rec.model_type": _ModelType.SERVER, + "Rec.ocr_version": _OCRVersion.PPOCRV5, + # Tighter detection boxes to reduce word merging + "Det.unclip_ratio": 1.3, + # Lower threshold to detect small chars (periods, ellipsis, phonetics) + "Det.box_thresh": 0.4, + # Silence verbose logging + "Global.log_level": "critical", + }) + logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)") + return _rapid_engine + + +def ocr_region_rapid( + img_bgr: np.ndarray, + region: PageRegion, +) -> List[Dict[str, Any]]: + """Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format. + + Args: + img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray). + region: Region to crop and OCR. + + Returns: + List of word dicts with text, left, top, width, height, conf, region_type. + """ + engine = _get_rapid_engine() + + # Crop region from BGR image + crop = img_bgr[region.y:region.y + region.height, + region.x:region.x + region.width] + + if crop.size == 0: + return [] + + result = engine(crop) + + if result is None or result.boxes is None or result.txts is None: + return [] + + words = [] + boxes = result.boxes # shape (N, 4, 2) — 4 corner points per text line + txts = result.txts # tuple of strings + scores = result.scores # tuple of floats + + for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)): + if not txt or not txt.strip(): + continue + + # box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left) + xs = [p[0] for p in box] + ys = [p[1] for p in box] + left = int(min(xs)) + top = int(min(ys)) + w = int(max(xs) - left) + h = int(max(ys) - top) + + words.append({ + 'text': txt.strip(), + 'left': left + region.x, # Absolute coords + 'top': top + region.y, + 'width': w, + 'height': h, + 'conf': int(score * 100), # 0-100 like Tesseract + 'region_type': region.type, + }) + + return words + + +def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]: + """Run TrOCR on a region. Returns line-level word dicts (same format as ocr_region_rapid). + + Uses trocr_service.get_trocr_model() + _split_into_lines() for line segmentation. + Bboxes are approximated from equal line-height distribution within the region. + Falls back to Tesseract if TrOCR is not available. + """ + from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available + + if not _check_trocr_available(): + logger.warning("TrOCR not available, falling back to Tesseract") + if region.height > 0 and region.width > 0: + ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None + if ocr_img_crop is not None: + return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) + return [] + + crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width] + if crop.size == 0: + return [] + + try: + import torch + from PIL import Image as _PILImage + + processor, model = get_trocr_model(handwritten=handwritten) + if processor is None or model is None: + logger.warning("TrOCR model not loaded, falling back to Tesseract") + ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) + return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) + + pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)) + lines = _split_into_lines(pil_crop) + if not lines: + lines = [pil_crop] + + device = next(model.parameters()).device + all_text = [] + confidences = [] + for line_img in lines: + pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device) + with torch.no_grad(): + generated_ids = model.generate(pixel_values, max_length=128) + text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() + if text_line: + all_text.append(text_line) + confidences.append(0.85 if len(text_line) > 3 else 0.5) + + if not all_text: + return [] + + avg_conf = int(sum(confidences) / len(confidences) * 100) + line_h = region.height // max(len(all_text), 1) + words = [] + for i, line in enumerate(all_text): + words.append({ + "text": line, + "left": region.x, + "top": region.y + i * line_h, + "width": region.width, + "height": line_h, + "conf": avg_conf, + "region_type": region.type, + }) + return words + + except Exception as e: + logger.error(f"ocr_region_trocr failed: {e}") + return [] + + +def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]: + """Run LightOnOCR-2-1B on a region. Returns line-level word dicts (same format as ocr_region_rapid). + + Falls back to RapidOCR or Tesseract if LightOnOCR is not available. + """ + from services.lighton_ocr_service import get_lighton_model, _check_lighton_available + + if not _check_lighton_available(): + logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract") + if RAPIDOCR_AVAILABLE and img_bgr is not None: + return ocr_region_rapid(img_bgr, region) + ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None + return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else [] + + crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width] + if crop.size == 0: + return [] + + try: + import io + import torch + from PIL import Image as _PILImage + + processor, model = get_lighton_model() + if processor is None or model is None: + logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract") + if RAPIDOCR_AVAILABLE and img_bgr is not None: + return ocr_region_rapid(img_bgr, region) + ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) + return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) + + pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)) + conversation = [{"role": "user", "content": [{"type": "image"}]}] + inputs = processor.apply_chat_template( + conversation, images=[pil_crop], + add_generation_prompt=True, return_tensors="pt" + ).to(model.device) + + with torch.no_grad(): + output_ids = model.generate(**inputs, max_new_tokens=1024) + + text = processor.decode(output_ids[0], skip_special_tokens=True).strip() + if not text: + return [] + + lines = [l.strip() for l in text.split("\n") if l.strip()] + line_h = region.height // max(len(lines), 1) + words = [] + for i, line in enumerate(lines): + words.append({ + "text": line, + "left": region.x, + "top": region.y + i * line_h, + "width": region.width, + "height": line_h, + "conf": 85, + "region_type": region.type, + }) + return words + + except Exception as e: + logger.error(f"ocr_region_lighton failed: {e}") + return [] + + +# ============================================================================= +# Post-Processing: Deterministic Quality Fixes +# ============================================================================= + +# --- A. Character Confusion Fix (I/1/l) --- + +# Common OCR confusion pairs in vocabulary context +_CHAR_CONFUSION_RULES = [ + # "1" at word start followed by lowercase → likely "I" or "l" + # Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3") + (re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant + # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number) + (re.compile(r'(? List[Dict[str, Any]]: + """Fix common OCR character confusions using context. + + Deterministic rules: + - "1" at word start → "I" or "l" based on context + - Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I" + - "y " artifact at word boundaries → remove (e.g. "y you" → "you") + """ + for entry in entries: + en = entry.get('english', '') or '' + de = entry.get('german', '') or '' + ex = entry.get('example', '') or '' + + # Apply general rules to all fields + for pattern, replacement in _CHAR_CONFUSION_RULES: + en = pattern.sub(replacement, en) + de = pattern.sub(replacement, de) + ex = pattern.sub(replacement, ex) + + # Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I" + de_lower_words = set(de.lower().replace(',', ' ').split()) + if de_lower_words & _DE_INDICATORS_FOR_EN_I: + # Any remaining "1" in EN that looks like "I" + en = re.sub(r'\b1\b(?![\d.,])', 'I', en) + + # Fix "y " artifact before repeated word: "y you" → "you" + en = re.sub(r'\by\s+([a-z])', r'\1', en) + ex = re.sub(r'\by\s+([a-z])', r'\1', ex) + + entry['english'] = en.strip() + entry['german'] = de.strip() + entry['example'] = ex.strip() + + return entries + + +# --- B. Comma-Separated Word Form Splitting --- + +def _is_singular_plural_pair(parts: List[str]) -> bool: + """Detect if comma-separated parts are singular/plural forms of the same word. + + E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split). + "break, broke, broken" → False (different verb forms, OK to split). + + Heuristic: exactly 2 parts that share a common prefix of >= 50% length, + OR one part is a known plural suffix of the other (e.g. +s, +es, +en). + """ + if len(parts) != 2: + return False + + a, b = parts[0].lower().strip(), parts[1].lower().strip() + if not a or not b: + return False + + # Common prefix heuristic: if words share >= 50% of the shorter word, + # they are likely forms of the same word (Maus/Mäuse, child/children). + min_len = min(len(a), len(b)) + common = 0 + for ca, cb in zip(a, b): + if ca == cb: + common += 1 + else: + break + if common >= max(2, min_len * 0.5): + return True + + # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü) + umlaut_map = str.maketrans('aou', 'äöü') + if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a: + return True + + return False + + +def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Split entries with comma-separated word forms into individual entries. + + E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen" + → 3 entries: break/brechen, broke/brach, broken/gebrochen + + Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse" + because those are forms of the same vocabulary entry. + + Only splits when both EN and DE have the same number of comma-parts, + parts are short (word forms, not sentences), and at least 3 parts + (to avoid splitting pairs that likely belong together). + """ + result: List[Dict[str, Any]] = [] + + for entry in entries: + en = (entry.get('english', '') or '').strip() + de = (entry.get('german', '') or '').strip() + + # Split by comma (but not inside brackets or parentheses) + en_parts = _split_by_comma(en) + de_parts = _split_by_comma(de) + + # Only split if we have multiple parts and counts match + should_split = False + if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts): + # All parts must be short (word forms, not sentences) + if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts): + # Do NOT split singular/plural pairs (2 parts that are + # forms of the same word) + if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts): + should_split = False + else: + should_split = True + + if not should_split: + result.append(entry) + continue + + # Split into individual entries + for k in range(len(en_parts)): + sub = dict(entry) # shallow copy + sub['english'] = en_parts[k].strip() + sub['german'] = de_parts[k].strip() if k < len(de_parts) else '' + sub['example'] = '' # examples get attached later + sub['split_from_comma'] = True + result.append(sub) + + # Re-number + for i, e in enumerate(result): + e['row_index'] = i + + return result + + +def _split_by_comma(text: str) -> List[str]: + """Split text by commas, but not inside brackets [...] or parens (...).""" + if ',' not in text: + return [text] + + parts = [] + depth_bracket = 0 + depth_paren = 0 + current = [] + + for ch in text: + if ch == '[': + depth_bracket += 1 + elif ch == ']': + depth_bracket = max(0, depth_bracket - 1) + elif ch == '(': + depth_paren += 1 + elif ch == ')': + depth_paren = max(0, depth_paren - 1) + elif ch == ',' and depth_bracket == 0 and depth_paren == 0: + parts.append(''.join(current).strip()) + current = [] + continue + current.append(ch) + + if current: + parts.append(''.join(current).strip()) + + # Filter empty parts + return [p for p in parts if p] + + +# --- C. Example Sentence Attachment --- + +def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int: + """Find the vocab entry whose English word(s) best match the example sentence. + + Returns index into vocab_entries, or -1 if no match found. + Uses word stem overlap: "a broken arm" matches "broken" or "break". + """ + if not vocab_entries or not example_text: + return -1 + + example_lower = example_text.lower() + example_words = set(re.findall(r'[a-zäöüß]+', example_lower)) + + best_idx = -1 + best_score = 0 + + for i, entry in enumerate(vocab_entries): + en = (entry.get('english', '') or '').lower() + if not en: + continue + + # Extract vocab words (split on space, comma, newline) + vocab_words = set(re.findall(r'[a-zäöüß]+', en)) + + # Score: how many vocab words appear in the example? + # Also check if example words share a common stem (first 4 chars) + direct_matches = vocab_words & example_words + score = len(direct_matches) * 10 + + # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre" + if score == 0: + for vw in vocab_words: + if len(vw) < 3: + continue + stem = vw[:4] if len(vw) >= 4 else vw[:3] + for ew in example_words: + if len(ew) >= len(stem) and ew[:len(stem)] == stem: + score += 5 + break + + if score > best_score: + best_score = score + best_idx = i + + return best_idx if best_score > 0 else -1 + + +def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Attach rows with EN text but no DE translation as examples to matching vocab entries. + + Vocabulary worksheets often have: + Row 1: break, broke, broken / brechen, brach, gebrochen + Row 2: a broken arm (no DE → example for "broken") + Row 3: a broken plate (no DE → example for "broken") + Row 4: egg / Ei (has DE → new vocab entry) + + Rules (deterministic, generic): + - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars) + - Find the best matching vocab entry by checking which entry's English words + appear in the example sentence (semantic matching via word overlap) + - Fall back to the nearest preceding entry if no word match found + - Multiple examples get joined with " | " + """ + if not entries: + return entries + + # Separate into vocab entries (have DE) and example candidates (no DE) + vocab_entries: List[Dict[str, Any]] = [] + examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts + + for entry in entries: + en = (entry.get('english', '') or '').strip() + de = (entry.get('german', '') or '').strip() + ex = (entry.get('example', '') or '').strip() + + # Treat single-char DE as OCR noise, not real translation. + # "Ei" (2 chars) is a valid German word, so threshold is 1. + has_de = len(de) > 1 + has_en = bool(en) + + # Heuristic: a row without DE is an "example sentence" only if + # the EN text looks like a sentence (>= 4 words, or contains + # typical sentence punctuation). Short EN text (1-3 words) is + # more likely a vocab entry whose DE was missed by OCR. + _looks_like_sentence = ( + len(en.split()) >= 4 + or en.rstrip().endswith(('.', '!', '?')) + ) + is_example_candidate = ( + has_en and not has_de and _looks_like_sentence and vocab_entries + ) + + if is_example_candidate: + # This is an example sentence — find best matching vocab entry + example_text = en + + match_idx = _find_best_vocab_match(en, vocab_entries) + if match_idx < 0: + # No word match → fall back to last entry + match_idx = len(vocab_entries) - 1 + + if match_idx not in examples_for: + examples_for[match_idx] = [] + examples_for[match_idx].append(example_text) + else: + vocab_entries.append(entry) + + # Attach examples to their matched vocab entries + for idx, example_list in examples_for.items(): + if 0 <= idx < len(vocab_entries): + entry = vocab_entries[idx] + existing_ex = (entry.get('example', '') or '').strip() + new_examples = ' | '.join(example_list) + entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples + + # Re-number + for i, e in enumerate(vocab_entries): + e['row_index'] = i + + return vocab_entries + + +# --- D. Phonetic Bracket IPA Replacement --- + +# Pattern: word followed by any bracket type containing phonetic content. +# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc. +# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs. +# This intentionally matches mixed brackets (e.g. {content]) because +# Tesseract frequently misrecognizes bracket characters. +_PHONETIC_BRACKET_RE = re.compile( + r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]' +) + +# Unicode IPA characters — used to distinguish correct IPA (from dictionary +# lookup) from garbled OCR content when stripping orphan brackets. +_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ') + +# Minimum word confidence for full-page Tesseract results (0-100). +# Words below this threshold are OCR noise (scanner shadows, borders). +_MIN_WORD_CONF = 30 + + +def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]: + """Look up IPA for a word using the selected pronunciation dictionary. + + Args: + word: English word to look up. + pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT). + + Returns: + IPA string or None if not found. + """ + word_lower = word.lower().strip() + if not word_lower: + return None + + if pronunciation == 'british' and _britfone_dict: + ipa = _britfone_dict.get(word_lower) + if ipa: + return ipa + # Fallback to American if not in Britfone + if _ipa_convert_american: + result = _ipa_convert_american(word_lower) + if result and '*' not in result: + return result + return None + + if pronunciation == 'american' and _ipa_convert_american: + result = _ipa_convert_american(word_lower) + if result and '*' not in result: + return result + # Fallback to Britfone if not in CMU + if _britfone_dict: + ipa = _britfone_dict.get(word_lower) + if ipa: + return ipa + return None + + # Try any available source + if _britfone_dict: + ipa = _britfone_dict.get(word_lower) + if ipa: + return ipa + if _ipa_convert_american: + result = _ipa_convert_american(word_lower) + if result and '*' not in result: + return result + + return None + + +def _fix_phonetic_brackets( + entries: List[Dict[str, Any]], + pronunciation: str = 'british', +) -> List[Dict[str, Any]]: + """Replace OCR'd phonetic transcriptions with dictionary IPA. + + Detects patterns like "dance [du:ns]" and replaces with correct IPA: + - British: "dance [dˈɑːns]" (Britfone, MIT) + - American: "dance [dæns]" (eng_to_ipa/CMU, MIT) + + Only replaces if the word before brackets is found in the dictionary. + """ + if not IPA_AVAILABLE: + return entries + + # IPA phonetics only appear in the ENGLISH field of vocab tables. + # German and example fields contain meaningful parenthetical content: + # german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)" + # example: "(sich beschweren)", "(brauchen)", "(jammern)" + # These must NEVER be processed as phonetic transcriptions. + replaced_count = 0 + for entry in entries: + text = entry.get('english', '') or '' + if not any(ch in text for ch in '[{('): + continue + new_text = _replace_phonetics_in_text(text, pronunciation) + if new_text != text: + logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'") + replaced_count += 1 + entry['english'] = new_text + + if replaced_count: + logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries") + return entries + + +# Grammar particles that appear in brackets after English words: +# cross (with), complain (about/of), agree (on/with), look (sth) up +# These must NOT be replaced with IPA. Only used for the English field +# (German/example fields are never processed for IPA replacement). +_GRAMMAR_BRACKET_WORDS = frozenset({ + # English prepositions/particles commonly in vocab tables + 'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by', + 'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through', + # English grammar abbreviations used in vocab tables + 'sth', 'sb', 'adj', 'adv', +}) + + +def _is_grammar_bracket_content(content: str) -> bool: + """Return True if bracket content is grammar info in the ENGLISH field. + + Grammar info: cross (with), complain (about/of), agree (on/with) + NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test] + + Since we only process the English field, we only need to recognize + English grammar particles. Everything else is (garbled) IPA. + """ + if not content: + return False + + # Split on / for patterns like (about/of), (on/with) + tokens = [t.strip().lower() for t in content.split('/') if t.strip()] + if not tokens: + return False + + # ALL tokens must be known grammar words + return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens) + + +def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: + """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA. + + Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno]. + We match any bracket type and replace with dictionary IPA if found. + Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved. + """ + if not IPA_AVAILABLE: + return text + + def replacer(match): + word = match.group(1) + bracket_content = match.group(2).strip() + full_match = match.group(0) + + # Skip if bracket content looks like regular text (multiple words) + if len(bracket_content.split()) > 3: + return full_match + + # Look up IPA for the word before brackets + ipa = _lookup_ipa(word, pronunciation) + + if ipa: + # Word has IPA → bracket content is phonetic (garbled or correct). + # Exception: grammar particles like cross (with) — keep those. + if _is_grammar_bracket_content(bracket_content): + return full_match + logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'") + return f"{word} [{ipa}]" + + # No IPA for this word — keep as-is + return full_match + + text = _PHONETIC_BRACKET_RE.sub(replacer, text) + + # Second pass: strip remaining orphan brackets that are garbled IPA. + # These have no word before them (the main regex requires \b word \s* bracket). + # Examples: "[mais]", "{'mani setva]", trailing "(kros]" + # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]" + def _strip_orphan_bracket(m): + content = m.group(1).strip() + # Keep grammar info: (sich beschweren), (about/of) + if _is_grammar_bracket_content(content): + return m.group(0) + # Keep correct IPA (contains Unicode IPA characters) + if any(ch in _IPA_CHARS for ch in content): + return m.group(0) + logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'") + return '' + + text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text) + text = text.strip() + + return text + + +def _assign_row_words_to_columns( + row: RowGeometry, + columns: List[PageRegion], +) -> Dict[int, List[Dict]]: + """Assign each word in a row to exactly one column. + + Uses a two-pass strategy: + 1. Containment: if a word's center falls within a column's horizontal + bounds (with padding), assign it to that column. + 2. Nearest center: for words not contained by any column, fall back to + nearest column center distance. + + This prevents long sentences in wide columns (e.g. example) from having + their rightmost words stolen by an adjacent column. + + Args: + row: Row with words (relative coordinates). + columns: Sorted list of columns (absolute coordinates). + + Returns: + Dict mapping col_index → list of words assigned to that column. + """ + result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))} + + if not row.words or not columns: + return result + + left_x = row.x # content ROI left (absolute) + + # Build non-overlapping column assignment ranges using midpoints. + # For adjacent columns, the boundary is the midpoint between them. + # This prevents words near column borders from being assigned to + # the wrong column (e.g. "We" at the start of an example sentence + # being stolen by the preceding DE column). + n = len(columns) + col_ranges_rel = [] # (assign_left, assign_right) per column + for ci, col in enumerate(columns): + col_left_rel = col.x - left_x + col_right_rel = col_left_rel + col.width + + # Left boundary: midpoint to previous column, or 0 + if ci == 0: + assign_left = 0 + else: + prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width + assign_left = (prev_right + col_left_rel) / 2 + + # Right boundary: midpoint to next column, or infinity (row width) + if ci == n - 1: + assign_right = row.width + 100 # generous for last column + else: + next_left = columns[ci + 1].x - left_x + assign_right = (col_right_rel + next_left) / 2 + + col_ranges_rel.append((assign_left, assign_right)) + + for w in row.words: + w_left = w['left'] + w_right = w_left + w['width'] + w_center_x = w_left + w['width'] / 2 + + # Primary: overlap-based matching — assign to column with most overlap. + # This is more robust than center-based for narrow columns (page_ref) + # where the last character's center may fall into the next column. + best_col = -1 + best_overlap = 0 + for ci, col in enumerate(columns): + col_left_rel = col.x - left_x + col_right_rel = col_left_rel + col.width + overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel)) + if overlap > best_overlap: + best_overlap = overlap + best_col = ci + + if best_col >= 0 and best_overlap > 0: + result[best_col].append(w) + else: + # Fallback: center-based range matching + assigned = False + for ci, (al, ar) in enumerate(col_ranges_rel): + if al <= w_center_x < ar: + result[ci].append(w) + assigned = True + break + + if not assigned: + # Last resort: nearest column center + best_col = 0 + col_left_0 = columns[0].x - left_x + best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2)) + for ci in range(1, n): + col_left = columns[ci].x - left_x + dist = abs(w_center_x - (col_left + columns[ci].width / 2)) + if dist < best_dist: + best_dist = dist + best_col = ci + result[best_col].append(w) + + return result + + +# Regex: at least 2 consecutive letters (Latin + umlauts + accents) +_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}') +_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]') + +# Common short EN/DE words (2-3 chars). Tokens at the end of a cell +# that do NOT appear here are treated as trailing OCR noise. +_COMMON_SHORT_WORDS: set = { + # EN 1-2 letter + 'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he', + 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on', + 'or', 'so', 'to', 'up', 'us', 'we', + # EN 3 letter + 'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all', + 'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art', + 'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay', + 'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy', + 'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap', + 'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad', + 'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip', + 'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel', + 'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far', + 'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit', + 'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur', + 'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut', + 'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her', + 'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how', + 'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink', + 'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet', + 'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit', + 'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let', + 'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man', + 'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob', + 'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag', + 'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut', + 'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one', + 'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad', + 'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per', + 'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot', + 'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram', + 'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid', + 'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub', + 'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap', + 'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin', + 'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob', + 'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty', + 'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan', + 'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip', + 'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug', + 'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim', + 'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet', + 'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo', + 'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you', + 'zap', 'zip', 'zoo', + # DE 2-3 letter + 'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu', + 'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem', + 'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar', + 'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist', + 'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun', + 'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag', + 'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von', + 'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir', + 'wut', 'zum', 'zur', +} + +# Known abbreviations found in EN/DE textbooks and dictionaries. +# Stored WITHOUT trailing period (the noise filter strips periods). +# These rescue tokens like "sth." / "sb." / "usw." from being deleted. +_KNOWN_ABBREVIATIONS: set = { + # EN dictionary meta-words + 'sth', 'sb', 'smth', 'smb', 'sbd', + # EN general + 'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp', + 'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap', + # EN references / textbook + 'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr', + 'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff', + 'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs', + 'ans', 'wb', 'tb', 'vocab', + # EN parts of speech / grammar + 'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj', + 'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger', + 'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans', + 'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut', + 'attr', 'pred', 'comp', 'superl', 'pos', 'neg', + 'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml', + 'syn', 'ant', 'opp', 'var', 'orig', + # EN titles + 'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr', + # EN pronunciation + 'br', 'am', 'brit', 'amer', + # EN units + 'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml', + # DE general + 'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg', + 'bes', 'insb', 'insbes', 'bspw', 'ca', + 'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr', + 'inkl', 'exkl', 'zzgl', 'abzgl', + # DE references + 'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde', + 'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap', + 's', 'sp', 'zit', 'zs', 'vlg', + # DE grammar + 'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj', + 'praet', 'imp', 'part', 'mask', 'fem', 'neutr', + 'trennb', 'untrennb', 'ugs', 'geh', 'pej', + # DE regional + 'nordd', 'österr', 'schweiz', + # Linguistic + 'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym', + 'deriv', 'pref', 'suf', 'suff', 'dim', 'coll', + 'count', 'uncount', 'indef', 'def', 'poss', 'demon', +} + + +def _is_noise_tail_token(token: str) -> bool: + """Check if a token at the END of cell text is trailing OCR noise. + + Trailing fragments are very common OCR artifacts from image edges, + borders, and neighbouring cells. This is more aggressive than a + general word filter: any short token that isn't in the dictionary + of common EN/DE words is considered noise. + + Examples of noise: "Es)", "3", "ee", "B" + Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]" + """ + t = token.strip() + if not t: + return True + + # Keep ellipsis + if t in ('...', '…'): + return False + + # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc. + if t.startswith('[') or t.startswith('["') or t.startswith("['"): + return False + if t.endswith(']'): + return False + + # Pure non-alpha → noise ("3", ")", "|") + alpha_chars = _RE_ALPHA.findall(t) + if not alpha_chars: + return True + + # Extract only alpha characters for dictionary lookup + cleaned = ''.join(alpha_chars) + + # Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep + if cleaned.lower() in _KNOWN_ABBREVIATIONS: + return False + + # Strip normal trailing punctuation before checking for internal noise. + stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes" + t_check = stripped_punct if stripped_punct else t + + # Check for legitimate punctuation patterns vs. real noise. + # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir", + # "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen" + # Noise: "3d", "B|", "x7" + # Strategy: strip common dictionary punctuation (parens, hyphens, slashes), + # THEN check if residual contains only alpha characters. + t_inner = t_check + # Remove all parentheses, hyphens, slashes, and dots — these are normal + # in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)", + # "(zer)brechen", "wir/uns", "e.g." + t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner) + # Now check: does the inner form still have non-alpha noise? + inner_alpha = ''.join(_RE_ALPHA.findall(t_inner)) + has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False + + # Long alpha words (4+ chars) without internal noise are likely real + if len(cleaned) >= 4 and not has_internal_noise: + return False + + # Short words: check dictionary (uses only alpha chars) + if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise: + return False + + # Default: short or suspicious → noise + return True + + +def _is_garbage_text(text: str) -> bool: + """Check if entire cell text is OCR garbage from image areas. + + Garbage text = no recognizable dictionary word. Catches + "(ci]oeu", "uanoaain." etc. + """ + words = _RE_REAL_WORD.findall(text) + if not words: + # Check if any token is a known abbreviation (e.g. "e.g.") + alpha_only = ''.join(_RE_ALPHA.findall(text)).lower() + if alpha_only in _KNOWN_ABBREVIATIONS: + return False + return True + + for w in words: + wl = w.lower() + # Known short word or abbreviation → not garbage + if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS: + return False + # Long word (>= 4 chars): check vowel/consonant ratio. + # Real EN/DE words have 20-60% vowels. Garbage like "uanoaain" + # or "cioeu" has unusual ratios (too many or too few vowels). + if len(wl) >= 4: + vowels = sum(1 for c in wl if c in 'aeiouäöü') + ratio = vowels / len(wl) + if 0.15 <= ratio <= 0.65: + return False # plausible vowel ratio → real word + + return True + + +def _clean_cell_text(text: str) -> str: + """Remove OCR noise from cell text. Generic filters: + + 1. If the entire text has no real alphabetic word (>= 2 letters), clear. + 2. If the entire text is garbage (no dictionary word), clear. + 3. Strip trailing noise tokens from the end of the text. + """ + stripped = text.strip() + if not stripped: + return '' + + # --- Filter 1: No real word at all --- + if not _RE_REAL_WORD.search(stripped): + # Exception: dotted abbreviations like "e.g.", "z.B.", "i.e." + alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower() + if alpha_only not in _KNOWN_ABBREVIATIONS: + return '' + + # --- Filter 2: Entire text is garbage --- + if _is_garbage_text(stripped): + return '' + + # --- Filter 3: Strip trailing noise tokens --- + tokens = stripped.split() + while tokens and _is_noise_tail_token(tokens[-1]): + tokens.pop() + if not tokens: + return '' + + return ' '.join(tokens) + + +def _clean_cell_text_lite(text: str) -> str: + """Simplified noise filter for cell-first OCR (isolated cell crops). + + Since each cell is OCR'd in isolation (no neighbour content visible), + trailing-noise stripping is unnecessary. Only 2 filters remain: + + 1. No real alphabetic word (>= 2 letters) and not a known abbreviation → empty. + 2. Entire text is garbage (no dictionary word) → empty. + """ + stripped = text.strip() + if not stripped: + return '' + + # --- Filter 1: No real word at all --- + if not _RE_REAL_WORD.search(stripped): + alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower() + if alpha_only not in _KNOWN_ABBREVIATIONS: + return '' + + # --- Filter 2: Entire text is garbage --- + if _is_garbage_text(stripped): + return '' + + return stripped + + +# --------------------------------------------------------------------------- +# Bold detection via stroke-width analysis (relative / page-level) +# --------------------------------------------------------------------------- + +def _measure_stroke_width(gray_crop: np.ndarray) -> float: + """Measure mean stroke width in a binarised cell crop. + + Returns a DPI-normalised value (mean stroke width as % of crop height), + or 0.0 if measurement is not possible. + """ + if gray_crop is None or gray_crop.size == 0: + return 0.0 + h, w = gray_crop.shape[:2] + if h < 10 or w < 10: + return 0.0 + + # Binarise: text = white (255), background = black (0) + _, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) + if cv2.countNonZero(bw) < 20: + return 0.0 + + # Distance transform: value at each white pixel = distance to nearest black + dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3) + + # Skeleton via morphological thinning + kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) + thin = bw.copy() + for _ in range(max(1, min(h, w) // 6)): + eroded = cv2.erode(thin, kernel) + if cv2.countNonZero(eroded) < 5: + break + thin = eroded + + skeleton_pts = thin > 0 + if not np.any(skeleton_pts): + return 0.0 + mean_stroke = float(np.mean(dist[skeleton_pts])) + return mean_stroke / max(h, 1) * 100 # normalised: % of cell height + + +def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray], + img_w: int, img_h: int) -> None: + """Two-pass bold detection: measure all cells, then compare against median. + + Cells with stroke width > 1.4× the page median are marked as bold. + This adapts automatically to font, DPI and scan quality. + Modifies cells in-place (sets 'is_bold' key). + """ + if ocr_img is None: + return + + # Pass 1: measure stroke width for every cell with text + metrics: List[float] = [] + cell_strokes: List[float] = [] + for cell in cells: + sw = 0.0 + if cell.get('text', '').strip(): + bp = cell['bbox_px'] + y1 = max(0, bp['y']) + y2 = min(img_h, bp['y'] + bp['h']) + x1 = max(0, bp['x']) + x2 = min(img_w, bp['x'] + bp['w']) + if y2 > y1 and x2 > x1: + sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2]) + cell_strokes.append(sw) + if sw > 0: + metrics.append(sw) + + if len(metrics) < 3: + # Too few cells to compare — leave all as non-bold + return + + median_sw = float(np.median(metrics)) + if median_sw <= 0: + return + + # Pass 2: cells significantly above median → bold + for cell, sw in zip(cells, cell_strokes): + cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4 + + +# --------------------------------------------------------------------------- diff --git a/klausur-service/backend/cv_preprocessing.py b/klausur-service/backend/cv_preprocessing.py new file mode 100644 index 0000000..133d47f --- /dev/null +++ b/klausur-service/backend/cv_preprocessing.py @@ -0,0 +1,1166 @@ +""" +Image I/O, orientation detection, deskew, and dewarp for the CV vocabulary pipeline. + +Lizenz: Apache 2.0 (kommerziell nutzbar) +DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +import logging +import time +from collections import defaultdict +from typing import Any, Dict, List, Tuple + +import numpy as np + +from cv_vocab_types import ( + CV2_AVAILABLE, + TESSERACT_AVAILABLE, +) + +logger = logging.getLogger(__name__) + +# Guarded imports — mirror cv_vocab_types guards +try: + import cv2 +except ImportError: + cv2 = None # type: ignore[assignment] + +try: + import pytesseract + from PIL import Image +except ImportError: + pytesseract = None # type: ignore[assignment] + Image = None # type: ignore[assignment,misc] + + +def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray: + """Render a PDF page to a high-resolution numpy array (BGR). + + Args: + pdf_data: Raw PDF bytes. + page_number: 0-indexed page number. + zoom: Zoom factor (3.0 = 432 DPI). + + Returns: + numpy array in BGR format. + """ + import fitz # PyMuPDF + + pdf_doc = fitz.open(stream=pdf_data, filetype="pdf") + if page_number >= pdf_doc.page_count: + raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)") + + page = pdf_doc[page_number] + mat = fitz.Matrix(zoom, zoom) + pix = page.get_pixmap(matrix=mat) + + # Convert to numpy BGR + img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n) + if pix.n == 4: # RGBA + img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR) + elif pix.n == 3: # RGB + img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR) + else: # Grayscale + img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR) + + pdf_doc.close() + return img_bgr + + +def render_image_high_res(image_data: bytes) -> np.ndarray: + """Load an image (PNG/JPEG) into a numpy array (BGR). + + Args: + image_data: Raw image bytes. + + Returns: + numpy array in BGR format. + """ + img_array = np.frombuffer(image_data, dtype=np.uint8) + img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR) + if img_bgr is None: + raise ValueError("Could not decode image data") + return img_bgr + + +# ============================================================================= +# Stage 1b: Orientation Detection (0°/90°/180°/270°) +# ============================================================================= + +def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]: + """Detect page orientation via Tesseract OSD and rotate if needed. + + Handles upside-down scans (180°) common with book scanners where + every other page is flipped due to the scanner hinge. + + Returns: + (corrected_image, rotation_degrees) — rotation is 0, 90, 180, or 270. + """ + if pytesseract is None: + return img_bgr, 0 + + try: + # Tesseract OSD needs a grayscale or RGB image + gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) + pil_img = Image.fromarray(gray) + + osd = pytesseract.image_to_osd(pil_img, output_type=pytesseract.Output.DICT) + rotate = osd.get("rotate", 0) + confidence = osd.get("orientation_conf", 0.0) + + logger.info(f"OSD: orientation={rotate}° confidence={confidence:.1f}") + + if rotate == 0 or confidence < 1.0: + return img_bgr, 0 + + # Apply rotation + if rotate == 180: + corrected = cv2.rotate(img_bgr, cv2.ROTATE_180) + elif rotate == 90: + corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_COUNTERCLOCKWISE) + elif rotate == 270: + corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_CLOCKWISE) + else: + return img_bgr, 0 + + logger.info(f"OSD: rotated {rotate}° to fix orientation") + return corrected, rotate + + except Exception as e: + logger.warning(f"OSD orientation detection failed: {e}") + return img_bgr, 0 + + +# ============================================================================= +# Stage 2: Deskew (Rotation Correction) +# ============================================================================= + +def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]: + """Correct rotation using Hough Line detection. + + Args: + img: BGR image. + + Returns: + Tuple of (corrected image, detected angle in degrees). + """ + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + # Binarize for line detection + _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + + # Detect lines + lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100, + minLineLength=img.shape[1] // 4, maxLineGap=20) + + if lines is None or len(lines) < 3: + return img, 0.0 + + # Compute angles of near-horizontal lines + angles = [] + for line in lines: + x1, y1, x2, y2 = line[0] + angle = np.degrees(np.arctan2(y2 - y1, x2 - x1)) + if abs(angle) < 15: # Only near-horizontal + angles.append(angle) + + if not angles: + return img, 0.0 + + median_angle = float(np.median(angles)) + + # Limit correction to ±5° + if abs(median_angle) > 5.0: + median_angle = 5.0 * np.sign(median_angle) + + if abs(median_angle) < 0.1: + return img, 0.0 + + # Rotate + h, w = img.shape[:2] + center = (w // 2, h // 2) + M = cv2.getRotationMatrix2D(center, median_angle, 1.0) + corrected = cv2.warpAffine(img, M, (w, h), + flags=cv2.INTER_LINEAR, + borderMode=cv2.BORDER_REPLICATE) + + logger.info(f"Deskew: corrected {median_angle:.2f}° rotation") + return corrected, median_angle + + +def deskew_image_by_word_alignment( + image_data: bytes, + lang: str = "eng+deu", + downscale_factor: float = 0.5, +) -> Tuple[bytes, float]: + """Correct rotation by fitting a line through left-most word starts per text line. + + More robust than Hough-based deskew for vocabulary worksheets where text lines + have consistent left-alignment. Runs a quick Tesseract pass on a downscaled + copy to find word positions, computes the dominant left-edge column, fits a + line through those points and rotates the full-resolution image. + + Args: + image_data: Raw image bytes (PNG/JPEG). + lang: Tesseract language string for the quick pass. + downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%). + + Returns: + Tuple of (rotated image as PNG bytes, detected angle in degrees). + """ + if not CV2_AVAILABLE or not TESSERACT_AVAILABLE: + return image_data, 0.0 + + # 1. Decode image + img_array = np.frombuffer(image_data, dtype=np.uint8) + img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) + if img is None: + logger.warning("deskew_by_word_alignment: could not decode image") + return image_data, 0.0 + + orig_h, orig_w = img.shape[:2] + + # 2. Downscale for fast Tesseract pass + small_w = int(orig_w * downscale_factor) + small_h = int(orig_h * downscale_factor) + small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA) + + # 3. Quick Tesseract — word-level positions + pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB)) + try: + data = pytesseract.image_to_data( + pil_small, lang=lang, config="--psm 6 --oem 3", + output_type=pytesseract.Output.DICT, + ) + except Exception as e: + logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}") + return image_data, 0.0 + + # 4. Per text-line, find the left-most word start + # Group by (block_num, par_num, line_num) + line_groups: Dict[tuple, list] = defaultdict(list) + for i in range(len(data["text"])): + text = (data["text"][i] or "").strip() + conf = int(data["conf"][i]) + if not text or conf < 20: + continue + key = (data["block_num"][i], data["par_num"][i], data["line_num"][i]) + line_groups[key].append(i) + + if len(line_groups) < 5: + logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping") + return image_data, 0.0 + + # For each line, pick the word with smallest 'left' → compute (left_x, center_y) + # Scale back to original resolution + scale = 1.0 / downscale_factor + points = [] # list of (x, y) in original-image coords + for key, indices in line_groups.items(): + best_idx = min(indices, key=lambda i: data["left"][i]) + lx = data["left"][best_idx] * scale + top = data["top"][best_idx] * scale + h = data["height"][best_idx] * scale + cy = top + h / 2.0 + points.append((lx, cy)) + + # 5. Find dominant left-edge column + compute angle + xs = np.array([p[0] for p in points]) + ys = np.array([p[1] for p in points]) + median_x = float(np.median(xs)) + tolerance = orig_w * 0.03 # 3% of image width + + mask = np.abs(xs - median_x) <= tolerance + filtered_xs = xs[mask] + filtered_ys = ys[mask] + + if len(filtered_xs) < 5: + logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping") + return image_data, 0.0 + + # polyfit: x = a*y + b → a = dx/dy → angle = arctan(a) + coeffs = np.polyfit(filtered_ys, filtered_xs, 1) + slope = coeffs[0] # dx/dy + angle_rad = np.arctan(slope) + angle_deg = float(np.degrees(angle_rad)) + + # Clamp to ±5° + angle_deg = max(-5.0, min(5.0, angle_deg)) + + logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points " + f"(total lines: {len(line_groups)})") + + if abs(angle_deg) < 0.05: + return image_data, 0.0 + + # 6. Rotate full-res image + center = (orig_w // 2, orig_h // 2) + M = cv2.getRotationMatrix2D(center, angle_deg, 1.0) + rotated = cv2.warpAffine(img, M, (orig_w, orig_h), + flags=cv2.INTER_LINEAR, + borderMode=cv2.BORDER_REPLICATE) + + # Encode back to PNG + success, png_buf = cv2.imencode(".png", rotated) + if not success: + logger.warning("deskew_by_word_alignment: PNG encoding failed") + return image_data, 0.0 + + return png_buf.tobytes(), angle_deg + + +def _projection_gradient_score(profile: np.ndarray) -> float: + """Score a projection profile by the L2-norm of its first derivative. + + Higher score = sharper transitions between text-lines and gaps, + i.e. better row/column alignment. + """ + diff = np.diff(profile) + return float(np.sum(diff * diff)) + + +def deskew_image_iterative( + img: np.ndarray, + coarse_range: float = 5.0, + coarse_step: float = 0.1, + fine_range: float = 0.15, + fine_step: float = 0.02, +) -> Tuple[np.ndarray, float, Dict[str, Any]]: + """Iterative deskew using vertical-edge projection optimisation. + + The key insight: at the correct rotation angle, vertical features + (word left-edges, column borders) become truly vertical, producing + the sharpest peaks in the vertical projection of vertical edges. + + Method: + 1. Detect vertical edges via Sobel-X on the central crop. + 2. Coarse sweep: rotate edge image, compute vertical projection + gradient score. The angle where vertical edges align best wins. + 3. Fine sweep: refine around the coarse winner. + + Args: + img: BGR image (full resolution). + coarse_range: half-range in degrees for the coarse sweep. + coarse_step: step size in degrees for the coarse sweep. + fine_range: half-range around the coarse winner for the fine sweep. + fine_step: step size in degrees for the fine sweep. + + Returns: + (rotated_bgr, angle_degrees, debug_dict) + """ + h, w = img.shape[:2] + debug: Dict[str, Any] = {} + + # --- Grayscale + vertical edge detection --- + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + # Central crop (15%-85% height, 10%-90% width) to avoid page margins + y_lo, y_hi = int(h * 0.15), int(h * 0.85) + x_lo, x_hi = int(w * 0.10), int(w * 0.90) + gray_crop = gray[y_lo:y_hi, x_lo:x_hi] + + # Sobel-X → absolute vertical edges + sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3) + edges = np.abs(sobel_x) + # Normalise to 0-255 for consistent scoring + edge_max = edges.max() + if edge_max > 0: + edges = (edges / edge_max * 255).astype(np.uint8) + else: + return img, 0.0, {"error": "no edges detected"} + + crop_h, crop_w = edges.shape[:2] + crop_center = (crop_w // 2, crop_h // 2) + + # Trim margin after rotation to avoid border artifacts + trim_y = max(4, int(crop_h * 0.03)) + trim_x = max(4, int(crop_w * 0.03)) + + def _sweep_edges(angles: np.ndarray) -> list: + """Score each angle by vertical projection gradient of vertical edges.""" + results = [] + for angle in angles: + if abs(angle) < 1e-6: + rotated = edges + else: + M = cv2.getRotationMatrix2D(crop_center, angle, 1.0) + rotated = cv2.warpAffine(edges, M, (crop_w, crop_h), + flags=cv2.INTER_NEAREST, + borderMode=cv2.BORDER_REPLICATE) + # Trim borders to avoid edge artifacts + trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x] + v_profile = np.sum(trimmed, axis=0, dtype=np.float64) + score = _projection_gradient_score(v_profile) + results.append((float(angle), score)) + return results + + # --- Phase 1: coarse sweep --- + coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step) + coarse_results = _sweep_edges(coarse_angles) + best_coarse = max(coarse_results, key=lambda x: x[1]) + best_coarse_angle, best_coarse_score = best_coarse + + debug["coarse_best_angle"] = round(best_coarse_angle, 2) + debug["coarse_best_score"] = round(best_coarse_score, 1) + debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results] + + # --- Phase 2: fine sweep around coarse winner --- + fine_lo = best_coarse_angle - fine_range + fine_hi = best_coarse_angle + fine_range + fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step) + fine_results = _sweep_edges(fine_angles) + best_fine = max(fine_results, key=lambda x: x[1]) + best_fine_angle, best_fine_score = best_fine + + debug["fine_best_angle"] = round(best_fine_angle, 2) + debug["fine_best_score"] = round(best_fine_score, 1) + debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results] + + final_angle = best_fine_angle + + # Clamp to ±5° + final_angle = max(-5.0, min(5.0, final_angle)) + + logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}° fine={best_fine_angle:.2f}° -> {final_angle:.2f}°") + + if abs(final_angle) < 0.05: + return img, 0.0, debug + + # --- Rotate full-res image --- + center = (w // 2, h // 2) + M = cv2.getRotationMatrix2D(center, final_angle, 1.0) + rotated = cv2.warpAffine(img, M, (w, h), + flags=cv2.INTER_LINEAR, + borderMode=cv2.BORDER_REPLICATE) + + return rotated, final_angle, debug + + +def _measure_textline_slope(img: np.ndarray) -> float: + """Measure residual text-line slope via Tesseract word-position regression. + + Groups Tesseract words by (block, par, line), fits a linear regression + per line (y = slope * x + b), and returns the trimmed-mean slope in + degrees. Positive = text rises to the right, negative = falls. + + This is the most direct measurement of remaining rotation after deskew. + """ + import math as _math + + if not TESSERACT_AVAILABLE or not CV2_AVAILABLE: + return 0.0 + + h, w = img.shape[:2] + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + data = pytesseract.image_to_data( + Image.fromarray(gray), + output_type=pytesseract.Output.DICT, + config="--psm 6", + ) + + # Group word centres by text line + lines: Dict[tuple, list] = {} + for i in range(len(data["text"])): + txt = (data["text"][i] or "").strip() + if len(txt) < 2 or int(data["conf"][i]) < 30: + continue + key = (data["block_num"][i], data["par_num"][i], data["line_num"][i]) + cx = data["left"][i] + data["width"][i] / 2.0 + cy = data["top"][i] + data["height"][i] / 2.0 + lines.setdefault(key, []).append((cx, cy)) + + # Per-line linear regression → slope angle + slopes: list = [] + for pts in lines.values(): + if len(pts) < 3: + continue + pts.sort(key=lambda p: p[0]) + xs = np.array([p[0] for p in pts], dtype=np.float64) + ys = np.array([p[1] for p in pts], dtype=np.float64) + if xs[-1] - xs[0] < w * 0.15: + continue # skip short lines + A = np.vstack([xs, np.ones_like(xs)]).T + result = np.linalg.lstsq(A, ys, rcond=None) + slope = result[0][0] + slopes.append(_math.degrees(_math.atan(slope))) + + if len(slopes) < 3: + return 0.0 + + # Trimmed mean (drop 10% extremes on each side) + slopes.sort() + trim = max(1, len(slopes) // 10) + trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes + if not trimmed: + return 0.0 + + return sum(trimmed) / len(trimmed) + + +def deskew_two_pass( + img: np.ndarray, + coarse_range: float = 5.0, +) -> Tuple[np.ndarray, float, Dict[str, Any]]: + """Two-pass deskew: iterative projection + word-alignment residual check. + + Pass 1: ``deskew_image_iterative()`` (vertical-edge projection, wide range). + Pass 2: ``deskew_image_by_word_alignment()`` on the already-corrected image + to detect and fix residual skew that the projection method missed. + + The two corrections are summed. If the residual from Pass 2 is below + 0.3° it is ignored (already good enough). + + Returns: + (corrected_bgr, total_angle_degrees, debug_dict) + """ + debug: Dict[str, Any] = {} + + # --- Pass 1: iterative projection --- + corrected, angle1, dbg1 = deskew_image_iterative( + img.copy(), coarse_range=coarse_range, + ) + debug["pass1_angle"] = round(angle1, 3) + debug["pass1_method"] = "iterative" + debug["pass1_debug"] = dbg1 + + # --- Pass 2: word-alignment residual check on corrected image --- + angle2 = 0.0 + try: + # Encode the corrected image to PNG bytes for word-alignment + ok, buf = cv2.imencode(".png", corrected) + if ok: + corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes()) + if abs(angle2) >= 0.3: + # Significant residual — decode and use the second correction + arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8) + corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR) + if corrected2 is not None: + corrected = corrected2 + logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° applied " + f"(total={angle1 + angle2:.2f}°)") + else: + angle2 = 0.0 + else: + logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° < 0.3° — skipped") + angle2 = 0.0 + except Exception as e: + logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}") + angle2 = 0.0 + + # --- Pass 3: Tesseract text-line regression residual check --- + # The most reliable final check: measure actual text-line slopes + # using Tesseract word positions and linear regression per line. + angle3 = 0.0 + try: + residual = _measure_textline_slope(corrected) + debug["pass3_raw"] = round(residual, 3) + if abs(residual) >= 0.3: + h3, w3 = corrected.shape[:2] + center3 = (w3 // 2, h3 // 2) + M3 = cv2.getRotationMatrix2D(center3, residual, 1.0) + corrected = cv2.warpAffine( + corrected, M3, (w3, h3), + flags=cv2.INTER_LINEAR, + borderMode=cv2.BORDER_REPLICATE, + ) + angle3 = residual + logger.info( + "deskew_two_pass: pass3 text-line residual=%.2f° applied", + residual, + ) + else: + logger.info( + "deskew_two_pass: pass3 text-line residual=%.2f° < 0.3° — skipped", + residual, + ) + except Exception as e: + logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e) + + total_angle = angle1 + angle2 + angle3 + debug["pass2_angle"] = round(angle2, 3) + debug["pass2_method"] = "word_alignment" + debug["pass3_angle"] = round(angle3, 3) + debug["pass3_method"] = "textline_regression" + debug["total_angle"] = round(total_angle, 3) + + logger.info( + "deskew_two_pass: pass1=%.2f° + pass2=%.2f° + pass3=%.2f° = %.2f°", + angle1, angle2, angle3, total_angle, + ) + + return corrected, total_angle, debug + + +# ============================================================================= +# Stage 3: Dewarp (Book Curvature Correction) +# ============================================================================= + +def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]: + """Detect the vertical shear angle of the page. + + After deskew (horizontal lines aligned), vertical features like column + edges may still be tilted. This measures that tilt by tracking the + strongest vertical edge across horizontal strips. + + The result is a shear angle in degrees: the angular difference between + true vertical and the detected column edge. + + Returns: + Dict with keys: method, shear_degrees, confidence. + """ + h, w = img.shape[:2] + result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0} + + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + # Vertical Sobel to find vertical edges + sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3) + abs_sobel = np.abs(sobel_x).astype(np.uint8) + + # Binarize with Otsu + _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + + num_strips = 20 + strip_h = h // num_strips + edge_positions = [] # (y_center, x_position) + + for i in range(num_strips): + y_start = i * strip_h + y_end = min((i + 1) * strip_h, h) + strip = binary[y_start:y_end, :] + + # Project vertically (sum along y-axis) + projection = np.sum(strip, axis=0).astype(np.float64) + if projection.max() == 0: + continue + + # Find the strongest vertical edge in left 40% of image + search_w = int(w * 0.4) + left_proj = projection[:search_w] + if left_proj.max() == 0: + continue + + # Smooth and find peak + kernel_size = max(3, w // 100) + if kernel_size % 2 == 0: + kernel_size += 1 + smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten() + x_pos = float(np.argmax(smoothed)) + y_center = (y_start + y_end) / 2.0 + edge_positions.append((y_center, x_pos)) + + if len(edge_positions) < 8: + return result + + ys = np.array([p[0] for p in edge_positions]) + xs = np.array([p[1] for p in edge_positions]) + + # Remove outliers (> 2 std from median) + median_x = np.median(xs) + std_x = max(np.std(xs), 1.0) + mask = np.abs(xs - median_x) < 2 * std_x + ys = ys[mask] + xs = xs[mask] + + if len(ys) < 6: + return result + + # Fit straight line: x = slope * y + intercept + # The slope tells us the tilt of the vertical edge + straight_coeffs = np.polyfit(ys, xs, 1) + slope = straight_coeffs[0] # dx/dy in pixels + fitted = np.polyval(straight_coeffs, ys) + residuals = xs - fitted + rmse = float(np.sqrt(np.mean(residuals ** 2))) + + # Convert slope to angle: arctan(dx/dy) in degrees + import math + shear_degrees = math.degrees(math.atan(slope)) + + confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0) + + result["shear_degrees"] = round(shear_degrees, 3) + result["confidence"] = round(float(confidence), 2) + + return result + + +def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]: + """Detect shear angle by maximising variance of horizontal text-line projections. + + Principle: horizontal text lines produce a row-projection profile with sharp + peaks (high variance) when the image is correctly aligned. Any residual shear + smears the peaks and reduces variance. We sweep ±3° and pick the angle whose + corrected projection has the highest variance. + + Works best on pages with clear horizontal banding (vocabulary tables, prose). + Complements _detect_shear_angle() which needs strong vertical edges. + + Returns: + Dict with keys: method, shear_degrees, confidence. + """ + import math + result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0} + + h, w = img.shape[:2] + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + # Otsu binarisation + _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + + # Work at half resolution for speed + small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA) + sh, sw = small.shape + + # 2-pass angle sweep for 10x better precision: + # Pass 1: Coarse sweep ±3° in 0.5° steps (13 values) + # Pass 2: Fine sweep ±0.5° around coarse best in 0.05° steps (21 values) + + def _sweep_variance(angles_list): + results = [] + for angle_deg in angles_list: + if abs(angle_deg) < 0.001: + rotated = small + else: + shear_tan = math.tan(math.radians(angle_deg)) + M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]]) + rotated = cv2.warpAffine(small, M, (sw, sh), + flags=cv2.INTER_NEAREST, + borderMode=cv2.BORDER_CONSTANT) + profile = np.sum(rotated, axis=1).astype(float) + results.append((angle_deg, float(np.var(profile)))) + return results + + # Pass 1: coarse + coarse_angles = [a * 0.5 for a in range(-6, 7)] # 13 values + coarse_results = _sweep_variance(coarse_angles) + coarse_best = max(coarse_results, key=lambda x: x[1]) + + # Pass 2: fine around coarse best + fine_center = coarse_best[0] + fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)] # 21 values + fine_results = _sweep_variance(fine_angles) + fine_best = max(fine_results, key=lambda x: x[1]) + + best_angle = fine_best[0] + best_variance = fine_best[1] + variances = coarse_results + fine_results + + # Confidence: how much sharper is the best angle vs. the mean? + all_mean = sum(v for _, v in variances) / len(variances) + if all_mean > 0 and best_variance > all_mean: + confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6) + else: + confidence = 0.0 + + result["shear_degrees"] = round(best_angle, 3) + result["confidence"] = round(max(0.0, min(1.0, confidence)), 2) + return result + + +def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]: + """Detect shear using Hough transform on printed table / ruled lines. + + Vocabulary worksheets have near-horizontal printed table borders. After + deskew these should be exactly horizontal; any residual tilt equals the + vertical shear angle (with inverted sign). + + The sign convention: a horizontal line tilting +α degrees (left end lower) + means the page has vertical shear of -α degrees (left column edge drifts + to the left going downward). + + Returns: + Dict with keys: method, shear_degrees, confidence. + """ + result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0} + + h, w = img.shape[:2] + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + edges = cv2.Canny(gray, 50, 150, apertureSize=3) + + min_len = int(w * 0.15) + lines = cv2.HoughLinesP( + edges, rho=1, theta=np.pi / 360, + threshold=int(w * 0.08), + minLineLength=min_len, + maxLineGap=20, + ) + + if lines is None or len(lines) < 3: + return result + + horizontal_angles: List[Tuple[float, float]] = [] + for line in lines: + x1, y1, x2, y2 = line[0] + if x1 == x2: + continue + angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1))) + if abs(angle) <= 5.0: + length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)) + horizontal_angles.append((angle, length)) + + if len(horizontal_angles) < 3: + return result + + # Weighted median + angles_arr = np.array([a for a, _ in horizontal_angles]) + weights_arr = np.array([l for _, l in horizontal_angles]) + sorted_idx = np.argsort(angles_arr) + s_angles = angles_arr[sorted_idx] + s_weights = weights_arr[sorted_idx] + cum = np.cumsum(s_weights) + mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0)) + median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)]) + + agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0) + confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85 + + # Sign inversion: horizontal line tilt is complementary to vertical shear + shear_degrees = -median_angle + + result["shear_degrees"] = round(shear_degrees, 3) + result["confidence"] = round(max(0.0, min(1.0, confidence)), 2) + return result + + +def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]: + """Detect shear by measuring text-line straightness (Method D). + + Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word + bounding boxes, groups them into vertical columns by X-proximity, + and measures how the left-edge X position drifts with Y (vertical + position). The drift dx/dy is the tangent of the shear angle. + + This directly measures vertical shear (column tilt) rather than + horizontal text-line slope, which is already corrected by deskew. + + Returns: + Dict with keys: method, shear_degrees, confidence. + """ + import math + result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0} + + h, w = img.shape[:2] + # Downscale 50% for speed + scale = 0.5 + small = cv2.resize(img, (int(w * scale), int(h * scale)), + interpolation=cv2.INTER_AREA) + gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY) + pil_img = Image.fromarray(gray) + + try: + data = pytesseract.image_to_data( + pil_img, lang='eng+deu', config='--psm 11 --oem 3', + output_type=pytesseract.Output.DICT, + ) + except Exception: + return result + + # Collect word left-edges (x) and vertical centres (y) + words = [] + for i in range(len(data['text'])): + text = data['text'][i].strip() + conf = int(data['conf'][i]) + if not text or conf < 20 or len(text) < 2: + continue + left_x = float(data['left'][i]) + cy = data['top'][i] + data['height'][i] / 2.0 + word_w = float(data['width'][i]) + words.append((left_x, cy, word_w)) + + if len(words) < 15: + return result + + # --- Group words into vertical columns by left-edge X proximity --- + # Sort by x, then cluster words whose left-edges are within x_tol + avg_w = sum(ww for _, _, ww in words) / len(words) + x_tol = max(avg_w * 0.4, 8) # tolerance for "same column" + + words_by_x = sorted(words, key=lambda w: w[0]) + columns: List[List[Tuple[float, float]]] = [] # each: [(left_x, cy), ...] + cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])] + cur_x = words_by_x[0][0] + + for lx, cy, _ in words_by_x[1:]: + if abs(lx - cur_x) <= x_tol: + cur_col.append((lx, cy)) + # Update running x as median of cluster + cur_x = cur_x * 0.8 + lx * 0.2 + else: + if len(cur_col) >= 5: + columns.append(cur_col) + cur_col = [(lx, cy)] + cur_x = lx + if len(cur_col) >= 5: + columns.append(cur_col) + + if len(columns) < 2: + return result + + # --- For each column, measure X-drift as a function of Y --- + # Fit: left_x = a * cy + b → a = dx/dy = tan(shear_angle) + drifts = [] + for col in columns: + ys = np.array([p[1] for p in col]) + xs = np.array([p[0] for p in col]) + y_range = ys.max() - ys.min() + if y_range < h * scale * 0.3: + continue # column must span at least 30% of image height + # Linear regression: x = a*y + b + coeffs = np.polyfit(ys, xs, 1) + drifts.append(coeffs[0]) # dx/dy + + if len(drifts) < 2: + return result + + # Median dx/dy → shear angle + # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right + median_drift = float(np.median(drifts)) + shear_degrees = math.degrees(math.atan(median_drift)) + + # Confidence from column count + drift consistency + drift_std = float(np.std(drifts)) + consistency = max(0.0, 1.0 - drift_std * 50) # tighter penalty for drift variance + count_factor = min(1.0, len(drifts) / 4.0) + confidence = count_factor * 0.5 + consistency * 0.5 + + result["shear_degrees"] = round(shear_degrees, 3) + result["confidence"] = round(max(0.0, min(1.0, confidence)), 2) + logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, " + "shear=%.3f°, conf=%.2f", + len(columns), len(drifts), median_drift, + shear_degrees, confidence) + return result + + +def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool: + """Check whether the dewarp correction actually improved alignment. + + Compares horizontal projection variance before and after correction. + Higher variance means sharper text-line peaks, which indicates better + horizontal alignment. + + Returns True if the correction improved the image, False if it should + be discarded. + """ + def _h_proj_variance(img: np.ndarray) -> float: + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + _, binary = cv2.threshold(gray, 0, 255, + cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2), + interpolation=cv2.INTER_AREA) + profile = np.sum(small, axis=1).astype(float) + return float(np.var(profile)) + + var_before = _h_proj_variance(original) + var_after = _h_proj_variance(corrected) + + # Correction must improve variance (even by a tiny margin) + return var_after > var_before + + +def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray: + """Apply a vertical shear correction to an image. + + Shifts each row horizontally proportional to its distance from the + vertical center. This corrects the tilt of vertical features (columns) + without affecting horizontal alignment (text lines). + + Args: + img: BGR image. + shear_degrees: Shear angle in degrees. Positive = shift top-right/bottom-left. + + Returns: + Corrected image. + """ + import math + h, w = img.shape[:2] + shear_tan = math.tan(math.radians(shear_degrees)) + + # Affine matrix: shift x by shear_tan * (y - h/2) + # [1 shear_tan -h/2*shear_tan] + # [0 1 0 ] + M = np.float32([ + [1, shear_tan, -h / 2.0 * shear_tan], + [0, 1, 0], + ]) + + corrected = cv2.warpAffine(img, M, (w, h), + flags=cv2.INTER_LINEAR, + borderMode=cv2.BORDER_REPLICATE) + return corrected + + +def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]: + """Combine multiple shear detections into a single weighted estimate (v2). + + Ensemble v2 changes vs v1: + - Minimum confidence raised to 0.5 (was 0.3) + - text_lines method gets 1.5× weight boost (most reliable detector) + - Outlier filter at 1° from weighted mean + + Returns: + (shear_degrees, ensemble_confidence, methods_used_str) + """ + # Confidence threshold — lowered from 0.5 to 0.35 to catch subtle shear + # that individual methods detect with moderate confidence. + _MIN_CONF = 0.35 + + # text_lines gets a weight boost as the most content-aware method + _METHOD_WEIGHT_BOOST = {"text_lines": 1.5} + + accepted = [] + for d in detections: + if d["confidence"] < _MIN_CONF: + continue + boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0) + effective_conf = d["confidence"] * boost + accepted.append((d["shear_degrees"], effective_conf, d["method"])) + + if not accepted: + return 0.0, 0.0, "none" + + if len(accepted) == 1: + deg, conf, method = accepted[0] + return deg, min(conf, 1.0), method + + # First pass: weighted mean + total_w = sum(c for _, c, _ in accepted) + w_mean = sum(d * c for d, c, _ in accepted) / total_w + + # Outlier filter: keep results within 1° of weighted mean + filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0] + if not filtered: + filtered = accepted # fallback: keep all + + # Second pass: weighted mean on filtered results + total_w2 = sum(c for _, c, _ in filtered) + final_deg = sum(d * c for d, c, _ in filtered) / total_w2 + + # Ensemble confidence: average of individual confidences, boosted when + # methods agree (all within 0.5° of each other) + avg_conf = total_w2 / len(filtered) + spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered) + agreement_bonus = 0.15 if spread < 0.5 else 0.0 + ensemble_conf = min(1.0, avg_conf + agreement_bonus) + + methods_str = "+".join(m for _, _, m in filtered) + return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str + + +def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]: + """Correct vertical shear after deskew (v2 with quality gate). + + After deskew aligns horizontal text lines, vertical features (column + edges) may still be tilted. This detects the tilt angle using an ensemble + of four complementary methods and applies an affine shear correction. + + Methods (all run in ~150ms total): + A. _detect_shear_angle() — vertical edge profile (~50ms) + B. _detect_shear_by_projection() — horizontal text-line variance (~30ms) + C. _detect_shear_by_hough() — Hough lines on table borders (~20ms) + D. _detect_shear_by_text_lines() — text-line straightness (~50ms) + + Quality gate: after correction, horizontal projection variance is compared + before vs after. If correction worsened alignment, it is discarded. + + Args: + img: BGR image (already deskewed). + use_ensemble: If False, fall back to single-method behaviour (method A only). + + Returns: + Tuple of (corrected_image, dewarp_info). + dewarp_info keys: method, shear_degrees, confidence, detections. + """ + no_correction = { + "method": "none", + "shear_degrees": 0.0, + "confidence": 0.0, + "detections": [], + } + + if not CV2_AVAILABLE: + return img, no_correction + + t0 = time.time() + + if use_ensemble: + det_a = _detect_shear_angle(img) + det_b = _detect_shear_by_projection(img) + det_c = _detect_shear_by_hough(img) + det_d = _detect_shear_by_text_lines(img) + detections = [det_a, det_b, det_c, det_d] + shear_deg, confidence, method = _ensemble_shear(detections) + else: + det_a = _detect_shear_angle(img) + detections = [det_a] + shear_deg = det_a["shear_degrees"] + confidence = det_a["confidence"] + method = det_a["method"] + + duration = time.time() - t0 + + logger.info( + "dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | " + "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f", + shear_deg, confidence, method, duration, + detections[0]["shear_degrees"], detections[0]["confidence"], + detections[1]["shear_degrees"] if len(detections) > 1 else 0.0, + detections[1]["confidence"] if len(detections) > 1 else 0.0, + detections[2]["shear_degrees"] if len(detections) > 2 else 0.0, + detections[2]["confidence"] if len(detections) > 2 else 0.0, + detections[3]["shear_degrees"] if len(detections) > 3 else 0.0, + detections[3]["confidence"] if len(detections) > 3 else 0.0, + ) + + # Always include individual detections (even when no correction applied) + _all_detections = [ + {"method": d["method"], "shear_degrees": d["shear_degrees"], + "confidence": d["confidence"]} + for d in detections + ] + + # Thresholds: very small shear (<0.08°) is truly irrelevant for OCR. + # For ensemble confidence, require at least 0.4 (lowered from 0.5 to + # catch moderate-confidence detections from multiple agreeing methods). + if abs(shear_deg) < 0.08 or confidence < 0.4: + no_correction["detections"] = _all_detections + return img, no_correction + + # Apply correction (negate the detected shear to straighten) + corrected = _apply_shear(img, -shear_deg) + + # Quality gate: verify the correction actually improved alignment. + # For small corrections (< 0.5°), the projection variance change can be + # negligible, so we skip the quality gate — the cost of a tiny wrong + # correction is much less than the cost of leaving 0.4° uncorrected + # (which shifts content ~25px at image edges on tall scans). + if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected): + logger.info("dewarp: quality gate REJECTED correction (%.3f°) — " + "projection variance did not improve", shear_deg) + no_correction["detections"] = _all_detections + return img, no_correction + + info = { + "method": method, + "shear_degrees": shear_deg, + "confidence": confidence, + "detections": _all_detections, + } + + return corrected, info + + +def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray: + """Apply shear correction with a manual angle. + + Args: + img: BGR image (deskewed, before dewarp). + shear_degrees: Shear angle in degrees to correct. + + Returns: + Corrected image. + """ + if abs(shear_degrees) < 0.001: + return img + return _apply_shear(img, -shear_degrees) + diff --git a/klausur-service/backend/cv_review.py b/klausur-service/backend/cv_review.py new file mode 100644 index 0000000..b3e0bc6 --- /dev/null +++ b/klausur-service/backend/cv_review.py @@ -0,0 +1,1184 @@ +""" +Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration. + +Lizenz: Apache 2.0 (kommerziell nutzbar) +DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +import json +import logging +import os +import re +import time +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np + +from cv_vocab_types import ( + CV_PIPELINE_AVAILABLE, + PageRegion, + PipelineResult, + VocabRow, +) +from cv_preprocessing import ( + deskew_image, + dewarp_image, + render_image_high_res, + render_pdf_high_res, +) +from cv_layout import ( + analyze_layout, + create_layout_image, + create_ocr_image, +) +from cv_ocr_engines import ( + _fix_character_confusion, +) + +logger = logging.getLogger(__name__) + +try: + import cv2 +except ImportError: + cv2 = None # type: ignore[assignment] + +try: + import pytesseract + from PIL import Image +except ImportError: + pytesseract = None # type: ignore[assignment] + Image = None # type: ignore[assignment,misc] + + +# ============================================================================= +# Stage 6: Multi-Pass OCR +# ============================================================================= + +def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str, + psm: int, fallback_psm: Optional[int] = None, + min_confidence: float = 40.0) -> List[Dict[str, Any]]: + """Run Tesseract OCR on a specific region with given PSM. + + Args: + ocr_img: Binarized full-page image. + region: Region to crop and OCR. + lang: Tesseract language string. + psm: Page Segmentation Mode. + fallback_psm: If confidence too low, retry with this PSM per line. + min_confidence: Minimum average confidence before fallback. + + Returns: + List of word dicts with text, position, confidence. + """ + # Crop region + crop = ocr_img[region.y:region.y + region.height, + region.x:region.x + region.width] + + if crop.size == 0: + return [] + + # Convert to PIL for pytesseract + pil_img = Image.fromarray(crop) + + # Run Tesseract with specified PSM + config = f'--psm {psm} --oem 3' + try: + data = pytesseract.image_to_data(pil_img, lang=lang, config=config, + output_type=pytesseract.Output.DICT) + except Exception as e: + logger.warning(f"Tesseract failed for region {region.type}: {e}") + return [] + + words = [] + for i in range(len(data['text'])): + text = data['text'][i].strip() + conf = int(data['conf'][i]) + if not text or conf < 10: + continue + words.append({ + 'text': text, + 'left': data['left'][i] + region.x, # Absolute coords + 'top': data['top'][i] + region.y, + 'width': data['width'][i], + 'height': data['height'][i], + 'conf': conf, + 'region_type': region.type, + }) + + # Check average confidence + if words and fallback_psm is not None: + avg_conf = sum(w['conf'] for w in words) / len(words) + if avg_conf < min_confidence: + logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, " + f"trying fallback PSM {fallback_psm}") + words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm) + + return words + + +def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion, + lang: str, psm: int) -> List[Dict[str, Any]]: + """OCR a region line by line (fallback for low-confidence regions). + + Splits the region into horizontal strips based on text density, + then OCRs each strip individually with the given PSM. + """ + crop = ocr_img[region.y:region.y + region.height, + region.x:region.x + region.width] + + if crop.size == 0: + return [] + + # Find text lines via horizontal projection + inv = cv2.bitwise_not(crop) + h_proj = np.sum(inv, axis=1) + threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0 + + # Find line boundaries + lines = [] + in_text = False + line_start = 0 + for y in range(len(h_proj)): + if h_proj[y] > threshold and not in_text: + line_start = y + in_text = True + elif h_proj[y] <= threshold and in_text: + if y - line_start > 5: # Minimum line height + lines.append((line_start, y)) + in_text = False + if in_text and len(h_proj) - line_start > 5: + lines.append((line_start, len(h_proj))) + + all_words = [] + config = f'--psm {psm} --oem 3' + + for line_y_start, line_y_end in lines: + # Add small padding + pad = 3 + y1 = max(0, line_y_start - pad) + y2 = min(crop.shape[0], line_y_end + pad) + line_crop = crop[y1:y2, :] + + if line_crop.size == 0: + continue + + pil_img = Image.fromarray(line_crop) + try: + data = pytesseract.image_to_data(pil_img, lang=lang, config=config, + output_type=pytesseract.Output.DICT) + except Exception: + continue + + for i in range(len(data['text'])): + text = data['text'][i].strip() + conf = int(data['conf'][i]) + if not text or conf < 10: + continue + all_words.append({ + 'text': text, + 'left': data['left'][i] + region.x, + 'top': data['top'][i] + region.y + y1, + 'width': data['width'][i], + 'height': data['height'][i], + 'conf': conf, + 'region_type': region.type, + }) + + return all_words + + +def run_multi_pass_ocr(ocr_img: np.ndarray, + regions: List[PageRegion], + lang: str = "eng+deu") -> Dict[str, List[Dict]]: + """Run OCR on each detected region with optimized settings. + + Args: + ocr_img: Binarized full-page image. + regions: Detected page regions. + lang: Default language. + + Returns: + Dict mapping region type to list of word dicts. + """ + results: Dict[str, List[Dict]] = {} + + _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} + for region in regions: + if region.type in _ocr_skip: + continue # Skip non-content regions + + if region.type == 'column_en': + words = ocr_region(ocr_img, region, lang='eng', psm=4) + elif region.type == 'column_de': + words = ocr_region(ocr_img, region, lang='deu', psm=4) + elif region.type == 'column_example': + words = ocr_region(ocr_img, region, lang=lang, psm=6, + fallback_psm=7, min_confidence=40.0) + else: + words = ocr_region(ocr_img, region, lang=lang, psm=6) + + results[region.type] = words + logger.info(f"OCR {region.type}: {len(words)} words") + + return results + + +# ============================================================================= +# Stage 7: Line Alignment → Vocabulary Entries +# ============================================================================= + +def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]: + """Group words by Y position into lines, sorted by X within each line.""" + if not words: + return [] + + sorted_words = sorted(words, key=lambda w: (w['top'], w['left'])) + lines: List[List[Dict]] = [] + current_line: List[Dict] = [sorted_words[0]] + current_y = sorted_words[0]['top'] + + for word in sorted_words[1:]: + if abs(word['top'] - current_y) <= y_tolerance_px: + current_line.append(word) + else: + current_line.sort(key=lambda w: w['left']) + lines.append(current_line) + current_line = [word] + current_y = word['top'] + + if current_line: + current_line.sort(key=lambda w: w['left']) + lines.append(current_line) + + return lines + + +def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]], + regions: List[PageRegion], + y_tolerance_px: int = 25) -> List[VocabRow]: + """Align OCR results from different columns into vocabulary rows. + + Uses Y-coordinate matching to pair English words, German translations, + and example sentences that appear on the same line. + + Args: + ocr_results: Dict mapping region type to word lists. + regions: Detected regions (for reference). + y_tolerance_px: Max Y-distance to consider words on the same row. + + Returns: + List of VocabRow objects. + """ + # If no vocabulary columns detected (e.g. plain text page), return empty + if 'column_en' not in ocr_results and 'column_de' not in ocr_results: + logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty") + return [] + + # Group words into lines per column + en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px) + de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px) + ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px) + + def line_y_center(line: List[Dict]) -> float: + return sum(w['top'] + w['height'] / 2 for w in line) / len(line) + + def line_text(line: List[Dict]) -> str: + return ' '.join(w['text'] for w in line) + + def line_confidence(line: List[Dict]) -> float: + return sum(w['conf'] for w in line) / len(line) if line else 0 + + # Build EN entries as the primary reference + vocab_rows: List[VocabRow] = [] + + for en_line in en_lines: + en_y = line_y_center(en_line) + en_text = line_text(en_line) + en_conf = line_confidence(en_line) + + # Skip very short or likely header content + if len(en_text.strip()) < 2: + continue + + # Find matching DE line + de_text = "" + de_conf = 0.0 + best_de_dist = float('inf') + best_de_idx = -1 + for idx, de_line in enumerate(de_lines): + dist = abs(line_y_center(de_line) - en_y) + if dist < y_tolerance_px and dist < best_de_dist: + best_de_dist = dist + best_de_idx = idx + + if best_de_idx >= 0: + de_text = line_text(de_lines[best_de_idx]) + de_conf = line_confidence(de_lines[best_de_idx]) + + # Find matching example line + ex_text = "" + ex_conf = 0.0 + best_ex_dist = float('inf') + best_ex_idx = -1 + for idx, ex_line in enumerate(ex_lines): + dist = abs(line_y_center(ex_line) - en_y) + if dist < y_tolerance_px and dist < best_ex_dist: + best_ex_dist = dist + best_ex_idx = idx + + if best_ex_idx >= 0: + ex_text = line_text(ex_lines[best_ex_idx]) + ex_conf = line_confidence(ex_lines[best_ex_idx]) + + avg_conf = en_conf + conf_count = 1 + if de_conf > 0: + avg_conf += de_conf + conf_count += 1 + if ex_conf > 0: + avg_conf += ex_conf + conf_count += 1 + + vocab_rows.append(VocabRow( + english=en_text.strip(), + german=de_text.strip(), + example=ex_text.strip(), + confidence=avg_conf / conf_count, + y_position=int(en_y), + )) + + # Handle multi-line wrapping in example column: + # If an example line has no matching EN/DE, append to previous entry + matched_ex_ys = set() + for row in vocab_rows: + if row.example: + matched_ex_ys.add(row.y_position) + + for ex_line in ex_lines: + ex_y = line_y_center(ex_line) + # Check if already matched + already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys) + if already_matched: + continue + + # Find nearest previous vocab row + best_row = None + best_dist = float('inf') + for row in vocab_rows: + dist = ex_y - row.y_position + if 0 < dist < y_tolerance_px * 3 and dist < best_dist: + best_dist = dist + best_row = row + + if best_row: + continuation = line_text(ex_line).strip() + if continuation: + best_row.example = (best_row.example + " " + continuation).strip() + + # Sort by Y position + vocab_rows.sort(key=lambda r: r.y_position) + + return vocab_rows + + +# ============================================================================= +# Stage 8: Optional LLM Post-Correction +# ============================================================================= + +async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow], + confidence_threshold: float = 50.0, + enabled: bool = False) -> List[VocabRow]: + """Optionally send low-confidence regions to Qwen-VL for correction. + + Default: disabled. Enable per parameter. + + Args: + img: Original BGR image. + vocab_rows: Current vocabulary rows. + confidence_threshold: Rows below this get LLM correction. + enabled: Whether to actually run LLM correction. + + Returns: + Corrected vocabulary rows. + """ + if not enabled: + return vocab_rows + + # TODO: Implement Qwen-VL correction for low-confidence entries + # For each row with confidence < threshold: + # 1. Crop the relevant region from img + # 2. Send crop + OCR text to Qwen-VL + # 3. Replace text if LLM provides a confident correction + logger.info(f"LLM post-correction skipped (not yet implemented)") + return vocab_rows + + +# ============================================================================= +# Orchestrator +# ============================================================================= + +async def run_cv_pipeline( + pdf_data: Optional[bytes] = None, + image_data: Optional[bytes] = None, + page_number: int = 0, + zoom: float = 3.0, + enable_dewarp: bool = True, + enable_llm_correction: bool = False, + lang: str = "eng+deu", +) -> PipelineResult: + """Run the complete CV document reconstruction pipeline. + + Args: + pdf_data: Raw PDF bytes (mutually exclusive with image_data). + image_data: Raw image bytes (mutually exclusive with pdf_data). + page_number: 0-indexed page number (for PDF). + zoom: PDF rendering zoom factor. + enable_dewarp: Whether to run dewarp stage. + enable_llm_correction: Whether to run LLM post-correction. + lang: Tesseract language string. + + Returns: + PipelineResult with vocabulary and timing info. + """ + if not CV_PIPELINE_AVAILABLE: + return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)") + + result = PipelineResult() + total_start = time.time() + + try: + # Stage 1: Render + t = time.time() + if pdf_data: + img = render_pdf_high_res(pdf_data, page_number, zoom) + elif image_data: + img = render_image_high_res(image_data) + else: + return PipelineResult(error="No input data (pdf_data or image_data required)") + result.stages['render'] = round(time.time() - t, 2) + result.image_width = img.shape[1] + result.image_height = img.shape[0] + logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s") + + # Stage 2: Deskew + t = time.time() + img, angle = deskew_image(img) + result.stages['deskew'] = round(time.time() - t, 2) + logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s") + + # Stage 3: Dewarp + if enable_dewarp: + t = time.time() + img, _dewarp_info = dewarp_image(img) + result.stages['dewarp'] = round(time.time() - t, 2) + + # Stage 4: Dual image preparation + t = time.time() + ocr_img = create_ocr_image(img) + layout_img = create_layout_image(img) + result.stages['image_prep'] = round(time.time() - t, 2) + + # Stage 5: Layout analysis + t = time.time() + regions = analyze_layout(layout_img, ocr_img) + result.stages['layout'] = round(time.time() - t, 2) + result.columns_detected = len([r for r in regions if r.type.startswith('column')]) + logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s") + + # Stage 6: Multi-pass OCR + t = time.time() + ocr_results = run_multi_pass_ocr(ocr_img, regions, lang) + result.stages['ocr'] = round(time.time() - t, 2) + total_words = sum(len(w) for w in ocr_results.values()) + result.word_count = total_words + logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s") + + # Stage 7: Line alignment + t = time.time() + vocab_rows = match_lines_to_vocab(ocr_results, regions) + result.stages['alignment'] = round(time.time() - t, 2) + + # Stage 8: Optional LLM correction + if enable_llm_correction: + t = time.time() + vocab_rows = await llm_post_correct(img, vocab_rows) + result.stages['llm_correction'] = round(time.time() - t, 2) + + # Convert to output format + result.vocabulary = [ + { + "english": row.english, + "german": row.german, + "example": row.example, + "confidence": round(row.confidence, 1), + } + for row in vocab_rows + if row.english or row.german # Skip empty rows + ] + + result.duration_seconds = round(time.time() - total_start, 2) + logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s") + + except Exception as e: + logger.error(f"CV Pipeline error: {e}") + import traceback + logger.debug(traceback.format_exc()) + result.error = str(e) + result.duration_seconds = round(time.time() - total_start, 2) + + return result + + +# --------------------------------------------------------------------------- +# LLM-based OCR Correction (Step 6) +# --------------------------------------------------------------------------- + +import httpx +import os +import json as _json +import re as _re + +_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") +OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b") +_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20")) +logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE) + +# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]" +_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]') + +# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion. +# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B) +# when they appear inside or next to a word character. +_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])') + + +def _entry_needs_review(entry: Dict) -> bool: + """Check if an entry should be sent to the LLM for review. + + Sends all non-empty entries that don't have IPA phonetic transcriptions. + The LLM prompt and _is_spurious_change() guard against unwanted changes. + """ + en = entry.get("english", "") or "" + de = entry.get("german", "") or "" + + # Skip completely empty entries + if not en.strip() and not de.strip(): + return False + # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them + if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de): + return False + return True + + +def _build_llm_prompt(table_lines: List[Dict]) -> str: + """Build the LLM correction prompt for a batch of entries.""" + return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch). + +DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden. + +NUR diese Korrekturen sind erlaubt: +- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball" +- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old" +- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin" +- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See" +- Ziffer 6 statt G oder g: "6eld" → "Geld" +- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help" + +ABSOLUT VERBOTEN — aendere NIEMALS: +- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst +- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN +- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst +- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest +- Eigennamen: Ben, London, China, Africa, Shakespeare usw. +- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw. +- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren +- Beispielsaetze in der ex-Spalte — NIEMALS aendern + +Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false. + +Antworte NUR mit dem JSON-Array. Kein Text davor oder danach. +Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge). + +/no_think + +Eingabe: +{_json.dumps(table_lines, ensure_ascii=False, indent=2)}""" + + +def _is_spurious_change(old_val: str, new_val: str) -> bool: + """Detect LLM changes that are likely wrong and should be discarded. + + Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are + legitimate OCR corrections. Everything else is rejected. + + Filters out: + - Case-only changes + - Changes that don't contain any digit→letter fix + - Completely different words (LLM translating or hallucinating) + - Additions or removals of whole words (count changed) + """ + if not old_val or not new_val: + return False + + # Case-only change — never a real OCR error + if old_val.lower() == new_val.lower(): + return True + + # If the word count changed significantly, the LLM rewrote rather than fixed + old_words = old_val.split() + new_words = new_val.split() + if abs(len(old_words) - len(new_words)) > 1: + return True + + # Core rule: a legitimate correction replaces a digit with the corresponding + # letter. If the change doesn't include such a substitution, reject it. + # Build a set of (old_char, new_char) pairs that differ between old and new. + # Use character-level diff heuristic: if lengths are close, zip and compare. + # Map of characters that OCR commonly misreads → set of correct replacements + _OCR_CHAR_MAP = { + # Digits mistaken for letters + '0': set('oOgG'), + '1': set('lLiI'), + '5': set('sS'), + '6': set('gG'), + '8': set('bB'), + # Non-letter symbols mistaken for letters + '|': set('lLiI1'), # pipe → lowercase l, capital I, or digit 1 + 'l': set('iI|1'), # lowercase l → capital I (and reverse) + } + has_valid_fix = False + if len(old_val) == len(new_val): + for oc, nc in zip(old_val, new_val): + if oc != nc: + if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]: + has_valid_fix = True + elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]: + # Reverse check (e.g. l→I where new is the "correct" char) + has_valid_fix = True + else: + # Length changed by 1: accept if old had a suspicious char sequence + _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]') + if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val): + has_valid_fix = True + + if not has_valid_fix: + return True # Reject — looks like translation or hallucination + + return False + + +def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]: + """Compare original entries with LLM-corrected ones, return (changes, corrected_entries).""" + changes = [] + entries_out = [] + for i, orig in enumerate(originals): + if i < len(corrected): + c = corrected[i] + entry = dict(orig) + for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]: + new_val = c.get(key, "").strip() + old_val = (orig.get(field_name, "") or "").strip() + if new_val and new_val != old_val: + # Filter spurious LLM changes + if _is_spurious_change(old_val, new_val): + continue + changes.append({ + "row_index": orig.get("row_index", i), + "field": field_name, + "old": old_val, + "new": new_val, + }) + entry[field_name] = new_val + entry["llm_corrected"] = True + entries_out.append(entry) + else: + entries_out.append(dict(orig)) + return changes, entries_out + + +# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ──────────────────────────── + +REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm" + +try: + from spellchecker import SpellChecker as _SpellChecker + _en_spell = _SpellChecker(language='en', distance=1) + _de_spell = _SpellChecker(language='de', distance=1) + _SPELL_AVAILABLE = True + logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE) +except ImportError: + _SPELL_AVAILABLE = False + logger.warning("pyspellchecker not installed — falling back to LLM review") + +# ─── Page-Ref Normalization ─────────────────────────────────────────────────── +# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60" +_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE) + + +def _normalize_page_ref(text: str) -> str: + """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'.""" + if not text: + return text + return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text) + + +# Suspicious OCR chars → ordered list of most-likely correct replacements +_SPELL_SUBS: Dict[str, List[str]] = { + '0': ['O', 'o'], + '1': ['l', 'I'], + '5': ['S', 's'], + '6': ['G', 'g'], + '8': ['B', 'b'], + '|': ['I', 'l', '1'], +} +_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys()) + +# Tokenizer: word tokens (letters + pipe) alternating with separators +_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)') + + +def _spell_dict_knows(word: str) -> bool: + """True if word is known in EN or DE dictionary.""" + if not _SPELL_AVAILABLE: + return False + w = word.lower() + return bool(_en_spell.known([w])) or bool(_de_spell.known([w])) + + +def _spell_fix_token(token: str, field: str = "") -> Optional[str]: + """Return corrected form of token, or None if no fix needed/possible. + + *field* is 'english' or 'german' — used to pick the right dictionary + for general spell correction (step 3 below). + """ + has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token) + + # 1. Already known word → no fix needed + if _spell_dict_knows(token): + return None + + # 2. Digit/pipe substitution (existing logic) + if has_suspicious: + # Standalone pipe → capital I + if token == '|': + return 'I' + # Dictionary-backed single-char substitution + for i, ch in enumerate(token): + if ch not in _SPELL_SUBS: + continue + for replacement in _SPELL_SUBS[ch]: + candidate = token[:i] + replacement + token[i + 1:] + if _spell_dict_knows(candidate): + return candidate + # Structural rule: suspicious char at position 0 + rest is all lowercase letters + first = token[0] + if first in _SPELL_SUBS and len(token) >= 2: + rest = token[1:] + if rest.isalpha() and rest.islower(): + candidate = _SPELL_SUBS[first][0] + rest + if not candidate[0].isdigit(): + return candidate + + # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u) + # Try single-char umlaut substitutions and check against dictionary. + if len(token) >= 3 and token.isalpha() and field == "german": + _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü', + 'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'} + for i, ch in enumerate(token): + if ch in _UMLAUT_SUBS: + candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:] + if _spell_dict_knows(candidate): + return candidate + + # 4. General spell correction for unknown words (no digits/pipes) + # e.g. "beautful" → "beautiful" + if not has_suspicious and len(token) >= 3 and token.isalpha(): + spell = _en_spell if field == "english" else _de_spell if field == "german" else None + if spell is not None: + correction = spell.correction(token.lower()) + if correction and correction != token.lower(): + # Preserve original capitalisation pattern + if token[0].isupper(): + correction = correction[0].upper() + correction[1:] + if _spell_dict_knows(correction): + return correction + return None + + +def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]: + """Apply OCR corrections to a text field. Returns (fixed_text, was_changed). + + *field* is 'english' or 'german' — forwarded to _spell_fix_token for + dictionary selection. + """ + if not text: + return text, False + has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS) + # If no suspicious chars AND no alpha chars that could be misspelled, skip + if not has_suspicious and not any(c.isalpha() for c in text): + return text, False + # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ") + fixed = _re.sub(r'(? Dict: + """Rule-based OCR correction: spell-checker + structural heuristics. + + Deterministic — never translates, never touches IPA, never hallucinates. + """ + t0 = time.time() + changes: List[Dict] = [] + all_corrected: List[Dict] = [] + for i, entry in enumerate(entries): + e = dict(entry) + # Page-ref normalization (always, regardless of review status) + old_ref = (e.get("source_page") or "").strip() + if old_ref: + new_ref = _normalize_page_ref(old_ref) + if new_ref != old_ref: + changes.append({ + "row_index": e.get("row_index", i), + "field": "source_page", + "old": old_ref, + "new": new_ref, + }) + e["source_page"] = new_ref + e["llm_corrected"] = True + if not _entry_needs_review(e): + all_corrected.append(e) + continue + for field_name in ("english", "german", "example"): + old_val = (e.get(field_name) or "").strip() + if not old_val: + continue + # example field is mixed-language — try German first (for umlauts) + lang = "german" if field_name in ("german", "example") else "english" + new_val, was_changed = _spell_fix_field(old_val, field=lang) + if was_changed and new_val != old_val: + changes.append({ + "row_index": e.get("row_index", i), + "field": field_name, + "old": old_val, + "new": new_val, + }) + e[field_name] = new_val + e["llm_corrected"] = True + all_corrected.append(e) + duration_ms = int((time.time() - t0) * 1000) + return { + "entries_original": entries, + "entries_corrected": all_corrected, + "changes": changes, + "skipped_count": 0, + "model_used": "spell-checker", + "duration_ms": duration_ms, + } + + +async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50): + """Async generator yielding SSE-compatible events for spell-checker review.""" + total = len(entries) + yield { + "type": "meta", + "total_entries": total, + "to_review": total, + "skipped": 0, + "model": "spell-checker", + "batch_size": batch_size, + } + result = spell_review_entries_sync(entries) + changes = result["changes"] + yield { + "type": "batch", + "batch_index": 0, + "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)], + "changes": changes, + "duration_ms": result["duration_ms"], + "progress": {"current": total, "total": total}, + } + yield { + "type": "complete", + "changes": changes, + "model_used": "spell-checker", + "duration_ms": result["duration_ms"], + "total_entries": total, + "reviewed": total, + "skipped": 0, + "corrections_found": len(changes), + "entries_corrected": result["entries_corrected"], + } + +# ─── End Spell-Checker ──────────────────────────────────────────────────────── + + +async def llm_review_entries( + entries: List[Dict], + model: str = None, +) -> Dict: + """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm).""" + if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: + return spell_review_entries_sync(entries) + if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: + logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") + + model = model or OLLAMA_REVIEW_MODEL + + # Filter: only entries that need review + reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)] + + if not reviewable: + return { + "entries_original": entries, + "entries_corrected": [dict(e) for e in entries], + "changes": [], + "skipped_count": len(entries), + "model_used": model, + "duration_ms": 0, + } + + review_entries = [e for _, e in reviewable] + table_lines = [ + {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")} + for e in review_entries + ] + + logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)", + len(review_entries), len(entries), model, len(entries) - len(reviewable)) + logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False)) + + prompt = _build_llm_prompt(table_lines) + + t0 = time.time() + async with httpx.AsyncClient(timeout=300.0) as client: + resp = await client.post( + f"{_OLLAMA_URL}/api/chat", + json={ + "model": model, + "messages": [{"role": "user", "content": prompt}], + "stream": False, + "think": False, # qwen3: disable chain-of-thought (Ollama >=0.6) + "options": {"temperature": 0.1, "num_predict": 8192}, + }, + ) + resp.raise_for_status() + content = resp.json().get("message", {}).get("content", "") + duration_ms = int((time.time() - t0) * 1000) + + logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content)) + logger.debug("LLM review raw response (first 500): %.500s", content) + + corrected = _parse_llm_json_array(content) + logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected)) + changes, corrected_entries = _diff_batch(review_entries, corrected) + + # Merge corrected entries back into the full list + all_corrected = [dict(e) for e in entries] + for batch_idx, (orig_idx, _) in enumerate(reviewable): + if batch_idx < len(corrected_entries): + all_corrected[orig_idx] = corrected_entries[batch_idx] + + return { + "entries_original": entries, + "entries_corrected": all_corrected, + "changes": changes, + "skipped_count": len(entries) - len(reviewable), + "model_used": model, + "duration_ms": duration_ms, + } + + +async def llm_review_entries_streaming( + entries: List[Dict], + model: str = None, + batch_size: int = _REVIEW_BATCH_SIZE, +): + """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE. + + Phase 0 (always): Run _fix_character_confusion and emit any changes so they are + visible in the UI — this is the only place the fix now runs (removed from Step 1 + of build_vocab_pipeline_streaming). + """ + # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) --- + _CONF_FIELDS = ('english', 'german', 'example') + originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries] + _fix_character_confusion(entries) # modifies in-place, returns same list + char_changes = [ + {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')} + for i in range(len(entries)) + for f in _CONF_FIELDS + if originals[i][f] != entries[i].get(f, '') + ] + + if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: + # Inject char_changes as a batch right after the meta event from the spell checker + _meta_sent = False + async for event in spell_review_entries_streaming(entries, batch_size): + yield event + if not _meta_sent and event.get('type') == 'meta' and char_changes: + _meta_sent = True + yield { + 'type': 'batch', + 'changes': char_changes, + 'entries_reviewed': sorted({c['row_index'] for c in char_changes}), + 'progress': {'current': 0, 'total': len(entries)}, + } + return + + if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: + logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") + + # LLM path: emit char_changes first (before meta) so they appear in the UI + if char_changes: + yield { + 'type': 'batch', + 'changes': char_changes, + 'entries_reviewed': sorted({c['row_index'] for c in char_changes}), + 'progress': {'current': 0, 'total': len(entries)}, + } + + model = model or OLLAMA_REVIEW_MODEL + + # Separate reviewable from skipped entries + reviewable = [] + skipped_indices = [] + for i, e in enumerate(entries): + if _entry_needs_review(e): + reviewable.append((i, e)) + else: + skipped_indices.append(i) + + total_to_review = len(reviewable) + + # meta event + yield { + "type": "meta", + "total_entries": len(entries), + "to_review": total_to_review, + "skipped": len(skipped_indices), + "model": model, + "batch_size": batch_size, + } + + all_changes = [] + all_corrected = [dict(e) for e in entries] + total_duration_ms = 0 + reviewed_count = 0 + + # Process in batches + for batch_start in range(0, total_to_review, batch_size): + batch_items = reviewable[batch_start:batch_start + batch_size] + batch_entries = [e for _, e in batch_items] + + table_lines = [ + {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")} + for e in batch_entries + ] + + prompt = _build_llm_prompt(table_lines) + + logger.info("LLM review streaming: batch %d — sending %d entries to %s", + batch_start // batch_size, len(batch_entries), model) + + t0 = time.time() + async with httpx.AsyncClient(timeout=300.0) as client: + resp = await client.post( + f"{_OLLAMA_URL}/api/chat", + json={ + "model": model, + "messages": [{"role": "user", "content": prompt}], + "stream": False, + "think": False, # qwen3: disable chain-of-thought + "options": {"temperature": 0.1, "num_predict": 8192}, + }, + ) + resp.raise_for_status() + content = resp.json().get("message", {}).get("content", "") + batch_ms = int((time.time() - t0) * 1000) + total_duration_ms += batch_ms + + logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content)) + logger.debug("LLM review streaming raw (first 500): %.500s", content) + + corrected = _parse_llm_json_array(content) + logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected)) + batch_changes, batch_corrected = _diff_batch(batch_entries, corrected) + + # Merge back + for batch_idx, (orig_idx, _) in enumerate(batch_items): + if batch_idx < len(batch_corrected): + all_corrected[orig_idx] = batch_corrected[batch_idx] + + all_changes.extend(batch_changes) + reviewed_count += len(batch_items) + + # Yield batch result + yield { + "type": "batch", + "batch_index": batch_start // batch_size, + "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items], + "changes": batch_changes, + "duration_ms": batch_ms, + "progress": {"current": reviewed_count, "total": total_to_review}, + } + + # Complete event + yield { + "type": "complete", + "changes": all_changes, + "model_used": model, + "duration_ms": total_duration_ms, + "total_entries": len(entries), + "reviewed": total_to_review, + "skipped": len(skipped_indices), + "corrections_found": len(all_changes), + "entries_corrected": all_corrected, + } + + +def _sanitize_for_json(text: str) -> str: + """Remove or escape control characters that break JSON parsing. + + Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid + JSON whitespace. Removes all other ASCII control characters (0x00-0x1f) + that are only valid inside JSON strings when properly escaped. + """ + # Replace literal control chars (except \\t \\n \\r) with a space + return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text) + + +def _parse_llm_json_array(text: str) -> List[Dict]: + """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags).""" + # Strip qwen3 ... blocks (present even with think=False on some builds) + text = _re.sub(r'.*?', '', text, flags=_re.DOTALL) + # Strip markdown code fences + text = _re.sub(r'```json\s*', '', text) + text = _re.sub(r'```\s*', '', text) + # Sanitize control characters before JSON parsing + text = _sanitize_for_json(text) + # Find first [ ... last ] + match = _re.search(r'\[.*\]', text, _re.DOTALL) + if match: + try: + return _json.loads(match.group()) + except (ValueError, _json.JSONDecodeError) as e: + logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200]) + else: + logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200]) + return [] diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 1c4961d..940381b 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -1,8163 +1,35 @@ """ CV-based Document Reconstruction Pipeline for Vocabulary Extraction. -Uses classical Computer Vision techniques for high-quality OCR: -- High-resolution PDF rendering (432 DPI) -- Deskew (rotation correction via Hough Lines) -- Dewarp (book curvature correction) — pass-through initially -- Dual image preparation (binarized for OCR, CLAHE for layout) -- Projection-profile layout analysis (column/row detection) -- Multi-pass Tesseract OCR with region-specific PSM settings -- Y-coordinate line alignment for vocabulary matching -- Optional LLM post-correction for low-confidence regions +Re-export facade — all logic lives in the sub-modules: + + cv_vocab_types Dataklassen, Konstanten, IPA, Feature-Flags + cv_preprocessing Bild-I/O, Orientierung, Deskew, Dewarp + cv_layout Dokumenttyp, Spalten, Zeilen, Klassifikation + cv_ocr_engines OCR-Engines, Vocab-Postprocessing, Text-Cleaning + cv_cell_grid Cell-Grid (v2 + Legacy), Vocab-Konvertierung + cv_review LLM/Spell Review, Pipeline-Orchestrierung Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ -import io -import logging -import time -from concurrent.futures import ThreadPoolExecutor, as_completed -from dataclasses import dataclass, field -from typing import Any, Dict, Generator, List, Optional, Tuple - -import numpy as np - -logger = logging.getLogger(__name__) - -# --- Availability Guards --- - -try: - import cv2 - CV2_AVAILABLE = True -except ImportError: - cv2 = None - CV2_AVAILABLE = False - logger.warning("OpenCV not available — CV pipeline disabled") - -try: - import pytesseract - from PIL import Image - TESSERACT_AVAILABLE = True -except ImportError: - pytesseract = None - Image = None - TESSERACT_AVAILABLE = False - logger.warning("pytesseract/Pillow not available — CV pipeline disabled") - -CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE - -# --- IPA Dictionary --- - -import json -import os -import re - -IPA_AVAILABLE = False -_ipa_convert_american = None -_britfone_dict: Dict[str, str] = {} - -try: - import eng_to_ipa as _eng_to_ipa - _ipa_convert_american = _eng_to_ipa.convert - IPA_AVAILABLE = True - logger.info("eng_to_ipa available — American IPA lookup enabled") -except ImportError: - logger.info("eng_to_ipa not installed — American IPA disabled") - -# Load Britfone dictionary (MIT license, ~15k British English IPA entries) -_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json') -if os.path.exists(_britfone_path): - try: - with open(_britfone_path, 'r', encoding='utf-8') as f: - _britfone_dict = json.load(f) - IPA_AVAILABLE = True - logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries") - except Exception as e: - logger.warning(f"Failed to load Britfone: {e}") -else: - logger.info("Britfone not found — British IPA disabled") - -# --- Language Detection Constants --- - -GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht', - 'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird', - 'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur', - 'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben', - 'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'} - -ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of', - 'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from', - 'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', - 'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he', - 'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'} - - -# --- Data Classes --- - -@dataclass -class PageRegion: - """A detected region on the page.""" - type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom' - x: int - y: int - width: int - height: int - classification_confidence: float = 1.0 # 0.0-1.0 - classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback' - - -@dataclass -class ColumnGeometry: - """Geometrisch erkannte Spalte vor Typ-Klassifikation.""" - index: int # 0-basiert, links->rechts - x: int - y: int - width: int - height: int - word_count: int - words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...) - width_ratio: float # width / content_width (0.0-1.0) - is_sub_column: bool = False # True if created by _detect_sub_columns() split - - -@dataclass -class RowGeometry: - """Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation.""" - index: int # 0-basiert, oben→unten - x: int # absolute left (= content left_x) - y: int # absolute y start - width: int # content width - height: int # Zeilenhoehe in px - word_count: int - words: List[Dict] - row_type: str = 'content' # 'content' | 'header' | 'footer' - gap_before: int = 0 # Gap in px ueber dieser Zeile - - -@dataclass -class VocabRow: - """A single vocabulary entry assembled from multi-column OCR.""" - english: str = "" - german: str = "" - example: str = "" - source_page: str = "" - confidence: float = 0.0 - y_position: int = 0 - - -@dataclass -class PipelineResult: - """Complete result of the CV pipeline.""" - vocabulary: List[Dict[str, Any]] = field(default_factory=list) - word_count: int = 0 - columns_detected: int = 0 - duration_seconds: float = 0.0 - stages: Dict[str, float] = field(default_factory=dict) - error: Optional[str] = None - image_width: int = 0 - image_height: int = 0 - - -@dataclass -class DocumentTypeResult: - """Result of automatic document type detection.""" - doc_type: str # 'vocab_table' | 'full_text' | 'generic_table' - confidence: float # 0.0-1.0 - pipeline: str # 'cell_first' | 'full_page' - skip_steps: List[str] = field(default_factory=list) # e.g. ['columns', 'rows'] - features: Dict[str, Any] = field(default_factory=dict) # debug info - - -# ============================================================================= -# Stage 1: High-Resolution PDF Rendering -# ============================================================================= - -def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray: - """Render a PDF page to a high-resolution numpy array (BGR). - - Args: - pdf_data: Raw PDF bytes. - page_number: 0-indexed page number. - zoom: Zoom factor (3.0 = 432 DPI). - - Returns: - numpy array in BGR format. - """ - import fitz # PyMuPDF - - pdf_doc = fitz.open(stream=pdf_data, filetype="pdf") - if page_number >= pdf_doc.page_count: - raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)") - - page = pdf_doc[page_number] - mat = fitz.Matrix(zoom, zoom) - pix = page.get_pixmap(matrix=mat) - - # Convert to numpy BGR - img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n) - if pix.n == 4: # RGBA - img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR) - elif pix.n == 3: # RGB - img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR) - else: # Grayscale - img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR) - - pdf_doc.close() - return img_bgr - - -def render_image_high_res(image_data: bytes) -> np.ndarray: - """Load an image (PNG/JPEG) into a numpy array (BGR). - - Args: - image_data: Raw image bytes. - - Returns: - numpy array in BGR format. - """ - img_array = np.frombuffer(image_data, dtype=np.uint8) - img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR) - if img_bgr is None: - raise ValueError("Could not decode image data") - return img_bgr - - -# ============================================================================= -# Stage 1b: Orientation Detection (0°/90°/180°/270°) -# ============================================================================= - -def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]: - """Detect page orientation via Tesseract OSD and rotate if needed. - - Handles upside-down scans (180°) common with book scanners where - every other page is flipped due to the scanner hinge. - - Returns: - (corrected_image, rotation_degrees) — rotation is 0, 90, 180, or 270. - """ - if pytesseract is None: - return img_bgr, 0 - - try: - # Tesseract OSD needs a grayscale or RGB image - gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) - pil_img = Image.fromarray(gray) - - osd = pytesseract.image_to_osd(pil_img, output_type=pytesseract.Output.DICT) - rotate = osd.get("rotate", 0) - confidence = osd.get("orientation_conf", 0.0) - - logger.info(f"OSD: orientation={rotate}° confidence={confidence:.1f}") - - if rotate == 0 or confidence < 1.0: - return img_bgr, 0 - - # Apply rotation - if rotate == 180: - corrected = cv2.rotate(img_bgr, cv2.ROTATE_180) - elif rotate == 90: - corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_COUNTERCLOCKWISE) - elif rotate == 270: - corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_CLOCKWISE) - else: - return img_bgr, 0 - - logger.info(f"OSD: rotated {rotate}° to fix orientation") - return corrected, rotate - - except Exception as e: - logger.warning(f"OSD orientation detection failed: {e}") - return img_bgr, 0 - - -# ============================================================================= -# Stage 2: Deskew (Rotation Correction) -# ============================================================================= - -def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]: - """Correct rotation using Hough Line detection. - - Args: - img: BGR image. - - Returns: - Tuple of (corrected image, detected angle in degrees). - """ - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - # Binarize for line detection - _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - - # Detect lines - lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100, - minLineLength=img.shape[1] // 4, maxLineGap=20) - - if lines is None or len(lines) < 3: - return img, 0.0 - - # Compute angles of near-horizontal lines - angles = [] - for line in lines: - x1, y1, x2, y2 = line[0] - angle = np.degrees(np.arctan2(y2 - y1, x2 - x1)) - if abs(angle) < 15: # Only near-horizontal - angles.append(angle) - - if not angles: - return img, 0.0 - - median_angle = float(np.median(angles)) - - # Limit correction to ±5° - if abs(median_angle) > 5.0: - median_angle = 5.0 * np.sign(median_angle) - - if abs(median_angle) < 0.1: - return img, 0.0 - - # Rotate - h, w = img.shape[:2] - center = (w // 2, h // 2) - M = cv2.getRotationMatrix2D(center, median_angle, 1.0) - corrected = cv2.warpAffine(img, M, (w, h), - flags=cv2.INTER_LINEAR, - borderMode=cv2.BORDER_REPLICATE) - - logger.info(f"Deskew: corrected {median_angle:.2f}° rotation") - return corrected, median_angle - - -def deskew_image_by_word_alignment( - image_data: bytes, - lang: str = "eng+deu", - downscale_factor: float = 0.5, -) -> Tuple[bytes, float]: - """Correct rotation by fitting a line through left-most word starts per text line. - - More robust than Hough-based deskew for vocabulary worksheets where text lines - have consistent left-alignment. Runs a quick Tesseract pass on a downscaled - copy to find word positions, computes the dominant left-edge column, fits a - line through those points and rotates the full-resolution image. - - Args: - image_data: Raw image bytes (PNG/JPEG). - lang: Tesseract language string for the quick pass. - downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%). - - Returns: - Tuple of (rotated image as PNG bytes, detected angle in degrees). - """ - if not CV2_AVAILABLE or not TESSERACT_AVAILABLE: - return image_data, 0.0 - - # 1. Decode image - img_array = np.frombuffer(image_data, dtype=np.uint8) - img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) - if img is None: - logger.warning("deskew_by_word_alignment: could not decode image") - return image_data, 0.0 - - orig_h, orig_w = img.shape[:2] - - # 2. Downscale for fast Tesseract pass - small_w = int(orig_w * downscale_factor) - small_h = int(orig_h * downscale_factor) - small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA) - - # 3. Quick Tesseract — word-level positions - pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB)) - try: - data = pytesseract.image_to_data( - pil_small, lang=lang, config="--psm 6 --oem 3", - output_type=pytesseract.Output.DICT, - ) - except Exception as e: - logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}") - return image_data, 0.0 - - # 4. Per text-line, find the left-most word start - # Group by (block_num, par_num, line_num) - from collections import defaultdict - line_groups: Dict[tuple, list] = defaultdict(list) - for i in range(len(data["text"])): - text = (data["text"][i] or "").strip() - conf = int(data["conf"][i]) - if not text or conf < 20: - continue - key = (data["block_num"][i], data["par_num"][i], data["line_num"][i]) - line_groups[key].append(i) - - if len(line_groups) < 5: - logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping") - return image_data, 0.0 - - # For each line, pick the word with smallest 'left' → compute (left_x, center_y) - # Scale back to original resolution - scale = 1.0 / downscale_factor - points = [] # list of (x, y) in original-image coords - for key, indices in line_groups.items(): - best_idx = min(indices, key=lambda i: data["left"][i]) - lx = data["left"][best_idx] * scale - top = data["top"][best_idx] * scale - h = data["height"][best_idx] * scale - cy = top + h / 2.0 - points.append((lx, cy)) - - # 5. Find dominant left-edge column + compute angle - xs = np.array([p[0] for p in points]) - ys = np.array([p[1] for p in points]) - median_x = float(np.median(xs)) - tolerance = orig_w * 0.03 # 3% of image width - - mask = np.abs(xs - median_x) <= tolerance - filtered_xs = xs[mask] - filtered_ys = ys[mask] - - if len(filtered_xs) < 5: - logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping") - return image_data, 0.0 - - # polyfit: x = a*y + b → a = dx/dy → angle = arctan(a) - coeffs = np.polyfit(filtered_ys, filtered_xs, 1) - slope = coeffs[0] # dx/dy - angle_rad = np.arctan(slope) - angle_deg = float(np.degrees(angle_rad)) - - # Clamp to ±5° - angle_deg = max(-5.0, min(5.0, angle_deg)) - - logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points " - f"(total lines: {len(line_groups)})") - - if abs(angle_deg) < 0.05: - return image_data, 0.0 - - # 6. Rotate full-res image - center = (orig_w // 2, orig_h // 2) - M = cv2.getRotationMatrix2D(center, angle_deg, 1.0) - rotated = cv2.warpAffine(img, M, (orig_w, orig_h), - flags=cv2.INTER_LINEAR, - borderMode=cv2.BORDER_REPLICATE) - - # Encode back to PNG - success, png_buf = cv2.imencode(".png", rotated) - if not success: - logger.warning("deskew_by_word_alignment: PNG encoding failed") - return image_data, 0.0 - - return png_buf.tobytes(), angle_deg - - -def _projection_gradient_score(profile: np.ndarray) -> float: - """Score a projection profile by the L2-norm of its first derivative. - - Higher score = sharper transitions between text-lines and gaps, - i.e. better row/column alignment. - """ - diff = np.diff(profile) - return float(np.sum(diff * diff)) - - -def deskew_image_iterative( - img: np.ndarray, - coarse_range: float = 5.0, - coarse_step: float = 0.1, - fine_range: float = 0.15, - fine_step: float = 0.02, -) -> Tuple[np.ndarray, float, Dict[str, Any]]: - """Iterative deskew using vertical-edge projection optimisation. - - The key insight: at the correct rotation angle, vertical features - (word left-edges, column borders) become truly vertical, producing - the sharpest peaks in the vertical projection of vertical edges. - - Method: - 1. Detect vertical edges via Sobel-X on the central crop. - 2. Coarse sweep: rotate edge image, compute vertical projection - gradient score. The angle where vertical edges align best wins. - 3. Fine sweep: refine around the coarse winner. - - Args: - img: BGR image (full resolution). - coarse_range: half-range in degrees for the coarse sweep. - coarse_step: step size in degrees for the coarse sweep. - fine_range: half-range around the coarse winner for the fine sweep. - fine_step: step size in degrees for the fine sweep. - - Returns: - (rotated_bgr, angle_degrees, debug_dict) - """ - h, w = img.shape[:2] - debug: Dict[str, Any] = {} - - # --- Grayscale + vertical edge detection --- - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - - # Central crop (15%-85% height, 10%-90% width) to avoid page margins - y_lo, y_hi = int(h * 0.15), int(h * 0.85) - x_lo, x_hi = int(w * 0.10), int(w * 0.90) - gray_crop = gray[y_lo:y_hi, x_lo:x_hi] - - # Sobel-X → absolute vertical edges - sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3) - edges = np.abs(sobel_x) - # Normalise to 0-255 for consistent scoring - edge_max = edges.max() - if edge_max > 0: - edges = (edges / edge_max * 255).astype(np.uint8) - else: - return img, 0.0, {"error": "no edges detected"} - - crop_h, crop_w = edges.shape[:2] - crop_center = (crop_w // 2, crop_h // 2) - - # Trim margin after rotation to avoid border artifacts - trim_y = max(4, int(crop_h * 0.03)) - trim_x = max(4, int(crop_w * 0.03)) - - def _sweep_edges(angles: np.ndarray) -> list: - """Score each angle by vertical projection gradient of vertical edges.""" - results = [] - for angle in angles: - if abs(angle) < 1e-6: - rotated = edges - else: - M = cv2.getRotationMatrix2D(crop_center, angle, 1.0) - rotated = cv2.warpAffine(edges, M, (crop_w, crop_h), - flags=cv2.INTER_NEAREST, - borderMode=cv2.BORDER_REPLICATE) - # Trim borders to avoid edge artifacts - trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x] - v_profile = np.sum(trimmed, axis=0, dtype=np.float64) - score = _projection_gradient_score(v_profile) - results.append((float(angle), score)) - return results - - # --- Phase 1: coarse sweep --- - coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step) - coarse_results = _sweep_edges(coarse_angles) - best_coarse = max(coarse_results, key=lambda x: x[1]) - best_coarse_angle, best_coarse_score = best_coarse - - debug["coarse_best_angle"] = round(best_coarse_angle, 2) - debug["coarse_best_score"] = round(best_coarse_score, 1) - debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results] - - # --- Phase 2: fine sweep around coarse winner --- - fine_lo = best_coarse_angle - fine_range - fine_hi = best_coarse_angle + fine_range - fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step) - fine_results = _sweep_edges(fine_angles) - best_fine = max(fine_results, key=lambda x: x[1]) - best_fine_angle, best_fine_score = best_fine - - debug["fine_best_angle"] = round(best_fine_angle, 2) - debug["fine_best_score"] = round(best_fine_score, 1) - debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results] - - final_angle = best_fine_angle - - # Clamp to ±5° - final_angle = max(-5.0, min(5.0, final_angle)) - - logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}° fine={best_fine_angle:.2f}° -> {final_angle:.2f}°") - - if abs(final_angle) < 0.05: - return img, 0.0, debug - - # --- Rotate full-res image --- - center = (w // 2, h // 2) - M = cv2.getRotationMatrix2D(center, final_angle, 1.0) - rotated = cv2.warpAffine(img, M, (w, h), - flags=cv2.INTER_LINEAR, - borderMode=cv2.BORDER_REPLICATE) - - return rotated, final_angle, debug - - -def _measure_textline_slope(img: np.ndarray) -> float: - """Measure residual text-line slope via Tesseract word-position regression. - - Groups Tesseract words by (block, par, line), fits a linear regression - per line (y = slope * x + b), and returns the trimmed-mean slope in - degrees. Positive = text rises to the right, negative = falls. - - This is the most direct measurement of remaining rotation after deskew. - """ - import math as _math - - if not TESSERACT_AVAILABLE or not CV2_AVAILABLE: - return 0.0 - - h, w = img.shape[:2] - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - data = pytesseract.image_to_data( - Image.fromarray(gray), - output_type=pytesseract.Output.DICT, - config="--psm 6", - ) - - # Group word centres by text line - lines: Dict[tuple, list] = {} - for i in range(len(data["text"])): - txt = (data["text"][i] or "").strip() - if len(txt) < 2 or int(data["conf"][i]) < 30: - continue - key = (data["block_num"][i], data["par_num"][i], data["line_num"][i]) - cx = data["left"][i] + data["width"][i] / 2.0 - cy = data["top"][i] + data["height"][i] / 2.0 - lines.setdefault(key, []).append((cx, cy)) - - # Per-line linear regression → slope angle - slopes: list = [] - for pts in lines.values(): - if len(pts) < 3: - continue - pts.sort(key=lambda p: p[0]) - xs = np.array([p[0] for p in pts], dtype=np.float64) - ys = np.array([p[1] for p in pts], dtype=np.float64) - if xs[-1] - xs[0] < w * 0.15: - continue # skip short lines - A = np.vstack([xs, np.ones_like(xs)]).T - result = np.linalg.lstsq(A, ys, rcond=None) - slope = result[0][0] - slopes.append(_math.degrees(_math.atan(slope))) - - if len(slopes) < 3: - return 0.0 - - # Trimmed mean (drop 10% extremes on each side) - slopes.sort() - trim = max(1, len(slopes) // 10) - trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes - if not trimmed: - return 0.0 - - return sum(trimmed) / len(trimmed) - - -def deskew_two_pass( - img: np.ndarray, - coarse_range: float = 5.0, -) -> Tuple[np.ndarray, float, Dict[str, Any]]: - """Two-pass deskew: iterative projection + word-alignment residual check. - - Pass 1: ``deskew_image_iterative()`` (vertical-edge projection, wide range). - Pass 2: ``deskew_image_by_word_alignment()`` on the already-corrected image - to detect and fix residual skew that the projection method missed. - - The two corrections are summed. If the residual from Pass 2 is below - 0.3° it is ignored (already good enough). - - Returns: - (corrected_bgr, total_angle_degrees, debug_dict) - """ - debug: Dict[str, Any] = {} - - # --- Pass 1: iterative projection --- - corrected, angle1, dbg1 = deskew_image_iterative( - img.copy(), coarse_range=coarse_range, - ) - debug["pass1_angle"] = round(angle1, 3) - debug["pass1_method"] = "iterative" - debug["pass1_debug"] = dbg1 - - # --- Pass 2: word-alignment residual check on corrected image --- - angle2 = 0.0 - try: - # Encode the corrected image to PNG bytes for word-alignment - ok, buf = cv2.imencode(".png", corrected) - if ok: - corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes()) - if abs(angle2) >= 0.3: - # Significant residual — decode and use the second correction - arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8) - corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR) - if corrected2 is not None: - corrected = corrected2 - logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° applied " - f"(total={angle1 + angle2:.2f}°)") - else: - angle2 = 0.0 - else: - logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}° < 0.3° — skipped") - angle2 = 0.0 - except Exception as e: - logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}") - angle2 = 0.0 - - # --- Pass 3: Tesseract text-line regression residual check --- - # The most reliable final check: measure actual text-line slopes - # using Tesseract word positions and linear regression per line. - angle3 = 0.0 - try: - residual = _measure_textline_slope(corrected) - debug["pass3_raw"] = round(residual, 3) - if abs(residual) >= 0.3: - h3, w3 = corrected.shape[:2] - center3 = (w3 // 2, h3 // 2) - M3 = cv2.getRotationMatrix2D(center3, residual, 1.0) - corrected = cv2.warpAffine( - corrected, M3, (w3, h3), - flags=cv2.INTER_LINEAR, - borderMode=cv2.BORDER_REPLICATE, - ) - angle3 = residual - logger.info( - "deskew_two_pass: pass3 text-line residual=%.2f° applied", - residual, - ) - else: - logger.info( - "deskew_two_pass: pass3 text-line residual=%.2f° < 0.3° — skipped", - residual, - ) - except Exception as e: - logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e) - - total_angle = angle1 + angle2 + angle3 - debug["pass2_angle"] = round(angle2, 3) - debug["pass2_method"] = "word_alignment" - debug["pass3_angle"] = round(angle3, 3) - debug["pass3_method"] = "textline_regression" - debug["total_angle"] = round(total_angle, 3) - - logger.info( - "deskew_two_pass: pass1=%.2f° + pass2=%.2f° + pass3=%.2f° = %.2f°", - angle1, angle2, angle3, total_angle, - ) - - return corrected, total_angle, debug - - -# ============================================================================= -# Stage 3: Dewarp (Book Curvature Correction) -# ============================================================================= - -def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]: - """Detect the vertical shear angle of the page. - - After deskew (horizontal lines aligned), vertical features like column - edges may still be tilted. This measures that tilt by tracking the - strongest vertical edge across horizontal strips. - - The result is a shear angle in degrees: the angular difference between - true vertical and the detected column edge. - - Returns: - Dict with keys: method, shear_degrees, confidence. - """ - h, w = img.shape[:2] - result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0} - - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - - # Vertical Sobel to find vertical edges - sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3) - abs_sobel = np.abs(sobel_x).astype(np.uint8) - - # Binarize with Otsu - _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - - num_strips = 20 - strip_h = h // num_strips - edge_positions = [] # (y_center, x_position) - - for i in range(num_strips): - y_start = i * strip_h - y_end = min((i + 1) * strip_h, h) - strip = binary[y_start:y_end, :] - - # Project vertically (sum along y-axis) - projection = np.sum(strip, axis=0).astype(np.float64) - if projection.max() == 0: - continue - - # Find the strongest vertical edge in left 40% of image - search_w = int(w * 0.4) - left_proj = projection[:search_w] - if left_proj.max() == 0: - continue - - # Smooth and find peak - kernel_size = max(3, w // 100) - if kernel_size % 2 == 0: - kernel_size += 1 - smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten() - x_pos = float(np.argmax(smoothed)) - y_center = (y_start + y_end) / 2.0 - edge_positions.append((y_center, x_pos)) - - if len(edge_positions) < 8: - return result - - ys = np.array([p[0] for p in edge_positions]) - xs = np.array([p[1] for p in edge_positions]) - - # Remove outliers (> 2 std from median) - median_x = np.median(xs) - std_x = max(np.std(xs), 1.0) - mask = np.abs(xs - median_x) < 2 * std_x - ys = ys[mask] - xs = xs[mask] - - if len(ys) < 6: - return result - - # Fit straight line: x = slope * y + intercept - # The slope tells us the tilt of the vertical edge - straight_coeffs = np.polyfit(ys, xs, 1) - slope = straight_coeffs[0] # dx/dy in pixels - fitted = np.polyval(straight_coeffs, ys) - residuals = xs - fitted - rmse = float(np.sqrt(np.mean(residuals ** 2))) - - # Convert slope to angle: arctan(dx/dy) in degrees - import math - shear_degrees = math.degrees(math.atan(slope)) - - confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0) - - result["shear_degrees"] = round(shear_degrees, 3) - result["confidence"] = round(float(confidence), 2) - - return result - - -def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]: - """Detect shear angle by maximising variance of horizontal text-line projections. - - Principle: horizontal text lines produce a row-projection profile with sharp - peaks (high variance) when the image is correctly aligned. Any residual shear - smears the peaks and reduces variance. We sweep ±3° and pick the angle whose - corrected projection has the highest variance. - - Works best on pages with clear horizontal banding (vocabulary tables, prose). - Complements _detect_shear_angle() which needs strong vertical edges. - - Returns: - Dict with keys: method, shear_degrees, confidence. - """ - import math - result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0} - - h, w = img.shape[:2] - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - - # Otsu binarisation - _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - - # Work at half resolution for speed - small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA) - sh, sw = small.shape - - # 2-pass angle sweep for 10x better precision: - # Pass 1: Coarse sweep ±3° in 0.5° steps (13 values) - # Pass 2: Fine sweep ±0.5° around coarse best in 0.05° steps (21 values) - - def _sweep_variance(angles_list): - results = [] - for angle_deg in angles_list: - if abs(angle_deg) < 0.001: - rotated = small - else: - shear_tan = math.tan(math.radians(angle_deg)) - M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]]) - rotated = cv2.warpAffine(small, M, (sw, sh), - flags=cv2.INTER_NEAREST, - borderMode=cv2.BORDER_CONSTANT) - profile = np.sum(rotated, axis=1).astype(float) - results.append((angle_deg, float(np.var(profile)))) - return results - - # Pass 1: coarse - coarse_angles = [a * 0.5 for a in range(-6, 7)] # 13 values - coarse_results = _sweep_variance(coarse_angles) - coarse_best = max(coarse_results, key=lambda x: x[1]) - - # Pass 2: fine around coarse best - fine_center = coarse_best[0] - fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)] # 21 values - fine_results = _sweep_variance(fine_angles) - fine_best = max(fine_results, key=lambda x: x[1]) - - best_angle = fine_best[0] - best_variance = fine_best[1] - variances = coarse_results + fine_results - - # Confidence: how much sharper is the best angle vs. the mean? - all_mean = sum(v for _, v in variances) / len(variances) - if all_mean > 0 and best_variance > all_mean: - confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6) - else: - confidence = 0.0 - - result["shear_degrees"] = round(best_angle, 3) - result["confidence"] = round(max(0.0, min(1.0, confidence)), 2) - return result - - -def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]: - """Detect shear using Hough transform on printed table / ruled lines. - - Vocabulary worksheets have near-horizontal printed table borders. After - deskew these should be exactly horizontal; any residual tilt equals the - vertical shear angle (with inverted sign). - - The sign convention: a horizontal line tilting +α degrees (left end lower) - means the page has vertical shear of -α degrees (left column edge drifts - to the left going downward). - - Returns: - Dict with keys: method, shear_degrees, confidence. - """ - result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0} - - h, w = img.shape[:2] - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - - edges = cv2.Canny(gray, 50, 150, apertureSize=3) - - min_len = int(w * 0.15) - lines = cv2.HoughLinesP( - edges, rho=1, theta=np.pi / 360, - threshold=int(w * 0.08), - minLineLength=min_len, - maxLineGap=20, - ) - - if lines is None or len(lines) < 3: - return result - - horizontal_angles: List[Tuple[float, float]] = [] - for line in lines: - x1, y1, x2, y2 = line[0] - if x1 == x2: - continue - angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1))) - if abs(angle) <= 5.0: - length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)) - horizontal_angles.append((angle, length)) - - if len(horizontal_angles) < 3: - return result - - # Weighted median - angles_arr = np.array([a for a, _ in horizontal_angles]) - weights_arr = np.array([l for _, l in horizontal_angles]) - sorted_idx = np.argsort(angles_arr) - s_angles = angles_arr[sorted_idx] - s_weights = weights_arr[sorted_idx] - cum = np.cumsum(s_weights) - mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0)) - median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)]) - - agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0) - confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85 - - # Sign inversion: horizontal line tilt is complementary to vertical shear - shear_degrees = -median_angle - - result["shear_degrees"] = round(shear_degrees, 3) - result["confidence"] = round(max(0.0, min(1.0, confidence)), 2) - return result - - -def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]: - """Detect shear by measuring text-line straightness (Method D). - - Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word - bounding boxes, groups them into vertical columns by X-proximity, - and measures how the left-edge X position drifts with Y (vertical - position). The drift dx/dy is the tangent of the shear angle. - - This directly measures vertical shear (column tilt) rather than - horizontal text-line slope, which is already corrected by deskew. - - Returns: - Dict with keys: method, shear_degrees, confidence. - """ - import math - result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0} - - h, w = img.shape[:2] - # Downscale 50% for speed - scale = 0.5 - small = cv2.resize(img, (int(w * scale), int(h * scale)), - interpolation=cv2.INTER_AREA) - gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY) - pil_img = Image.fromarray(gray) - - try: - data = pytesseract.image_to_data( - pil_img, lang='eng+deu', config='--psm 11 --oem 3', - output_type=pytesseract.Output.DICT, - ) - except Exception: - return result - - # Collect word left-edges (x) and vertical centres (y) - words = [] - for i in range(len(data['text'])): - text = data['text'][i].strip() - conf = int(data['conf'][i]) - if not text or conf < 20 or len(text) < 2: - continue - left_x = float(data['left'][i]) - cy = data['top'][i] + data['height'][i] / 2.0 - word_w = float(data['width'][i]) - words.append((left_x, cy, word_w)) - - if len(words) < 15: - return result - - # --- Group words into vertical columns by left-edge X proximity --- - # Sort by x, then cluster words whose left-edges are within x_tol - avg_w = sum(ww for _, _, ww in words) / len(words) - x_tol = max(avg_w * 0.4, 8) # tolerance for "same column" - - words_by_x = sorted(words, key=lambda w: w[0]) - columns: List[List[Tuple[float, float]]] = [] # each: [(left_x, cy), ...] - cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])] - cur_x = words_by_x[0][0] - - for lx, cy, _ in words_by_x[1:]: - if abs(lx - cur_x) <= x_tol: - cur_col.append((lx, cy)) - # Update running x as median of cluster - cur_x = cur_x * 0.8 + lx * 0.2 - else: - if len(cur_col) >= 5: - columns.append(cur_col) - cur_col = [(lx, cy)] - cur_x = lx - if len(cur_col) >= 5: - columns.append(cur_col) - - if len(columns) < 2: - return result - - # --- For each column, measure X-drift as a function of Y --- - # Fit: left_x = a * cy + b → a = dx/dy = tan(shear_angle) - drifts = [] - for col in columns: - ys = np.array([p[1] for p in col]) - xs = np.array([p[0] for p in col]) - y_range = ys.max() - ys.min() - if y_range < h * scale * 0.3: - continue # column must span at least 30% of image height - # Linear regression: x = a*y + b - coeffs = np.polyfit(ys, xs, 1) - drifts.append(coeffs[0]) # dx/dy - - if len(drifts) < 2: - return result - - # Median dx/dy → shear angle - # dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right - median_drift = float(np.median(drifts)) - shear_degrees = math.degrees(math.atan(median_drift)) - - # Confidence from column count + drift consistency - drift_std = float(np.std(drifts)) - consistency = max(0.0, 1.0 - drift_std * 50) # tighter penalty for drift variance - count_factor = min(1.0, len(drifts) / 4.0) - confidence = count_factor * 0.5 + consistency * 0.5 - - result["shear_degrees"] = round(shear_degrees, 3) - result["confidence"] = round(max(0.0, min(1.0, confidence)), 2) - logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, " - "shear=%.3f°, conf=%.2f", - len(columns), len(drifts), median_drift, - shear_degrees, confidence) - return result - - -def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool: - """Check whether the dewarp correction actually improved alignment. - - Compares horizontal projection variance before and after correction. - Higher variance means sharper text-line peaks, which indicates better - horizontal alignment. - - Returns True if the correction improved the image, False if it should - be discarded. - """ - def _h_proj_variance(img: np.ndarray) -> float: - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - _, binary = cv2.threshold(gray, 0, 255, - cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2), - interpolation=cv2.INTER_AREA) - profile = np.sum(small, axis=1).astype(float) - return float(np.var(profile)) - - var_before = _h_proj_variance(original) - var_after = _h_proj_variance(corrected) - - # Correction must improve variance (even by a tiny margin) - return var_after > var_before - - -def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray: - """Apply a vertical shear correction to an image. - - Shifts each row horizontally proportional to its distance from the - vertical center. This corrects the tilt of vertical features (columns) - without affecting horizontal alignment (text lines). - - Args: - img: BGR image. - shear_degrees: Shear angle in degrees. Positive = shift top-right/bottom-left. - - Returns: - Corrected image. - """ - import math - h, w = img.shape[:2] - shear_tan = math.tan(math.radians(shear_degrees)) - - # Affine matrix: shift x by shear_tan * (y - h/2) - # [1 shear_tan -h/2*shear_tan] - # [0 1 0 ] - M = np.float32([ - [1, shear_tan, -h / 2.0 * shear_tan], - [0, 1, 0], - ]) - - corrected = cv2.warpAffine(img, M, (w, h), - flags=cv2.INTER_LINEAR, - borderMode=cv2.BORDER_REPLICATE) - return corrected - - -def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]: - """Combine multiple shear detections into a single weighted estimate (v2). - - Ensemble v2 changes vs v1: - - Minimum confidence raised to 0.5 (was 0.3) - - text_lines method gets 1.5× weight boost (most reliable detector) - - Outlier filter at 1° from weighted mean - - Returns: - (shear_degrees, ensemble_confidence, methods_used_str) - """ - # Confidence threshold — lowered from 0.5 to 0.35 to catch subtle shear - # that individual methods detect with moderate confidence. - _MIN_CONF = 0.35 - - # text_lines gets a weight boost as the most content-aware method - _METHOD_WEIGHT_BOOST = {"text_lines": 1.5} - - accepted = [] - for d in detections: - if d["confidence"] < _MIN_CONF: - continue - boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0) - effective_conf = d["confidence"] * boost - accepted.append((d["shear_degrees"], effective_conf, d["method"])) - - if not accepted: - return 0.0, 0.0, "none" - - if len(accepted) == 1: - deg, conf, method = accepted[0] - return deg, min(conf, 1.0), method - - # First pass: weighted mean - total_w = sum(c for _, c, _ in accepted) - w_mean = sum(d * c for d, c, _ in accepted) / total_w - - # Outlier filter: keep results within 1° of weighted mean - filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0] - if not filtered: - filtered = accepted # fallback: keep all - - # Second pass: weighted mean on filtered results - total_w2 = sum(c for _, c, _ in filtered) - final_deg = sum(d * c for d, c, _ in filtered) / total_w2 - - # Ensemble confidence: average of individual confidences, boosted when - # methods agree (all within 0.5° of each other) - avg_conf = total_w2 / len(filtered) - spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered) - agreement_bonus = 0.15 if spread < 0.5 else 0.0 - ensemble_conf = min(1.0, avg_conf + agreement_bonus) - - methods_str = "+".join(m for _, _, m in filtered) - return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str - - -def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]: - """Correct vertical shear after deskew (v2 with quality gate). - - After deskew aligns horizontal text lines, vertical features (column - edges) may still be tilted. This detects the tilt angle using an ensemble - of four complementary methods and applies an affine shear correction. - - Methods (all run in ~150ms total): - A. _detect_shear_angle() — vertical edge profile (~50ms) - B. _detect_shear_by_projection() — horizontal text-line variance (~30ms) - C. _detect_shear_by_hough() — Hough lines on table borders (~20ms) - D. _detect_shear_by_text_lines() — text-line straightness (~50ms) - - Quality gate: after correction, horizontal projection variance is compared - before vs after. If correction worsened alignment, it is discarded. - - Args: - img: BGR image (already deskewed). - use_ensemble: If False, fall back to single-method behaviour (method A only). - - Returns: - Tuple of (corrected_image, dewarp_info). - dewarp_info keys: method, shear_degrees, confidence, detections. - """ - no_correction = { - "method": "none", - "shear_degrees": 0.0, - "confidence": 0.0, - "detections": [], - } - - if not CV2_AVAILABLE: - return img, no_correction - - t0 = time.time() - - if use_ensemble: - det_a = _detect_shear_angle(img) - det_b = _detect_shear_by_projection(img) - det_c = _detect_shear_by_hough(img) - det_d = _detect_shear_by_text_lines(img) - detections = [det_a, det_b, det_c, det_d] - shear_deg, confidence, method = _ensemble_shear(detections) - else: - det_a = _detect_shear_angle(img) - detections = [det_a] - shear_deg = det_a["shear_degrees"] - confidence = det_a["confidence"] - method = det_a["method"] - - duration = time.time() - t0 - - logger.info( - "dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | " - "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f", - shear_deg, confidence, method, duration, - detections[0]["shear_degrees"], detections[0]["confidence"], - detections[1]["shear_degrees"] if len(detections) > 1 else 0.0, - detections[1]["confidence"] if len(detections) > 1 else 0.0, - detections[2]["shear_degrees"] if len(detections) > 2 else 0.0, - detections[2]["confidence"] if len(detections) > 2 else 0.0, - detections[3]["shear_degrees"] if len(detections) > 3 else 0.0, - detections[3]["confidence"] if len(detections) > 3 else 0.0, - ) - - # Always include individual detections (even when no correction applied) - _all_detections = [ - {"method": d["method"], "shear_degrees": d["shear_degrees"], - "confidence": d["confidence"]} - for d in detections - ] - - # Thresholds: very small shear (<0.08°) is truly irrelevant for OCR. - # For ensemble confidence, require at least 0.4 (lowered from 0.5 to - # catch moderate-confidence detections from multiple agreeing methods). - if abs(shear_deg) < 0.08 or confidence < 0.4: - no_correction["detections"] = _all_detections - return img, no_correction - - # Apply correction (negate the detected shear to straighten) - corrected = _apply_shear(img, -shear_deg) - - # Quality gate: verify the correction actually improved alignment. - # For small corrections (< 0.5°), the projection variance change can be - # negligible, so we skip the quality gate — the cost of a tiny wrong - # correction is much less than the cost of leaving 0.4° uncorrected - # (which shifts content ~25px at image edges on tall scans). - if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected): - logger.info("dewarp: quality gate REJECTED correction (%.3f°) — " - "projection variance did not improve", shear_deg) - no_correction["detections"] = _all_detections - return img, no_correction - - info = { - "method": method, - "shear_degrees": shear_deg, - "confidence": confidence, - "detections": _all_detections, - } - - return corrected, info - - -def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray: - """Apply shear correction with a manual angle. - - Args: - img: BGR image (deskewed, before dewarp). - shear_degrees: Shear angle in degrees to correct. - - Returns: - Corrected image. - """ - if abs(shear_degrees) < 0.001: - return img - return _apply_shear(img, -shear_degrees) - - -# ============================================================================= -# Document Type Detection -# ============================================================================= - -def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult: - """Detect whether the page is a vocab table, generic table, or full text. - - Uses projection profiles and text density analysis — no OCR required. - Runs in < 2 seconds. - - Args: - ocr_img: Binarized grayscale image (for projection profiles). - img_bgr: BGR color image. - - Returns: - DocumentTypeResult with doc_type, confidence, pipeline, skip_steps. - """ - if ocr_img is None or ocr_img.size == 0: - return DocumentTypeResult( - doc_type='full_text', confidence=0.5, pipeline='full_page', - skip_steps=['columns', 'rows'], - features={'error': 'empty image'}, - ) - - h, w = ocr_img.shape[:2] - - # --- 1. Vertical projection profile → detect column gaps --- - # Sum dark pixels along each column (x-axis). Gaps = valleys in the profile. - # Invert: dark pixels on white background → high values = text. - vert_proj = np.sum(ocr_img < 128, axis=0).astype(float) - - # Smooth the profile to avoid noise spikes - kernel_size = max(3, w // 100) - if kernel_size % 2 == 0: - kernel_size += 1 - vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same') - - # Find significant vertical gaps (columns of near-zero text density) - # A gap must be at least 1% of image width and have < 5% of max density - max_density = max(vert_smooth.max(), 1) - gap_threshold = max_density * 0.05 - min_gap_width = max(5, w // 100) - - in_gap = False - gap_count = 0 - gap_start = 0 - vert_gaps = [] - - for x in range(w): - if vert_smooth[x] < gap_threshold: - if not in_gap: - in_gap = True - gap_start = x - else: - if in_gap: - gap_width = x - gap_start - if gap_width >= min_gap_width: - gap_count += 1 - vert_gaps.append((gap_start, x, gap_width)) - in_gap = False - - # Filter out margin gaps (within 10% of image edges) - margin_threshold = w * 0.10 - internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold] - internal_gap_count = len(internal_gaps) - - # --- 2. Horizontal projection profile → detect row gaps --- - horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float) - h_kernel = max(3, h // 200) - if h_kernel % 2 == 0: - h_kernel += 1 - horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same') - - h_max = max(horiz_smooth.max(), 1) - h_gap_threshold = h_max * 0.05 - min_row_gap = max(3, h // 200) - - row_gap_count = 0 - in_gap = False - for y in range(h): - if horiz_smooth[y] < h_gap_threshold: - if not in_gap: - in_gap = True - gap_start = y - else: - if in_gap: - if y - gap_start >= min_row_gap: - row_gap_count += 1 - in_gap = False - - # --- 3. Text density distribution (4×4 grid) --- - grid_rows, grid_cols = 4, 4 - cell_h, cell_w = h // grid_rows, w // grid_cols - densities = [] - for gr in range(grid_rows): - for gc in range(grid_cols): - cell = ocr_img[gr * cell_h:(gr + 1) * cell_h, - gc * cell_w:(gc + 1) * cell_w] - if cell.size > 0: - d = float(np.count_nonzero(cell < 128)) / cell.size - densities.append(d) - - density_std = float(np.std(densities)) if densities else 0 - density_mean = float(np.mean(densities)) if densities else 0 - - features = { - 'vertical_gaps': gap_count, - 'internal_vertical_gaps': internal_gap_count, - 'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]], - 'row_gaps': row_gap_count, - 'density_mean': round(density_mean, 4), - 'density_std': round(density_std, 4), - 'image_size': (w, h), - } - - # --- 4. Decision tree --- - # Use internal_gap_count (excludes margin gaps) for column detection. - if internal_gap_count >= 2 and row_gap_count >= 5: - # Multiple internal vertical gaps + many row gaps → table - confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005) - return DocumentTypeResult( - doc_type='vocab_table', - confidence=round(confidence, 2), - pipeline='cell_first', - skip_steps=[], - features=features, - ) - elif internal_gap_count >= 1 and row_gap_count >= 3: - # Some internal structure, likely a table - confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01) - return DocumentTypeResult( - doc_type='generic_table', - confidence=round(confidence, 2), - pipeline='cell_first', - skip_steps=[], - features=features, - ) - elif internal_gap_count == 0: - # No internal column gaps → full text (regardless of density) - confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15) - return DocumentTypeResult( - doc_type='full_text', - confidence=round(confidence, 2), - pipeline='full_page', - skip_steps=['columns', 'rows'], - features=features, - ) - else: - # Ambiguous — default to vocab_table (most common use case) - return DocumentTypeResult( - doc_type='vocab_table', - confidence=0.5, - pipeline='cell_first', - skip_steps=[], - features=features, - ) - - -# ============================================================================= -# Stage 4: Dual Image Preparation -# ============================================================================= - -def create_ocr_image(img: np.ndarray) -> np.ndarray: - """Create a binarized image optimized for Tesseract OCR. - - Steps: Grayscale → Background normalization → Adaptive threshold → Denoise. - - Args: - img: BGR image. - - Returns: - Binary image (white text on black background inverted to black on white). - """ - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - - # Background normalization: divide by blurred version - bg = cv2.GaussianBlur(gray, (51, 51), 0) - normalized = cv2.divide(gray, bg, scale=255) - - # Adaptive binarization - binary = cv2.adaptiveThreshold( - normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, - cv2.THRESH_BINARY, 31, 10 - ) - - # Light denoise - denoised = cv2.medianBlur(binary, 3) - - return denoised - - -def create_layout_image(img: np.ndarray) -> np.ndarray: - """Create a CLAHE-enhanced grayscale image for layout analysis. - - Args: - img: BGR image. - - Returns: - Enhanced grayscale image. - """ - gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) - enhanced = clahe.apply(gray) - return enhanced - - -# ============================================================================= -# Stage 5: Layout Analysis (Projection Profiles) -# ============================================================================= - -def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray: - """Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask.""" - out = mask.copy() - n = len(out) - i = 0 - while i < n: - if out[i]: - start = i - while i < n and out[i]: - i += 1 - if (i - start) < min_width: - out[start:i] = False - else: - i += 1 - return out - - -def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]: - """Find the bounding box of actual text content (excluding page margins). - - Scan artefacts (thin black lines at page edges) are filtered out by - discarding contiguous projection runs narrower than 1 % of the image - dimension (min 5 px). - - Returns: - Tuple of (left_x, right_x, top_y, bottom_y). - """ - h, w = inv.shape[:2] - threshold = 0.005 - - # --- Horizontal projection for top/bottom --- - h_proj = np.sum(inv, axis=1).astype(float) / (w * 255) - h_mask = h_proj > threshold - min_h_run = max(5, h // 100) - h_mask = _filter_narrow_runs(h_mask, min_h_run) - - top_y = 0 - for y in range(h): - if h_mask[y]: - top_y = max(0, y - 5) - break - - bottom_y = h - for y in range(h - 1, 0, -1): - if h_mask[y]: - bottom_y = min(h, y + 5) - break - - # --- Vertical projection for left/right margins --- - v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float) - v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj - v_mask = v_proj_norm > threshold - min_v_run = max(5, w // 100) - v_mask = _filter_narrow_runs(v_mask, min_v_run) - - left_x = 0 - for x in range(w): - if v_mask[x]: - left_x = max(0, x - 2) - break - - right_x = w - for x in range(w - 1, 0, -1): - if v_mask[x]: - right_x = min(w, x + 2) - break - - return left_x, right_x, top_y, bottom_y - - -def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]: - """Detect columns, header, and footer using projection profiles. - - Uses content-bounds detection to exclude page margins before searching - for column separators within the actual text area. - - Args: - layout_img: CLAHE-enhanced grayscale image. - ocr_img: Binarized image for text density analysis. - - Returns: - List of PageRegion objects describing detected regions. - """ - h, w = ocr_img.shape[:2] - - # Invert: black text on white → white text on black for projection - inv = cv2.bitwise_not(ocr_img) - - # --- Find actual content bounds (exclude page margins) --- - left_x, right_x, top_y, bottom_y = _find_content_bounds(inv) - content_w = right_x - left_x - content_h = bottom_y - top_y - - logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), " - f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image") - - if content_w < w * 0.3 or content_h < h * 0.3: - # Fallback if detection seems wrong - left_x, right_x = 0, w - top_y, bottom_y = 0, h - content_w, content_h = w, h - - # --- Vertical projection within content area to find column separators --- - content_strip = inv[top_y:bottom_y, left_x:right_x] - v_proj = np.sum(content_strip, axis=0).astype(float) - v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj - - # Smooth the projection profile - kernel_size = max(5, content_w // 50) - if kernel_size % 2 == 0: - kernel_size += 1 - v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') - - # Debug: log projection profile statistics - p_mean = float(np.mean(v_proj_smooth)) - p_median = float(np.median(v_proj_smooth)) - p_min = float(np.min(v_proj_smooth)) - p_max = float(np.max(v_proj_smooth)) - logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, " - f"mean={p_mean:.4f}, median={p_median:.4f}") - - # Find valleys using multiple threshold strategies - # Strategy 1: relative to median (catches clear separators) - # Strategy 2: local minima approach (catches subtle gaps) - threshold = max(p_median * 0.3, p_mean * 0.2) - logger.info(f"Layout: valley threshold={threshold:.4f}") - - in_valley = v_proj_smooth < threshold - - # Find contiguous valley regions - all_valleys = [] - start = None - for x in range(len(v_proj_smooth)): - if in_valley[x] and start is None: - start = x - elif not in_valley[x] and start is not None: - valley_width = x - start - valley_depth = float(np.min(v_proj_smooth[start:x])) - # Valley must be at least 3px wide - if valley_width >= 3: - all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth)) - start = None - - logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — " - f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}") - - # Filter: valleys must be inside the content area (not at edges) - inner_margin = int(content_w * 0.08) - valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin] - - # If no valleys found with strict threshold, try local minima approach - if len(valleys) < 2: - logger.info("Layout: trying local minima approach for column detection") - # Divide content into 20 segments, find the 2 lowest - seg_count = 20 - seg_width = content_w // seg_count - seg_scores = [] - for i in range(seg_count): - sx = i * seg_width - ex = min((i + 1) * seg_width, content_w) - seg_mean = float(np.mean(v_proj_smooth[sx:ex])) - seg_scores.append((i, sx, ex, seg_mean)) - - seg_scores.sort(key=lambda s: s[3]) - logger.info(f"Layout: segment scores (lowest 5): " - f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}") - - # Find two lowest non-adjacent segments that create reasonable columns - candidate_valleys = [] - for seg_idx, sx, ex, seg_mean in seg_scores: - # Must not be at the edges - if seg_idx <= 1 or seg_idx >= seg_count - 2: - continue - # Must be significantly lower than overall mean - if seg_mean < p_mean * 0.6: - center = (sx + ex) // 2 - candidate_valleys.append((sx, ex, center, ex - sx, seg_mean)) - - if len(candidate_valleys) >= 2: - # Pick the best pair: non-adjacent, creating reasonable column widths - candidate_valleys.sort(key=lambda v: v[2]) - best_pair = None - best_score = float('inf') - for i in range(len(candidate_valleys)): - for j in range(i + 1, len(candidate_valleys)): - c1 = candidate_valleys[i][2] - c2 = candidate_valleys[j][2] - # Must be at least 20% apart - if (c2 - c1) < content_w * 0.2: - continue - col1 = c1 - col2 = c2 - c1 - col3 = content_w - c2 - # Each column at least 15% - if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12: - continue - parts = sorted([col1, col2, col3]) - score = parts[2] - parts[0] - if score < best_score: - best_score = score - best_pair = (candidate_valleys[i], candidate_valleys[j]) - - if best_pair: - valleys = list(best_pair) - logger.info(f"Layout: local minima found 2 valleys: " - f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}") - - logger.info(f"Layout: final {len(valleys)} valleys: " - f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}") - - regions = [] - - if len(valleys) >= 2: - # 3-column layout detected - valleys.sort(key=lambda v: v[2]) - - if len(valleys) == 2: - sep1_center = valleys[0][2] - sep2_center = valleys[1][2] - else: - # Pick the two valleys that best divide into 3 parts - # Prefer wider valleys (more likely true separators) - best_pair = None - best_score = float('inf') - for i in range(len(valleys)): - for j in range(i + 1, len(valleys)): - c1, c2 = valleys[i][2], valleys[j][2] - # Each column should be at least 15% of content width - col1 = c1 - col2 = c2 - c1 - col3 = content_w - c2 - if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15: - continue - # Score: lower is better (more even distribution) - parts = sorted([col1, col2, col3]) - score = parts[2] - parts[0] - # Bonus for wider valleys (subtract valley width) - score -= (valleys[i][3] + valleys[j][3]) * 0.5 - if score < best_score: - best_score = score - best_pair = (c1, c2) - if best_pair: - sep1_center, sep2_center = best_pair - else: - sep1_center = valleys[0][2] - sep2_center = valleys[1][2] - - # Convert from content-relative to absolute coordinates - abs_sep1 = sep1_center + left_x - abs_sep2 = sep2_center + left_x - - logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} " - f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})") - - regions.append(PageRegion( - type='column_en', x=0, y=top_y, - width=abs_sep1, height=content_h - )) - regions.append(PageRegion( - type='column_de', x=abs_sep1, y=top_y, - width=abs_sep2 - abs_sep1, height=content_h - )) - regions.append(PageRegion( - type='column_example', x=abs_sep2, y=top_y, - width=w - abs_sep2, height=content_h - )) - - elif len(valleys) == 1: - # 2-column layout - abs_sep = valleys[0][2] + left_x - - logger.info(f"Layout: 2 columns at separator x={abs_sep}") - - regions.append(PageRegion( - type='column_en', x=0, y=top_y, - width=abs_sep, height=content_h - )) - regions.append(PageRegion( - type='column_de', x=abs_sep, y=top_y, - width=w - abs_sep, height=content_h - )) - - else: - # No columns detected — run full-page OCR as single column - logger.warning("Layout: no column separators found, using full page") - regions.append(PageRegion( - type='column_en', x=0, y=top_y, - width=w, height=content_h - )) - - # Add header/footer info (gap-based detection with fallback) - _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv) - - top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none') - bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none') - col_count = len([r for r in regions if r.type.startswith('column')]) - logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}") - - return regions - - -# ============================================================================= -# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection) -# ============================================================================= - -# --- Phase A: Geometry Detection --- - -def _detect_columns_by_clustering( - word_dicts: List[Dict], - left_edges: List[int], - edge_word_indices: List[int], - content_w: int, - content_h: int, - left_x: int, - right_x: int, - top_y: int, - bottom_y: int, - inv: Optional[np.ndarray] = None, -) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]: - """Fallback: detect columns by clustering left-aligned word positions. - - Used when the primary gap-based algorithm finds fewer than 2 gaps. - """ - tolerance = max(10, int(content_w * 0.01)) - sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0]) - - clusters = [] - cluster_widxs = [] - cur_edges = [sorted_pairs[0][0]] - cur_widxs = [sorted_pairs[0][1]] - for edge, widx in sorted_pairs[1:]: - if edge - cur_edges[-1] <= tolerance: - cur_edges.append(edge) - cur_widxs.append(widx) - else: - clusters.append(cur_edges) - cluster_widxs.append(cur_widxs) - cur_edges = [edge] - cur_widxs = [widx] - clusters.append(cur_edges) - cluster_widxs.append(cur_widxs) - - MIN_Y_COVERAGE_PRIMARY = 0.30 - MIN_Y_COVERAGE_SECONDARY = 0.15 - MIN_WORDS_SECONDARY = 5 - - cluster_infos = [] - for c_edges, c_widxs in zip(clusters, cluster_widxs): - if len(c_edges) < 2: - continue - y_positions = [word_dicts[idx]['top'] for idx in c_widxs] - y_span = max(y_positions) - min(y_positions) - y_coverage = y_span / content_h if content_h > 0 else 0.0 - cluster_infos.append({ - 'mean_x': int(np.mean(c_edges)), - 'count': len(c_edges), - 'min_edge': min(c_edges), - 'max_edge': max(c_edges), - 'y_min': min(y_positions), - 'y_max': max(y_positions), - 'y_coverage': y_coverage, - }) - - primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY] - primary_set = set(id(c) for c in primary) - secondary = [c for c in cluster_infos - if id(c) not in primary_set - and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY - and c['count'] >= MIN_WORDS_SECONDARY] - significant = sorted(primary + secondary, key=lambda c: c['mean_x']) - - if len(significant) < 3: - logger.info("ColumnGeometry clustering fallback: < 3 significant clusters") - return None - - merge_distance = max(30, int(content_w * 0.06)) - merged = [significant[0].copy()] - for s in significant[1:]: - if s['mean_x'] - merged[-1]['mean_x'] < merge_distance: - prev = merged[-1] - total = prev['count'] + s['count'] - avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total - prev['mean_x'] = avg_x - prev['count'] = total - prev['min_edge'] = min(prev['min_edge'], s['min_edge']) - prev['max_edge'] = max(prev['max_edge'], s['max_edge']) - else: - merged.append(s.copy()) - - if len(merged) < 3: - logger.info("ColumnGeometry clustering fallback: < 3 merged clusters") - return None - - logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering") - - margin_px = max(6, int(content_w * 0.003)) - return _build_geometries_from_starts( - [(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged], - word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv, - ) - - -def _detect_sub_columns( - geometries: List[ColumnGeometry], - content_w: int, - left_x: int = 0, - top_y: int = 0, - header_y: Optional[int] = None, - footer_y: Optional[int] = None, - _edge_tolerance: int = 8, - _min_col_start_ratio: float = 0.10, -) -> List[ColumnGeometry]: - """Split columns that contain internal sub-columns based on left-edge alignment. - - For each column, clusters word left-edges into alignment bins (within - ``_edge_tolerance`` px). The leftmost bin whose word count reaches - ``_min_col_start_ratio`` of the column total is treated as the true column - start. Any words to the left of that bin form a sub-column, provided they - number >= 2 and < 35 % of total. - - Word ``left`` values are relative to the content ROI (offset by *left_x*), - while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x* - bridges the two coordinate systems. - - If *header_y* / *footer_y* are provided (absolute y-coordinates), words - in header/footer regions are excluded from alignment clustering to avoid - polluting the bins with page numbers or chapter titles. Word ``top`` - values are relative to *top_y*. - - Returns a new list of ColumnGeometry — potentially longer than the input. - """ - if content_w <= 0: - return geometries - - result: List[ColumnGeometry] = [] - for geo in geometries: - # Only consider wide-enough columns with enough words - if geo.width_ratio < 0.15 or geo.word_count < 5: - result.append(geo) - continue - - # Collect left-edges of confident words, excluding header/footer - # Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y) - min_top_rel = (header_y - top_y) if header_y is not None else None - max_top_rel = (footer_y - top_y) if footer_y is not None else None - - confident = [w for w in geo.words - if w.get('conf', 0) >= 30 - and (min_top_rel is None or w['top'] >= min_top_rel) - and (max_top_rel is None or w['top'] <= max_top_rel)] - if len(confident) < 3: - result.append(geo) - continue - - # --- Cluster left-edges into alignment bins --- - sorted_edges = sorted(w['left'] for w in confident) - bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge) - cur = [sorted_edges[0]] - for i in range(1, len(sorted_edges)): - if sorted_edges[i] - cur[-1] <= _edge_tolerance: - cur.append(sorted_edges[i]) - else: - bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur))) - cur = [sorted_edges[i]] - bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur))) - - # --- Find the leftmost bin qualifying as a real column start --- - total = len(confident) - min_count = max(3, int(total * _min_col_start_ratio)) - col_start_bin = None - for b in bins: - if b[1] >= min_count: - col_start_bin = b - break - - if col_start_bin is None: - result.append(geo) - continue - - # Words to the left of the column-start bin are sub-column candidates - split_threshold = col_start_bin[2] - _edge_tolerance - sub_words = [w for w in geo.words if w['left'] < split_threshold] - main_words = [w for w in geo.words if w['left'] >= split_threshold] - - # Count only body words (excluding header/footer) for the threshold check - # so that header/footer words don't artificially trigger a split. - sub_body = [w for w in sub_words - if (min_top_rel is None or w['top'] >= min_top_rel) - and (max_top_rel is None or w['top'] <= max_top_rel)] - if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35: - result.append(geo) - continue - - # --- Build two sub-column geometries --- - # Word 'left' values are relative to left_x; geo.x is absolute. - # Convert the split position from relative to absolute coordinates. - max_sub_left = max(w['left'] for w in sub_words) - split_rel = (max_sub_left + col_start_bin[2]) // 2 - split_abs = split_rel + left_x - - sub_x = geo.x - sub_width = split_abs - geo.x - main_x = split_abs - main_width = (geo.x + geo.width) - split_abs - - if sub_width <= 0 or main_width <= 0: - result.append(geo) - continue - - sub_geo = ColumnGeometry( - index=0, - x=sub_x, - y=geo.y, - width=sub_width, - height=geo.height, - word_count=len(sub_words), - words=sub_words, - width_ratio=sub_width / content_w if content_w > 0 else 0.0, - is_sub_column=True, - ) - main_geo = ColumnGeometry( - index=0, - x=main_x, - y=geo.y, - width=main_width, - height=geo.height, - word_count=len(main_words), - words=main_words, - width_ratio=main_width / content_w if content_w > 0 else 0.0, - is_sub_column=True, - ) - - result.append(sub_geo) - result.append(main_geo) - - logger.info( - f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} " - f"(rel={split_rel}), sub={len(sub_words)} words, " - f"main={len(main_words)} words, " - f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})" - ) - - # Re-index by left-to-right order - result.sort(key=lambda g: g.x) - for i, g in enumerate(result): - g.index = i - - return result - - -def _split_broad_columns( - geometries: List[ColumnGeometry], - content_w: int, - left_x: int = 0, - _broad_threshold: float = 0.35, - _min_gap_px: int = 15, - _min_words_per_split: int = 5, -) -> List[ColumnGeometry]: - """Split overly broad columns that contain two language blocks (EN+DE). - - Uses word-coverage gap analysis: builds a per-pixel coverage array from the - words inside each broad column, finds the largest horizontal gap, and splits - the column at that gap. - - Args: - geometries: Column geometries from _detect_sub_columns. - content_w: Width of the content area in pixels. - left_x: Left edge of content ROI in absolute image coordinates. - _broad_threshold: Minimum width_ratio to consider a column "broad". - _min_gap_px: Minimum gap width (pixels) to trigger a split. - _min_words_per_split: Both halves must have at least this many words. - - Returns: - Updated list of ColumnGeometry (possibly with more columns). - """ - result: List[ColumnGeometry] = [] - - logger.info(f"SplitBroadCols: input {len(geometries)} cols: " - f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}") - - for geo in geometries: - if geo.width_ratio <= _broad_threshold or len(geo.words) < 10: - result.append(geo) - continue - - # Build word-coverage array (per pixel within column) - col_left_rel = geo.x - left_x # column left in content-relative coords - coverage = np.zeros(geo.width, dtype=np.float32) - - for wd in geo.words: - # wd['left'] is relative to left_x (content ROI) - wl = wd['left'] - col_left_rel - wr = wl + wd.get('width', 0) - wl = max(0, int(wl)) - wr = min(geo.width, int(wr)) - if wr > wl: - coverage[wl:wr] += 1.0 - - # Light smoothing (kernel=3px) to avoid noise - if len(coverage) > 3: - kernel = np.ones(3, dtype=np.float32) / 3.0 - coverage = np.convolve(coverage, kernel, mode='same') - - # Normalise to [0, 1] - cmax = coverage.max() - if cmax > 0: - coverage /= cmax - - # Find INTERNAL gaps where coverage < 0.5 - # Exclude edge gaps (touching pixel 0 or geo.width) — those are margins. - low_mask = coverage < 0.5 - all_gaps = [] - _gs = None - for px in range(len(low_mask)): - if low_mask[px]: - if _gs is None: - _gs = px - else: - if _gs is not None: - all_gaps.append((_gs, px, px - _gs)) - _gs = None - if _gs is not None: - all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs)) - - # Filter: only internal gaps (not touching column edges) - _edge_margin = 10 # pixels from edge to ignore - internal_gaps = [g for g in all_gaps - if g[0] > _edge_margin and g[1] < geo.width - _edge_margin] - best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None - - logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): " - f"{[g for g in all_gaps if g[2] >= 5]}, " - f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, " - f"best={best_gap}") - - if best_gap is None or best_gap[2] < _min_gap_px: - result.append(geo) - continue - - gap_center = (best_gap[0] + best_gap[1]) // 2 - - # Split words by midpoint relative to gap - left_words = [] - right_words = [] - for wd in geo.words: - wl = wd['left'] - col_left_rel - mid = wl + wd.get('width', 0) / 2.0 - if mid < gap_center: - left_words.append(wd) - else: - right_words.append(wd) - - if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split: - result.append(geo) - continue - - # Build two new ColumnGeometry objects - split_x_abs = geo.x + gap_center - left_w = gap_center - right_w = geo.width - gap_center - - left_geo = ColumnGeometry( - index=0, - x=geo.x, - y=geo.y, - width=left_w, - height=geo.height, - word_count=len(left_words), - words=left_words, - width_ratio=left_w / content_w if content_w else 0, - is_sub_column=True, - ) - right_geo = ColumnGeometry( - index=0, - x=split_x_abs, - y=geo.y, - width=right_w, - height=geo.height, - word_count=len(right_words), - words=right_words, - width_ratio=right_w / content_w if content_w else 0, - is_sub_column=True, - ) - - logger.info( - f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} " - f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), " - f"left={len(left_words)} words (w={left_w}), " - f"right={len(right_words)} words (w={right_w})" - ) - - result.append(left_geo) - result.append(right_geo) - - # Re-index left-to-right - result.sort(key=lambda g: g.x) - for i, g in enumerate(result): - g.index = i - - return result - - -def _build_geometries_from_starts( - col_starts: List[Tuple[int, int]], - word_dicts: List[Dict], - left_x: int, - right_x: int, - top_y: int, - bottom_y: int, - content_w: int, - content_h: int, - inv: Optional[np.ndarray] = None, -) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]: - """Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs.""" - geometries = [] - for i, (start_x, count) in enumerate(col_starts): - if i + 1 < len(col_starts): - col_width = col_starts[i + 1][0] - start_x - else: - col_width = right_x - start_x - - col_left_rel = start_x - left_x - col_right_rel = col_left_rel + col_width - col_words = [w for w in word_dicts - if col_left_rel <= w['left'] < col_right_rel] - - geometries.append(ColumnGeometry( - index=i, - x=start_x, - y=top_y, - width=col_width, - height=content_h, - word_count=len(col_words), - words=col_words, - width_ratio=col_width / content_w if content_w > 0 else 0.0, - )) - - logger.info(f"ColumnGeometry: {len(geometries)} columns: " - f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") - return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv) - - -def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]: - """Detect column geometry using whitespace-gap analysis with word validation. - - Phase A of the two-phase column detection. Uses vertical projection - profiles to find whitespace gaps between columns, then validates that - no gap cuts through a word bounding box. - - Falls back to clustering-based detection if fewer than 2 gaps are found. - - Args: - ocr_img: Binarized grayscale image for layout analysis. - dewarped_bgr: Original BGR image (for Tesseract word detection). - - Returns: - Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv) - or None if detection fails entirely. - """ - h, w = ocr_img.shape[:2] - - # --- Step 1: Find content bounds --- - inv = cv2.bitwise_not(ocr_img) - left_x, right_x, top_y, bottom_y = _find_content_bounds(inv) - content_w = right_x - left_x - content_h = bottom_y - top_y - - if content_w < w * 0.3 or content_h < h * 0.3: - left_x, right_x = 0, w - top_y, bottom_y = 0, h - content_w, content_h = w, h - - logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), " - f"y=[{top_y}..{bottom_y}] ({content_h}px)") - - # --- Step 2: Get word bounding boxes from Tesseract --- - # Crop from left_x to full image width (not right_x) so words at the right - # edge of the last column are included even if they extend past the detected - # content boundary (right_x). - content_roi = dewarped_bgr[top_y:bottom_y, left_x:w] - pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB)) - - try: - data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT) - except Exception as e: - logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}") - return None - - word_dicts = [] - left_edges = [] - edge_word_indices = [] - n_words = len(data['text']) - for i in range(n_words): - conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1 - text = str(data['text'][i]).strip() - if conf < 30 or not text: - continue - lx = int(data['left'][i]) - ty = int(data['top'][i]) - bw = int(data['width'][i]) - bh = int(data['height'][i]) - left_edges.append(lx) - edge_word_indices.append(len(word_dicts)) - word_dicts.append({ - 'text': text, 'conf': conf, - 'left': lx, 'top': ty, 'width': bw, 'height': bh, - }) - - if len(left_edges) < 5: - logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected") - return None - - logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area") - - # --- Step 2b: Segment by sub-headers --- - # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width - # text bands that pollute the vertical projection. We detect large - # horizontal gaps (= whitespace rows separating sections) and use only - # the tallest content segment for the projection. This makes column - # detection immune to sub-headers, illustrations, and section dividers. - content_strip = inv[top_y:bottom_y, left_x:right_x] - h_proj_row = np.sum(content_strip, axis=1).astype(float) - h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row - - # Find horizontal gaps (near-empty rows) - H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty" - h_in_gap = h_proj_row_norm < H_GAP_THRESH - H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px - - h_gaps: List[Tuple[int, int]] = [] - h_gap_start = None - for y_idx in range(len(h_in_gap)): - if h_in_gap[y_idx]: - if h_gap_start is None: - h_gap_start = y_idx - else: - if h_gap_start is not None: - if y_idx - h_gap_start >= H_MIN_GAP: - h_gaps.append((h_gap_start, y_idx)) - h_gap_start = None - if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP: - h_gaps.append((h_gap_start, len(h_in_gap))) - - # Identify "large" gaps (significantly bigger than median) that indicate - # section boundaries (sub-headers, chapter titles). - if len(h_gaps) >= 3: - gap_sizes = sorted(g[1] - g[0] for g in h_gaps) - median_gap_h = gap_sizes[len(gap_sizes) // 2] - large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3) - large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh] - else: - large_gaps = h_gaps - - # Build content segments between large gaps and pick the tallest - seg_boundaries = [0] - for gs, ge in large_gaps: - seg_boundaries.append(gs) - seg_boundaries.append(ge) - seg_boundaries.append(content_h) - - segments = [] - for i in range(0, len(seg_boundaries) - 1, 2): - seg_top = seg_boundaries[i] - seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h - seg_height = seg_bot - seg_top - if seg_height > 20: # ignore tiny fragments - segments.append((seg_top, seg_bot, seg_height)) - - if segments: - segments.sort(key=lambda s: s[2], reverse=True) - best_seg = segments[0] - proj_strip = content_strip[best_seg[0]:best_seg[1], :] - effective_h = best_seg[2] - if len(segments) > 1: - logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} " - f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} " - f"({effective_h}px, {effective_h*100/content_h:.0f}%)") - else: - proj_strip = content_strip - effective_h = content_h - - # --- Step 3: Vertical projection profile --- - v_proj = np.sum(proj_strip, axis=0).astype(float) - v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj - - # Smooth the projection to avoid noise-induced micro-gaps - kernel_size = max(5, content_w // 80) - if kernel_size % 2 == 0: - kernel_size += 1 # keep odd for symmetry - v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') - - # --- Step 4: Find whitespace gaps --- - # Threshold: areas with very little ink density are gaps - median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01 - gap_threshold = max(median_density * 0.15, 0.005) - - in_gap = v_smooth < gap_threshold - MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width - - # Collect contiguous gap regions - raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI - gap_start = None - for x in range(len(in_gap)): - if in_gap[x]: - if gap_start is None: - gap_start = x - else: - if gap_start is not None: - gap_width = x - gap_start - if gap_width >= MIN_GAP_WIDTH: - raw_gaps.append((gap_start, x)) - gap_start = None - # Handle gap at the right edge - if gap_start is not None: - gap_width = len(in_gap) - gap_start - if gap_width >= MIN_GAP_WIDTH: - raw_gaps.append((gap_start, len(in_gap))) - - logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, " - f"min_width={MIN_GAP_WIDTH}px): " - f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}") - - # --- Step 5: Validate gaps against word bounding boxes --- - # When using a segment for projection, only validate against words - # inside that segment — words from sub-headers or other sections - # would incorrectly overlap with real column gaps. - if segments and len(segments) > 1: - seg_top_abs = best_seg[0] # relative to content strip - seg_bot_abs = best_seg[1] - segment_words = [wd for wd in word_dicts - if wd['top'] >= seg_top_abs - and wd['top'] + wd['height'] <= seg_bot_abs] - logger.info(f"ColumnGeometry: filtering words to segment: " - f"{len(segment_words)}/{len(word_dicts)} words") - else: - segment_words = word_dicts - - validated_gaps = [] - for gap_start_rel, gap_end_rel in raw_gaps: - # Check if any word overlaps with this gap region - overlapping = False - for wd in segment_words: - word_left = wd['left'] - word_right = wd['left'] + wd['width'] - if word_left < gap_end_rel and word_right > gap_start_rel: - overlapping = True - break - - if not overlapping: - validated_gaps.append((gap_start_rel, gap_end_rel)) - else: - # Try to shift the gap to avoid the overlapping word(s) - # Find the tightest word boundaries within the gap region - min_word_left = content_w - max_word_right = 0 - for wd in segment_words: - word_left = wd['left'] - word_right = wd['left'] + wd['width'] - if word_left < gap_end_rel and word_right > gap_start_rel: - min_word_left = min(min_word_left, word_left) - max_word_right = max(max_word_right, word_right) - - # Try gap before the overlapping words - if min_word_left - gap_start_rel >= MIN_GAP_WIDTH: - validated_gaps.append((gap_start_rel, min_word_left)) - logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}") - # Try gap after the overlapping words - elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH: - validated_gaps.append((max_word_right, gap_end_rel)) - logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}") - else: - logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] " - f"discarded (word overlap, no room to shift)") - - logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: " - f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}") - - # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) --- - # When pixel-based projection fails (e.g. due to illustrations or colored - # bands), use word bounding boxes to find clear vertical gaps. This is - # immune to decorative graphics that Tesseract doesn't recognise as words. - if len(validated_gaps) < 2: - logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps") - word_coverage = np.zeros(content_w, dtype=np.int32) - for wd in segment_words: - wl = max(0, wd['left']) - wr = min(wd['left'] + wd['width'], content_w) - if wr > wl: - word_coverage[wl:wr] += 1 - - # Smooth slightly to bridge tiny 1-2px noise gaps between words - wc_kernel = max(3, content_w // 300) - if wc_kernel % 2 == 0: - wc_kernel += 1 - wc_smooth = np.convolve(word_coverage.astype(float), - np.ones(wc_kernel) / wc_kernel, mode='same') - - wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage - WC_MIN_GAP = max(4, content_w // 300) - - wc_gaps: List[Tuple[int, int]] = [] - wc_gap_start = None - for x in range(len(wc_in_gap)): - if wc_in_gap[x]: - if wc_gap_start is None: - wc_gap_start = x - else: - if wc_gap_start is not None: - if x - wc_gap_start >= WC_MIN_GAP: - wc_gaps.append((wc_gap_start, x)) - wc_gap_start = None - if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP: - wc_gaps.append((wc_gap_start, len(wc_in_gap))) - - logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found " - f"(min_width={WC_MIN_GAP}px): " - f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}") - - if len(wc_gaps) >= 2: - validated_gaps = wc_gaps - - # --- Step 6: Fallback to clustering if too few gaps --- - if len(validated_gaps) < 2: - logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering") - return _detect_columns_by_clustering( - word_dicts, left_edges, edge_word_indices, - content_w, content_h, left_x, right_x, top_y, bottom_y, inv, - ) - - # --- Step 7: Derive column boundaries from gaps --- - # Sort gaps by position - validated_gaps.sort(key=lambda g: g[0]) - - # Identify margin gaps (first and last) vs interior gaps - # A margin gap touches the edge of the content area (within 2% tolerance) - edge_tolerance = max(10, int(content_w * 0.02)) - - is_left_margin = validated_gaps[0][0] <= edge_tolerance - is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance - - # Interior gaps define column boundaries - # Column starts at the end of a gap, ends at the start of the next gap - col_starts = [] - - if is_left_margin: - # First column starts after the left margin gap - first_gap_end = validated_gaps[0][1] - interior_gaps = validated_gaps[1:] - else: - # No left margin gap — first column starts at content left edge - first_gap_end = 0 - interior_gaps = validated_gaps[:] - - if is_right_margin: - # Last gap is right margin — don't use it as column start - interior_gaps_for_boundaries = interior_gaps[:-1] - right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start - else: - interior_gaps_for_boundaries = interior_gaps - right_boundary = content_w - - # First column - col_starts.append(left_x + first_gap_end) - - # Columns between interior gaps - for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries: - col_starts.append(left_x + gap_end_rel) - - # Count words per column region (for logging) - col_start_counts = [] - for i, start_x in enumerate(col_starts): - if i + 1 < len(col_starts): - next_start = col_starts[i + 1] - else: - # Rightmost column always extends to full image width (w). - # The page margin contains only white space — extending the OCR - # crop to the image edge is safe and prevents text near the right - # border from being cut off. - next_start = w - - col_left_rel = start_x - left_x - col_right_rel = next_start - left_x - n_words_in_col = sum(1 for w in word_dicts - if col_left_rel <= w['left'] < col_right_rel) - col_start_counts.append((start_x, n_words_in_col)) - - logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps " - f"(left_margin={is_left_margin}, right_margin={is_right_margin}): " - f"{col_start_counts}") - - # --- Step 8: Build ColumnGeometry objects --- - # Determine right edge for each column - all_boundaries = [] - for i, start_x in enumerate(col_starts): - if i + 1 < len(col_starts): - end_x = col_starts[i + 1] - else: - # Rightmost column always extends to full image width (w). - end_x = w - all_boundaries.append((start_x, end_x)) - - geometries = [] - for i, (start_x, end_x) in enumerate(all_boundaries): - col_width = end_x - start_x - col_left_rel = start_x - left_x - col_right_rel = col_left_rel + col_width - col_words = [w for w in word_dicts - if col_left_rel <= w['left'] < col_right_rel] - - geometries.append(ColumnGeometry( - index=i, - x=start_x, - y=top_y, - width=col_width, - height=content_h, - word_count=len(col_words), - words=col_words, - width_ratio=col_width / content_w if content_w > 0 else 0.0, - )) - - logger.info(f"ColumnGeometry: {len(geometries)} columns: " - f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") - - # --- Step 9: Filter phantom narrow columns --- - # Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow - # columns (< 3% of content width) with zero or no words. These are not - # real columns — remove them and close the gap between neighbors. - min_real_col_w = max(20, int(content_w * 0.03)) - filtered_geoms = [g for g in geometries - if not (g.word_count < 3 and g.width < min_real_col_w)] - if len(filtered_geoms) < len(geometries): - n_removed = len(geometries) - len(filtered_geoms) - logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) " - f"(width < {min_real_col_w}px and words < 3)") - # Extend each remaining column to close gaps with its right neighbor - for i, g in enumerate(filtered_geoms): - if i + 1 < len(filtered_geoms): - g.width = filtered_geoms[i + 1].x - g.x - else: - g.width = w - g.x - g.index = i - col_left_rel = g.x - left_x - col_right_rel = col_left_rel + g.width - g.words = [w for w in word_dicts - if col_left_rel <= w['left'] < col_right_rel] - g.word_count = len(g.words) - geometries = filtered_geoms - logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: " - f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}") - - return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv) - - -def expand_narrow_columns( - geometries: List[ColumnGeometry], - content_w: int, - left_x: int, - word_dicts: List[Dict], -) -> List[ColumnGeometry]: - """Expand narrow columns into adjacent whitespace gaps. - - Narrow columns (marker, page_ref, < 10% content width) often lose - content at image edges due to residual shear. This expands them toward - the neighbouring column, but never past 40% of the gap or past the - nearest word in the neighbour. - - Must be called AFTER _detect_sub_columns() so that sub-column splits - (which create the narrowest columns) have already happened. - """ - _NARROW_THRESHOLD_PCT = 10.0 - _MIN_WORD_MARGIN = 4 - - if len(geometries) < 2: - return geometries - - logger.info("ExpandNarrowCols: input %d cols: %s", - len(geometries), - [(i, g.x, g.width, round(g.width / content_w * 100, 1)) - for i, g in enumerate(geometries)]) - - for i, g in enumerate(geometries): - col_pct = g.width / content_w * 100 if content_w > 0 else 100 - if col_pct >= _NARROW_THRESHOLD_PCT: - continue - - expanded = False - orig_pct = col_pct - - # --- try expanding to the LEFT --- - if i > 0: - left_nb = geometries[i - 1] - # Gap can be 0 if sub-column split created adjacent columns. - # In that case, look at where the neighbor's rightmost words - # actually are — there may be unused space we can claim. - nb_words_right = [wd['left'] + wd.get('width', 0) - for wd in left_nb.words] - if nb_words_right: - rightmost_word_abs = left_x + max(nb_words_right) - safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN - else: - # No words in neighbor → we can take up to neighbor's start - safe_left_abs = left_nb.x + _MIN_WORD_MARGIN - - if safe_left_abs < g.x: - g.width += (g.x - safe_left_abs) - g.x = safe_left_abs - expanded = True - - # --- try expanding to the RIGHT --- - if i + 1 < len(geometries): - right_nb = geometries[i + 1] - nb_words_left = [wd['left'] for wd in right_nb.words] - if nb_words_left: - leftmost_word_abs = left_x + min(nb_words_left) - safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN - else: - safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN - - cur_right = g.x + g.width - if safe_right_abs > cur_right: - g.width = safe_right_abs - g.x - expanded = True - - if expanded: - col_left_rel = g.x - left_x - col_right_rel = col_left_rel + g.width - g.words = [wd for wd in word_dicts - if col_left_rel <= wd['left'] < col_right_rel] - g.word_count = len(g.words) - g.width_ratio = g.width / content_w if content_w > 0 else 0.0 - logger.info( - "ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d", - i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count) - - # --- Shrink overlapping neighbors to match new boundaries --- - # Left neighbor: its right edge must not exceed our new left edge - if i > 0: - left_nb = geometries[i - 1] - nb_right = left_nb.x + left_nb.width - if nb_right > g.x: - left_nb.width = g.x - left_nb.x - if left_nb.width < 0: - left_nb.width = 0 - left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0 - # Re-assign words - nb_left_rel = left_nb.x - left_x - nb_right_rel = nb_left_rel + left_nb.width - left_nb.words = [wd for wd in word_dicts - if nb_left_rel <= wd['left'] < nb_right_rel] - left_nb.word_count = len(left_nb.words) - - # Right neighbor: its left edge must not be before our new right edge - if i + 1 < len(geometries): - right_nb = geometries[i + 1] - my_right = g.x + g.width - if right_nb.x < my_right: - old_right_edge = right_nb.x + right_nb.width - right_nb.x = my_right - right_nb.width = old_right_edge - right_nb.x - if right_nb.width < 0: - right_nb.width = 0 - right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0 - # Re-assign words - nb_left_rel = right_nb.x - left_x - nb_right_rel = nb_left_rel + right_nb.width - right_nb.words = [wd for wd in word_dicts - if nb_left_rel <= wd['left'] < nb_right_rel] - right_nb.word_count = len(right_nb.words) - - return geometries - - -# ============================================================================= -# Row Geometry Detection (horizontal whitespace-gap analysis) -# ============================================================================= - -def detect_row_geometry( - inv: np.ndarray, - word_dicts: List[Dict], - left_x: int, right_x: int, - top_y: int, bottom_y: int, -) -> List['RowGeometry']: - """Detect row geometry using horizontal whitespace-gap analysis. - - Mirrors the vertical gap approach used for columns, but operates on - horizontal projection profiles to find gaps between text lines. - Also classifies header/footer rows based on gap size. - - Args: - inv: Inverted binarized image (white text on black bg, full page). - word_dicts: Word bounding boxes from Tesseract (relative to content ROI). - left_x, right_x: Absolute X bounds of the content area. - top_y, bottom_y: Absolute Y bounds of the content area. - - Returns: - List of RowGeometry objects sorted top to bottom. - """ - content_w = right_x - left_x - content_h = bottom_y - top_y - - if content_h < 10 or content_w < 10: - logger.warning("detect_row_geometry: content area too small") - return [] - - # --- Step 1: Horizontal projection profile (text-only, images masked out) --- - content_strip = inv[top_y:bottom_y, left_x:right_x] - - # Build a word-coverage mask so that image regions (high ink density but no - # Tesseract words) are ignored. Only pixels within/near word bounding boxes - # contribute to the projection. This prevents large illustrations from - # merging multiple vocabulary rows into one. - WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words - word_mask = np.zeros((content_h, content_w), dtype=np.uint8) - for wd in word_dicts: - y1 = max(0, wd['top'] - WORD_PAD_Y) - y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y) - x1 = max(0, wd['left']) - x2 = min(content_w, wd['left'] + wd['width']) - word_mask[y1:y2, x1:x2] = 255 - - masked_strip = cv2.bitwise_and(content_strip, word_mask) - h_proj = np.sum(masked_strip, axis=1).astype(float) - h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj - - # --- Step 2: Smoothing + threshold --- - kernel_size = max(3, content_h // 200) - if kernel_size % 2 == 0: - kernel_size += 1 - h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') - - median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01 - gap_threshold = max(median_density * 0.15, 0.003) - - in_gap = h_smooth < gap_threshold - MIN_GAP_HEIGHT = max(3, content_h // 500) - - # --- Step 3: Collect contiguous gap regions --- - raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI - gap_start = None - for y in range(len(in_gap)): - if in_gap[y]: - if gap_start is None: - gap_start = y - else: - if gap_start is not None: - gap_height = y - gap_start - if gap_height >= MIN_GAP_HEIGHT: - raw_gaps.append((gap_start, y)) - gap_start = None - if gap_start is not None: - gap_height = len(in_gap) - gap_start - if gap_height >= MIN_GAP_HEIGHT: - raw_gaps.append((gap_start, len(in_gap))) - - logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, " - f"min_height={MIN_GAP_HEIGHT}px)") - - # --- Step 4: Validate gaps against word bounding boxes --- - validated_gaps = [] - for gap_start_rel, gap_end_rel in raw_gaps: - overlapping = False - for wd in word_dicts: - word_top = wd['top'] - word_bottom = wd['top'] + wd['height'] - if word_top < gap_end_rel and word_bottom > gap_start_rel: - overlapping = True - break - - if not overlapping: - validated_gaps.append((gap_start_rel, gap_end_rel)) - else: - # Try to shift the gap to avoid overlapping words - min_word_top = content_h - max_word_bottom = 0 - for wd in word_dicts: - word_top = wd['top'] - word_bottom = wd['top'] + wd['height'] - if word_top < gap_end_rel and word_bottom > gap_start_rel: - min_word_top = min(min_word_top, word_top) - max_word_bottom = max(max_word_bottom, word_bottom) - - if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT: - validated_gaps.append((gap_start_rel, min_word_top)) - elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT: - validated_gaps.append((max_word_bottom, gap_end_rel)) - else: - logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] " - f"discarded (word overlap, no room to shift)") - - logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation") - - # --- Fallback if too few gaps --- - if len(validated_gaps) < 2: - logger.info("RowGeometry: < 2 gaps found, falling back to word grouping") - return _build_rows_from_word_grouping( - word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, - ) - - validated_gaps.sort(key=lambda g: g[0]) - - # --- Step 5: Header/footer detection via gap size --- - HEADER_FOOTER_ZONE = 0.15 - GAP_MULTIPLIER = 2.0 - - gap_sizes = [g[1] - g[0] for g in validated_gaps] - median_gap = float(np.median(gap_sizes)) if gap_sizes else 0 - large_gap_threshold = median_gap * GAP_MULTIPLIER - - header_boundary_rel = None # y below which is header - footer_boundary_rel = None # y above which is footer - - header_zone_limit = int(content_h * HEADER_FOOTER_ZONE) - footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE)) - - # Find largest gap in header zone - best_header_gap = None - for gs, ge in validated_gaps: - gap_mid = (gs + ge) / 2 - gap_size = ge - gs - if gap_mid < header_zone_limit and gap_size > large_gap_threshold: - if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]): - best_header_gap = (gs, ge) - - if best_header_gap is not None: - header_boundary_rel = best_header_gap[1] - logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} " - f"(gap={best_header_gap[1] - best_header_gap[0]}px, " - f"median_gap={median_gap:.0f}px)") - - # Find largest gap in footer zone - best_footer_gap = None - for gs, ge in validated_gaps: - gap_mid = (gs + ge) / 2 - gap_size = ge - gs - if gap_mid > footer_zone_start and gap_size > large_gap_threshold: - if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]): - best_footer_gap = (gs, ge) - - if best_footer_gap is not None: - footer_boundary_rel = best_footer_gap[0] - logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} " - f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)") - - # --- Step 6: Build RowGeometry objects from gaps --- - # Rows are the spans between gaps - row_boundaries = [] # (start_y_rel, end_y_rel) - - # Top of content to first gap - if validated_gaps[0][0] > MIN_GAP_HEIGHT: - row_boundaries.append((0, validated_gaps[0][0])) - - # Between gaps - for i in range(len(validated_gaps) - 1): - row_start = validated_gaps[i][1] - row_end = validated_gaps[i + 1][0] - if row_end - row_start > 0: - row_boundaries.append((row_start, row_end)) - - # Last gap to bottom of content - if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT: - row_boundaries.append((validated_gaps[-1][1], content_h)) - - rows = [] - for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries): - # Determine row type - row_mid = (row_start_rel + row_end_rel) / 2 - if header_boundary_rel is not None and row_mid < header_boundary_rel: - row_type = 'header' - elif footer_boundary_rel is not None and row_mid > footer_boundary_rel: - row_type = 'footer' - else: - row_type = 'content' - - # Collect words in this row - row_words = [w for w in word_dicts - if w['top'] + w['height'] / 2 >= row_start_rel - and w['top'] + w['height'] / 2 < row_end_rel] - - # Gap before this row - gap_before = 0 - if idx == 0 and validated_gaps[0][0] > 0: - gap_before = validated_gaps[0][0] - elif idx > 0: - # Find the gap just before this row boundary - for gs, ge in validated_gaps: - if ge == row_start_rel: - gap_before = ge - gs - break - - rows.append(RowGeometry( - index=idx, - x=left_x, - y=top_y + row_start_rel, - width=content_w, - height=row_end_rel - row_start_rel, - word_count=len(row_words), - words=row_words, - row_type=row_type, - gap_before=gap_before, - )) - - # --- Step 7: Word-center grid regularization --- - # Derive precise row boundaries from word vertical centers. Detects - # section breaks (headings, paragraphs) and builds per-section grids. - rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y, - content_w, content_h, inv) - - type_counts = {} - for r in rows: - type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1 - logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}") - - return rows - - -def _regularize_row_grid( - rows: List['RowGeometry'], - word_dicts: List[Dict], - left_x: int, right_x: int, - top_y: int, - content_w: int, content_h: int, - inv: np.ndarray, -) -> List['RowGeometry']: - """Rebuild row boundaries from word center-lines with section-break awareness. - - Instead of overlaying a rigid grid, this derives row positions bottom-up - from the words themselves: - - 1. Group words into line clusters (by Y proximity). - 2. For each cluster compute center_y (median of word vertical centers) - and letter_height (median of word heights). - 3. Compute the pitch (distance between consecutive centers). - 4. Detect section breaks where the gap is >1.8× the median pitch - (headings, sub-headings, paragraph breaks). - 5. Within each section, use the local pitch to place row boundaries - at the midpoints between consecutive centers. - 6. Validate that ≥85% of words land in a grid row; otherwise fall back. - - Header/footer rows from the gap-based detection are preserved. - """ - content_rows = [r for r in rows if r.row_type == 'content'] - non_content = [r for r in rows if r.row_type != 'content'] - - if len(content_rows) < 5: - return rows - - # --- Step A: Group ALL words into line clusters --- - # Collect words that belong to content rows (deduplicated) - content_words: List[Dict] = [] - seen_keys: set = set() - for r in content_rows: - for w in r.words: - key = (w['left'], w['top'], w['width'], w['height']) - if key not in seen_keys: - seen_keys.add(key) - content_words.append(w) - - if len(content_words) < 5: - return rows - - # Compute median word height (excluding outliers like tall brackets/IPA) - word_heights = sorted(w['height'] for w in content_words) - median_wh = word_heights[len(word_heights) // 2] - - # Compute median gap-based row height — this is the actual line height - # as detected by the horizontal projection. We use 40% of this as - # grouping tolerance. This is much more reliable than using word height - # alone, because words on the same line can have very different heights - # (e.g. lowercase vs uppercase, brackets, phonetic symbols). - gap_row_heights = sorted(r.height for r in content_rows) - median_row_h = gap_row_heights[len(gap_row_heights) // 2] - - # Tolerance: 40% of row height. Words on the same line should have - # centers within this range. Even if a word's bbox is taller/shorter, - # its center should stay within half a row height of the line center. - y_tol = max(10, int(median_row_h * 0.4)) - - # Sort by center_y, then group by proximity - words_by_center = sorted(content_words, - key=lambda w: (w['top'] + w['height'] / 2, w['left'])) - line_clusters: List[List[Dict]] = [] - current_line: List[Dict] = [words_by_center[0]] - current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2 - - for w in words_by_center[1:]: - w_center = w['top'] + w['height'] / 2 - if abs(w_center - current_center) <= y_tol: - current_line.append(w) - else: - current_line.sort(key=lambda w: w['left']) - line_clusters.append(current_line) - current_line = [w] - current_center = w_center - - if current_line: - current_line.sort(key=lambda w: w['left']) - line_clusters.append(current_line) - - if len(line_clusters) < 3: - return rows - - # --- Step B: Compute center_y per cluster --- - # center_y = median of (word_top + word_height/2) across all words in cluster - # letter_h = median of word heights, but excluding outlier-height words - # (>2× median) so that tall brackets/IPA don't skew the height - cluster_info: List[Dict] = [] - for cl_words in line_clusters: - centers = [w['top'] + w['height'] / 2 for w in cl_words] - # Filter outlier heights for letter_h computation - normal_heights = [w['height'] for w in cl_words - if w['height'] <= median_wh * 2.0] - if not normal_heights: - normal_heights = [w['height'] for w in cl_words] - center_y = float(np.median(centers)) - letter_h = float(np.median(normal_heights)) - cluster_info.append({ - 'center_y_rel': center_y, # relative to content ROI - 'center_y_abs': center_y + top_y, # absolute - 'letter_h': letter_h, - 'words': cl_words, - }) - - cluster_info.sort(key=lambda c: c['center_y_rel']) - - # --- Step B2: Merge clusters that are too close together --- - # Even with center-based grouping, some edge cases can produce - # spurious clusters. Merge any pair whose centers are closer - # than 30% of the row height (they're definitely the same text line). - merge_threshold = max(8, median_row_h * 0.3) - merged: List[Dict] = [cluster_info[0]] - for cl in cluster_info[1:]: - prev = merged[-1] - if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold: - # Merge: combine words, recompute center - combined_words = prev['words'] + cl['words'] - centers = [w['top'] + w['height'] / 2 for w in combined_words] - normal_heights = [w['height'] for w in combined_words - if w['height'] <= median_wh * 2.0] - if not normal_heights: - normal_heights = [w['height'] for w in combined_words] - prev['center_y_rel'] = float(np.median(centers)) - prev['center_y_abs'] = prev['center_y_rel'] + top_y - prev['letter_h'] = float(np.median(normal_heights)) - prev['words'] = combined_words - else: - merged.append(cl) - - cluster_info = merged - - if len(cluster_info) < 3: - return rows - - # --- Step C: Compute pitches and detect section breaks --- - pitches: List[float] = [] - for i in range(1, len(cluster_info)): - pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel'] - pitches.append(pitch) - - if not pitches: - return rows - - median_pitch = float(np.median(pitches)) - if median_pitch <= 5: - return rows - - # A section break is where the gap between line centers is much larger - # than the normal pitch (sub-headings, section titles, etc.) - BREAK_FACTOR = 1.8 - - # --- Step D: Build sections (groups of consecutive lines with normal spacing) --- - sections: List[List[Dict]] = [] - current_section: List[Dict] = [cluster_info[0]] - - for i in range(1, len(cluster_info)): - gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel'] - if gap > median_pitch * BREAK_FACTOR: - sections.append(current_section) - current_section = [cluster_info[i]] - else: - current_section.append(cluster_info[i]) - - if current_section: - sections.append(current_section) - - # --- Step E: Build row boundaries per section --- - grid_rows: List[RowGeometry] = [] - - for section in sections: - if not section: - continue - - if len(section) == 1: - # Single-line section (likely a heading) - cl = section[0] - half_h = max(cl['letter_h'], median_pitch * 0.4) - row_top = cl['center_y_abs'] - half_h - row_bot = cl['center_y_abs'] + half_h - grid_rows.append(RowGeometry( - index=0, - x=left_x, - y=round(row_top), - width=content_w, - height=round(row_bot - row_top), - word_count=len(cl['words']), - words=cl['words'], - row_type='content', - gap_before=0, - )) - continue - - # Compute local pitch for this section - local_pitches = [] - for i in range(1, len(section)): - local_pitches.append( - section[i]['center_y_rel'] - section[i - 1]['center_y_rel'] - ) - local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch - - # Row boundaries are placed at midpoints between consecutive centers. - # First row: top = center - local_pitch/2 - # Last row: bottom = center + local_pitch/2 - for i, cl in enumerate(section): - if i == 0: - row_top = cl['center_y_abs'] - local_pitch / 2 - else: - # Midpoint between this center and previous center - prev_center = section[i - 1]['center_y_abs'] - row_top = (prev_center + cl['center_y_abs']) / 2 - - if i == len(section) - 1: - row_bot = cl['center_y_abs'] + local_pitch / 2 - else: - next_center = section[i + 1]['center_y_abs'] - row_bot = (cl['center_y_abs'] + next_center) / 2 - - # Clamp to reasonable bounds - row_top = max(top_y, row_top) - row_bot = min(top_y + content_h, row_bot) - - if row_bot - row_top < 5: - continue - - grid_rows.append(RowGeometry( - index=0, - x=left_x, - y=round(row_top), - width=content_w, - height=round(row_bot - row_top), - word_count=len(cl['words']), - words=cl['words'], - row_type='content', - gap_before=0, - )) - - if not grid_rows: - return rows - - # --- Step F: Re-assign words to grid rows --- - # Words may have shifted slightly; assign each word to the row whose - # center is closest to the word's vertical center. - for gr in grid_rows: - gr.words = [] - - for w in content_words: - w_center = w['top'] + top_y + w['height'] / 2 - best_row = None - best_dist = float('inf') - for gr in grid_rows: - row_center = gr.y + gr.height / 2 - dist = abs(w_center - row_center) - if dist < best_dist: - best_dist = dist - best_row = gr - if best_row is not None and best_dist < median_pitch: - best_row.words.append(w) - - for gr in grid_rows: - gr.word_count = len(gr.words) - - # --- Step G: Validate --- - words_placed = sum(gr.word_count for gr in grid_rows) - if len(content_words) > 0: - match_ratio = words_placed / len(content_words) - if match_ratio < 0.85: - logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} " - f"of words, keeping gap-based rows") - return rows - - # Remove empty grid rows (no words assigned) - grid_rows = [gr for gr in grid_rows if gr.word_count > 0] - - # --- Step H: Merge header/footer + re-index --- - result = list(non_content) + grid_rows - result.sort(key=lambda r: r.y) - for i, r in enumerate(result): - r.index = i - - row_heights = [gr.height for gr in grid_rows] - min_h = min(row_heights) if row_heights else 0 - max_h = max(row_heights) if row_heights else 0 - logger.info(f"RowGrid: word-center grid applied " - f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, " - f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, " - f"{len(sections)} sections, " - f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], " - f"was {len(content_rows)} gap-based rows)") - - return result - - -def _build_rows_from_word_grouping( - word_dicts: List[Dict], - left_x: int, right_x: int, - top_y: int, bottom_y: int, - content_w: int, content_h: int, -) -> List['RowGeometry']: - """Fallback: build rows by grouping words by Y position. - - Uses _group_words_into_lines() with a generous tolerance. - No header/footer detection in fallback mode. - """ - if not word_dicts: - return [] - - y_tolerance = max(20, content_h // 100) - lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance) - - rows = [] - for idx, line_words in enumerate(lines): - if not line_words: - continue - min_top = min(w['top'] for w in line_words) - max_bottom = max(w['top'] + w['height'] for w in line_words) - row_height = max_bottom - min_top - - rows.append(RowGeometry( - index=idx, - x=left_x, - y=top_y + min_top, - width=content_w, - height=row_height, - word_count=len(line_words), - words=line_words, - row_type='content', - gap_before=0, - )) - - logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping") - return rows - - -# --- Phase B: Content-Based Classification --- - -def _score_language(words: List[Dict]) -> Dict[str, float]: - """Score the language of a column's words. - - Analyzes function words, umlauts, and capitalization patterns - to determine whether text is English or German. - - Args: - words: List of word dicts with 'text' and 'conf' keys. - - Returns: - Dict with 'eng' and 'deu' scores (0.0-1.0). - """ - if not words: - return {'eng': 0.0, 'deu': 0.0} - - # Only consider words with decent confidence - good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0] - if not good_words: - return {'eng': 0.0, 'deu': 0.0} - - total = len(good_words) - en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS) - de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS) - - # Check for umlauts (strong German signal) - raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40] - umlaut_count = sum(1 for t in raw_texts - for c in t if c in 'äöüÄÖÜß') - - # German capitalization: nouns are capitalized mid-sentence - # Count words that start with uppercase but aren't at position 0 - cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2) - - en_score = en_hits / total if total > 0 else 0.0 - de_score = de_hits / total if total > 0 else 0.0 - - # Boost German score for umlauts - if umlaut_count > 0: - de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5)) - - # Boost German score for high capitalization ratio (typical for German nouns) - if total > 5: - cap_ratio = cap_words / total - if cap_ratio > 0.3: - de_score = min(1.0, de_score + 0.1) - - return {'eng': round(en_score, 3), 'deu': round(de_score, 3)} - - -def _score_role(geom: ColumnGeometry) -> Dict[str, float]: - """Score the role of a column based on its geometry and content patterns. - - Args: - geom: ColumnGeometry with words and dimensions. - - Returns: - Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'. - """ - scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0} - - if not geom.words: - return scores - - texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40] - if not texts: - return scores - - avg_word_len = sum(len(t) for t in texts) / len(texts) - has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,')) - digit_words = sum(1 for t in texts if any(c.isdigit() for c in t)) - digit_ratio = digit_words / len(texts) if texts else 0.0 - - # Reference: narrow + mostly numbers/page references - if geom.width_ratio < 0.12: - scores['reference'] = 0.5 - if digit_ratio > 0.4: - scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5) - - # Marker: narrow + few short entries - if geom.width_ratio < 0.06 and geom.word_count <= 15: - scores['marker'] = 0.7 - if avg_word_len < 4: - scores['marker'] = 0.9 - # Very narrow non-edge column → strong marker regardless of word count - if geom.width_ratio < 0.04 and geom.index > 0: - scores['marker'] = max(scores['marker'], 0.9) - - # Sentence: longer words + punctuation present - if geom.width_ratio > 0.15 and has_punctuation > 2: - scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts)) - if avg_word_len > 4: - scores['sentence'] = min(1.0, scores['sentence'] + 0.2) - - # Vocabulary: medium width + medium word length - if 0.10 < geom.width_ratio < 0.45: - scores['vocabulary'] = 0.4 - if 3 < avg_word_len < 8: - scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3) - - return {k: round(v, 3) for k, v in scores.items()} - - -def _build_margin_regions( - all_regions: List[PageRegion], - left_x: int, - right_x: int, - img_w: int, - top_y: int, - content_h: int, -) -> List[PageRegion]: - """Create margin_left / margin_right PageRegions from content bounds. - - Margins represent the space between the image edge and the first/last - content column. They are used downstream for faithful page - reconstruction but are skipped during OCR. - """ - margins: List[PageRegion] = [] - # Minimum gap (px) to create a margin region - _min_gap = 5 - - if left_x > _min_gap: - margins.append(PageRegion( - type='margin_left', x=0, y=top_y, - width=left_x, height=content_h, - classification_confidence=1.0, - classification_method='content_bounds', - )) - - # Right margin: from end of last content column to image edge - non_margin = [r for r in all_regions - if r.type not in ('margin_left', 'margin_right', 'header', 'footer', - 'margin_top', 'margin_bottom')] - if non_margin: - last_col_end = max(r.x + r.width for r in non_margin) - else: - last_col_end = right_x - if img_w - last_col_end > _min_gap: - margins.append(PageRegion( - type='margin_right', x=last_col_end, y=top_y, - width=img_w - last_col_end, height=content_h, - classification_confidence=1.0, - classification_method='content_bounds', - )) - - if margins: - logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} " - f"(left_x={left_x}, right_x={right_x}, img_w={img_w})") - - return margins - - -def positional_column_regions( - geometries: List[ColumnGeometry], - content_w: int, - content_h: int, - left_x: int, -) -> List[PageRegion]: - """Classify columns by position only (no language scoring). - - Structural columns (page_ref, column_marker) are identified by geometry. - Remaining content columns are labelled left→right as column_en, column_de, - column_example. The names are purely positional – no language analysis. - """ - structural: List[PageRegion] = [] - content_cols: List[ColumnGeometry] = [] - - for g in geometries: - rel_x = g.x - left_x - # page_ref: narrow column in the leftmost 20% region - if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20: - structural.append(PageRegion( - type='page_ref', x=g.x, y=g.y, - width=g.width, height=content_h, - classification_confidence=0.95, - classification_method='positional', - )) - # column_marker: very narrow, few words - elif g.width_ratio < 0.06 and g.word_count <= 15: - structural.append(PageRegion( - type='column_marker', x=g.x, y=g.y, - width=g.width, height=content_h, - classification_confidence=0.95, - classification_method='positional', - )) - # empty or near-empty narrow column → treat as margin/structural - elif g.word_count <= 2 and g.width_ratio < 0.15: - structural.append(PageRegion( - type='column_marker', x=g.x, y=g.y, - width=g.width, height=content_h, - classification_confidence=0.85, - classification_method='positional', - )) - else: - content_cols.append(g) - - # Single content column → plain text page - if len(content_cols) == 1: - g = content_cols[0] - return structural + [PageRegion( - type='column_text', x=g.x, y=g.y, - width=g.width, height=content_h, - classification_confidence=0.9, - classification_method='positional', - )] - - # No content columns - if not content_cols: - return structural - - # Sort content columns left→right and assign positional labels - content_cols.sort(key=lambda g: g.x) - - # With exactly 2 content columns: if the left one is very wide (>35%), - # it likely contains EN+DE combined, so the right one is examples. - if (len(content_cols) == 2 - and content_cols[0].width_ratio > 0.35 - and content_cols[1].width_ratio > 0.20): - labels = ['column_en', 'column_example'] - else: - labels = ['column_en', 'column_de', 'column_example'] - - regions = list(structural) - for i, g in enumerate(content_cols): - label = labels[i] if i < len(labels) else 'column_example' - regions.append(PageRegion( - type=label, x=g.x, y=g.y, - width=g.width, height=content_h, - classification_confidence=0.95, - classification_method='positional', - )) - - logger.info(f"PositionalColumns: {len(structural)} structural, " - f"{len(content_cols)} content → " - f"{[r.type for r in regions]}") - return regions - - -def classify_column_types(geometries: List[ColumnGeometry], - content_w: int, - top_y: int, - img_w: int, - img_h: int, - bottom_y: int, - left_x: int = 0, - right_x: int = 0, - inv: Optional[np.ndarray] = None) -> List[PageRegion]: - """Classify column types using a 3-level fallback chain. - - Level 1: Content-based (language + role scoring) - Level 2: Position + language (old rules enhanced with language detection) - Level 3: Pure position (exact old code, no regression) - - Args: - geometries: List of ColumnGeometry from Phase A. - content_w: Total content width. - top_y: Top Y of content area. - img_w: Full image width. - img_h: Full image height. - bottom_y: Bottom Y of content area. - left_x: Left content bound (from _find_content_bounds). - right_x: Right content bound (from _find_content_bounds). - - Returns: - List of PageRegion with types, confidence, and method. - """ - content_h = bottom_y - top_y - - def _with_margins(result: List[PageRegion]) -> List[PageRegion]: - """Append margin_left / margin_right regions to *result*.""" - margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h) - return result + margins - - # Special case: single column → plain text page - if len(geometries) == 1: - geom = geometries[0] - return _with_margins([PageRegion( - type='column_text', x=geom.x, y=geom.y, - width=geom.width, height=geom.height, - classification_confidence=0.9, - classification_method='content', - )]) - - # --- Pre-filter: first/last columns with very few words → column_ignore --- - # Sub-columns from _detect_sub_columns() are exempt: they intentionally - # have few words (page refs, markers) and should not be discarded. - ignore_regions = [] - active_geometries = [] - for idx, g in enumerate(geometries): - if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column: - ignore_regions.append(PageRegion( - type='column_ignore', x=g.x, y=g.y, - width=g.width, height=content_h, - classification_confidence=0.95, - classification_method='content', - )) - logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)") - else: - active_geometries.append(g) - - # Re-index active geometries for classification - for new_idx, g in enumerate(active_geometries): - g.index = new_idx - geometries = active_geometries - - # Handle edge case: all columns ignored or only 1 left - if len(geometries) == 0: - return _with_margins(ignore_regions) - if len(geometries) == 1: - geom = geometries[0] - ignore_regions.append(PageRegion( - type='column_text', x=geom.x, y=geom.y, - width=geom.width, height=geom.height, - classification_confidence=0.9, - classification_method='content', - )) - return _with_margins(ignore_regions) - - # --- Score all columns --- - lang_scores = [_score_language(g.words) for g in geometries] - role_scores = [_score_role(g) for g in geometries] - - logger.info(f"ClassifyColumns: language scores: " - f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}") - logger.info(f"ClassifyColumns: role scores: " - f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}") - - # --- Level 1: Content-based classification --- - regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h) - if regions is not None: - logger.info("ClassifyColumns: Level 1 (content-based) succeeded") - _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv) - return _with_margins(ignore_regions + regions) - - # --- Level 2: Position + language enhanced --- - regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h) - if regions is not None: - logger.info("ClassifyColumns: Level 2 (position+language) succeeded") - _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv) - return _with_margins(ignore_regions + regions) - - # --- Level 3: Pure position fallback (old code, no regression) --- - logger.info("ClassifyColumns: Level 3 (position fallback)") - regions = _classify_by_position_fallback(geometries, content_w, content_h) - _add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv) - return _with_margins(ignore_regions + regions) - - -def _classify_by_content(geometries: List[ColumnGeometry], - lang_scores: List[Dict[str, float]], - role_scores: List[Dict[str, float]], - content_w: int, - content_h: int) -> Optional[List[PageRegion]]: - """Level 1: Classify columns purely by content analysis. - - Requires clear language signals to distinguish EN/DE columns. - Returns None if language signals are too weak. - """ - regions = [] - assigned = set() - - # Step 1: Assign structural roles first (reference, marker) - # left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref - left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0 - - for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)): - is_left_side = geom.x < left_20_threshold - has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3 - if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language: - regions.append(PageRegion( - type='page_ref', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=rs['reference'], - classification_method='content', - )) - assigned.add(i) - elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06: - regions.append(PageRegion( - type='column_marker', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=rs['marker'], - classification_method='content', - )) - assigned.add(i) - elif geom.width_ratio < 0.05 and not is_left_side: - # Narrow column on the right side → marker, not page_ref - regions.append(PageRegion( - type='column_marker', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=0.8, - classification_method='content', - )) - assigned.add(i) - - # Step 2: Among remaining columns, find EN and DE by language scores - remaining = [(i, geometries[i], lang_scores[i], role_scores[i]) - for i in range(len(geometries)) if i not in assigned] - - if len(remaining) < 2: - # Not enough columns for EN/DE pair - if len(remaining) == 1: - i, geom, ls, rs = remaining[0] - regions.append(PageRegion( - type='column_text', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=0.6, - classification_method='content', - )) - regions.sort(key=lambda r: r.x) - return regions - - # Check if we have enough language signal - en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05] - de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05] - - # Position tiebreaker: when language signals are weak, use left=EN, right=DE - if (not en_candidates or not de_candidates) and len(remaining) >= 2: - max_eng = max(ls['eng'] for _, _, ls, _ in remaining) - max_deu = max(ls['deu'] for _, _, ls, _ in remaining) - if max_eng < 0.15 and max_deu < 0.15: - # Both signals weak — fall back to positional: left=EN, right=DE - sorted_remaining = sorted(remaining, key=lambda x: x[1].x) - best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2]) - best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2]) - logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE") - en_conf = 0.4 - de_conf = 0.4 - - regions.append(PageRegion( - type='column_en', x=best_en[1].x, y=best_en[1].y, - width=best_en[1].width, height=content_h, - classification_confidence=en_conf, - classification_method='content', - )) - assigned.add(best_en[0]) - - regions.append(PageRegion( - type='column_de', x=best_de[1].x, y=best_de[1].y, - width=best_de[1].width, height=content_h, - classification_confidence=de_conf, - classification_method='content', - )) - assigned.add(best_de[0]) - - # Assign remaining as example - for i, geom, ls, rs in remaining: - if i not in assigned: - regions.append(PageRegion( - type='column_example', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=0.4, - classification_method='content', - )) - regions.sort(key=lambda r: r.x) - return regions - - if not en_candidates or not de_candidates: - # Language signals too weak for content-based classification - logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split") - return None - - # Pick the best EN and DE candidates - best_en = max(en_candidates, key=lambda x: x[2]['eng']) - best_de = max(de_candidates, key=lambda x: x[2]['deu']) - - # Position-aware EN selection: in typical textbooks the layout is EN | DE | Example. - # Example sentences contain English function words ("the", "a", "is") which inflate - # the eng score of the Example column. When the best EN candidate sits to the RIGHT - # of the DE column and there is another EN candidate to the LEFT, prefer the left one - # — it is almost certainly the real vocabulary column. - if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1: - left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x] - if left_of_de: - alt_en = max(left_of_de, key=lambda x: x[2]['eng']) - logger.info( - f"ClassifyColumns: Level 1 position fix — best EN col {best_en[0]} " - f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; " - f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})") - best_en = alt_en - - if best_en[0] == best_de[0]: - # Same column scored highest for both — ambiguous - logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE") - return None - - en_conf = best_en[2]['eng'] - de_conf = best_de[2]['deu'] - - regions.append(PageRegion( - type='column_en', x=best_en[1].x, y=best_en[1].y, - width=best_en[1].width, height=content_h, - classification_confidence=round(en_conf, 2), - classification_method='content', - )) - assigned.add(best_en[0]) - - regions.append(PageRegion( - type='column_de', x=best_de[1].x, y=best_de[1].y, - width=best_de[1].width, height=content_h, - classification_confidence=round(de_conf, 2), - classification_method='content', - )) - assigned.add(best_de[0]) - - # Step 3: Remaining columns → example or text based on role scores - for i, geom, ls, rs in remaining: - if i in assigned: - continue - if rs['sentence'] > 0.4: - regions.append(PageRegion( - type='column_example', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=round(rs['sentence'], 2), - classification_method='content', - )) - else: - regions.append(PageRegion( - type='column_example', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=0.5, - classification_method='content', - )) - - regions.sort(key=lambda r: r.x) - return regions - - -def _classify_by_position_enhanced(geometries: List[ColumnGeometry], - lang_scores: List[Dict[str, float]], - content_w: int, - content_h: int) -> Optional[List[PageRegion]]: - """Level 2: Position-based rules enhanced with language confirmation. - - Uses the old positional heuristics but confirms EN/DE assignment - with language scores (swapping if needed). - """ - regions = [] - untyped = list(range(len(geometries))) - first_x = geometries[0].x if geometries else 0 - left_20_threshold = first_x + content_w * 0.20 - - # Rule 1: Leftmost narrow column → page_ref (only if in left 20%, no strong language) - g0 = geometries[0] - ls0 = lang_scores[0] - has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3 - if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0: - regions.append(PageRegion( - type='page_ref', x=g0.x, y=g0.y, - width=g0.width, height=content_h, - classification_confidence=0.8, - classification_method='position_enhanced', - )) - untyped.remove(0) - - # Rule 2: Narrow columns with few words → marker - for i in list(untyped): - geom = geometries[i] - if geom.width_ratio < 0.06 and geom.word_count <= 15: - regions.append(PageRegion( - type='column_marker', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=0.7, - classification_method='position_enhanced', - )) - untyped.remove(i) - - # Rule 3: Rightmost remaining → column_example (if 3+ remaining) - if len(untyped) >= 3: - last_idx = untyped[-1] - geom = geometries[last_idx] - regions.append(PageRegion( - type='column_example', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=0.7, - classification_method='position_enhanced', - )) - untyped.remove(last_idx) - - # Rule 4: First two remaining → EN/DE, but check language to possibly swap - if len(untyped) >= 2: - idx_a = untyped[0] - idx_b = untyped[1] - ls_a = lang_scores[idx_a] - ls_b = lang_scores[idx_b] - - # Default: first=EN, second=DE (old behavior) - en_idx, de_idx = idx_a, idx_b - conf = 0.7 - - # Swap if language signals clearly indicate the opposite - if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']: - en_idx, de_idx = idx_b, idx_a - conf = 0.85 - logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores") - - regions.append(PageRegion( - type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y, - width=geometries[en_idx].width, height=content_h, - classification_confidence=conf, - classification_method='position_enhanced', - )) - regions.append(PageRegion( - type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y, - width=geometries[de_idx].width, height=content_h, - classification_confidence=conf, - classification_method='position_enhanced', - )) - untyped = untyped[2:] - elif len(untyped) == 1: - idx = untyped[0] - geom = geometries[idx] - regions.append(PageRegion( - type='column_en', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=0.5, - classification_method='position_enhanced', - )) - untyped = [] - - # Remaining → example - for idx in untyped: - geom = geometries[idx] - regions.append(PageRegion( - type='column_example', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=0.5, - classification_method='position_enhanced', - )) - - regions.sort(key=lambda r: r.x) - return regions - - -def _classify_by_position_fallback(geometries: List[ColumnGeometry], - content_w: int, - content_h: int) -> List[PageRegion]: - """Level 3: Pure position-based fallback (identical to old code). - - Guarantees no regression from the previous behavior. - """ - regions = [] - untyped = list(range(len(geometries))) - first_x = geometries[0].x if geometries else 0 - left_20_threshold = first_x + content_w * 0.20 - - # Rule 1: Leftmost narrow column → page_ref (only if in left 20%) - g0 = geometries[0] - if g0.width_ratio < 0.12 and g0.x < left_20_threshold: - regions.append(PageRegion( - type='page_ref', x=g0.x, y=g0.y, - width=g0.width, height=content_h, - classification_confidence=1.0, - classification_method='position_fallback', - )) - untyped.remove(0) - - # Rule 2: Narrow + few words → marker - for i in list(untyped): - geom = geometries[i] - if geom.width_ratio < 0.06 and geom.word_count <= 15: - regions.append(PageRegion( - type='column_marker', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=1.0, - classification_method='position_fallback', - )) - untyped.remove(i) - - # Rule 3: Rightmost remaining → example (if 3+) - if len(untyped) >= 3: - last_idx = untyped[-1] - geom = geometries[last_idx] - regions.append(PageRegion( - type='column_example', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=1.0, - classification_method='position_fallback', - )) - untyped.remove(last_idx) - - # Rule 4: First remaining → EN, second → DE - if len(untyped) >= 2: - en_idx = untyped[0] - de_idx = untyped[1] - regions.append(PageRegion( - type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y, - width=geometries[en_idx].width, height=content_h, - classification_confidence=1.0, - classification_method='position_fallback', - )) - regions.append(PageRegion( - type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y, - width=geometries[de_idx].width, height=content_h, - classification_confidence=1.0, - classification_method='position_fallback', - )) - untyped = untyped[2:] - elif len(untyped) == 1: - idx = untyped[0] - geom = geometries[idx] - regions.append(PageRegion( - type='column_en', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=1.0, - classification_method='position_fallback', - )) - untyped = [] - - for idx in untyped: - geom = geometries[idx] - regions.append(PageRegion( - type='column_example', x=geom.x, y=geom.y, - width=geom.width, height=content_h, - classification_confidence=1.0, - classification_method='position_fallback', - )) - - regions.sort(key=lambda r: r.x) - return regions - - -def _detect_header_footer_gaps( - inv: np.ndarray, - img_w: int, - img_h: int, -) -> Tuple[Optional[int], Optional[int]]: - """Detect header/footer boundaries via horizontal projection gap analysis. - - Scans the full-page inverted image for large horizontal gaps in the top/bottom - 20% that separate header/footer content from the main body. - - Returns: - (header_y, footer_y) — absolute y-coordinates. - header_y = bottom edge of header region (None if no header detected). - footer_y = top edge of footer region (None if no footer detected). - """ - HEADER_FOOTER_ZONE = 0.20 - GAP_MULTIPLIER = 2.0 - - # Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding - actual_h = min(inv.shape[0], img_h) - roi = inv[:actual_h, :] - h_proj = np.sum(roi, axis=1).astype(float) - proj_w = roi.shape[1] - h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj - - # Step 2: Smoothing - kernel_size = max(3, actual_h // 200) - if kernel_size % 2 == 0: - kernel_size += 1 - h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same') - - # Step 3: Gap threshold - positive = h_smooth[h_smooth > 0] - median_density = float(np.median(positive)) if len(positive) > 0 else 0.01 - gap_threshold = max(median_density * 0.15, 0.003) - - in_gap = h_smooth < gap_threshold - MIN_GAP_HEIGHT = max(3, actual_h // 500) - - # Step 4: Collect contiguous gaps - raw_gaps: List[Tuple[int, int]] = [] - gap_start: Optional[int] = None - for y in range(len(in_gap)): - if in_gap[y]: - if gap_start is None: - gap_start = y - else: - if gap_start is not None: - gap_height = y - gap_start - if gap_height >= MIN_GAP_HEIGHT: - raw_gaps.append((gap_start, y)) - gap_start = None - if gap_start is not None: - gap_height = len(in_gap) - gap_start - if gap_height >= MIN_GAP_HEIGHT: - raw_gaps.append((gap_start, len(in_gap))) - - if not raw_gaps: - return None, None - - # Step 5: Compute median gap size and large-gap threshold - gap_sizes = [g[1] - g[0] for g in raw_gaps] - median_gap = float(np.median(gap_sizes)) - large_gap_threshold = median_gap * GAP_MULTIPLIER - - # Step 6: Find largest qualifying gap in header / footer zones - # A separator gap must have content on BOTH sides — edge-touching gaps - # (e.g. dewarp padding at bottom) are not valid separators. - EDGE_MARGIN = max(5, actual_h // 400) - header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE) - footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE)) - - header_y: Optional[int] = None - footer_y: Optional[int] = None - - best_header_size = 0 - for gs, ge in raw_gaps: - if gs <= EDGE_MARGIN: - continue # skip gaps touching the top edge - gap_mid = (gs + ge) / 2 - gap_size = ge - gs - if gap_mid < header_zone_limit and gap_size > large_gap_threshold: - if gap_size > best_header_size: - best_header_size = gap_size - header_y = ge # bottom edge of gap - - best_footer_size = 0 - for gs, ge in raw_gaps: - if ge >= actual_h - EDGE_MARGIN: - continue # skip gaps touching the bottom edge - gap_mid = (gs + ge) / 2 - gap_size = ge - gs - if gap_mid > footer_zone_start and gap_size > large_gap_threshold: - if gap_size > best_footer_size: - best_footer_size = gap_size - footer_y = gs # top edge of gap - - if header_y is not None: - logger.info(f"HeaderFooterGaps: header boundary at y={header_y} " - f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)") - if footer_y is not None: - logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} " - f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)") - - return header_y, footer_y - - -def _region_has_content(inv: np.ndarray, y_start: int, y_end: int, - min_density: float = 0.005) -> bool: - """Check whether a horizontal strip contains meaningful ink. - - Args: - inv: Inverted binarized image (white-on-black). - y_start: Top of the region (inclusive). - y_end: Bottom of the region (exclusive). - min_density: Fraction of white pixels required to count as content. - - Returns: - True if the region contains text/graphics, False if empty margin. - """ - if y_start >= y_end: - return False - strip = inv[y_start:y_end, :] - density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255) - return density > min_density - - -def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int, - img_w: int, img_h: int, - inv: Optional[np.ndarray] = None) -> None: - """Add header/footer/margin regions in-place. - - Uses gap-based detection when *inv* is provided, otherwise falls back - to simple top_y/bottom_y bounds. - - Region types depend on whether there is actual content (text/graphics): - - 'header' / 'footer' — region contains text (e.g. title, page number) - - 'margin_top' / 'margin_bottom' — region is empty page margin - """ - header_y: Optional[int] = None - footer_y: Optional[int] = None - - if inv is not None: - header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h) - - # --- Top region --- - top_boundary = header_y if header_y is not None and header_y > 10 else ( - top_y if top_y > 10 else None - ) - if top_boundary is not None: - has_content = inv is not None and _region_has_content(inv, 0, top_boundary) - rtype = 'header' if has_content else 'margin_top' - regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary)) - logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px " - f"(has_content={has_content})") - - # --- Bottom region --- - bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else ( - bottom_y if bottom_y < img_h - 10 else None - ) - if bottom_boundary is not None: - has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h) - rtype = 'footer' if has_content else 'margin_bottom' - regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w, - height=img_h - bottom_boundary)) - logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} " - f"height={img_h - bottom_boundary}px (has_content={has_content})") - - -# --- Main Entry Point --- - -def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]: - """Detect columns using two-phase approach: geometry then content classification. - - Phase A: detect_column_geometry() — clustering word positions into columns. - Phase B: classify_column_types() — content-based type assignment with fallback. - - Falls back to projection-based analyze_layout() if geometry detection fails. - - Args: - ocr_img: Binarized grayscale image for layout analysis. - dewarped_bgr: Original BGR image (for Tesseract word detection). - - Returns: - List of PageRegion objects with types, confidence, and method. - """ - h, w = ocr_img.shape[:2] - - # Phase A: Geometry detection - result = detect_column_geometry(ocr_img, dewarped_bgr) - - if result is None: - # Fallback to projection-based layout - logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles") - layout_img = create_layout_image(dewarped_bgr) - return analyze_layout(layout_img, ocr_img) - - geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result - content_w = right_x - left_x - - # Detect header/footer early so sub-column clustering ignores them - header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None) - - # Split sub-columns (e.g. page references) before classification - geometries = _detect_sub_columns(geometries, content_w, left_x=left_x, - top_y=top_y, header_y=header_y, footer_y=footer_y) - - # Split broad columns that contain EN+DE mixed via word-coverage gaps - geometries = _split_broad_columns(geometries, content_w, left_x=left_x) - - # Phase B: Positional classification (no language scoring) - content_h = bottom_y - top_y - regions = positional_column_regions(geometries, content_w, content_h, left_x) - - col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref']) - methods = set(r.classification_method for r in regions if r.classification_method) - logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): " - f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}") - - return regions - - -# ============================================================================= -# Pipeline Step 5: Word Grid from Columns × Rows -# ============================================================================= - -def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]: - """Group OCR words into visual lines in reading order. - - Returns a list of line strings (one per visual line in the cell). - """ - if not words: - return [] - - lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px) - return [' '.join(w['text'] for w in line) for line in lines] - - -def _rejoin_hyphenated(lines: List[str]) -> List[str]: - """Rejoin words split by line-break hyphenation. - - E.g. ['Fuß-', 'boden'] → ['Fußboden'] - ['some text-', 'thing here'] → ['something here'] - """ - if len(lines) <= 1: - return lines - - result = [] - i = 0 - while i < len(lines): - line = lines[i] - # If line ends with '-' and there's a next line, rejoin - if i + 1 < len(lines) and line.rstrip().endswith('-'): - stripped = line.rstrip() - # Get the word fragment before hyphen (last word) - prefix = stripped[:-1] # remove trailing hyphen - next_line = lines[i + 1] - # Join: last word of this line + first word of next line - prefix_words = prefix.rsplit(' ', 1) - next_words = next_line.split(' ', 1) - if len(prefix_words) > 1: - joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0] - else: - joined = prefix_words[0] + next_words[0] - remainder = next_words[1] if len(next_words) > 1 else '' - if remainder: - result.append(joined + ' ' + remainder) - else: - result.append(joined) - i += 2 - else: - result.append(line) - i += 1 - return result - - -def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str: - """Join OCR words into text in correct reading order, preserving line breaks. - - Groups words into visual lines by Y-tolerance, sorts each line by X, - rejoins hyphenated words, then joins lines with newlines. - """ - lines = _words_to_reading_order_lines(words, y_tolerance_px) - lines = _rejoin_hyphenated(lines) - return '\n'.join(lines) - - -# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) --- - -_rapid_engine = None -RAPIDOCR_AVAILABLE = False - -try: - from rapidocr import RapidOCR as _RapidOCRClass - from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType - RAPIDOCR_AVAILABLE = True - logger.info("RapidOCR available — can be used as alternative to Tesseract") -except ImportError: - logger.info("RapidOCR not installed — using Tesseract only") - - -def _get_rapid_engine(): - """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support.""" - global _rapid_engine - if _rapid_engine is None: - _rapid_engine = _RapidOCRClass(params={ - # PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß) - "Rec.lang_type": _LangRec.LATIN, - "Rec.model_type": _ModelType.SERVER, - "Rec.ocr_version": _OCRVersion.PPOCRV5, - # Tighter detection boxes to reduce word merging - "Det.unclip_ratio": 1.3, - # Lower threshold to detect small chars (periods, ellipsis, phonetics) - "Det.box_thresh": 0.4, - # Silence verbose logging - "Global.log_level": "critical", - }) - logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)") - return _rapid_engine - - -def ocr_region_rapid( - img_bgr: np.ndarray, - region: PageRegion, -) -> List[Dict[str, Any]]: - """Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format. - - Args: - img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray). - region: Region to crop and OCR. - - Returns: - List of word dicts with text, left, top, width, height, conf, region_type. - """ - engine = _get_rapid_engine() - - # Crop region from BGR image - crop = img_bgr[region.y:region.y + region.height, - region.x:region.x + region.width] - - if crop.size == 0: - return [] - - result = engine(crop) - - if result is None or result.boxes is None or result.txts is None: - return [] - - words = [] - boxes = result.boxes # shape (N, 4, 2) — 4 corner points per text line - txts = result.txts # tuple of strings - scores = result.scores # tuple of floats - - for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)): - if not txt or not txt.strip(): - continue - - # box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left) - xs = [p[0] for p in box] - ys = [p[1] for p in box] - left = int(min(xs)) - top = int(min(ys)) - w = int(max(xs) - left) - h = int(max(ys) - top) - - words.append({ - 'text': txt.strip(), - 'left': left + region.x, # Absolute coords - 'top': top + region.y, - 'width': w, - 'height': h, - 'conf': int(score * 100), # 0-100 like Tesseract - 'region_type': region.type, - }) - - return words - - -def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]: - """Run TrOCR on a region. Returns line-level word dicts (same format as ocr_region_rapid). - - Uses trocr_service.get_trocr_model() + _split_into_lines() for line segmentation. - Bboxes are approximated from equal line-height distribution within the region. - Falls back to Tesseract if TrOCR is not available. - """ - from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available - - if not _check_trocr_available(): - logger.warning("TrOCR not available, falling back to Tesseract") - if region.height > 0 and region.width > 0: - ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None - if ocr_img_crop is not None: - return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) - return [] - - crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width] - if crop.size == 0: - return [] - - try: - import torch - from PIL import Image as _PILImage - - processor, model = get_trocr_model(handwritten=handwritten) - if processor is None or model is None: - logger.warning("TrOCR model not loaded, falling back to Tesseract") - ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) - return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) - - pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)) - lines = _split_into_lines(pil_crop) - if not lines: - lines = [pil_crop] - - device = next(model.parameters()).device - all_text = [] - confidences = [] - for line_img in lines: - pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device) - with torch.no_grad(): - generated_ids = model.generate(pixel_values, max_length=128) - text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() - if text_line: - all_text.append(text_line) - confidences.append(0.85 if len(text_line) > 3 else 0.5) - - if not all_text: - return [] - - avg_conf = int(sum(confidences) / len(confidences) * 100) - line_h = region.height // max(len(all_text), 1) - words = [] - for i, line in enumerate(all_text): - words.append({ - "text": line, - "left": region.x, - "top": region.y + i * line_h, - "width": region.width, - "height": line_h, - "conf": avg_conf, - "region_type": region.type, - }) - return words - - except Exception as e: - logger.error(f"ocr_region_trocr failed: {e}") - return [] - - -def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]: - """Run LightOnOCR-2-1B on a region. Returns line-level word dicts (same format as ocr_region_rapid). - - Falls back to RapidOCR or Tesseract if LightOnOCR is not available. - """ - from services.lighton_ocr_service import get_lighton_model, _check_lighton_available - - if not _check_lighton_available(): - logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract") - if RAPIDOCR_AVAILABLE and img_bgr is not None: - return ocr_region_rapid(img_bgr, region) - ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None - return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else [] - - crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width] - if crop.size == 0: - return [] - - try: - import io - import torch - from PIL import Image as _PILImage - - processor, model = get_lighton_model() - if processor is None or model is None: - logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract") - if RAPIDOCR_AVAILABLE and img_bgr is not None: - return ocr_region_rapid(img_bgr, region) - ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) - return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) - - pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)) - conversation = [{"role": "user", "content": [{"type": "image"}]}] - inputs = processor.apply_chat_template( - conversation, images=[pil_crop], - add_generation_prompt=True, return_tensors="pt" - ).to(model.device) - - with torch.no_grad(): - output_ids = model.generate(**inputs, max_new_tokens=1024) - - text = processor.decode(output_ids[0], skip_special_tokens=True).strip() - if not text: - return [] - - lines = [l.strip() for l in text.split("\n") if l.strip()] - line_h = region.height // max(len(lines), 1) - words = [] - for i, line in enumerate(lines): - words.append({ - "text": line, - "left": region.x, - "top": region.y + i * line_h, - "width": region.width, - "height": line_h, - "conf": 85, - "region_type": region.type, - }) - return words - - except Exception as e: - logger.error(f"ocr_region_lighton failed: {e}") - return [] - - -# ============================================================================= -# Post-Processing: Deterministic Quality Fixes -# ============================================================================= - -# --- A. Character Confusion Fix (I/1/l) --- - -# Common OCR confusion pairs in vocabulary context -_CHAR_CONFUSION_RULES = [ - # "1" at word start followed by lowercase → likely "I" or "l" - # Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3") - (re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant - # Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number) - (re.compile(r'(? List[Dict[str, Any]]: - """Fix common OCR character confusions using context. - - Deterministic rules: - - "1" at word start → "I" or "l" based on context - - Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I" - - "y " artifact at word boundaries → remove (e.g. "y you" → "you") - """ - for entry in entries: - en = entry.get('english', '') or '' - de = entry.get('german', '') or '' - ex = entry.get('example', '') or '' - - # Apply general rules to all fields - for pattern, replacement in _CHAR_CONFUSION_RULES: - en = pattern.sub(replacement, en) - de = pattern.sub(replacement, de) - ex = pattern.sub(replacement, ex) - - # Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I" - de_lower_words = set(de.lower().replace(',', ' ').split()) - if de_lower_words & _DE_INDICATORS_FOR_EN_I: - # Any remaining "1" in EN that looks like "I" - en = re.sub(r'\b1\b(?![\d.,])', 'I', en) - - # Fix "y " artifact before repeated word: "y you" → "you" - en = re.sub(r'\by\s+([a-z])', r'\1', en) - ex = re.sub(r'\by\s+([a-z])', r'\1', ex) - - entry['english'] = en.strip() - entry['german'] = de.strip() - entry['example'] = ex.strip() - - return entries - - -# --- B. Comma-Separated Word Form Splitting --- - -def _is_singular_plural_pair(parts: List[str]) -> bool: - """Detect if comma-separated parts are singular/plural forms of the same word. - - E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split). - "break, broke, broken" → False (different verb forms, OK to split). - - Heuristic: exactly 2 parts that share a common prefix of >= 50% length, - OR one part is a known plural suffix of the other (e.g. +s, +es, +en). - """ - if len(parts) != 2: - return False - - a, b = parts[0].lower().strip(), parts[1].lower().strip() - if not a or not b: - return False - - # Common prefix heuristic: if words share >= 50% of the shorter word, - # they are likely forms of the same word (Maus/Mäuse, child/children). - min_len = min(len(a), len(b)) - common = 0 - for ca, cb in zip(a, b): - if ca == cb: - common += 1 - else: - break - if common >= max(2, min_len * 0.5): - return True - - # Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü) - umlaut_map = str.maketrans('aou', 'äöü') - if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a: - return True - - return False - - -def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Split entries with comma-separated word forms into individual entries. - - E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen" - → 3 entries: break/brechen, broke/brach, broken/gebrochen - - Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse" - because those are forms of the same vocabulary entry. - - Only splits when both EN and DE have the same number of comma-parts, - parts are short (word forms, not sentences), and at least 3 parts - (to avoid splitting pairs that likely belong together). - """ - result: List[Dict[str, Any]] = [] - - for entry in entries: - en = (entry.get('english', '') or '').strip() - de = (entry.get('german', '') or '').strip() - - # Split by comma (but not inside brackets or parentheses) - en_parts = _split_by_comma(en) - de_parts = _split_by_comma(de) - - # Only split if we have multiple parts and counts match - should_split = False - if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts): - # All parts must be short (word forms, not sentences) - if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts): - # Do NOT split singular/plural pairs (2 parts that are - # forms of the same word) - if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts): - should_split = False - else: - should_split = True - - if not should_split: - result.append(entry) - continue - - # Split into individual entries - for k in range(len(en_parts)): - sub = dict(entry) # shallow copy - sub['english'] = en_parts[k].strip() - sub['german'] = de_parts[k].strip() if k < len(de_parts) else '' - sub['example'] = '' # examples get attached later - sub['split_from_comma'] = True - result.append(sub) - - # Re-number - for i, e in enumerate(result): - e['row_index'] = i - - return result - - -def _split_by_comma(text: str) -> List[str]: - """Split text by commas, but not inside brackets [...] or parens (...).""" - if ',' not in text: - return [text] - - parts = [] - depth_bracket = 0 - depth_paren = 0 - current = [] - - for ch in text: - if ch == '[': - depth_bracket += 1 - elif ch == ']': - depth_bracket = max(0, depth_bracket - 1) - elif ch == '(': - depth_paren += 1 - elif ch == ')': - depth_paren = max(0, depth_paren - 1) - elif ch == ',' and depth_bracket == 0 and depth_paren == 0: - parts.append(''.join(current).strip()) - current = [] - continue - current.append(ch) - - if current: - parts.append(''.join(current).strip()) - - # Filter empty parts - return [p for p in parts if p] - - -# --- C. Example Sentence Attachment --- - -def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int: - """Find the vocab entry whose English word(s) best match the example sentence. - - Returns index into vocab_entries, or -1 if no match found. - Uses word stem overlap: "a broken arm" matches "broken" or "break". - """ - if not vocab_entries or not example_text: - return -1 - - example_lower = example_text.lower() - example_words = set(re.findall(r'[a-zäöüß]+', example_lower)) - - best_idx = -1 - best_score = 0 - - for i, entry in enumerate(vocab_entries): - en = (entry.get('english', '') or '').lower() - if not en: - continue - - # Extract vocab words (split on space, comma, newline) - vocab_words = set(re.findall(r'[a-zäöüß]+', en)) - - # Score: how many vocab words appear in the example? - # Also check if example words share a common stem (first 4 chars) - direct_matches = vocab_words & example_words - score = len(direct_matches) * 10 - - # Stem matching: "broken" matches "break" via shared prefix "bro"/"bre" - if score == 0: - for vw in vocab_words: - if len(vw) < 3: - continue - stem = vw[:4] if len(vw) >= 4 else vw[:3] - for ew in example_words: - if len(ew) >= len(stem) and ew[:len(stem)] == stem: - score += 5 - break - - if score > best_score: - best_score = score - best_idx = i - - return best_idx if best_score > 0 else -1 - - -def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Attach rows with EN text but no DE translation as examples to matching vocab entries. - - Vocabulary worksheets often have: - Row 1: break, broke, broken / brechen, brach, gebrochen - Row 2: a broken arm (no DE → example for "broken") - Row 3: a broken plate (no DE → example for "broken") - Row 4: egg / Ei (has DE → new vocab entry) - - Rules (deterministic, generic): - - A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars) - - Find the best matching vocab entry by checking which entry's English words - appear in the example sentence (semantic matching via word overlap) - - Fall back to the nearest preceding entry if no word match found - - Multiple examples get joined with " | " - """ - if not entries: - return entries - - # Separate into vocab entries (have DE) and example candidates (no DE) - vocab_entries: List[Dict[str, Any]] = [] - examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts - - for entry in entries: - en = (entry.get('english', '') or '').strip() - de = (entry.get('german', '') or '').strip() - ex = (entry.get('example', '') or '').strip() - - # Treat single-char DE as OCR noise, not real translation. - # "Ei" (2 chars) is a valid German word, so threshold is 1. - has_de = len(de) > 1 - has_en = bool(en) - - # Heuristic: a row without DE is an "example sentence" only if - # the EN text looks like a sentence (>= 4 words, or contains - # typical sentence punctuation). Short EN text (1-3 words) is - # more likely a vocab entry whose DE was missed by OCR. - _looks_like_sentence = ( - len(en.split()) >= 4 - or en.rstrip().endswith(('.', '!', '?')) - ) - is_example_candidate = ( - has_en and not has_de and _looks_like_sentence and vocab_entries - ) - - if is_example_candidate: - # This is an example sentence — find best matching vocab entry - example_text = en - - match_idx = _find_best_vocab_match(en, vocab_entries) - if match_idx < 0: - # No word match → fall back to last entry - match_idx = len(vocab_entries) - 1 - - if match_idx not in examples_for: - examples_for[match_idx] = [] - examples_for[match_idx].append(example_text) - else: - vocab_entries.append(entry) - - # Attach examples to their matched vocab entries - for idx, example_list in examples_for.items(): - if 0 <= idx < len(vocab_entries): - entry = vocab_entries[idx] - existing_ex = (entry.get('example', '') or '').strip() - new_examples = ' | '.join(example_list) - entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples - - # Re-number - for i, e in enumerate(vocab_entries): - e['row_index'] = i - - return vocab_entries - - -# --- D. Phonetic Bracket IPA Replacement --- - -# Pattern: word followed by any bracket type containing phonetic content. -# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc. -# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs. -# This intentionally matches mixed brackets (e.g. {content]) because -# Tesseract frequently misrecognizes bracket characters. -_PHONETIC_BRACKET_RE = re.compile( - r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]' +from cv_vocab_types import * # noqa: F401,F403 +from cv_preprocessing import * # noqa: F401,F403 +from cv_layout import * # noqa: F401,F403 +from cv_ocr_engines import * # noqa: F401,F403 +from cv_cell_grid import * # noqa: F401,F403 +from cv_review import * # noqa: F401,F403 + +# Private names used by consumers — not covered by wildcard re-exports. +from cv_preprocessing import _apply_shear # noqa: F401 +from cv_layout import ( # noqa: F401 + _detect_header_footer_gaps, + _detect_sub_columns, + _split_broad_columns, ) - -# Unicode IPA characters — used to distinguish correct IPA (from dictionary -# lookup) from garbled OCR content when stripping orphan brackets. -_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ') - -# Minimum word confidence for full-page Tesseract results (0-100). -# Words below this threshold are OCR noise (scanner shadows, borders). -_MIN_WORD_CONF = 30 - - -def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]: - """Look up IPA for a word using the selected pronunciation dictionary. - - Args: - word: English word to look up. - pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT). - - Returns: - IPA string or None if not found. - """ - word_lower = word.lower().strip() - if not word_lower: - return None - - if pronunciation == 'british' and _britfone_dict: - ipa = _britfone_dict.get(word_lower) - if ipa: - return ipa - # Fallback to American if not in Britfone - if _ipa_convert_american: - result = _ipa_convert_american(word_lower) - if result and '*' not in result: - return result - return None - - if pronunciation == 'american' and _ipa_convert_american: - result = _ipa_convert_american(word_lower) - if result and '*' not in result: - return result - # Fallback to Britfone if not in CMU - if _britfone_dict: - ipa = _britfone_dict.get(word_lower) - if ipa: - return ipa - return None - - # Try any available source - if _britfone_dict: - ipa = _britfone_dict.get(word_lower) - if ipa: - return ipa - if _ipa_convert_american: - result = _ipa_convert_american(word_lower) - if result and '*' not in result: - return result - - return None - - -def _fix_phonetic_brackets( - entries: List[Dict[str, Any]], - pronunciation: str = 'british', -) -> List[Dict[str, Any]]: - """Replace OCR'd phonetic transcriptions with dictionary IPA. - - Detects patterns like "dance [du:ns]" and replaces with correct IPA: - - British: "dance [dˈɑːns]" (Britfone, MIT) - - American: "dance [dæns]" (eng_to_ipa/CMU, MIT) - - Only replaces if the word before brackets is found in the dictionary. - """ - if not IPA_AVAILABLE: - return entries - - # IPA phonetics only appear in the ENGLISH field of vocab tables. - # German and example fields contain meaningful parenthetical content: - # german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)" - # example: "(sich beschweren)", "(brauchen)", "(jammern)" - # These must NEVER be processed as phonetic transcriptions. - replaced_count = 0 - for entry in entries: - text = entry.get('english', '') or '' - if not any(ch in text for ch in '[{('): - continue - new_text = _replace_phonetics_in_text(text, pronunciation) - if new_text != text: - logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'") - replaced_count += 1 - entry['english'] = new_text - - if replaced_count: - logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries") - return entries - - -# Grammar particles that appear in brackets after English words: -# cross (with), complain (about/of), agree (on/with), look (sth) up -# These must NOT be replaced with IPA. Only used for the English field -# (German/example fields are never processed for IPA replacement). -_GRAMMAR_BRACKET_WORDS = frozenset({ - # English prepositions/particles commonly in vocab tables - 'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by', - 'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through', - # English grammar abbreviations used in vocab tables - 'sth', 'sb', 'adj', 'adv', -}) - - -def _is_grammar_bracket_content(content: str) -> bool: - """Return True if bracket content is grammar info in the ENGLISH field. - - Grammar info: cross (with), complain (about/of), agree (on/with) - NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test] - - Since we only process the English field, we only need to recognize - English grammar particles. Everything else is (garbled) IPA. - """ - if not content: - return False - - # Split on / for patterns like (about/of), (on/with) - tokens = [t.strip().lower() for t in content.split('/') if t.strip()] - if not tokens: - return False - - # ALL tokens must be known grammar words - return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens) - - -def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str: - """Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA. - - Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno]. - We match any bracket type and replace with dictionary IPA if found. - Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved. - """ - if not IPA_AVAILABLE: - return text - - def replacer(match): - word = match.group(1) - bracket_content = match.group(2).strip() - full_match = match.group(0) - - # Skip if bracket content looks like regular text (multiple words) - if len(bracket_content.split()) > 3: - return full_match - - # Look up IPA for the word before brackets - ipa = _lookup_ipa(word, pronunciation) - - if ipa: - # Word has IPA → bracket content is phonetic (garbled or correct). - # Exception: grammar particles like cross (with) — keep those. - if _is_grammar_bracket_content(bracket_content): - return full_match - logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'") - return f"{word} [{ipa}]" - - # No IPA for this word — keep as-is - return full_match - - text = _PHONETIC_BRACKET_RE.sub(replacer, text) - - # Second pass: strip remaining orphan brackets that are garbled IPA. - # These have no word before them (the main regex requires \b word \s* bracket). - # Examples: "[mais]", "{'mani setva]", trailing "(kros]" - # Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]" - def _strip_orphan_bracket(m): - content = m.group(1).strip() - # Keep grammar info: (sich beschweren), (about/of) - if _is_grammar_bracket_content(content): - return m.group(0) - # Keep correct IPA (contains Unicode IPA characters) - if any(ch in _IPA_CHARS for ch in content): - return m.group(0) - logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'") - return '' - - text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text) - text = text.strip() - - return text - - -def _assign_row_words_to_columns( - row: RowGeometry, - columns: List[PageRegion], -) -> Dict[int, List[Dict]]: - """Assign each word in a row to exactly one column. - - Uses a two-pass strategy: - 1. Containment: if a word's center falls within a column's horizontal - bounds (with padding), assign it to that column. - 2. Nearest center: for words not contained by any column, fall back to - nearest column center distance. - - This prevents long sentences in wide columns (e.g. example) from having - their rightmost words stolen by an adjacent column. - - Args: - row: Row with words (relative coordinates). - columns: Sorted list of columns (absolute coordinates). - - Returns: - Dict mapping col_index → list of words assigned to that column. - """ - result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))} - - if not row.words or not columns: - return result - - left_x = row.x # content ROI left (absolute) - - # Build non-overlapping column assignment ranges using midpoints. - # For adjacent columns, the boundary is the midpoint between them. - # This prevents words near column borders from being assigned to - # the wrong column (e.g. "We" at the start of an example sentence - # being stolen by the preceding DE column). - n = len(columns) - col_ranges_rel = [] # (assign_left, assign_right) per column - for ci, col in enumerate(columns): - col_left_rel = col.x - left_x - col_right_rel = col_left_rel + col.width - - # Left boundary: midpoint to previous column, or 0 - if ci == 0: - assign_left = 0 - else: - prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width - assign_left = (prev_right + col_left_rel) / 2 - - # Right boundary: midpoint to next column, or infinity (row width) - if ci == n - 1: - assign_right = row.width + 100 # generous for last column - else: - next_left = columns[ci + 1].x - left_x - assign_right = (col_right_rel + next_left) / 2 - - col_ranges_rel.append((assign_left, assign_right)) - - for w in row.words: - w_left = w['left'] - w_right = w_left + w['width'] - w_center_x = w_left + w['width'] / 2 - - # Primary: overlap-based matching — assign to column with most overlap. - # This is more robust than center-based for narrow columns (page_ref) - # where the last character's center may fall into the next column. - best_col = -1 - best_overlap = 0 - for ci, col in enumerate(columns): - col_left_rel = col.x - left_x - col_right_rel = col_left_rel + col.width - overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel)) - if overlap > best_overlap: - best_overlap = overlap - best_col = ci - - if best_col >= 0 and best_overlap > 0: - result[best_col].append(w) - else: - # Fallback: center-based range matching - assigned = False - for ci, (al, ar) in enumerate(col_ranges_rel): - if al <= w_center_x < ar: - result[ci].append(w) - assigned = True - break - - if not assigned: - # Last resort: nearest column center - best_col = 0 - col_left_0 = columns[0].x - left_x - best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2)) - for ci in range(1, n): - col_left = columns[ci].x - left_x - dist = abs(w_center_x - (col_left + columns[ci].width / 2)) - if dist < best_dist: - best_dist = dist - best_col = ci - result[best_col].append(w) - - return result - - -# Regex: at least 2 consecutive letters (Latin + umlauts + accents) -_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}') -_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]') - -# Common short EN/DE words (2-3 chars). Tokens at the end of a cell -# that do NOT appear here are treated as trailing OCR noise. -_COMMON_SHORT_WORDS: set = { - # EN 1-2 letter - 'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he', - 'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on', - 'or', 'so', 'to', 'up', 'us', 'we', - # EN 3 letter - 'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all', - 'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art', - 'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay', - 'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy', - 'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap', - 'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad', - 'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip', - 'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel', - 'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far', - 'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit', - 'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur', - 'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut', - 'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her', - 'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how', - 'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink', - 'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet', - 'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit', - 'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let', - 'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man', - 'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob', - 'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag', - 'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut', - 'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one', - 'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad', - 'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per', - 'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot', - 'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram', - 'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid', - 'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub', - 'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap', - 'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin', - 'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob', - 'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty', - 'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan', - 'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip', - 'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug', - 'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim', - 'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet', - 'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo', - 'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you', - 'zap', 'zip', 'zoo', - # DE 2-3 letter - 'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu', - 'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem', - 'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar', - 'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist', - 'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun', - 'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag', - 'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von', - 'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir', - 'wut', 'zum', 'zur', -} - -# Known abbreviations found in EN/DE textbooks and dictionaries. -# Stored WITHOUT trailing period (the noise filter strips periods). -# These rescue tokens like "sth." / "sb." / "usw." from being deleted. -_KNOWN_ABBREVIATIONS: set = { - # EN dictionary meta-words - 'sth', 'sb', 'smth', 'smb', 'sbd', - # EN general - 'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp', - 'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap', - # EN references / textbook - 'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr', - 'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff', - 'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs', - 'ans', 'wb', 'tb', 'vocab', - # EN parts of speech / grammar - 'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj', - 'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger', - 'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans', - 'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut', - 'attr', 'pred', 'comp', 'superl', 'pos', 'neg', - 'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml', - 'syn', 'ant', 'opp', 'var', 'orig', - # EN titles - 'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr', - # EN pronunciation - 'br', 'am', 'brit', 'amer', - # EN units - 'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml', - # DE general - 'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg', - 'bes', 'insb', 'insbes', 'bspw', 'ca', - 'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr', - 'inkl', 'exkl', 'zzgl', 'abzgl', - # DE references - 'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde', - 'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap', - 's', 'sp', 'zit', 'zs', 'vlg', - # DE grammar - 'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj', - 'praet', 'imp', 'part', 'mask', 'fem', 'neutr', - 'trennb', 'untrennb', 'ugs', 'geh', 'pej', - # DE regional - 'nordd', 'österr', 'schweiz', - # Linguistic - 'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym', - 'deriv', 'pref', 'suf', 'suff', 'dim', 'coll', - 'count', 'uncount', 'indef', 'def', 'poss', 'demon', -} - - -def _is_noise_tail_token(token: str) -> bool: - """Check if a token at the END of cell text is trailing OCR noise. - - Trailing fragments are very common OCR artifacts from image edges, - borders, and neighbouring cells. This is more aggressive than a - general word filter: any short token that isn't in the dictionary - of common EN/DE words is considered noise. - - Examples of noise: "Es)", "3", "ee", "B" - Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]" - """ - t = token.strip() - if not t: - return True - - # Keep ellipsis - if t in ('...', '…'): - return False - - # Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc. - if t.startswith('[') or t.startswith('["') or t.startswith("['"): - return False - if t.endswith(']'): - return False - - # Pure non-alpha → noise ("3", ")", "|") - alpha_chars = _RE_ALPHA.findall(t) - if not alpha_chars: - return True - - # Extract only alpha characters for dictionary lookup - cleaned = ''.join(alpha_chars) - - # Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep - if cleaned.lower() in _KNOWN_ABBREVIATIONS: - return False - - # Strip normal trailing punctuation before checking for internal noise. - stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes" - t_check = stripped_punct if stripped_punct else t - - # Check for legitimate punctuation patterns vs. real noise. - # Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir", - # "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen" - # Noise: "3d", "B|", "x7" - # Strategy: strip common dictionary punctuation (parens, hyphens, slashes), - # THEN check if residual contains only alpha characters. - t_inner = t_check - # Remove all parentheses, hyphens, slashes, and dots — these are normal - # in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)", - # "(zer)brechen", "wir/uns", "e.g." - t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner) - # Now check: does the inner form still have non-alpha noise? - inner_alpha = ''.join(_RE_ALPHA.findall(t_inner)) - has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False - - # Long alpha words (4+ chars) without internal noise are likely real - if len(cleaned) >= 4 and not has_internal_noise: - return False - - # Short words: check dictionary (uses only alpha chars) - if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise: - return False - - # Default: short or suspicious → noise - return True - - -def _is_garbage_text(text: str) -> bool: - """Check if entire cell text is OCR garbage from image areas. - - Garbage text = no recognizable dictionary word. Catches - "(ci]oeu", "uanoaain." etc. - """ - words = _RE_REAL_WORD.findall(text) - if not words: - # Check if any token is a known abbreviation (e.g. "e.g.") - alpha_only = ''.join(_RE_ALPHA.findall(text)).lower() - if alpha_only in _KNOWN_ABBREVIATIONS: - return False - return True - - for w in words: - wl = w.lower() - # Known short word or abbreviation → not garbage - if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS: - return False - # Long word (>= 4 chars): check vowel/consonant ratio. - # Real EN/DE words have 20-60% vowels. Garbage like "uanoaain" - # or "cioeu" has unusual ratios (too many or too few vowels). - if len(wl) >= 4: - vowels = sum(1 for c in wl if c in 'aeiouäöü') - ratio = vowels / len(wl) - if 0.15 <= ratio <= 0.65: - return False # plausible vowel ratio → real word - - return True - - -def _clean_cell_text(text: str) -> str: - """Remove OCR noise from cell text. Generic filters: - - 1. If the entire text has no real alphabetic word (>= 2 letters), clear. - 2. If the entire text is garbage (no dictionary word), clear. - 3. Strip trailing noise tokens from the end of the text. - """ - stripped = text.strip() - if not stripped: - return '' - - # --- Filter 1: No real word at all --- - if not _RE_REAL_WORD.search(stripped): - # Exception: dotted abbreviations like "e.g.", "z.B.", "i.e." - alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower() - if alpha_only not in _KNOWN_ABBREVIATIONS: - return '' - - # --- Filter 2: Entire text is garbage --- - if _is_garbage_text(stripped): - return '' - - # --- Filter 3: Strip trailing noise tokens --- - tokens = stripped.split() - while tokens and _is_noise_tail_token(tokens[-1]): - tokens.pop() - if not tokens: - return '' - - return ' '.join(tokens) - - -def _clean_cell_text_lite(text: str) -> str: - """Simplified noise filter for cell-first OCR (isolated cell crops). - - Since each cell is OCR'd in isolation (no neighbour content visible), - trailing-noise stripping is unnecessary. Only 2 filters remain: - - 1. No real alphabetic word (>= 2 letters) and not a known abbreviation → empty. - 2. Entire text is garbage (no dictionary word) → empty. - """ - stripped = text.strip() - if not stripped: - return '' - - # --- Filter 1: No real word at all --- - if not _RE_REAL_WORD.search(stripped): - alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower() - if alpha_only not in _KNOWN_ABBREVIATIONS: - return '' - - # --- Filter 2: Entire text is garbage --- - if _is_garbage_text(stripped): - return '' - - return stripped - - -# --------------------------------------------------------------------------- -# Bold detection via stroke-width analysis (relative / page-level) -# --------------------------------------------------------------------------- - -def _measure_stroke_width(gray_crop: np.ndarray) -> float: - """Measure mean stroke width in a binarised cell crop. - - Returns a DPI-normalised value (mean stroke width as % of crop height), - or 0.0 if measurement is not possible. - """ - if gray_crop is None or gray_crop.size == 0: - return 0.0 - h, w = gray_crop.shape[:2] - if h < 10 or w < 10: - return 0.0 - - # Binarise: text = white (255), background = black (0) - _, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU) - if cv2.countNonZero(bw) < 20: - return 0.0 - - # Distance transform: value at each white pixel = distance to nearest black - dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3) - - # Skeleton via morphological thinning - kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)) - thin = bw.copy() - for _ in range(max(1, min(h, w) // 6)): - eroded = cv2.erode(thin, kernel) - if cv2.countNonZero(eroded) < 5: - break - thin = eroded - - skeleton_pts = thin > 0 - if not np.any(skeleton_pts): - return 0.0 - mean_stroke = float(np.mean(dist[skeleton_pts])) - return mean_stroke / max(h, 1) * 100 # normalised: % of cell height - - -def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray], - img_w: int, img_h: int) -> None: - """Two-pass bold detection: measure all cells, then compare against median. - - Cells with stroke width > 1.4× the page median are marked as bold. - This adapts automatically to font, DPI and scan quality. - Modifies cells in-place (sets 'is_bold' key). - """ - if ocr_img is None: - return - - # Pass 1: measure stroke width for every cell with text - metrics: List[float] = [] - cell_strokes: List[float] = [] - for cell in cells: - sw = 0.0 - if cell.get('text', '').strip(): - bp = cell['bbox_px'] - y1 = max(0, bp['y']) - y2 = min(img_h, bp['y'] + bp['h']) - x1 = max(0, bp['x']) - x2 = min(img_w, bp['x'] + bp['w']) - if y2 > y1 and x2 > x1: - sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2]) - cell_strokes.append(sw) - if sw > 0: - metrics.append(sw) - - if len(metrics) < 3: - # Too few cells to compare — leave all as non-bold - return - - median_sw = float(np.median(metrics)) - if median_sw <= 0: - return - - # Pass 2: cells significantly above median → bold - for cell, sw in zip(cells, cell_strokes): - cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4 - - -# --------------------------------------------------------------------------- -# Cell-First OCR (v2) — each cell cropped and OCR'd in isolation -# --------------------------------------------------------------------------- - -def _ocr_cell_crop( - row_idx: int, - col_idx: int, - row: RowGeometry, - col: PageRegion, - ocr_img: np.ndarray, - img_bgr: Optional[np.ndarray], - img_w: int, - img_h: int, - engine_name: str, - lang: str, - lang_map: Dict[str, str], -) -> Dict[str, Any]: - """OCR a single cell by cropping the exact column×row intersection. - - No padding beyond cell boundaries → no neighbour bleeding. - """ - # Display bbox: exact column × row intersection - disp_x = col.x - disp_y = row.y - disp_w = col.width - disp_h = row.height - - # Crop boundaries: add small internal padding (3px each side) to avoid - # clipping characters near column/row edges (e.g. parentheses, descenders). - # Stays within image bounds but may extend slightly beyond strict cell. - # 3px is small enough to avoid neighbour content at typical scan DPI (200-300). - _PAD = 3 - cx = max(0, disp_x - _PAD) - cy = max(0, disp_y - _PAD) - cx2 = min(img_w, disp_x + disp_w + _PAD) - cy2 = min(img_h, disp_y + disp_h + _PAD) - cw = cx2 - cx - ch = cy2 - cy - - empty_cell = { - 'cell_id': f"R{row_idx:02d}_C{col_idx}", - 'row_index': row_idx, - 'col_index': col_idx, - 'col_type': col.type, - 'text': '', - 'confidence': 0.0, - 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h}, - 'bbox_pct': { - 'x': round(disp_x / img_w * 100, 2) if img_w else 0, - 'y': round(disp_y / img_h * 100, 2) if img_h else 0, - 'w': round(disp_w / img_w * 100, 2) if img_w else 0, - 'h': round(disp_h / img_h * 100, 2) if img_h else 0, - }, - 'ocr_engine': 'cell_crop_v2', - 'is_bold': False, - } - - if cw <= 0 or ch <= 0: - logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch) - return empty_cell - - # --- Pixel-density check: skip truly empty cells --- - if ocr_img is not None: - crop = ocr_img[cy:cy + ch, cx:cx + cw] - if crop.size > 0: - dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size - if dark_ratio < 0.005: - logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)", - row_idx, col_idx, dark_ratio, cw, ch) - return empty_cell - - # --- Prepare crop for OCR --- - cell_lang = lang_map.get(col.type, lang) - psm = _select_psm_for_column(col.type, col.width, row.height) - text = '' - avg_conf = 0.0 - used_engine = 'cell_crop_v2' - - if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None: - cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch) - words = ocr_region_trocr(img_bgr, cell_region, - handwritten=(engine_name == "trocr-handwritten")) - elif engine_name == "lighton" and img_bgr is not None: - cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch) - words = ocr_region_lighton(img_bgr, cell_region) - elif engine_name == "rapid" and img_bgr is not None: - # Upscale small BGR crops for RapidOCR. - # Cell crops typically have height 35-55px but width >300px. - # _ensure_minimum_crop_size only scales when EITHER dim < min_dim, - # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78). - # For very short heights (< 80px), force 3× upscale for better OCR - # of small characters like periods, ellipsis, and phonetic symbols. - bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw] - if bgr_crop.size == 0: - words = [] - else: - crop_h, crop_w = bgr_crop.shape[:2] - if crop_h < 80: - # Force 3× upscale for short rows — small chars need more pixels - scale = 3.0 - bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale, - interpolation=cv2.INTER_CUBIC) - else: - bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3) - up_h, up_w = bgr_up.shape[:2] - scale_x = up_w / max(crop_w, 1) - scale_y = up_h / max(crop_h, 1) - was_scaled = (up_w != crop_w or up_h != crop_h) - logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)", - row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y) - tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h) - words = ocr_region_rapid(bgr_up, tmp_region) - # Remap positions back to original image coords - if words and was_scaled: - for w in words: - w['left'] = int(w['left'] / scale_x) + cx - w['top'] = int(w['top'] / scale_y) + cy - w['width'] = int(w['width'] / scale_x) - w['height'] = int(w['height'] / scale_y) - elif words: - for w in words: - w['left'] += cx - w['top'] += cy - else: - # Tesseract: upscale tiny crops for better recognition - if ocr_img is not None: - crop_slice = ocr_img[cy:cy + ch, cx:cx + cw] - upscaled = _ensure_minimum_crop_size(crop_slice) - up_h, up_w = upscaled.shape[:2] - tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h) - words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm) - # Remap word positions back to original image coordinates - if words and (up_w != cw or up_h != ch): - sx = cw / max(up_w, 1) - sy = ch / max(up_h, 1) - for w in words: - w['left'] = int(w['left'] * sx) + cx - w['top'] = int(w['top'] * sy) + cy - w['width'] = int(w['width'] * sx) - w['height'] = int(w['height'] * sy) - elif words: - for w in words: - w['left'] += cx - w['top'] += cy - else: - words = [] - - # Filter low-confidence words - _MIN_WORD_CONF = 30 - if words: - words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF] - - if words: - y_tol = max(15, ch) - text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) - avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) - logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s", - row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name) - else: - logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)", - row_idx, col_idx, cw, ch, psm, engine_name) - - # --- PSM 7 fallback for still-empty Tesseract cells --- - if not text.strip() and engine_name == "tesseract" and ocr_img is not None: - crop_slice = ocr_img[cy:cy + ch, cx:cx + cw] - upscaled = _ensure_minimum_crop_size(crop_slice) - up_h, up_w = upscaled.shape[:2] - tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h) - psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7) - if psm7_words: - psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF] - if psm7_words: - p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10) - if p7_text.strip(): - text = p7_text - avg_conf = round( - sum(w['conf'] for w in psm7_words) / len(psm7_words), 1 - ) - used_engine = 'cell_crop_v2_psm7' - - # --- Noise filter --- - if text.strip(): - pre_filter = text - text = _clean_cell_text_lite(text) - if not text: - logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r", - row_idx, col_idx, pre_filter) - avg_conf = 0.0 - - result = dict(empty_cell) - result['text'] = text - result['confidence'] = avg_conf - result['ocr_engine'] = used_engine - return result - - -# Threshold: columns narrower than this (% of image width) use single-cell -# crop OCR instead of full-page word assignment. -# -# Broad columns (>= threshold): Full-page Tesseract word assignment. -# Better for multi-word content (sentences, IPA brackets, punctuation). -# Examples: EN vocabulary, DE translation, example sentences. -# -# Narrow columns (< threshold): Isolated cell-crop OCR. -# Prevents neighbour bleeding from adjacent broad columns. -# Examples: page_ref, marker, numbering columns. -# -# 15% was empirically validated across vocab table scans with 3-5 columns. -# Typical broad columns: 20-40% width. Typical narrow columns: 3-12% width. -# The 15% boundary cleanly separates the two groups. -_NARROW_COL_THRESHOLD_PCT = 15.0 - - -def build_cell_grid_v2( - ocr_img: np.ndarray, - column_regions: List[PageRegion], - row_geometries: List[RowGeometry], - img_w: int, - img_h: int, - lang: str = "eng+deu", - ocr_engine: str = "auto", - img_bgr: Optional[np.ndarray] = None, -) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - """Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones. - - Drop-in replacement for build_cell_grid() — same signature & return type. - - Strategy: - - Broad columns (>15% image width): Use pre-assigned full-page Tesseract - words (from row.words). Handles IPA brackets, punctuation, sentence - continuity correctly. - - Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent - neighbour bleeding from adjacent broad columns. - """ - engine_name = "tesseract" - if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): - engine_name = ocr_engine - elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE: - engine_name = "rapid" - - logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)") - - # Filter to content rows only - content_rows = [r for r in row_geometries if r.row_type == 'content'] - if not content_rows: - logger.warning("build_cell_grid_v2: no content rows found") - return [], [] - - # Filter phantom rows (word_count=0) and artifact rows - before = len(content_rows) - content_rows = [r for r in content_rows if r.word_count > 0] - skipped = before - len(content_rows) - if skipped > 0: - logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)") - if not content_rows: - logger.warning("build_cell_grid_v2: no content rows with words found") - return [], [] - - before_art = len(content_rows) - content_rows = [r for r in content_rows if not _is_artifact_row(r)] - artifact_skipped = before_art - len(content_rows) - if artifact_skipped > 0: - logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows") - if not content_rows: - logger.warning("build_cell_grid_v2: no content rows after artifact filtering") - return [], [] - - # Filter columns - _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', - 'margin_bottom', 'margin_left', 'margin_right'} - relevant_cols = [c for c in column_regions if c.type not in _skip_types] - if not relevant_cols: - logger.warning("build_cell_grid_v2: no usable columns found") - return [], [] - - # Heal row gaps — use header/footer boundaries - content_rows.sort(key=lambda r: r.y) - header_rows = [r for r in row_geometries if r.row_type == 'header'] - footer_rows = [r for r in row_geometries if r.row_type == 'footer'] - if header_rows: - top_bound = max(r.y + r.height for r in header_rows) - else: - top_bound = content_rows[0].y - if footer_rows: - bottom_bound = min(r.y for r in footer_rows) - else: - bottom_bound = content_rows[-1].y + content_rows[-1].height - - _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound) - - relevant_cols.sort(key=lambda c: c.x) - - columns_meta = [ - {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width} - for ci, c in enumerate(relevant_cols) - ] - - lang_map = { - 'column_en': 'eng', - 'column_de': 'deu', - 'column_example': 'eng+deu', - } - - # --- Classify columns as broad vs narrow --- - narrow_col_indices = set() - for ci, col in enumerate(relevant_cols): - col_pct = (col.width / img_w * 100) if img_w > 0 else 0 - if col_pct < _NARROW_COL_THRESHOLD_PCT: - narrow_col_indices.add(ci) - - broad_col_count = len(relevant_cols) - len(narrow_col_indices) - logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), " - f"{len(narrow_col_indices)} narrow columns (cell-crop)") - - # --- Phase 1: Broad columns via full-page word assignment --- - cells: List[Dict[str, Any]] = [] - - for row_idx, row in enumerate(content_rows): - # Assign full-page words to columns for this row - col_words = _assign_row_words_to_columns(row, relevant_cols) - - for col_idx, col in enumerate(relevant_cols): - if col_idx not in narrow_col_indices: - # BROAD column: use pre-assigned full-page words - words = col_words.get(col_idx, []) - # Filter low-confidence words - words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF] - - if words: - y_tol = max(15, row.height) - text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) - avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) - else: - text = '' - avg_conf = 0.0 - - # Apply noise filter - text = _clean_cell_text(text) - - cell = { - 'cell_id': f"R{row_idx:02d}_C{col_idx}", - 'row_index': row_idx, - 'col_index': col_idx, - 'col_type': col.type, - 'text': text, - 'confidence': avg_conf, - 'bbox_px': { - 'x': col.x, 'y': row.y, - 'w': col.width, 'h': row.height, - }, - 'bbox_pct': { - 'x': round(col.x / img_w * 100, 2) if img_w else 0, - 'y': round(row.y / img_h * 100, 2) if img_h else 0, - 'w': round(col.width / img_w * 100, 2) if img_w else 0, - 'h': round(row.height / img_h * 100, 2) if img_h else 0, - }, - 'ocr_engine': 'word_lookup', - 'is_bold': False, - } - cells.append(cell) - - # --- Phase 2: Narrow columns via cell-crop OCR (parallel) --- - narrow_tasks = [] - for row_idx, row in enumerate(content_rows): - for col_idx, col in enumerate(relevant_cols): - if col_idx in narrow_col_indices: - narrow_tasks.append((row_idx, col_idx, row, col)) - - if narrow_tasks: - max_workers = 4 if engine_name == "tesseract" else 2 - with ThreadPoolExecutor(max_workers=max_workers) as pool: - futures = { - pool.submit( - _ocr_cell_crop, - ri, ci, row, col, - ocr_img, img_bgr, img_w, img_h, - engine_name, lang, lang_map, - ): (ri, ci) - for ri, ci, row, col in narrow_tasks - } - for future in as_completed(futures): - try: - cell = future.result() - cells.append(cell) - except Exception as e: - ri, ci = futures[future] - logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}") - - # Sort cells by (row_index, col_index) - cells.sort(key=lambda c: (c['row_index'], c['col_index'])) - - # Remove all-empty rows - rows_with_text: set = set() - for cell in cells: - if cell['text'].strip(): - rows_with_text.add(cell['row_index']) - before_filter = len(cells) - cells = [c for c in cells if c['row_index'] in rows_with_text] - empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1) - if empty_rows_removed > 0: - logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows") - - # Bold detection disabled: cell-level stroke-width analysis cannot - # distinguish bold from non-bold when cells contain mixed formatting - # (e.g. "cookie ['kuki]" — bold word + non-bold phonetics). - # TODO: word-level bold detection would require per-word bounding boxes. - - logger.info(f"build_cell_grid_v2: {len(cells)} cells from " - f"{len(content_rows)} rows × {len(relevant_cols)} columns, " - f"engine={engine_name} (hybrid)") - - return cells, columns_meta - - -def build_cell_grid_v2_streaming( - ocr_img: np.ndarray, - column_regions: List[PageRegion], - row_geometries: List[RowGeometry], - img_w: int, - img_h: int, - lang: str = "eng+deu", - ocr_engine: str = "auto", - img_bgr: Optional[np.ndarray] = None, -) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]: - """Streaming variant of build_cell_grid_v2 — yields each cell as OCR'd. - - Yields: - (cell_dict, columns_meta, total_cells) - """ - # Resolve engine — default to Tesseract for cell-first OCR. - # Tesseract excels at isolated text crops (binarized, upscaled). - # RapidOCR is optimized for full-page scene-text and produces artifacts - # on small cell crops (extra chars, missing punctuation, garbled IPA). - use_rapid = False - if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): - engine_name = ocr_engine - elif ocr_engine == "auto": - engine_name = "tesseract" - elif ocr_engine == "rapid": - if not RAPIDOCR_AVAILABLE: - logger.warning("RapidOCR requested but not available, falling back to Tesseract") - else: - use_rapid = True - engine_name = "rapid" if use_rapid else "tesseract" - else: - engine_name = "tesseract" - - content_rows = [r for r in row_geometries if r.row_type == 'content'] - if not content_rows: - return - - content_rows = [r for r in content_rows if r.word_count > 0] - if not content_rows: - return - - _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', - 'margin_bottom', 'margin_left', 'margin_right'} - relevant_cols = [c for c in column_regions if c.type not in _skip_types] - if not relevant_cols: - return - - content_rows = [r for r in content_rows if not _is_artifact_row(r)] - if not content_rows: - return - - # Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2) - content_rows.sort(key=lambda r: r.y) - header_rows = [r for r in row_geometries if r.row_type == 'header'] - footer_rows = [r for r in row_geometries if r.row_type == 'footer'] - if header_rows: - top_bound = max(r.y + r.height for r in header_rows) - else: - top_bound = content_rows[0].y - if footer_rows: - bottom_bound = min(r.y for r in footer_rows) - else: - bottom_bound = content_rows[-1].y + content_rows[-1].height - - _heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound) - - relevant_cols.sort(key=lambda c: c.x) - - columns_meta = [ - {'index': ci, 'type': c.type, 'x': c.x, 'width': c.width} - for ci, c in enumerate(relevant_cols) - ] - - lang_map = { - 'column_en': 'eng', - 'column_de': 'deu', - 'column_example': 'eng+deu', - } - - total_cells = len(content_rows) * len(relevant_cols) - - for row_idx, row in enumerate(content_rows): - for col_idx, col in enumerate(relevant_cols): - cell = _ocr_cell_crop( - row_idx, col_idx, row, col, - ocr_img, img_bgr, img_w, img_h, - engine_name, lang, lang_map, - ) - yield cell, columns_meta, total_cells - - -# --------------------------------------------------------------------------- -# Narrow-column OCR helpers (Proposal B) — DEPRECATED (kept for legacy build_cell_grid) -# --------------------------------------------------------------------------- - -def _compute_cell_padding(col_width: int, img_w: int) -> int: - """Adaptive padding for OCR crops based on column width. - - Narrow columns (page_ref, marker) need more surrounding context so - Tesseract can segment characters correctly. Wide columns keep the - minimal 4 px padding to avoid pulling in neighbours. - """ - col_pct = col_width / img_w * 100 if img_w > 0 else 100 - if col_pct < 5: - return max(20, col_width // 2) - if col_pct < 10: - return max(12, col_width // 4) - if col_pct < 15: - return 8 - return 4 - - -def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150, - max_scale: int = 3) -> np.ndarray: - """Upscale tiny crops so Tesseract gets enough pixel data. - - If either dimension is below *min_dim*, the crop is bicubic-upscaled - so the smallest dimension reaches *min_dim* (capped at *max_scale* ×). - """ - h, w = crop.shape[:2] - if h >= min_dim and w >= min_dim: - return crop - scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1))) - if scale <= 1.0: - return crop - new_w = int(w * scale) - new_h = int(h * scale) - return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC) - - -def _select_psm_for_column(col_type: str, col_width: int, - row_height: int) -> int: - """Choose the best Tesseract PSM for a given column geometry. - - - page_ref columns are almost always single short tokens → PSM 8 - - Very narrow or short cells → PSM 7 (single text line) - - Everything else → PSM 6 (uniform block) - """ - if col_type in ('page_ref', 'marker'): - return 8 # single word - if col_width < 100 or row_height < 30: - return 7 # single line - return 6 # uniform block - - -def _ocr_single_cell( - row_idx: int, - col_idx: int, - row: RowGeometry, - col: PageRegion, - ocr_img: np.ndarray, - img_bgr: Optional[np.ndarray], - img_w: int, - img_h: int, - use_rapid: bool, - engine_name: str, - lang: str, - lang_map: Dict[str, str], - preassigned_words: Optional[List[Dict]] = None, -) -> Dict[str, Any]: - """Populate a single cell (column x row intersection) via word lookup.""" - # Display bbox: exact column × row intersection (no padding) - disp_x = col.x - disp_y = row.y - disp_w = col.width - disp_h = row.height - - # OCR crop: adaptive padding — narrow columns get more context - pad = _compute_cell_padding(col.width, img_w) - cell_x = max(0, col.x - pad) - cell_y = max(0, row.y - pad) - cell_w = min(col.width + 2 * pad, img_w - cell_x) - cell_h = min(row.height + 2 * pad, img_h - cell_y) - is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False - - if disp_w <= 0 or disp_h <= 0: - return { - 'cell_id': f"R{row_idx:02d}_C{col_idx}", - 'row_index': row_idx, - 'col_index': col_idx, - 'col_type': col.type, - 'text': '', - 'confidence': 0.0, - 'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height}, - 'bbox_pct': { - 'x': round(col.x / img_w * 100, 2), - 'y': round(row.y / img_h * 100, 2), - 'w': round(col.width / img_w * 100, 2), - 'h': round(row.height / img_h * 100, 2), - }, - 'ocr_engine': 'word_lookup', - } - - # --- PRIMARY: Word-lookup from full-page Tesseract --- - words = preassigned_words if preassigned_words is not None else [] - used_engine = 'word_lookup' - - # Filter low-confidence words (OCR noise from images/artifacts). - # Tesseract gives low confidence to misread image edges, borders, - # and other non-text elements. - _MIN_WORD_CONF = 30 - if words: - words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF] - - if words: - # Use row height as Y-tolerance so all words within a single row - # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse" - # across two lines due to slight vertical offset). - y_tol = max(15, row.height) - text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) - avg_conf = round(sum(w['conf'] for w in words) / len(words), 1) - else: - text = '' - avg_conf = 0.0 - - # --- FALLBACK: Cell-OCR for empty cells --- - # Full-page Tesseract can miss small or isolated words (e.g. "Ei"). - # Re-run OCR on the cell crop to catch what word-lookup missed. - # To avoid wasting time on truly empty cells, check pixel density first: - # only run Tesseract if the cell crop contains enough dark pixels to - # plausibly contain text. - _run_fallback = False - if not text.strip() and cell_w > 0 and cell_h > 0: - if ocr_img is not None: - crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] - if crop.size > 0: - # Threshold: pixels darker than 180 (on 0-255 grayscale). - # Use 0.5% to catch even small text like "Ei" (2 chars) - # in an otherwise empty cell. - dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size - _run_fallback = dark_ratio > 0.005 - if _run_fallback: - # For narrow columns, upscale the crop before OCR - if is_narrow and ocr_img is not None: - _crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] - _upscaled = _ensure_minimum_crop_size(_crop_slice) - if _upscaled is not _crop_slice: - # Build a temporary full-size image with the upscaled crop - # placed at origin so ocr_region can crop it cleanly. - _up_h, _up_w = _upscaled.shape[:2] - _tmp_region = PageRegion( - type=col.type, x=0, y=0, width=_up_w, height=_up_h, - ) - _cell_psm = _select_psm_for_column(col.type, col.width, row.height) - cell_lang = lang_map.get(col.type, lang) - fallback_words = ocr_region(_upscaled, _tmp_region, - lang=cell_lang, psm=_cell_psm) - # Remap word positions back to original image coordinates - _sx = cell_w / max(_up_w, 1) - _sy = cell_h / max(_up_h, 1) - for _fw in (fallback_words or []): - _fw['left'] = int(_fw['left'] * _sx) + cell_x - _fw['top'] = int(_fw['top'] * _sy) + cell_y - _fw['width'] = int(_fw['width'] * _sx) - _fw['height'] = int(_fw['height'] * _sy) - else: - # No upscaling needed, use adaptive PSM - cell_region = PageRegion( - type=col.type, x=cell_x, y=cell_y, - width=cell_w, height=cell_h, - ) - _cell_psm = _select_psm_for_column(col.type, col.width, row.height) - cell_lang = lang_map.get(col.type, lang) - fallback_words = ocr_region(ocr_img, cell_region, - lang=cell_lang, psm=_cell_psm) - else: - cell_region = PageRegion( - type=col.type, - x=cell_x, y=cell_y, - width=cell_w, height=cell_h, - ) - if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None: - fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten")) - elif engine_name == "lighton" and img_bgr is not None: - fallback_words = ocr_region_lighton(img_bgr, cell_region) - elif use_rapid and img_bgr is not None: - fallback_words = ocr_region_rapid(img_bgr, cell_region) - else: - _cell_psm = _select_psm_for_column(col.type, col.width, row.height) - cell_lang = lang_map.get(col.type, lang) - fallback_words = ocr_region(ocr_img, cell_region, - lang=cell_lang, psm=_cell_psm) - - if fallback_words: - # Apply same confidence filter to fallback words - fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF] - if fallback_words: - fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words) - fb_y_tol = max(10, int(fb_avg_h * 0.5)) - fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol) - if fb_text.strip(): - text = fb_text - avg_conf = round( - sum(w['conf'] for w in fallback_words) / len(fallback_words), 1 - ) - used_engine = 'cell_ocr_fallback' - - # --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells --- - if not text.strip() and _run_fallback and not use_rapid: - _fb_region = PageRegion( - type=col.type, x=cell_x, y=cell_y, - width=cell_w, height=cell_h, - ) - cell_lang = lang_map.get(col.type, lang) - psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7) - if psm7_words: - psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF] - if psm7_words: - p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10) - if p7_text.strip(): - text = p7_text - avg_conf = round( - sum(w['conf'] for w in psm7_words) / len(psm7_words), 1 - ) - used_engine = 'cell_ocr_psm7' - - # --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns --- - # If a narrow cell is still empty, OCR the entire row strip with - # RapidOCR (which handles small text better) and assign words by - # X-position overlap with this column. - if not text.strip() and is_narrow and img_bgr is not None: - row_region = PageRegion( - type='_row_strip', x=0, y=row.y, - width=img_w, height=row.height, - ) - strip_words = ocr_region_rapid(img_bgr, row_region) - if strip_words: - # Filter to words overlapping this column's X-range - col_left = col.x - col_right = col.x + col.width - col_words = [] - for sw in strip_words: - sw_left = sw.get('left', 0) - sw_right = sw_left + sw.get('width', 0) - overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left)) - if overlap > sw.get('width', 1) * 0.3: - col_words.append(sw) - if col_words: - col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF] - if col_words: - rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height) - if rs_text.strip(): - text = rs_text - avg_conf = round( - sum(w['conf'] for w in col_words) / len(col_words), 1 - ) - used_engine = 'row_strip_rapid' - - # --- NOISE FILTER: clear cells that contain only OCR artifacts --- - if text.strip(): - text = _clean_cell_text(text) - if not text: - avg_conf = 0.0 - - return { - 'cell_id': f"R{row_idx:02d}_C{col_idx}", - 'row_index': row_idx, - 'col_index': col_idx, - 'col_type': col.type, - 'text': text, - 'confidence': avg_conf, - 'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h}, - 'bbox_pct': { - 'x': round(disp_x / img_w * 100, 2), - 'y': round(disp_y / img_h * 100, 2), - 'w': round(disp_w / img_w * 100, 2), - 'h': round(disp_h / img_h * 100, 2), - }, - 'ocr_engine': used_engine, - } - - -def _is_artifact_row(row: RowGeometry) -> bool: - """Return True if this row contains only scan artifacts, not real text. - - Artifact rows (scanner shadows, noise) typically produce only single-character - detections. A real content row always has at least one token with 2+ characters. - """ - if row.word_count == 0: - return True - texts = [w.get('text', '').strip() for w in row.words] - return all(len(t) <= 1 for t in texts) - - -def _heal_row_gaps( - rows: List[RowGeometry], - top_bound: int, - bottom_bound: int, -) -> None: - """Expand row y/height to fill vertical gaps caused by removed adjacent rows. - - After filtering out empty or artifact rows, remaining content rows may have - gaps between them where the removed rows used to be. This function mutates - each row to extend upward/downward to the midpoint of such gaps so that - OCR crops cover the full available content area. - - The first row always extends to top_bound; the last row to bottom_bound. - """ - if not rows: - return - rows.sort(key=lambda r: r.y) - n = len(rows) - orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation - - for i, row in enumerate(rows): - # New top: midpoint between previous row's bottom and this row's top - if i == 0: - new_top = top_bound - else: - prev_bot = orig[i - 1][1] - my_top = orig[i][0] - gap = my_top - prev_bot - new_top = prev_bot + gap // 2 if gap > 1 else my_top - - # New bottom: midpoint between this row's bottom and next row's top - if i == n - 1: - new_bottom = bottom_bound - else: - my_bot = orig[i][1] - next_top = orig[i + 1][0] - gap = next_top - my_bot - new_bottom = my_bot + gap // 2 if gap > 1 else my_bot - - row.y = new_top - row.height = max(5, new_bottom - new_top) - - logger.debug( - f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] " - f"(bounds: top={top_bound}, bottom={bottom_bound})" - ) - - -def build_cell_grid( - ocr_img: np.ndarray, - column_regions: List[PageRegion], - row_geometries: List[RowGeometry], - img_w: int, - img_h: int, - lang: str = "eng+deu", - ocr_engine: str = "auto", - img_bgr: Optional[np.ndarray] = None, -) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - """Generic Cell-Grid: Columns × Rows → cells with OCR text. - - This is the layout-agnostic foundation. Every column (except column_ignore) - is intersected with every content row to produce numbered cells. - - Args: - ocr_img: Binarized full-page image (for Tesseract). - column_regions: Classified columns from Step 3 (PageRegion list). - row_geometries: Rows from Step 4 (RowGeometry list). - img_w: Image width in pixels. - img_h: Image height in pixels. - lang: Default Tesseract language. - ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'. - img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR). - - Returns: - (cells, columns_meta) where cells is a list of cell dicts and - columns_meta describes the columns used. - """ - # Resolve engine choice - use_rapid = False - if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): - engine_name = ocr_engine - elif ocr_engine == "auto": - use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None - engine_name = "rapid" if use_rapid else "tesseract" - elif ocr_engine == "rapid": - if not RAPIDOCR_AVAILABLE: - logger.warning("RapidOCR requested but not available, falling back to Tesseract") - else: - use_rapid = True - engine_name = "rapid" if use_rapid else "tesseract" - else: - engine_name = "tesseract" - - logger.info(f"build_cell_grid: using OCR engine '{engine_name}'") - - # Filter to content rows only (skip header/footer) - content_rows = [r for r in row_geometries if r.row_type == 'content'] - if not content_rows: - logger.warning("build_cell_grid: no content rows found") - return [], [] - - # Filter phantom rows: rows with no Tesseract words assigned are - # inter-line whitespace gaps that would produce garbage OCR. - before = len(content_rows) - content_rows = [r for r in content_rows if r.word_count > 0] - skipped = before - len(content_rows) - if skipped > 0: - logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)") - if not content_rows: - logger.warning("build_cell_grid: no content rows with words found") - return [], [] - - # Use columns only — skip ignore, header, footer, page_ref - _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} - relevant_cols = [c for c in column_regions if c.type not in _skip_types] - if not relevant_cols: - logger.warning("build_cell_grid: no usable columns found") - return [], [] - - # Filter artifact rows: rows whose detected words are all single characters - # are caused by scanner shadows or noise, not real text. - before_art = len(content_rows) - content_rows = [r for r in content_rows if not _is_artifact_row(r)] - artifact_skipped = before_art - len(content_rows) - if artifact_skipped > 0: - logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)") - if not content_rows: - logger.warning("build_cell_grid: no content rows after artifact filtering") - return [], [] - - # Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows - # to fill the space so OCR crops are not artificially narrow. - _heal_row_gaps( - content_rows, - top_bound=min(c.y for c in relevant_cols), - bottom_bound=max(c.y + c.height for c in relevant_cols), - ) - - # Sort columns left-to-right - relevant_cols.sort(key=lambda c: c.x) - - # Build columns_meta - columns_meta = [ - { - 'index': col_idx, - 'type': col.type, - 'x': col.x, - 'width': col.width, - } - for col_idx, col in enumerate(relevant_cols) - ] - - # Choose OCR language per column type (Tesseract only) - lang_map = { - 'column_en': 'eng', - 'column_de': 'deu', - 'column_example': 'eng+deu', - } - - cells: List[Dict[str, Any]] = [] - - for row_idx, row in enumerate(content_rows): - # Pre-assign each word to exactly one column (nearest center) - col_words = _assign_row_words_to_columns(row, relevant_cols) - for col_idx, col in enumerate(relevant_cols): - cell = _ocr_single_cell( - row_idx, col_idx, row, col, - ocr_img, img_bgr, img_w, img_h, - use_rapid, engine_name, lang, lang_map, - preassigned_words=col_words[col_idx], - ) - cells.append(cell) - - # --- BATCH FALLBACK: re-OCR empty cells by column strip --- - # Collect cells that are still empty but have visible pixels. - # Instead of calling Tesseract once per cell (expensive), crop an entire - # column strip and run OCR once, then assign words to cells by Y position. - empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices] - for ci, cell in enumerate(cells): - if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7': - bpx = cell['bbox_px'] - x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h'] - if w > 0 and h > 0 and ocr_img is not None: - crop = ocr_img[y:y + h, x:x + w] - if crop.size > 0: - dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size - if dark_ratio > 0.005: - empty_by_col.setdefault(cell['col_index'], []).append(ci) - - for col_idx, cell_indices in empty_by_col.items(): - if len(cell_indices) < 3: - continue # Not worth batching for < 3 cells - - # Find the column strip bounding box (union of all empty cell bboxes) - min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices) - max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices) - col_x = cells[cell_indices[0]]['bbox_px']['x'] - col_w = cells[cell_indices[0]]['bbox_px']['w'] - - strip_region = PageRegion( - type=relevant_cols[col_idx].type, - x=col_x, y=min_y, - width=col_w, height=max_y_h - min_y, - ) - strip_lang = lang_map.get(relevant_cols[col_idx].type, lang) - - if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None: - strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten")) - elif engine_name == "lighton" and img_bgr is not None: - strip_words = ocr_region_lighton(img_bgr, strip_region) - elif use_rapid and img_bgr is not None: - strip_words = ocr_region_rapid(img_bgr, strip_region) - else: - strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6) - - if not strip_words: - continue - - strip_words = [w for w in strip_words if w.get('conf', 0) >= 30] - if not strip_words: - continue - - # Assign words to cells by Y overlap - for ci in cell_indices: - cell_y = cells[ci]['bbox_px']['y'] - cell_h = cells[ci]['bbox_px']['h'] - cell_mid_y = cell_y + cell_h / 2 - - matched_words = [ - w for w in strip_words - if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8 - ] - if matched_words: - matched_words.sort(key=lambda w: w['left']) - batch_text = ' '.join(w['text'] for w in matched_words) - batch_text = _clean_cell_text(batch_text) - if batch_text.strip(): - cells[ci]['text'] = batch_text - cells[ci]['confidence'] = round( - sum(w['conf'] for w in matched_words) / len(matched_words), 1 - ) - cells[ci]['ocr_engine'] = 'batch_column_ocr' - - batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip()) - if batch_filled > 0: - logger.info( - f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} " - f"empty cells in column {col_idx}" - ) - - # Post-OCR: remove rows where ALL cells are empty (inter-row gaps - # that had stray Tesseract artifacts giving word_count > 0). - rows_with_text: set = set() - for cell in cells: - if cell['text'].strip(): - rows_with_text.add(cell['row_index']) - before_filter = len(cells) - cells = [c for c in cells if c['row_index'] in rows_with_text] - empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1) - if empty_rows_removed > 0: - logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR") - - logger.info(f"build_cell_grid: {len(cells)} cells from " - f"{len(content_rows)} rows × {len(relevant_cols)} columns, " - f"engine={engine_name}") - - return cells, columns_meta - - -def build_cell_grid_streaming( - ocr_img: np.ndarray, - column_regions: List[PageRegion], - row_geometries: List[RowGeometry], - img_w: int, - img_h: int, - lang: str = "eng+deu", - ocr_engine: str = "auto", - img_bgr: Optional[np.ndarray] = None, -) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]: - """Like build_cell_grid(), but yields each cell as it is OCR'd. - - Yields: - (cell_dict, columns_meta, total_cells) for each cell. - """ - # Resolve engine choice (same as build_cell_grid) - use_rapid = False - if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"): - engine_name = ocr_engine - elif ocr_engine == "auto": - use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None - engine_name = "rapid" if use_rapid else "tesseract" - elif ocr_engine == "rapid": - if not RAPIDOCR_AVAILABLE: - logger.warning("RapidOCR requested but not available, falling back to Tesseract") - else: - use_rapid = True - engine_name = "rapid" if use_rapid else "tesseract" - else: - engine_name = "tesseract" - - content_rows = [r for r in row_geometries if r.row_type == 'content'] - if not content_rows: - return - - # Filter phantom rows: rows with no Tesseract words assigned are - # inter-line whitespace gaps that would produce garbage OCR. - before = len(content_rows) - content_rows = [r for r in content_rows if r.word_count > 0] - skipped = before - len(content_rows) - if skipped > 0: - logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)") - if not content_rows: - return - - _skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} - relevant_cols = [c for c in column_regions if c.type not in _skip_types] - if not relevant_cols: - return - - # Filter artifact rows + heal gaps (same logic as build_cell_grid) - before_art = len(content_rows) - content_rows = [r for r in content_rows if not _is_artifact_row(r)] - artifact_skipped = before_art - len(content_rows) - if artifact_skipped > 0: - logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows") - if not content_rows: - return - _heal_row_gaps( - content_rows, - top_bound=min(c.y for c in relevant_cols), - bottom_bound=max(c.y + c.height for c in relevant_cols), - ) - - relevant_cols.sort(key=lambda c: c.x) - - columns_meta = [ - { - 'index': col_idx, - 'type': col.type, - 'x': col.x, - 'width': col.width, - } - for col_idx, col in enumerate(relevant_cols) - ] - - lang_map = { - 'column_en': 'eng', - 'column_de': 'deu', - 'column_example': 'eng+deu', - } - - total_cells = len(content_rows) * len(relevant_cols) - - for row_idx, row in enumerate(content_rows): - # Pre-assign each word to exactly one column (nearest center) - col_words = _assign_row_words_to_columns(row, relevant_cols) - for col_idx, col in enumerate(relevant_cols): - cell = _ocr_single_cell( - row_idx, col_idx, row, col, - ocr_img, img_bgr, img_w, img_h, - use_rapid, engine_name, lang, lang_map, - preassigned_words=col_words[col_idx], - ) - yield cell, columns_meta, total_cells - - -def _cells_to_vocab_entries( - cells: List[Dict[str, Any]], - columns_meta: List[Dict[str, Any]], -) -> List[Dict[str, Any]]: - """Map generic cells to vocab entries with english/german/example fields. - - Groups cells by row_index, maps col_type → field name, and produces - one entry per row (only rows with at least one non-empty field). - """ - # Determine image dimensions from first cell (for row-level bbox) - col_type_to_field = { - 'column_en': 'english', - 'column_de': 'german', - 'column_example': 'example', - 'page_ref': 'source_page', - 'column_marker': 'marker', - } - bbox_key_map = { - 'column_en': 'bbox_en', - 'column_de': 'bbox_de', - 'column_example': 'bbox_ex', - 'page_ref': 'bbox_ref', - 'column_marker': 'bbox_marker', - } - - # Group cells by row_index - rows: Dict[int, List[Dict]] = {} - for cell in cells: - ri = cell['row_index'] - rows.setdefault(ri, []).append(cell) - - entries: List[Dict[str, Any]] = [] - for row_idx in sorted(rows.keys()): - row_cells = rows[row_idx] - entry: Dict[str, Any] = { - 'row_index': row_idx, - 'english': '', - 'german': '', - 'example': '', - 'source_page': '', - 'marker': '', - 'confidence': 0.0, - 'bbox': None, - 'bbox_en': None, - 'bbox_de': None, - 'bbox_ex': None, - 'bbox_ref': None, - 'bbox_marker': None, - 'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '', - } - - confidences = [] - for cell in row_cells: - col_type = cell['col_type'] - field = col_type_to_field.get(col_type) - if field: - entry[field] = cell['text'] - bbox_field = bbox_key_map.get(col_type) - if bbox_field: - entry[bbox_field] = cell['bbox_pct'] - if cell['confidence'] > 0: - confidences.append(cell['confidence']) - - # Compute row-level bbox as union of all cell bboxes - all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')] - if all_bboxes: - min_x = min(b['x'] for b in all_bboxes) - min_y = min(b['y'] for b in all_bboxes) - max_x2 = max(b['x'] + b['w'] for b in all_bboxes) - max_y2 = max(b['y'] + b['h'] for b in all_bboxes) - entry['bbox'] = { - 'x': round(min_x, 2), - 'y': round(min_y, 2), - 'w': round(max_x2 - min_x, 2), - 'h': round(max_y2 - min_y, 2), - } - - entry['confidence'] = round( - sum(confidences) / len(confidences), 1 - ) if confidences else 0.0 - - # Only include if at least one mapped field has text - has_content = any( - entry.get(f) - for f in col_type_to_field.values() - ) - if has_content: - entries.append(entry) - - return entries - - -# Regex: line starts with phonetic bracket content only (no real word before it) -_PHONETIC_ONLY_RE = re.compile( - r'''^\s*[\[\('"]*[^\]]*[\])\s]*$''' +from cv_ocr_engines import ( # noqa: F401 + _fix_character_confusion, + _fix_phonetic_brackets, ) - - -def _is_phonetic_only_text(text: str) -> bool: - """Check if text consists only of phonetic transcription. - - Phonetic-only patterns: - ['mani serva] → True - [dɑːns] → True - ["a:mand] → True - almond ['a:mand] → False (has real word before bracket) - Mandel → False - """ - t = text.strip() - if not t: - return False - # Must contain at least one bracket - if '[' not in t and ']' not in t: - return False - # Remove all bracket content and surrounding punctuation/whitespace - without_brackets = re.sub(r"\[.*?\]", '', t) - without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets) - # If nothing meaningful remains, it's phonetic-only - alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets)) - return len(alpha_remaining) < 2 - - -def _merge_phonetic_continuation_rows( - entries: List[Dict[str, Any]], -) -> List[Dict[str, Any]]: - """Merge rows that contain only phonetic transcription into previous entry. - - In dictionary pages, phonetic transcription sometimes wraps to the next - row. E.g.: - Row 28: EN="it's a money-saver" DE="es spart Kosten" - Row 29: EN="['mani serva]" DE="" - - Row 29 is phonetic-only → merge into row 28's EN field. - """ - if len(entries) < 2: - return entries - - merged: List[Dict[str, Any]] = [] - for entry in entries: - en = (entry.get('english') or '').strip() - de = (entry.get('german') or '').strip() - ex = (entry.get('example') or '').strip() - - # Check if this entry is phonetic-only (EN has only phonetics, DE empty) - if merged and _is_phonetic_only_text(en) and not de: - prev = merged[-1] - prev_en = (prev.get('english') or '').strip() - # Append phonetic to previous entry's EN - if prev_en: - prev['english'] = prev_en + ' ' + en - else: - prev['english'] = en - # If there was an example, append to previous too - if ex: - prev_ex = (prev.get('example') or '').strip() - prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex - logger.debug( - f"Merged phonetic row {entry.get('row_index')} " - f"into previous entry: {prev['english']!r}" - ) - continue - - merged.append(entry) - - return merged - - -def _merge_continuation_rows( - entries: List[Dict[str, Any]], -) -> List[Dict[str, Any]]: - """Merge multi-line vocabulary entries where text wraps to the next row. - - A row is a continuation of the previous entry when: - - EN has text, but DE is empty - - EN starts with a lowercase letter (not a new vocab entry) - - Previous entry's EN does NOT end with a sentence terminator (.!?) - - The continuation text has fewer than 4 words (not an example sentence) - - The row was not already merged as phonetic - - Example: - Row 5: EN="to put up" DE="aufstellen" - Row 6: EN="with sth." DE="" - → Merged: EN="to put up with sth." DE="aufstellen" - """ - if len(entries) < 2: - return entries - - merged: List[Dict[str, Any]] = [] - for entry in entries: - en = (entry.get('english') or '').strip() - de = (entry.get('german') or '').strip() - - if merged and en and not de: - # Check: not phonetic (already handled) - if _is_phonetic_only_text(en): - merged.append(entry) - continue - - # Check: starts with lowercase - first_alpha = next((c for c in en if c.isalpha()), '') - starts_lower = first_alpha and first_alpha.islower() - - # Check: fewer than 4 words (not an example sentence) - word_count = len(en.split()) - is_short = word_count < 4 - - # Check: previous entry doesn't end with sentence terminator - prev = merged[-1] - prev_en = (prev.get('english') or '').strip() - prev_ends_sentence = prev_en and prev_en[-1] in '.!?' - - if starts_lower and is_short and not prev_ends_sentence: - # Merge into previous entry - prev['english'] = (prev_en + ' ' + en).strip() - # Merge example if present - ex = (entry.get('example') or '').strip() - if ex: - prev_ex = (prev.get('example') or '').strip() - prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex - logger.debug( - f"Merged continuation row {entry.get('row_index')} " - f"into previous entry: {prev['english']!r}" - ) - continue - - merged.append(entry) - - return merged - - -def build_word_grid( - ocr_img: np.ndarray, - column_regions: List[PageRegion], - row_geometries: List[RowGeometry], - img_w: int, - img_h: int, - lang: str = "eng+deu", - ocr_engine: str = "auto", - img_bgr: Optional[np.ndarray] = None, - pronunciation: str = "british", -) -> List[Dict[str, Any]]: - """Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing. - - Wrapper around build_cell_grid() that adds vocabulary-specific logic: - - Maps cells to english/german/example entries - - Applies character confusion fixes, IPA lookup, comma splitting, etc. - - Falls back to returning raw cells if no vocab columns detected. - - Args: - ocr_img: Binarized full-page image (for Tesseract). - column_regions: Classified columns from Step 3. - row_geometries: Rows from Step 4. - img_w, img_h: Image dimensions. - lang: Default Tesseract language. - ocr_engine: 'tesseract', 'rapid', or 'auto'. - img_bgr: BGR color image (required for RapidOCR). - pronunciation: 'british' or 'american' for IPA lookup. - - Returns: - List of entry dicts with english/german/example text and bbox info (percent). - """ - cells, columns_meta = build_cell_grid( - ocr_img, column_regions, row_geometries, img_w, img_h, - lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr, - ) - - if not cells: - return [] - - # Check if vocab layout is present - col_types = {c['type'] for c in columns_meta} - if not (col_types & {'column_en', 'column_de'}): - logger.info("build_word_grid: no vocab columns — returning raw cells") - return cells - - # Vocab mapping: cells → entries - entries = _cells_to_vocab_entries(cells, columns_meta) - - # --- Post-processing pipeline (deterministic, no LLM) --- - n_raw = len(entries) - - # 0a. Merge phonetic-only continuation rows into previous entry - entries = _merge_phonetic_continuation_rows(entries) - - # 0b. Merge multi-line continuation rows (lowercase EN, empty DE) - entries = _merge_continuation_rows(entries) - - # 1. Character confusion (| → I, 1 → I, 8 → B) is now run in - # llm_review_entries_streaming so changes are visible to the user in Step 6. - - # 2. Replace OCR'd phonetics with dictionary IPA - entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation) - - # 3. Split comma-separated word forms (break, broke, broken → 3 entries) - entries = _split_comma_entries(entries) - - # 4. Attach example sentences (rows without DE → examples for preceding entry) - entries = _attach_example_sentences(entries) - - engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown' - logger.info(f"build_word_grid: {len(entries)} entries from " - f"{n_raw} raw → {len(entries)} after post-processing " - f"(engine={engine_name})") - - return entries - - -# ============================================================================= -# Stage 6: Multi-Pass OCR -# ============================================================================= - -def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str, - psm: int, fallback_psm: Optional[int] = None, - min_confidence: float = 40.0) -> List[Dict[str, Any]]: - """Run Tesseract OCR on a specific region with given PSM. - - Args: - ocr_img: Binarized full-page image. - region: Region to crop and OCR. - lang: Tesseract language string. - psm: Page Segmentation Mode. - fallback_psm: If confidence too low, retry with this PSM per line. - min_confidence: Minimum average confidence before fallback. - - Returns: - List of word dicts with text, position, confidence. - """ - # Crop region - crop = ocr_img[region.y:region.y + region.height, - region.x:region.x + region.width] - - if crop.size == 0: - return [] - - # Convert to PIL for pytesseract - pil_img = Image.fromarray(crop) - - # Run Tesseract with specified PSM - config = f'--psm {psm} --oem 3' - try: - data = pytesseract.image_to_data(pil_img, lang=lang, config=config, - output_type=pytesseract.Output.DICT) - except Exception as e: - logger.warning(f"Tesseract failed for region {region.type}: {e}") - return [] - - words = [] - for i in range(len(data['text'])): - text = data['text'][i].strip() - conf = int(data['conf'][i]) - if not text or conf < 10: - continue - words.append({ - 'text': text, - 'left': data['left'][i] + region.x, # Absolute coords - 'top': data['top'][i] + region.y, - 'width': data['width'][i], - 'height': data['height'][i], - 'conf': conf, - 'region_type': region.type, - }) - - # Check average confidence - if words and fallback_psm is not None: - avg_conf = sum(w['conf'] for w in words) / len(words) - if avg_conf < min_confidence: - logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, " - f"trying fallback PSM {fallback_psm}") - words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm) - - return words - - -def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion, - lang: str, psm: int) -> List[Dict[str, Any]]: - """OCR a region line by line (fallback for low-confidence regions). - - Splits the region into horizontal strips based on text density, - then OCRs each strip individually with the given PSM. - """ - crop = ocr_img[region.y:region.y + region.height, - region.x:region.x + region.width] - - if crop.size == 0: - return [] - - # Find text lines via horizontal projection - inv = cv2.bitwise_not(crop) - h_proj = np.sum(inv, axis=1) - threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0 - - # Find line boundaries - lines = [] - in_text = False - line_start = 0 - for y in range(len(h_proj)): - if h_proj[y] > threshold and not in_text: - line_start = y - in_text = True - elif h_proj[y] <= threshold and in_text: - if y - line_start > 5: # Minimum line height - lines.append((line_start, y)) - in_text = False - if in_text and len(h_proj) - line_start > 5: - lines.append((line_start, len(h_proj))) - - all_words = [] - config = f'--psm {psm} --oem 3' - - for line_y_start, line_y_end in lines: - # Add small padding - pad = 3 - y1 = max(0, line_y_start - pad) - y2 = min(crop.shape[0], line_y_end + pad) - line_crop = crop[y1:y2, :] - - if line_crop.size == 0: - continue - - pil_img = Image.fromarray(line_crop) - try: - data = pytesseract.image_to_data(pil_img, lang=lang, config=config, - output_type=pytesseract.Output.DICT) - except Exception: - continue - - for i in range(len(data['text'])): - text = data['text'][i].strip() - conf = int(data['conf'][i]) - if not text or conf < 10: - continue - all_words.append({ - 'text': text, - 'left': data['left'][i] + region.x, - 'top': data['top'][i] + region.y + y1, - 'width': data['width'][i], - 'height': data['height'][i], - 'conf': conf, - 'region_type': region.type, - }) - - return all_words - - -def run_multi_pass_ocr(ocr_img: np.ndarray, - regions: List[PageRegion], - lang: str = "eng+deu") -> Dict[str, List[Dict]]: - """Run OCR on each detected region with optimized settings. - - Args: - ocr_img: Binarized full-page image. - regions: Detected page regions. - lang: Default language. - - Returns: - Dict mapping region type to list of word dicts. - """ - results: Dict[str, List[Dict]] = {} - - _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} - for region in regions: - if region.type in _ocr_skip: - continue # Skip non-content regions - - if region.type == 'column_en': - words = ocr_region(ocr_img, region, lang='eng', psm=4) - elif region.type == 'column_de': - words = ocr_region(ocr_img, region, lang='deu', psm=4) - elif region.type == 'column_example': - words = ocr_region(ocr_img, region, lang=lang, psm=6, - fallback_psm=7, min_confidence=40.0) - else: - words = ocr_region(ocr_img, region, lang=lang, psm=6) - - results[region.type] = words - logger.info(f"OCR {region.type}: {len(words)} words") - - return results - - -# ============================================================================= -# Stage 7: Line Alignment → Vocabulary Entries -# ============================================================================= - -def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]: - """Group words by Y position into lines, sorted by X within each line.""" - if not words: - return [] - - sorted_words = sorted(words, key=lambda w: (w['top'], w['left'])) - lines: List[List[Dict]] = [] - current_line: List[Dict] = [sorted_words[0]] - current_y = sorted_words[0]['top'] - - for word in sorted_words[1:]: - if abs(word['top'] - current_y) <= y_tolerance_px: - current_line.append(word) - else: - current_line.sort(key=lambda w: w['left']) - lines.append(current_line) - current_line = [word] - current_y = word['top'] - - if current_line: - current_line.sort(key=lambda w: w['left']) - lines.append(current_line) - - return lines - - -def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]], - regions: List[PageRegion], - y_tolerance_px: int = 25) -> List[VocabRow]: - """Align OCR results from different columns into vocabulary rows. - - Uses Y-coordinate matching to pair English words, German translations, - and example sentences that appear on the same line. - - Args: - ocr_results: Dict mapping region type to word lists. - regions: Detected regions (for reference). - y_tolerance_px: Max Y-distance to consider words on the same row. - - Returns: - List of VocabRow objects. - """ - # If no vocabulary columns detected (e.g. plain text page), return empty - if 'column_en' not in ocr_results and 'column_de' not in ocr_results: - logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty") - return [] - - # Group words into lines per column - en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px) - de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px) - ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px) - - def line_y_center(line: List[Dict]) -> float: - return sum(w['top'] + w['height'] / 2 for w in line) / len(line) - - def line_text(line: List[Dict]) -> str: - return ' '.join(w['text'] for w in line) - - def line_confidence(line: List[Dict]) -> float: - return sum(w['conf'] for w in line) / len(line) if line else 0 - - # Build EN entries as the primary reference - vocab_rows: List[VocabRow] = [] - - for en_line in en_lines: - en_y = line_y_center(en_line) - en_text = line_text(en_line) - en_conf = line_confidence(en_line) - - # Skip very short or likely header content - if len(en_text.strip()) < 2: - continue - - # Find matching DE line - de_text = "" - de_conf = 0.0 - best_de_dist = float('inf') - best_de_idx = -1 - for idx, de_line in enumerate(de_lines): - dist = abs(line_y_center(de_line) - en_y) - if dist < y_tolerance_px and dist < best_de_dist: - best_de_dist = dist - best_de_idx = idx - - if best_de_idx >= 0: - de_text = line_text(de_lines[best_de_idx]) - de_conf = line_confidence(de_lines[best_de_idx]) - - # Find matching example line - ex_text = "" - ex_conf = 0.0 - best_ex_dist = float('inf') - best_ex_idx = -1 - for idx, ex_line in enumerate(ex_lines): - dist = abs(line_y_center(ex_line) - en_y) - if dist < y_tolerance_px and dist < best_ex_dist: - best_ex_dist = dist - best_ex_idx = idx - - if best_ex_idx >= 0: - ex_text = line_text(ex_lines[best_ex_idx]) - ex_conf = line_confidence(ex_lines[best_ex_idx]) - - avg_conf = en_conf - conf_count = 1 - if de_conf > 0: - avg_conf += de_conf - conf_count += 1 - if ex_conf > 0: - avg_conf += ex_conf - conf_count += 1 - - vocab_rows.append(VocabRow( - english=en_text.strip(), - german=de_text.strip(), - example=ex_text.strip(), - confidence=avg_conf / conf_count, - y_position=int(en_y), - )) - - # Handle multi-line wrapping in example column: - # If an example line has no matching EN/DE, append to previous entry - matched_ex_ys = set() - for row in vocab_rows: - if row.example: - matched_ex_ys.add(row.y_position) - - for ex_line in ex_lines: - ex_y = line_y_center(ex_line) - # Check if already matched - already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys) - if already_matched: - continue - - # Find nearest previous vocab row - best_row = None - best_dist = float('inf') - for row in vocab_rows: - dist = ex_y - row.y_position - if 0 < dist < y_tolerance_px * 3 and dist < best_dist: - best_dist = dist - best_row = row - - if best_row: - continuation = line_text(ex_line).strip() - if continuation: - best_row.example = (best_row.example + " " + continuation).strip() - - # Sort by Y position - vocab_rows.sort(key=lambda r: r.y_position) - - return vocab_rows - - -# ============================================================================= -# Stage 8: Optional LLM Post-Correction -# ============================================================================= - -async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow], - confidence_threshold: float = 50.0, - enabled: bool = False) -> List[VocabRow]: - """Optionally send low-confidence regions to Qwen-VL for correction. - - Default: disabled. Enable per parameter. - - Args: - img: Original BGR image. - vocab_rows: Current vocabulary rows. - confidence_threshold: Rows below this get LLM correction. - enabled: Whether to actually run LLM correction. - - Returns: - Corrected vocabulary rows. - """ - if not enabled: - return vocab_rows - - # TODO: Implement Qwen-VL correction for low-confidence entries - # For each row with confidence < threshold: - # 1. Crop the relevant region from img - # 2. Send crop + OCR text to Qwen-VL - # 3. Replace text if LLM provides a confident correction - logger.info(f"LLM post-correction skipped (not yet implemented)") - return vocab_rows - - -# ============================================================================= -# Orchestrator -# ============================================================================= - -async def run_cv_pipeline( - pdf_data: Optional[bytes] = None, - image_data: Optional[bytes] = None, - page_number: int = 0, - zoom: float = 3.0, - enable_dewarp: bool = True, - enable_llm_correction: bool = False, - lang: str = "eng+deu", -) -> PipelineResult: - """Run the complete CV document reconstruction pipeline. - - Args: - pdf_data: Raw PDF bytes (mutually exclusive with image_data). - image_data: Raw image bytes (mutually exclusive with pdf_data). - page_number: 0-indexed page number (for PDF). - zoom: PDF rendering zoom factor. - enable_dewarp: Whether to run dewarp stage. - enable_llm_correction: Whether to run LLM post-correction. - lang: Tesseract language string. - - Returns: - PipelineResult with vocabulary and timing info. - """ - if not CV_PIPELINE_AVAILABLE: - return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)") - - result = PipelineResult() - total_start = time.time() - - try: - # Stage 1: Render - t = time.time() - if pdf_data: - img = render_pdf_high_res(pdf_data, page_number, zoom) - elif image_data: - img = render_image_high_res(image_data) - else: - return PipelineResult(error="No input data (pdf_data or image_data required)") - result.stages['render'] = round(time.time() - t, 2) - result.image_width = img.shape[1] - result.image_height = img.shape[0] - logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s") - - # Stage 2: Deskew - t = time.time() - img, angle = deskew_image(img) - result.stages['deskew'] = round(time.time() - t, 2) - logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s") - - # Stage 3: Dewarp - if enable_dewarp: - t = time.time() - img, _dewarp_info = dewarp_image(img) - result.stages['dewarp'] = round(time.time() - t, 2) - - # Stage 4: Dual image preparation - t = time.time() - ocr_img = create_ocr_image(img) - layout_img = create_layout_image(img) - result.stages['image_prep'] = round(time.time() - t, 2) - - # Stage 5: Layout analysis - t = time.time() - regions = analyze_layout(layout_img, ocr_img) - result.stages['layout'] = round(time.time() - t, 2) - result.columns_detected = len([r for r in regions if r.type.startswith('column')]) - logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s") - - # Stage 6: Multi-pass OCR - t = time.time() - ocr_results = run_multi_pass_ocr(ocr_img, regions, lang) - result.stages['ocr'] = round(time.time() - t, 2) - total_words = sum(len(w) for w in ocr_results.values()) - result.word_count = total_words - logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s") - - # Stage 7: Line alignment - t = time.time() - vocab_rows = match_lines_to_vocab(ocr_results, regions) - result.stages['alignment'] = round(time.time() - t, 2) - - # Stage 8: Optional LLM correction - if enable_llm_correction: - t = time.time() - vocab_rows = await llm_post_correct(img, vocab_rows) - result.stages['llm_correction'] = round(time.time() - t, 2) - - # Convert to output format - result.vocabulary = [ - { - "english": row.english, - "german": row.german, - "example": row.example, - "confidence": round(row.confidence, 1), - } - for row in vocab_rows - if row.english or row.german # Skip empty rows - ] - - result.duration_seconds = round(time.time() - total_start, 2) - logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s") - - except Exception as e: - logger.error(f"CV Pipeline error: {e}") - import traceback - logger.debug(traceback.format_exc()) - result.error = str(e) - result.duration_seconds = round(time.time() - total_start, 2) - - return result - - -# --------------------------------------------------------------------------- -# LLM-based OCR Correction (Step 6) -# --------------------------------------------------------------------------- - -import httpx -import os -import json as _json -import re as _re - -_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") -OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b") -_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20")) -logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE) - -# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]" -_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]') - -# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion. -# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B) -# when they appear inside or next to a word character. -_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])') - - -def _entry_needs_review(entry: Dict) -> bool: - """Check if an entry should be sent to the LLM for review. - - Sends all non-empty entries that don't have IPA phonetic transcriptions. - The LLM prompt and _is_spurious_change() guard against unwanted changes. - """ - en = entry.get("english", "") or "" - de = entry.get("german", "") or "" - - # Skip completely empty entries - if not en.strip() and not de.strip(): - return False - # Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them - if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de): - return False - return True - - -def _build_llm_prompt(table_lines: List[Dict]) -> str: - """Build the LLM correction prompt for a batch of entries.""" - return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch). - -DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden. - -NUR diese Korrekturen sind erlaubt: -- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball" -- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old" -- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin" -- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See" -- Ziffer 6 statt G oder g: "6eld" → "Geld" -- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help" - -ABSOLUT VERBOTEN — aendere NIEMALS: -- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst -- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN -- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst -- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest -- Eigennamen: Ben, London, China, Africa, Shakespeare usw. -- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw. -- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren -- Beispielsaetze in der ex-Spalte — NIEMALS aendern - -Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false. - -Antworte NUR mit dem JSON-Array. Kein Text davor oder danach. -Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge). - -/no_think - -Eingabe: -{_json.dumps(table_lines, ensure_ascii=False, indent=2)}""" - - -def _is_spurious_change(old_val: str, new_val: str) -> bool: - """Detect LLM changes that are likely wrong and should be discarded. - - Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are - legitimate OCR corrections. Everything else is rejected. - - Filters out: - - Case-only changes - - Changes that don't contain any digit→letter fix - - Completely different words (LLM translating or hallucinating) - - Additions or removals of whole words (count changed) - """ - if not old_val or not new_val: - return False - - # Case-only change — never a real OCR error - if old_val.lower() == new_val.lower(): - return True - - # If the word count changed significantly, the LLM rewrote rather than fixed - old_words = old_val.split() - new_words = new_val.split() - if abs(len(old_words) - len(new_words)) > 1: - return True - - # Core rule: a legitimate correction replaces a digit with the corresponding - # letter. If the change doesn't include such a substitution, reject it. - # Build a set of (old_char, new_char) pairs that differ between old and new. - # Use character-level diff heuristic: if lengths are close, zip and compare. - # Map of characters that OCR commonly misreads → set of correct replacements - _OCR_CHAR_MAP = { - # Digits mistaken for letters - '0': set('oOgG'), - '1': set('lLiI'), - '5': set('sS'), - '6': set('gG'), - '8': set('bB'), - # Non-letter symbols mistaken for letters - '|': set('lLiI1'), # pipe → lowercase l, capital I, or digit 1 - 'l': set('iI|1'), # lowercase l → capital I (and reverse) - } - has_valid_fix = False - if len(old_val) == len(new_val): - for oc, nc in zip(old_val, new_val): - if oc != nc: - if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]: - has_valid_fix = True - elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]: - # Reverse check (e.g. l→I where new is the "correct" char) - has_valid_fix = True - else: - # Length changed by 1: accept if old had a suspicious char sequence - _OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]') - if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val): - has_valid_fix = True - - if not has_valid_fix: - return True # Reject — looks like translation or hallucination - - return False - - -def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]: - """Compare original entries with LLM-corrected ones, return (changes, corrected_entries).""" - changes = [] - entries_out = [] - for i, orig in enumerate(originals): - if i < len(corrected): - c = corrected[i] - entry = dict(orig) - for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]: - new_val = c.get(key, "").strip() - old_val = (orig.get(field_name, "") or "").strip() - if new_val and new_val != old_val: - # Filter spurious LLM changes - if _is_spurious_change(old_val, new_val): - continue - changes.append({ - "row_index": orig.get("row_index", i), - "field": field_name, - "old": old_val, - "new": new_val, - }) - entry[field_name] = new_val - entry["llm_corrected"] = True - entries_out.append(entry) - else: - entries_out.append(dict(orig)) - return changes, entries_out - - -# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ──────────────────────────── - -REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm" - -try: - from spellchecker import SpellChecker as _SpellChecker - _en_spell = _SpellChecker(language='en', distance=1) - _de_spell = _SpellChecker(language='de', distance=1) - _SPELL_AVAILABLE = True - logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE) -except ImportError: - _SPELL_AVAILABLE = False - logger.warning("pyspellchecker not installed — falling back to LLM review") - -# ─── Page-Ref Normalization ─────────────────────────────────────────────────── -# Normalizes OCR variants like "p-60", "p 61", "p60" → "p.60" -_PAGE_REF_RE = _re.compile(r'\bp[\s\-]?(\d+)', _re.IGNORECASE) - - -def _normalize_page_ref(text: str) -> str: - """Normalize page references: 'p-60' / 'p 61' / 'p60' → 'p.60'.""" - if not text: - return text - return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text) - - -# Suspicious OCR chars → ordered list of most-likely correct replacements -_SPELL_SUBS: Dict[str, List[str]] = { - '0': ['O', 'o'], - '1': ['l', 'I'], - '5': ['S', 's'], - '6': ['G', 'g'], - '8': ['B', 'b'], - '|': ['I', 'l', '1'], -} -_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys()) - -# Tokenizer: word tokens (letters + pipe) alternating with separators -_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)') - - -def _spell_dict_knows(word: str) -> bool: - """True if word is known in EN or DE dictionary.""" - if not _SPELL_AVAILABLE: - return False - w = word.lower() - return bool(_en_spell.known([w])) or bool(_de_spell.known([w])) - - -def _spell_fix_token(token: str, field: str = "") -> Optional[str]: - """Return corrected form of token, or None if no fix needed/possible. - - *field* is 'english' or 'german' — used to pick the right dictionary - for general spell correction (step 3 below). - """ - has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token) - - # 1. Already known word → no fix needed - if _spell_dict_knows(token): - return None - - # 2. Digit/pipe substitution (existing logic) - if has_suspicious: - # Standalone pipe → capital I - if token == '|': - return 'I' - # Dictionary-backed single-char substitution - for i, ch in enumerate(token): - if ch not in _SPELL_SUBS: - continue - for replacement in _SPELL_SUBS[ch]: - candidate = token[:i] + replacement + token[i + 1:] - if _spell_dict_knows(candidate): - return candidate - # Structural rule: suspicious char at position 0 + rest is all lowercase letters - first = token[0] - if first in _SPELL_SUBS and len(token) >= 2: - rest = token[1:] - if rest.isalpha() and rest.islower(): - candidate = _SPELL_SUBS[first][0] + rest - if not candidate[0].isdigit(): - return candidate - - # 3. OCR umlaut confusion: OCR often drops umlaut dots (ü→i, ä→a, ö→o, ü→u) - # Try single-char umlaut substitutions and check against dictionary. - if len(token) >= 3 and token.isalpha() and field == "german": - _UMLAUT_SUBS = {'a': 'ä', 'o': 'ö', 'u': 'ü', 'i': 'ü', - 'A': 'Ä', 'O': 'Ö', 'U': 'Ü', 'I': 'Ü'} - for i, ch in enumerate(token): - if ch in _UMLAUT_SUBS: - candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:] - if _spell_dict_knows(candidate): - return candidate - - # 4. General spell correction for unknown words (no digits/pipes) - # e.g. "beautful" → "beautiful" - if not has_suspicious and len(token) >= 3 and token.isalpha(): - spell = _en_spell if field == "english" else _de_spell if field == "german" else None - if spell is not None: - correction = spell.correction(token.lower()) - if correction and correction != token.lower(): - # Preserve original capitalisation pattern - if token[0].isupper(): - correction = correction[0].upper() + correction[1:] - if _spell_dict_knows(correction): - return correction - return None - - -def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]: - """Apply OCR corrections to a text field. Returns (fixed_text, was_changed). - - *field* is 'english' or 'german' — forwarded to _spell_fix_token for - dictionary selection. - """ - if not text: - return text, False - has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS) - # If no suspicious chars AND no alpha chars that could be misspelled, skip - if not has_suspicious and not any(c.isalpha() for c in text): - return text, False - # Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ") - fixed = _re.sub(r'(? Dict: - """Rule-based OCR correction: spell-checker + structural heuristics. - - Deterministic — never translates, never touches IPA, never hallucinates. - """ - t0 = time.time() - changes: List[Dict] = [] - all_corrected: List[Dict] = [] - for i, entry in enumerate(entries): - e = dict(entry) - # Page-ref normalization (always, regardless of review status) - old_ref = (e.get("source_page") or "").strip() - if old_ref: - new_ref = _normalize_page_ref(old_ref) - if new_ref != old_ref: - changes.append({ - "row_index": e.get("row_index", i), - "field": "source_page", - "old": old_ref, - "new": new_ref, - }) - e["source_page"] = new_ref - e["llm_corrected"] = True - if not _entry_needs_review(e): - all_corrected.append(e) - continue - for field_name in ("english", "german", "example"): - old_val = (e.get(field_name) or "").strip() - if not old_val: - continue - # example field is mixed-language — try German first (for umlauts) - lang = "german" if field_name in ("german", "example") else "english" - new_val, was_changed = _spell_fix_field(old_val, field=lang) - if was_changed and new_val != old_val: - changes.append({ - "row_index": e.get("row_index", i), - "field": field_name, - "old": old_val, - "new": new_val, - }) - e[field_name] = new_val - e["llm_corrected"] = True - all_corrected.append(e) - duration_ms = int((time.time() - t0) * 1000) - return { - "entries_original": entries, - "entries_corrected": all_corrected, - "changes": changes, - "skipped_count": 0, - "model_used": "spell-checker", - "duration_ms": duration_ms, - } - - -async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50): - """Async generator yielding SSE-compatible events for spell-checker review.""" - total = len(entries) - yield { - "type": "meta", - "total_entries": total, - "to_review": total, - "skipped": 0, - "model": "spell-checker", - "batch_size": batch_size, - } - result = spell_review_entries_sync(entries) - changes = result["changes"] - yield { - "type": "batch", - "batch_index": 0, - "entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)], - "changes": changes, - "duration_ms": result["duration_ms"], - "progress": {"current": total, "total": total}, - } - yield { - "type": "complete", - "changes": changes, - "model_used": "spell-checker", - "duration_ms": result["duration_ms"], - "total_entries": total, - "reviewed": total, - "skipped": 0, - "corrections_found": len(changes), - "entries_corrected": result["entries_corrected"], - } - -# ─── End Spell-Checker ──────────────────────────────────────────────────────── - - -async def llm_review_entries( - entries: List[Dict], - model: str = None, -) -> Dict: - """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm).""" - if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: - return spell_review_entries_sync(entries) - if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: - logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") - - model = model or OLLAMA_REVIEW_MODEL - - # Filter: only entries that need review - reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)] - - if not reviewable: - return { - "entries_original": entries, - "entries_corrected": [dict(e) for e in entries], - "changes": [], - "skipped_count": len(entries), - "model_used": model, - "duration_ms": 0, - } - - review_entries = [e for _, e in reviewable] - table_lines = [ - {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")} - for e in review_entries - ] - - logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)", - len(review_entries), len(entries), model, len(entries) - len(reviewable)) - logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False)) - - prompt = _build_llm_prompt(table_lines) - - t0 = time.time() - async with httpx.AsyncClient(timeout=300.0) as client: - resp = await client.post( - f"{_OLLAMA_URL}/api/chat", - json={ - "model": model, - "messages": [{"role": "user", "content": prompt}], - "stream": False, - "think": False, # qwen3: disable chain-of-thought (Ollama >=0.6) - "options": {"temperature": 0.1, "num_predict": 8192}, - }, - ) - resp.raise_for_status() - content = resp.json().get("message", {}).get("content", "") - duration_ms = int((time.time() - t0) * 1000) - - logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content)) - logger.debug("LLM review raw response (first 500): %.500s", content) - - corrected = _parse_llm_json_array(content) - logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected)) - changes, corrected_entries = _diff_batch(review_entries, corrected) - - # Merge corrected entries back into the full list - all_corrected = [dict(e) for e in entries] - for batch_idx, (orig_idx, _) in enumerate(reviewable): - if batch_idx < len(corrected_entries): - all_corrected[orig_idx] = corrected_entries[batch_idx] - - return { - "entries_original": entries, - "entries_corrected": all_corrected, - "changes": changes, - "skipped_count": len(entries) - len(reviewable), - "model_used": model, - "duration_ms": duration_ms, - } - - -async def llm_review_entries_streaming( - entries: List[Dict], - model: str = None, - batch_size: int = _REVIEW_BATCH_SIZE, -): - """Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE. - - Phase 0 (always): Run _fix_character_confusion and emit any changes so they are - visible in the UI — this is the only place the fix now runs (removed from Step 1 - of build_vocab_pipeline_streaming). - """ - # --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) --- - _CONF_FIELDS = ('english', 'german', 'example') - originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries] - _fix_character_confusion(entries) # modifies in-place, returns same list - char_changes = [ - {'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')} - for i in range(len(entries)) - for f in _CONF_FIELDS - if originals[i][f] != entries[i].get(f, '') - ] - - if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: - # Inject char_changes as a batch right after the meta event from the spell checker - _meta_sent = False - async for event in spell_review_entries_streaming(entries, batch_size): - yield event - if not _meta_sent and event.get('type') == 'meta' and char_changes: - _meta_sent = True - yield { - 'type': 'batch', - 'changes': char_changes, - 'entries_reviewed': sorted({c['row_index'] for c in char_changes}), - 'progress': {'current': 0, 'total': len(entries)}, - } - return - - if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE: - logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM") - - # LLM path: emit char_changes first (before meta) so they appear in the UI - if char_changes: - yield { - 'type': 'batch', - 'changes': char_changes, - 'entries_reviewed': sorted({c['row_index'] for c in char_changes}), - 'progress': {'current': 0, 'total': len(entries)}, - } - - model = model or OLLAMA_REVIEW_MODEL - - # Separate reviewable from skipped entries - reviewable = [] - skipped_indices = [] - for i, e in enumerate(entries): - if _entry_needs_review(e): - reviewable.append((i, e)) - else: - skipped_indices.append(i) - - total_to_review = len(reviewable) - - # meta event - yield { - "type": "meta", - "total_entries": len(entries), - "to_review": total_to_review, - "skipped": len(skipped_indices), - "model": model, - "batch_size": batch_size, - } - - all_changes = [] - all_corrected = [dict(e) for e in entries] - total_duration_ms = 0 - reviewed_count = 0 - - # Process in batches - for batch_start in range(0, total_to_review, batch_size): - batch_items = reviewable[batch_start:batch_start + batch_size] - batch_entries = [e for _, e in batch_items] - - table_lines = [ - {"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")} - for e in batch_entries - ] - - prompt = _build_llm_prompt(table_lines) - - logger.info("LLM review streaming: batch %d — sending %d entries to %s", - batch_start // batch_size, len(batch_entries), model) - - t0 = time.time() - async with httpx.AsyncClient(timeout=300.0) as client: - resp = await client.post( - f"{_OLLAMA_URL}/api/chat", - json={ - "model": model, - "messages": [{"role": "user", "content": prompt}], - "stream": False, - "think": False, # qwen3: disable chain-of-thought - "options": {"temperature": 0.1, "num_predict": 8192}, - }, - ) - resp.raise_for_status() - content = resp.json().get("message", {}).get("content", "") - batch_ms = int((time.time() - t0) * 1000) - total_duration_ms += batch_ms - - logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content)) - logger.debug("LLM review streaming raw (first 500): %.500s", content) - - corrected = _parse_llm_json_array(content) - logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected)) - batch_changes, batch_corrected = _diff_batch(batch_entries, corrected) - - # Merge back - for batch_idx, (orig_idx, _) in enumerate(batch_items): - if batch_idx < len(batch_corrected): - all_corrected[orig_idx] = batch_corrected[batch_idx] - - all_changes.extend(batch_changes) - reviewed_count += len(batch_items) - - # Yield batch result - yield { - "type": "batch", - "batch_index": batch_start // batch_size, - "entries_reviewed": [e.get("row_index", 0) for _, e in batch_items], - "changes": batch_changes, - "duration_ms": batch_ms, - "progress": {"current": reviewed_count, "total": total_to_review}, - } - - # Complete event - yield { - "type": "complete", - "changes": all_changes, - "model_used": model, - "duration_ms": total_duration_ms, - "total_entries": len(entries), - "reviewed": total_to_review, - "skipped": len(skipped_indices), - "corrections_found": len(all_changes), - "entries_corrected": all_corrected, - } - - -def _sanitize_for_json(text: str) -> str: - """Remove or escape control characters that break JSON parsing. - - Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid - JSON whitespace. Removes all other ASCII control characters (0x00-0x1f) - that are only valid inside JSON strings when properly escaped. - """ - # Replace literal control chars (except \\t \\n \\r) with a space - return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text) - - -def _parse_llm_json_array(text: str) -> List[Dict]: - """Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags).""" - # Strip qwen3 ... blocks (present even with think=False on some builds) - text = _re.sub(r'.*?', '', text, flags=_re.DOTALL) - # Strip markdown code fences - text = _re.sub(r'```json\s*', '', text) - text = _re.sub(r'```\s*', '', text) - # Sanitize control characters before JSON parsing - text = _sanitize_for_json(text) - # Find first [ ... last ] - match = _re.search(r'\[.*\]', text, _re.DOTALL) - if match: - try: - return _json.loads(match.group()) - except (ValueError, _json.JSONDecodeError) as e: - logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200]) - else: - logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200]) - return [] +from cv_cell_grid import _cells_to_vocab_entries # noqa: F401 diff --git a/klausur-service/backend/cv_vocab_types.py b/klausur-service/backend/cv_vocab_types.py new file mode 100644 index 0000000..74a6b9c --- /dev/null +++ b/klausur-service/backend/cv_vocab_types.py @@ -0,0 +1,156 @@ +""" +Shared types, constants, and availability guards for the CV vocabulary pipeline. + +Lizenz: Apache 2.0 (kommerziell nutzbar) +DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +import json +import logging +import os +import re # noqa: F401 — re-exported for downstream modules +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +import numpy as np # noqa: F401 + +logger = logging.getLogger(__name__) + +# --- Availability Guards --- + +try: + import cv2 # noqa: F401 + CV2_AVAILABLE = True +except ImportError: + cv2 = None # type: ignore[assignment] + CV2_AVAILABLE = False + logger.warning("OpenCV not available — CV pipeline disabled") + +try: + import pytesseract # noqa: F401 + from PIL import Image # noqa: F401 + TESSERACT_AVAILABLE = True +except ImportError: + pytesseract = None # type: ignore[assignment] + Image = None # type: ignore[assignment,misc] + TESSERACT_AVAILABLE = False + logger.warning("pytesseract/Pillow not available — CV pipeline disabled") + +CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE + +# --- IPA Dictionary --- + +IPA_AVAILABLE = False +_ipa_convert_american = None +_britfone_dict: Dict[str, str] = {} + +try: + import eng_to_ipa as _eng_to_ipa + _ipa_convert_american = _eng_to_ipa.convert + IPA_AVAILABLE = True + logger.info("eng_to_ipa available — American IPA lookup enabled") +except ImportError: + logger.info("eng_to_ipa not installed — American IPA disabled") + +# Load Britfone dictionary (MIT license, ~15k British English IPA entries) +_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json') +if os.path.exists(_britfone_path): + try: + with open(_britfone_path, 'r', encoding='utf-8') as f: + _britfone_dict = json.load(f) + IPA_AVAILABLE = True + logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries") + except Exception as e: + logger.warning(f"Failed to load Britfone: {e}") +else: + logger.info("Britfone not found — British IPA disabled") + +# --- Language Detection Constants --- + +GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht', + 'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird', + 'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur', + 'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben', + 'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'} + +ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of', + 'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from', + 'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', + 'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he', + 'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'} + + +# --- Data Classes --- + +@dataclass +class PageRegion: + """A detected region on the page.""" + type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom' + x: int + y: int + width: int + height: int + classification_confidence: float = 1.0 # 0.0-1.0 + classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback' + + +@dataclass +class ColumnGeometry: + """Geometrisch erkannte Spalte vor Typ-Klassifikation.""" + index: int # 0-basiert, links->rechts + x: int + y: int + width: int + height: int + word_count: int + words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...) + width_ratio: float # width / content_width (0.0-1.0) + is_sub_column: bool = False # True if created by _detect_sub_columns() split + + +@dataclass +class RowGeometry: + """Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation.""" + index: int # 0-basiert, oben→unten + x: int # absolute left (= content left_x) + y: int # absolute y start + width: int # content width + height: int # Zeilenhoehe in px + word_count: int + words: List[Dict] + row_type: str = 'content' # 'content' | 'header' | 'footer' + gap_before: int = 0 # Gap in px ueber dieser Zeile + + +@dataclass +class VocabRow: + """A single vocabulary entry assembled from multi-column OCR.""" + english: str = "" + german: str = "" + example: str = "" + source_page: str = "" + confidence: float = 0.0 + y_position: int = 0 + + +@dataclass +class PipelineResult: + """Complete result of the CV pipeline.""" + vocabulary: List[Dict[str, Any]] = field(default_factory=list) + word_count: int = 0 + columns_detected: int = 0 + duration_seconds: float = 0.0 + stages: Dict[str, float] = field(default_factory=dict) + error: Optional[str] = None + image_width: int = 0 + image_height: int = 0 + + +@dataclass +class DocumentTypeResult: + """Result of automatic document type detection.""" + doc_type: str # 'vocab_table' | 'full_text' | 'generic_table' + confidence: float # 0.0-1.0 + pipeline: str # 'cell_first' | 'full_page' + skip_steps: List[str] = field(default_factory=list) # e.g. ['columns', 'rows'] + features: Dict[str, Any] = field(default_factory=dict) # debug info