Fix: Sidebar scrollable + add Eltern-Portal nav link

overflow-hidden → overflow-y-auto so all nav items are reachable.
Added /parent (Eltern-Portal) link with people icon.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 20:49:44 +02:00
parent d87645ffce
commit 45287b3541
48 changed files with 6 additions and 1 deletions
@@ -0,0 +1,498 @@
"""
Cell-grid construction v2 (hybrid: broad columns via word lookup, narrow via cell-crop).
Extracted from cv_cell_grid.py.
Lizenz: Apache 2.0 — DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import PageRegion, RowGeometry
from cv_ocr_engines import (
RAPIDOCR_AVAILABLE,
_assign_row_words_to_columns,
_clean_cell_text,
_clean_cell_text_lite,
_words_to_reading_order_text,
_words_to_spaced_text,
ocr_region_lighton,
ocr_region_rapid,
ocr_region_trocr,
)
from cv_cell_grid_helpers import (
_MIN_WORD_CONF,
_ensure_minimum_crop_size,
_heal_row_gaps,
_is_artifact_row,
_select_psm_for_column,
)
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
# ---------------------------------------------------------------------------
# _ocr_cell_crop — isolated cell-crop OCR for v2 hybrid mode
# ---------------------------------------------------------------------------
def _ocr_cell_crop(
row_idx: int,
col_idx: int,
row: RowGeometry,
col: PageRegion,
ocr_img: np.ndarray,
img_bgr: Optional[np.ndarray],
img_w: int,
img_h: int,
engine_name: str,
lang: str,
lang_map: Dict[str, str],
) -> Dict[str, Any]:
"""OCR a single cell by cropping the exact column x row intersection.
No padding beyond cell boundaries -> no neighbour bleeding.
"""
# Display bbox: exact column x row intersection
disp_x = col.x
disp_y = row.y
disp_w = col.width
disp_h = row.height
# Crop boundaries: add small internal padding (3px each side) to avoid
# clipping characters near column/row edges (e.g. parentheses, descenders).
# Stays within image bounds but may extend slightly beyond strict cell.
# 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
_PAD = 3
cx = max(0, disp_x - _PAD)
cy = max(0, disp_y - _PAD)
cx2 = min(img_w, disp_x + disp_w + _PAD)
cy2 = min(img_h, disp_y + disp_h + _PAD)
cw = cx2 - cx
ch = cy2 - cy
empty_cell = {
'cell_id': f"R{row_idx:02d}_C{col_idx}",
'row_index': row_idx,
'col_index': col_idx,
'col_type': col.type,
'text': '',
'confidence': 0.0,
'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
'bbox_pct': {
'x': round(disp_x / img_w * 100, 2) if img_w else 0,
'y': round(disp_y / img_h * 100, 2) if img_h else 0,
'w': round(disp_w / img_w * 100, 2) if img_w else 0,
'h': round(disp_h / img_h * 100, 2) if img_h else 0,
},
'ocr_engine': 'cell_crop_v2',
'is_bold': False,
}
if cw <= 0 or ch <= 0:
logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
return empty_cell
# --- Pixel-density check: skip truly empty cells ---
if ocr_img is not None:
crop = ocr_img[cy:cy + ch, cx:cx + cw]
if crop.size > 0:
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
if dark_ratio < 0.005:
logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
row_idx, col_idx, dark_ratio, cw, ch)
return empty_cell
# --- Prepare crop for OCR ---
cell_lang = lang_map.get(col.type, lang)
psm = _select_psm_for_column(col.type, col.width, row.height)
text = ''
avg_conf = 0.0
used_engine = 'cell_crop_v2'
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
words = ocr_region_trocr(img_bgr, cell_region,
handwritten=(engine_name == "trocr-handwritten"))
elif engine_name == "lighton" and img_bgr is not None:
cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
words = ocr_region_lighton(img_bgr, cell_region)
elif engine_name == "rapid" and img_bgr is not None:
# Upscale small BGR crops for RapidOCR.
bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
if bgr_crop.size == 0:
words = []
else:
crop_h, crop_w = bgr_crop.shape[:2]
if crop_h < 80:
# Force 3x upscale for short rows — small chars need more pixels
scale = 3.0
bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
interpolation=cv2.INTER_CUBIC)
else:
bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
up_h, up_w = bgr_up.shape[:2]
scale_x = up_w / max(crop_w, 1)
scale_y = up_h / max(crop_h, 1)
was_scaled = (up_w != crop_w or up_h != crop_h)
logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
words = ocr_region_rapid(bgr_up, tmp_region)
# Remap positions back to original image coords
if words and was_scaled:
for w in words:
w['left'] = int(w['left'] / scale_x) + cx
w['top'] = int(w['top'] / scale_y) + cy
w['width'] = int(w['width'] / scale_x)
w['height'] = int(w['height'] / scale_y)
elif words:
for w in words:
w['left'] += cx
w['top'] += cy
else:
# Tesseract: upscale tiny crops for better recognition
if ocr_img is not None:
crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
upscaled = _ensure_minimum_crop_size(crop_slice)
up_h, up_w = upscaled.shape[:2]
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
# Remap word positions back to original image coordinates
if words and (up_w != cw or up_h != ch):
sx = cw / max(up_w, 1)
sy = ch / max(up_h, 1)
for w in words:
w['left'] = int(w['left'] * sx) + cx
w['top'] = int(w['top'] * sy) + cy
w['width'] = int(w['width'] * sx)
w['height'] = int(w['height'] * sy)
elif words:
for w in words:
w['left'] += cx
w['top'] += cy
else:
words = []
# Filter low-confidence words
if words:
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
if words:
y_tol = max(15, ch)
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
else:
logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
row_idx, col_idx, cw, ch, psm, engine_name)
# --- PSM 7 fallback for still-empty Tesseract cells ---
if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
upscaled = _ensure_minimum_crop_size(crop_slice)
up_h, up_w = upscaled.shape[:2]
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
if psm7_words:
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
if psm7_words:
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
if p7_text.strip():
text = p7_text
avg_conf = round(
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
)
used_engine = 'cell_crop_v2_psm7'
# Remap PSM7 word positions back to original image coords
if up_w != cw or up_h != ch:
sx = cw / max(up_w, 1)
sy = ch / max(up_h, 1)
for w in psm7_words:
w['left'] = int(w['left'] * sx) + cx
w['top'] = int(w['top'] * sy) + cy
w['width'] = int(w['width'] * sx)
w['height'] = int(w['height'] * sy)
else:
for w in psm7_words:
w['left'] += cx
w['top'] += cy
words = psm7_words
# --- Noise filter ---
if text.strip():
pre_filter = text
text = _clean_cell_text_lite(text)
if not text:
logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
row_idx, col_idx, pre_filter)
avg_conf = 0.0
result = dict(empty_cell)
result['text'] = text
result['confidence'] = avg_conf
result['ocr_engine'] = used_engine
# Store individual word bounding boxes (absolute image coordinates)
# for pixel-accurate overlay positioning in the frontend.
if words and text.strip():
result['word_boxes'] = [
{
'text': w.get('text', ''),
'left': w['left'],
'top': w['top'],
'width': w['width'],
'height': w['height'],
'conf': w.get('conf', 0),
}
for w in words
if w.get('text', '').strip()
]
return result
# Threshold: columns narrower than this (% of image width) use single-cell
# crop OCR instead of full-page word assignment.
_NARROW_COL_THRESHOLD_PCT = 15.0
# ---------------------------------------------------------------------------
# build_cell_grid_v2 — hybrid grid builder (current default)
# ---------------------------------------------------------------------------
def build_cell_grid_v2(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
row_geometries: List[RowGeometry],
img_w: int,
img_h: int,
lang: str = "eng+deu",
ocr_engine: str = "auto",
img_bgr: Optional[np.ndarray] = None,
skip_heal_gaps: bool = False,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
Drop-in replacement for build_cell_grid() -- same signature & return type.
Strategy:
- Broad columns (>15% image width): Use pre-assigned full-page Tesseract
words (from row.words). Handles IPA brackets, punctuation, sentence
continuity correctly.
- Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
neighbour bleeding from adjacent broad columns.
"""
engine_name = "tesseract"
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
engine_name = "rapid"
logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
# Filter to content rows only
content_rows = [r for r in row_geometries if r.row_type == 'content']
if not content_rows:
logger.warning("build_cell_grid_v2: no content rows found")
return [], []
# Filter phantom rows (word_count=0) and artifact rows
before = len(content_rows)
content_rows = [r for r in content_rows if r.word_count > 0]
skipped = before - len(content_rows)
if skipped > 0:
logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
if not content_rows:
logger.warning("build_cell_grid_v2: no content rows with words found")
return [], []
before_art = len(content_rows)
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
artifact_skipped = before_art - len(content_rows)
if artifact_skipped > 0:
logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
if not content_rows:
logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
return [], []
# Filter columns
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
'margin_bottom', 'margin_left', 'margin_right'}
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
if not relevant_cols:
logger.warning("build_cell_grid_v2: no usable columns found")
return [], []
# Heal row gaps -- use header/footer boundaries
content_rows.sort(key=lambda r: r.y)
header_rows = [r for r in row_geometries if r.row_type == 'header']
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
if header_rows:
top_bound = max(r.y + r.height for r in header_rows)
else:
top_bound = content_rows[0].y
if footer_rows:
bottom_bound = min(r.y for r in footer_rows)
else:
bottom_bound = content_rows[-1].y + content_rows[-1].height
# skip_heal_gaps: When True, keep cell positions at their exact row geometry
# positions without expanding to fill gaps from removed rows.
if not skip_heal_gaps:
_heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
relevant_cols.sort(key=lambda c: c.x)
columns_meta = [
{'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
for ci, c in enumerate(relevant_cols)
]
lang_map = {
'column_en': 'eng',
'column_de': 'deu',
'column_example': 'eng+deu',
}
# --- Classify columns as broad vs narrow ---
narrow_col_indices = set()
for ci, col in enumerate(relevant_cols):
col_pct = (col.width / img_w * 100) if img_w > 0 else 0
if col_pct < _NARROW_COL_THRESHOLD_PCT:
narrow_col_indices.add(ci)
broad_col_count = len(relevant_cols) - len(narrow_col_indices)
logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
f"{len(narrow_col_indices)} narrow columns (cell-crop)")
# --- Phase 1: Broad columns via full-page word assignment ---
cells: List[Dict[str, Any]] = []
for row_idx, row in enumerate(content_rows):
# Assign full-page words to columns for this row
col_words = _assign_row_words_to_columns(row, relevant_cols)
for col_idx, col in enumerate(relevant_cols):
if col_idx not in narrow_col_indices:
# BROAD column: use pre-assigned full-page words
words = col_words.get(col_idx, [])
# Filter low-confidence words
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
# Single full-width column (box sub-session): preserve spacing
is_single_full_column = (
len(relevant_cols) == 1
and img_w > 0
and relevant_cols[0].width / img_w > 0.9
)
if words:
y_tol = max(15, row.height)
if is_single_full_column:
text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
logger.info(f"R{row_idx:02d}: {len(words)} words, "
f"text={text!r:.100}")
else:
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
else:
text = ''
avg_conf = 0.0
if is_single_full_column:
logger.info(f"R{row_idx:02d}: 0 words (row has "
f"{row.word_count} total, y={row.y}..{row.y+row.height})")
# Apply noise filter -- but NOT for single-column sub-sessions
if not is_single_full_column:
text = _clean_cell_text(text)
cell = {
'cell_id': f"R{row_idx:02d}_C{col_idx}",
'row_index': row_idx,
'col_index': col_idx,
'col_type': col.type,
'text': text,
'confidence': avg_conf,
'bbox_px': {
'x': col.x, 'y': row.y,
'w': col.width, 'h': row.height,
},
'bbox_pct': {
'x': round(col.x / img_w * 100, 2) if img_w else 0,
'y': round(row.y / img_h * 100, 2) if img_h else 0,
'w': round(col.width / img_w * 100, 2) if img_w else 0,
'h': round(row.height / img_h * 100, 2) if img_h else 0,
},
'ocr_engine': 'word_lookup',
'is_bold': False,
}
# Store word bounding boxes for pixel-accurate overlay
if words and text.strip():
cell['word_boxes'] = [
{
'text': w.get('text', ''),
'left': w['left'],
'top': w['top'],
'width': w['width'],
'height': w['height'],
'conf': w.get('conf', 0),
}
for w in words
if w.get('text', '').strip()
]
cells.append(cell)
# --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
narrow_tasks = []
for row_idx, row in enumerate(content_rows):
for col_idx, col in enumerate(relevant_cols):
if col_idx in narrow_col_indices:
narrow_tasks.append((row_idx, col_idx, row, col))
if narrow_tasks:
max_workers = 4 if engine_name == "tesseract" else 2
with ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = {
pool.submit(
_ocr_cell_crop,
ri, ci, row, col,
ocr_img, img_bgr, img_w, img_h,
engine_name, lang, lang_map,
): (ri, ci)
for ri, ci, row, col in narrow_tasks
}
for future in as_completed(futures):
try:
cell = future.result()
cells.append(cell)
except Exception as e:
ri, ci = futures[future]
logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
# Sort cells by (row_index, col_index)
cells.sort(key=lambda c: (c['row_index'], c['col_index']))
# Remove all-empty rows
rows_with_text: set = set()
for cell in cells:
if cell['text'].strip():
rows_with_text.add(cell['row_index'])
before_filter = len(cells)
cells = [c for c in cells if c['row_index'] in rows_with_text]
empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
if empty_rows_removed > 0:
logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
f"engine={engine_name} (hybrid)")
return cells, columns_meta
@@ -0,0 +1,60 @@
"""
Cell-grid construction (v2 + legacy), vocab conversion, and word-grid OCR.
Re-export hub — all public and private names remain importable from here
for backward compatibility. The actual implementations live in:
cv_cell_grid_helpers.py — shared helpers (_heal_row_gaps, _is_artifact_row, ...)
cv_cell_grid_build.py — v2 hybrid grid (build_cell_grid_v2, _ocr_cell_crop)
cv_cell_grid_legacy.py — deprecated v1 grid (build_cell_grid, _ocr_single_cell)
cv_cell_grid_streaming.py — streaming variants (build_cell_grid_v2_streaming, ...)
cv_cell_grid_merge.py — row-merging logic (_merge_wrapped_rows, ...)
cv_cell_grid_vocab.py — vocab extraction (_cells_to_vocab_entries, build_word_grid)
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
# --- Helpers ---
from cv_cell_grid_helpers import ( # noqa: F401
_MIN_WORD_CONF,
_compute_cell_padding,
_ensure_minimum_crop_size,
_heal_row_gaps,
_is_artifact_row,
_select_psm_for_column,
)
# --- v2 build (current default) ---
from cv_cell_grid_build import ( # noqa: F401
_NARROW_COL_THRESHOLD_PCT,
_ocr_cell_crop,
build_cell_grid_v2,
)
# --- Legacy build (DEPRECATED) ---
from cv_cell_grid_legacy import ( # noqa: F401
_ocr_single_cell,
build_cell_grid,
)
# --- Streaming variants ---
from cv_cell_grid_streaming import ( # noqa: F401
build_cell_grid_streaming,
build_cell_grid_v2_streaming,
)
# --- Row merging ---
from cv_cell_grid_merge import ( # noqa: F401
_PHONETIC_ONLY_RE,
_is_phonetic_only_text,
_merge_continuation_rows,
_merge_phonetic_continuation_rows,
_merge_wrapped_rows,
)
# --- Vocab extraction ---
from cv_cell_grid_vocab import ( # noqa: F401
_cells_to_vocab_entries,
build_word_grid,
)
@@ -0,0 +1,136 @@
"""
Shared helpers for cell-grid construction (v2 + legacy).
Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and
cv_cell_grid_legacy.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import List
import numpy as np
from cv_vocab_types import RowGeometry
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
# Minimum OCR word confidence to keep (used across multiple functions)
_MIN_WORD_CONF = 30
def _compute_cell_padding(col_width: int, img_w: int) -> int:
"""Adaptive padding for OCR crops based on column width.
Narrow columns (page_ref, marker) need more surrounding context so
Tesseract can segment characters correctly. Wide columns keep the
minimal 4 px padding to avoid pulling in neighbours.
"""
col_pct = col_width / img_w * 100 if img_w > 0 else 100
if col_pct < 5:
return max(20, col_width // 2)
if col_pct < 10:
return max(12, col_width // 4)
if col_pct < 15:
return 8
return 4
def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
max_scale: int = 3) -> np.ndarray:
"""Upscale tiny crops so Tesseract gets enough pixel data.
If either dimension is below *min_dim*, the crop is bicubic-upscaled
so the smallest dimension reaches *min_dim* (capped at *max_scale* x).
"""
h, w = crop.shape[:2]
if h >= min_dim and w >= min_dim:
return crop
scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
if scale <= 1.0:
return crop
new_w = int(w * scale)
new_h = int(h * scale)
return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
def _select_psm_for_column(col_type: str, col_width: int,
row_height: int) -> int:
"""Choose the best Tesseract PSM for a given column geometry.
- page_ref columns are almost always single short tokens -> PSM 8
- Very narrow or short cells -> PSM 7 (single text line)
- Everything else -> PSM 6 (uniform block)
"""
if col_type in ('page_ref', 'marker'):
return 8 # single word
if col_width < 100 or row_height < 30:
return 7 # single line
return 6 # uniform block
def _is_artifact_row(row: RowGeometry) -> bool:
"""Return True if this row contains only scan artifacts, not real text.
Artifact rows (scanner shadows, noise) typically produce only single-character
detections. A real content row always has at least one token with 2+ characters.
"""
if row.word_count == 0:
return True
texts = [w.get('text', '').strip() for w in row.words]
return all(len(t) <= 1 for t in texts)
def _heal_row_gaps(
rows: List[RowGeometry],
top_bound: int,
bottom_bound: int,
) -> None:
"""Expand row y/height to fill vertical gaps caused by removed adjacent rows.
After filtering out empty or artifact rows, remaining content rows may have
gaps between them where the removed rows used to be. This function mutates
each row to extend upward/downward to the midpoint of such gaps so that
OCR crops cover the full available content area.
The first row always extends to top_bound; the last row to bottom_bound.
"""
if not rows:
return
rows.sort(key=lambda r: r.y)
n = len(rows)
orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
for i, row in enumerate(rows):
# New top: midpoint between previous row's bottom and this row's top
if i == 0:
new_top = top_bound
else:
prev_bot = orig[i - 1][1]
my_top = orig[i][0]
gap = my_top - prev_bot
new_top = prev_bot + gap // 2 if gap > 1 else my_top
# New bottom: midpoint between this row's bottom and next row's top
if i == n - 1:
new_bottom = bottom_bound
else:
my_bot = orig[i][1]
next_top = orig[i + 1][0]
gap = next_top - my_bot
new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
row.y = new_top
row.height = max(5, new_bottom - new_top)
logger.debug(
f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
f"(bounds: top={top_bound}, bottom={bottom_bound})"
)
@@ -0,0 +1,436 @@
"""
Legacy cell-grid construction (v1) -- DEPRECATED, kept for backward compat.
Extracted from cv_cell_grid.py. Prefer build_cell_grid_v2 for new code.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import PageRegion, RowGeometry
from cv_ocr_engines import (
RAPIDOCR_AVAILABLE,
_assign_row_words_to_columns,
_clean_cell_text,
_words_to_reading_order_text,
ocr_region_lighton,
ocr_region_rapid,
ocr_region_trocr,
)
from cv_cell_grid_helpers import (
_MIN_WORD_CONF,
_compute_cell_padding,
_ensure_minimum_crop_size,
_heal_row_gaps,
_is_artifact_row,
_select_psm_for_column,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# _ocr_single_cell — legacy per-cell OCR with multi-level fallback
# ---------------------------------------------------------------------------
def _ocr_single_cell(
row_idx: int,
col_idx: int,
row: RowGeometry,
col: PageRegion,
ocr_img: np.ndarray,
img_bgr: Optional[np.ndarray],
img_w: int,
img_h: int,
use_rapid: bool,
engine_name: str,
lang: str,
lang_map: Dict[str, str],
preassigned_words: Optional[List[Dict]] = None,
) -> Dict[str, Any]:
"""Populate a single cell (column x row intersection) via word lookup."""
# Display bbox: exact column x row intersection (no padding)
disp_x = col.x
disp_y = row.y
disp_w = col.width
disp_h = row.height
# OCR crop: adaptive padding -- narrow columns get more context
pad = _compute_cell_padding(col.width, img_w)
cell_x = max(0, col.x - pad)
cell_y = max(0, row.y - pad)
cell_w = min(col.width + 2 * pad, img_w - cell_x)
cell_h = min(row.height + 2 * pad, img_h - cell_y)
is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
if disp_w <= 0 or disp_h <= 0:
return {
'cell_id': f"R{row_idx:02d}_C{col_idx}",
'row_index': row_idx,
'col_index': col_idx,
'col_type': col.type,
'text': '',
'confidence': 0.0,
'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
'bbox_pct': {
'x': round(col.x / img_w * 100, 2),
'y': round(row.y / img_h * 100, 2),
'w': round(col.width / img_w * 100, 2),
'h': round(row.height / img_h * 100, 2),
},
'ocr_engine': 'word_lookup',
}
# --- PRIMARY: Word-lookup from full-page Tesseract ---
words = preassigned_words if preassigned_words is not None else []
used_engine = 'word_lookup'
# Filter low-confidence words
if words:
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
if words:
y_tol = max(15, row.height)
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
else:
text = ''
avg_conf = 0.0
# --- FALLBACK: Cell-OCR for empty cells ---
_run_fallback = False
if not text.strip() and cell_w > 0 and cell_h > 0:
if ocr_img is not None:
crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
if crop.size > 0:
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
_run_fallback = dark_ratio > 0.005
if _run_fallback:
# For narrow columns, upscale the crop before OCR
if is_narrow and ocr_img is not None:
_crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
_upscaled = _ensure_minimum_crop_size(_crop_slice)
if _upscaled is not _crop_slice:
_up_h, _up_w = _upscaled.shape[:2]
_tmp_region = PageRegion(
type=col.type, x=0, y=0, width=_up_w, height=_up_h,
)
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
cell_lang = lang_map.get(col.type, lang)
fallback_words = ocr_region(_upscaled, _tmp_region,
lang=cell_lang, psm=_cell_psm)
# Remap word positions back to original image coordinates
_sx = cell_w / max(_up_w, 1)
_sy = cell_h / max(_up_h, 1)
for _fw in (fallback_words or []):
_fw['left'] = int(_fw['left'] * _sx) + cell_x
_fw['top'] = int(_fw['top'] * _sy) + cell_y
_fw['width'] = int(_fw['width'] * _sx)
_fw['height'] = int(_fw['height'] * _sy)
else:
cell_region = PageRegion(
type=col.type, x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
cell_lang = lang_map.get(col.type, lang)
fallback_words = ocr_region(ocr_img, cell_region,
lang=cell_lang, psm=_cell_psm)
else:
cell_region = PageRegion(
type=col.type,
x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
elif engine_name == "lighton" and img_bgr is not None:
fallback_words = ocr_region_lighton(img_bgr, cell_region)
elif use_rapid and img_bgr is not None:
fallback_words = ocr_region_rapid(img_bgr, cell_region)
else:
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
cell_lang = lang_map.get(col.type, lang)
fallback_words = ocr_region(ocr_img, cell_region,
lang=cell_lang, psm=_cell_psm)
if fallback_words:
fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
if fallback_words:
fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
fb_y_tol = max(10, int(fb_avg_h * 0.5))
fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
if fb_text.strip():
text = fb_text
avg_conf = round(
sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
)
used_engine = 'cell_ocr_fallback'
# --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
if not text.strip() and _run_fallback and not use_rapid:
_fb_region = PageRegion(
type=col.type, x=cell_x, y=cell_y,
width=cell_w, height=cell_h,
)
cell_lang = lang_map.get(col.type, lang)
psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
if psm7_words:
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
if psm7_words:
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
if p7_text.strip():
text = p7_text
avg_conf = round(
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
)
used_engine = 'cell_ocr_psm7'
# --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
if not text.strip() and is_narrow and img_bgr is not None:
row_region = PageRegion(
type='_row_strip', x=0, y=row.y,
width=img_w, height=row.height,
)
strip_words = ocr_region_rapid(img_bgr, row_region)
if strip_words:
col_left = col.x
col_right = col.x + col.width
col_words = []
for sw in strip_words:
sw_left = sw.get('left', 0)
sw_right = sw_left + sw.get('width', 0)
overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
if overlap > sw.get('width', 1) * 0.3:
col_words.append(sw)
if col_words:
col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
if col_words:
rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
if rs_text.strip():
text = rs_text
avg_conf = round(
sum(w['conf'] for w in col_words) / len(col_words), 1
)
used_engine = 'row_strip_rapid'
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
if text.strip():
text = _clean_cell_text(text)
if not text:
avg_conf = 0.0
return {
'cell_id': f"R{row_idx:02d}_C{col_idx}",
'row_index': row_idx,
'col_index': col_idx,
'col_type': col.type,
'text': text,
'confidence': avg_conf,
'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
'bbox_pct': {
'x': round(disp_x / img_w * 100, 2),
'y': round(disp_y / img_h * 100, 2),
'w': round(disp_w / img_w * 100, 2),
'h': round(disp_h / img_h * 100, 2),
},
'ocr_engine': used_engine,
}
# ---------------------------------------------------------------------------
# build_cell_grid — legacy grid builder (DEPRECATED)
# ---------------------------------------------------------------------------
def build_cell_grid(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
row_geometries: List[RowGeometry],
img_w: int,
img_h: int,
lang: str = "eng+deu",
ocr_engine: str = "auto",
img_bgr: Optional[np.ndarray] = None,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Generic Cell-Grid: Columns x Rows -> cells with OCR text.
DEPRECATED: Use build_cell_grid_v2 instead.
"""
# Resolve engine choice
use_rapid = False
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
engine_name = "rapid" if use_rapid else "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
else:
use_rapid = True
engine_name = "rapid" if use_rapid else "tesseract"
else:
engine_name = "tesseract"
logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
# Filter to content rows only (skip header/footer)
content_rows = [r for r in row_geometries if r.row_type == 'content']
if not content_rows:
logger.warning("build_cell_grid: no content rows found")
return [], []
before = len(content_rows)
content_rows = [r for r in content_rows if r.word_count > 0]
skipped = before - len(content_rows)
if skipped > 0:
logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
if not content_rows:
logger.warning("build_cell_grid: no content rows with words found")
return [], []
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
if not relevant_cols:
logger.warning("build_cell_grid: no usable columns found")
return [], []
before_art = len(content_rows)
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
artifact_skipped = before_art - len(content_rows)
if artifact_skipped > 0:
logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
if not content_rows:
logger.warning("build_cell_grid: no content rows after artifact filtering")
return [], []
_heal_row_gaps(
content_rows,
top_bound=min(c.y for c in relevant_cols),
bottom_bound=max(c.y + c.height for c in relevant_cols),
)
relevant_cols.sort(key=lambda c: c.x)
columns_meta = [
{
'index': col_idx,
'type': col.type,
'x': col.x,
'width': col.width,
}
for col_idx, col in enumerate(relevant_cols)
]
lang_map = {
'column_en': 'eng',
'column_de': 'deu',
'column_example': 'eng+deu',
}
cells: List[Dict[str, Any]] = []
for row_idx, row in enumerate(content_rows):
col_words = _assign_row_words_to_columns(row, relevant_cols)
for col_idx, col in enumerate(relevant_cols):
cell = _ocr_single_cell(
row_idx, col_idx, row, col,
ocr_img, img_bgr, img_w, img_h,
use_rapid, engine_name, lang, lang_map,
preassigned_words=col_words[col_idx],
)
cells.append(cell)
# --- BATCH FALLBACK: re-OCR empty cells by column strip ---
empty_by_col: Dict[int, List[int]] = {}
for ci, cell in enumerate(cells):
if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
bpx = cell['bbox_px']
x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
if w > 0 and h > 0 and ocr_img is not None:
crop = ocr_img[y:y + h, x:x + w]
if crop.size > 0:
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
if dark_ratio > 0.005:
empty_by_col.setdefault(cell['col_index'], []).append(ci)
for col_idx, cell_indices in empty_by_col.items():
if len(cell_indices) < 3:
continue
min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
col_x = cells[cell_indices[0]]['bbox_px']['x']
col_w = cells[cell_indices[0]]['bbox_px']['w']
strip_region = PageRegion(
type=relevant_cols[col_idx].type,
x=col_x, y=min_y,
width=col_w, height=max_y_h - min_y,
)
strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
elif engine_name == "lighton" and img_bgr is not None:
strip_words = ocr_region_lighton(img_bgr, strip_region)
elif use_rapid and img_bgr is not None:
strip_words = ocr_region_rapid(img_bgr, strip_region)
else:
strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
if not strip_words:
continue
strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
if not strip_words:
continue
for ci in cell_indices:
cell_y = cells[ci]['bbox_px']['y']
cell_h = cells[ci]['bbox_px']['h']
cell_mid_y = cell_y + cell_h / 2
matched_words = [
w for w in strip_words
if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
]
if matched_words:
matched_words.sort(key=lambda w: w['left'])
batch_text = ' '.join(w['text'] for w in matched_words)
batch_text = _clean_cell_text(batch_text)
if batch_text.strip():
cells[ci]['text'] = batch_text
cells[ci]['confidence'] = round(
sum(w['conf'] for w in matched_words) / len(matched_words), 1
)
cells[ci]['ocr_engine'] = 'batch_column_ocr'
batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
if batch_filled > 0:
logger.info(
f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
f"empty cells in column {col_idx}"
)
# Remove all-empty rows
rows_with_text: set = set()
for cell in cells:
if cell['text'].strip():
rows_with_text.add(cell['row_index'])
before_filter = len(cells)
cells = [c for c in cells if c['row_index'] in rows_with_text]
empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
if empty_rows_removed > 0:
logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
logger.info(f"build_cell_grid: {len(cells)} cells from "
f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
f"engine={engine_name}")
return cells, columns_meta
@@ -0,0 +1,235 @@
"""
Row-merging logic for vocabulary entries (phonetic, wrapped, continuation rows).
Extracted from cv_cell_grid.py.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List
from cv_ocr_engines import _RE_ALPHA
logger = logging.getLogger(__name__)
# Regex: line starts with phonetic bracket content only (no real word before it)
_PHONETIC_ONLY_RE = re.compile(
r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
)
def _is_phonetic_only_text(text: str) -> bool:
"""Check if text consists only of phonetic transcription.
Phonetic-only patterns:
['mani serva] -> True
[dance] -> True
["a:mand] -> True
almond ['a:mand] -> False (has real word before bracket)
Mandel -> False
"""
t = text.strip()
if not t:
return False
# Must contain at least one bracket
if '[' not in t and ']' not in t:
return False
# Remove all bracket content and surrounding punctuation/whitespace
without_brackets = re.sub(r"\[.*?\]", '', t)
without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
# If nothing meaningful remains, it's phonetic-only
alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
return len(alpha_remaining) < 2
def _merge_phonetic_continuation_rows(
entries: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Merge rows that contain only phonetic transcription into previous entry.
In dictionary pages, phonetic transcription sometimes wraps to the next
row. E.g.:
Row 28: EN="it's a money-saver" DE="es spart Kosten"
Row 29: EN="['mani serva]" DE=""
Row 29 is phonetic-only -> merge into row 28's EN field.
"""
if len(entries) < 2:
return entries
merged: List[Dict[str, Any]] = []
for entry in entries:
en = (entry.get('english') or '').strip()
de = (entry.get('german') or '').strip()
ex = (entry.get('example') or '').strip()
# Check if this entry is phonetic-only (EN has only phonetics, DE empty)
if merged and _is_phonetic_only_text(en) and not de:
prev = merged[-1]
prev_en = (prev.get('english') or '').strip()
# Append phonetic to previous entry's EN
if prev_en:
prev['english'] = prev_en + ' ' + en
else:
prev['english'] = en
# If there was an example, append to previous too
if ex:
prev_ex = (prev.get('example') or '').strip()
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
logger.debug(
f"Merged phonetic row {entry.get('row_index')} "
f"into previous entry: {prev['english']!r}"
)
continue
merged.append(entry)
return merged
def _merge_wrapped_rows(
entries: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Merge rows where the primary column (EN) is empty -- cell wrap continuation.
In textbook vocabulary tables, columns are often narrow, so the author
wraps text within a cell. OCR treats each physical line as a separate row.
The key indicator: if the EN column is empty but DE/example have text,
this row is a continuation of the previous row's cells.
Example (original textbook has ONE row):
Row 2: EN="take part (in)" DE="teilnehmen (an), mitmachen" EX="More than 200 singers took"
Row 3: EN="" DE="(bei)" EX="part in the concert."
-> Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="..."
Also handles the reverse case: DE empty but EN has text (wrap in EN column).
"""
if len(entries) < 2:
return entries
merged: List[Dict[str, Any]] = []
for entry in entries:
en = (entry.get('english') or '').strip()
de = (entry.get('german') or '').strip()
ex = (entry.get('example') or '').strip()
if not merged:
merged.append(entry)
continue
prev = merged[-1]
prev_en = (prev.get('english') or '').strip()
prev_de = (prev.get('german') or '').strip()
prev_ex = (prev.get('example') or '').strip()
# Case 1: EN is empty -> continuation of previous row
if not en and (de or ex) and prev_en:
if de:
if prev_de.endswith(','):
sep = ' '
elif prev_de.endswith(('-', '(')):
sep = ''
else:
sep = ' '
prev['german'] = (prev_de + sep + de).strip()
if ex:
sep = ' ' if prev_ex else ''
prev['example'] = (prev_ex + sep + ex).strip()
logger.debug(
f"Merged wrapped row {entry.get('row_index')} into previous "
f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
)
continue
# Case 2: DE is empty, EN has text that looks like continuation
if en and not de and prev_de:
is_paren = en.startswith('(')
first_alpha = next((c for c in en if c.isalpha()), '')
starts_lower = first_alpha and first_alpha.islower()
if (is_paren or starts_lower) and len(en.split()) < 5:
sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
prev['english'] = (prev_en + sep + en).strip()
if ex:
sep2 = ' ' if prev_ex else ''
prev['example'] = (prev_ex + sep2 + ex).strip()
logger.debug(
f"Merged wrapped row {entry.get('row_index')} into previous "
f"(empty DE): EN={prev['english']!r}"
)
continue
merged.append(entry)
if len(merged) < len(entries):
logger.info(
f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
f"continuation rows ({len(entries)} -> {len(merged)})"
)
return merged
def _merge_continuation_rows(
entries: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Merge multi-line vocabulary entries where text wraps to the next row.
A row is a continuation of the previous entry when:
- EN has text, but DE is empty
- EN starts with a lowercase letter (not a new vocab entry)
- Previous entry's EN does NOT end with a sentence terminator (.!?)
- The continuation text has fewer than 4 words (not an example sentence)
- The row was not already merged as phonetic
Example:
Row 5: EN="to put up" DE="aufstellen"
Row 6: EN="with sth." DE=""
-> Merged: EN="to put up with sth." DE="aufstellen"
"""
if len(entries) < 2:
return entries
merged: List[Dict[str, Any]] = []
for entry in entries:
en = (entry.get('english') or '').strip()
de = (entry.get('german') or '').strip()
if merged and en and not de:
# Check: not phonetic (already handled)
if _is_phonetic_only_text(en):
merged.append(entry)
continue
# Check: starts with lowercase
first_alpha = next((c for c in en if c.isalpha()), '')
starts_lower = first_alpha and first_alpha.islower()
# Check: fewer than 4 words (not an example sentence)
word_count = len(en.split())
is_short = word_count < 4
# Check: previous entry doesn't end with sentence terminator
prev = merged[-1]
prev_en = (prev.get('english') or '').strip()
prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
if starts_lower and is_short and not prev_ends_sentence:
# Merge into previous entry
prev['english'] = (prev_en + ' ' + en).strip()
# Merge example if present
ex = (entry.get('example') or '').strip()
if ex:
prev_ex = (prev.get('example') or '').strip()
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
logger.debug(
f"Merged continuation row {entry.get('row_index')} "
f"into previous entry: {prev['english']!r}"
)
continue
merged.append(entry)
return merged
@@ -0,0 +1,217 @@
"""
Streaming variants of cell-grid builders (v2 + legacy).
Extracted from cv_cell_grid.py. These yield cells one-by-one as OCR'd,
useful for progress reporting.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Any, Dict, Generator, List, Optional, Tuple
import numpy as np
from cv_vocab_types import PageRegion, RowGeometry
from cv_ocr_engines import (
RAPIDOCR_AVAILABLE,
_assign_row_words_to_columns,
)
from cv_cell_grid_helpers import (
_heal_row_gaps,
_is_artifact_row,
)
from cv_cell_grid_build import _ocr_cell_crop
from cv_cell_grid_legacy import _ocr_single_cell
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# build_cell_grid_v2_streaming
# ---------------------------------------------------------------------------
def build_cell_grid_v2_streaming(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
row_geometries: List[RowGeometry],
img_w: int,
img_h: int,
lang: str = "eng+deu",
ocr_engine: str = "auto",
img_bgr: Optional[np.ndarray] = None,
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
"""Streaming variant of build_cell_grid_v2 -- yields each cell as OCR'd.
Yields:
(cell_dict, columns_meta, total_cells)
"""
use_rapid = False
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
engine_name = "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
else:
use_rapid = True
engine_name = "rapid" if use_rapid else "tesseract"
else:
engine_name = "tesseract"
content_rows = [r for r in row_geometries if r.row_type == 'content']
if not content_rows:
return
content_rows = [r for r in content_rows if r.word_count > 0]
if not content_rows:
return
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
'margin_bottom', 'margin_left', 'margin_right'}
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
if not relevant_cols:
return
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
if not content_rows:
return
# Use header/footer boundaries for heal_row_gaps
content_rows.sort(key=lambda r: r.y)
header_rows = [r for r in row_geometries if r.row_type == 'header']
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
if header_rows:
top_bound = max(r.y + r.height for r in header_rows)
else:
top_bound = content_rows[0].y
if footer_rows:
bottom_bound = min(r.y for r in footer_rows)
else:
bottom_bound = content_rows[-1].y + content_rows[-1].height
_heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
relevant_cols.sort(key=lambda c: c.x)
columns_meta = [
{'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
for ci, c in enumerate(relevant_cols)
]
lang_map = {
'column_en': 'eng',
'column_de': 'deu',
'column_example': 'eng+deu',
}
total_cells = len(content_rows) * len(relevant_cols)
for row_idx, row in enumerate(content_rows):
for col_idx, col in enumerate(relevant_cols):
cell = _ocr_cell_crop(
row_idx, col_idx, row, col,
ocr_img, img_bgr, img_w, img_h,
engine_name, lang, lang_map,
)
yield cell, columns_meta, total_cells
# ---------------------------------------------------------------------------
# build_cell_grid_streaming — legacy streaming variant
# ---------------------------------------------------------------------------
def build_cell_grid_streaming(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
row_geometries: List[RowGeometry],
img_w: int,
img_h: int,
lang: str = "eng+deu",
ocr_engine: str = "auto",
img_bgr: Optional[np.ndarray] = None,
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
"""Like build_cell_grid(), but yields each cell as it is OCR'd.
DEPRECATED: Use build_cell_grid_v2_streaming instead.
Yields:
(cell_dict, columns_meta, total_cells) for each cell.
"""
use_rapid = False
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
engine_name = ocr_engine
elif ocr_engine == "auto":
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
engine_name = "rapid" if use_rapid else "tesseract"
elif ocr_engine == "rapid":
if not RAPIDOCR_AVAILABLE:
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
else:
use_rapid = True
engine_name = "rapid" if use_rapid else "tesseract"
else:
engine_name = "tesseract"
content_rows = [r for r in row_geometries if r.row_type == 'content']
if not content_rows:
return
before = len(content_rows)
content_rows = [r for r in content_rows if r.word_count > 0]
skipped = before - len(content_rows)
if skipped > 0:
logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
if not content_rows:
return
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
if not relevant_cols:
return
before_art = len(content_rows)
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
artifact_skipped = before_art - len(content_rows)
if artifact_skipped > 0:
logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
if not content_rows:
return
_heal_row_gaps(
content_rows,
top_bound=min(c.y for c in relevant_cols),
bottom_bound=max(c.y + c.height for c in relevant_cols),
)
relevant_cols.sort(key=lambda c: c.x)
columns_meta = [
{
'index': col_idx,
'type': col.type,
'x': col.x,
'width': col.width,
}
for col_idx, col in enumerate(relevant_cols)
]
lang_map = {
'column_en': 'eng',
'column_de': 'deu',
'column_example': 'eng+deu',
}
total_cells = len(content_rows) * len(relevant_cols)
for row_idx, row in enumerate(content_rows):
col_words = _assign_row_words_to_columns(row, relevant_cols)
for col_idx, col in enumerate(relevant_cols):
cell = _ocr_single_cell(
row_idx, col_idx, row, col,
ocr_img, img_bgr, img_w, img_h,
use_rapid, engine_name, lang, lang_map,
preassigned_words=col_words[col_idx],
)
yield cell, columns_meta, total_cells
@@ -0,0 +1,200 @@
"""
Vocabulary extraction: cells -> vocab entries, and build_word_grid wrapper.
Extracted from cv_cell_grid.py.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Any, Dict, List
from cv_ocr_engines import (
_attach_example_sentences,
_fix_phonetic_brackets,
_split_comma_entries,
)
from cv_cell_grid_legacy import build_cell_grid
from cv_cell_grid_merge import (
_merge_continuation_rows,
_merge_phonetic_continuation_rows,
_merge_wrapped_rows,
)
logger = logging.getLogger(__name__)
def _cells_to_vocab_entries(
cells: List[Dict[str, Any]],
columns_meta: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Map generic cells to vocab entries with english/german/example fields.
Groups cells by row_index, maps col_type -> field name, and produces
one entry per row (only rows with at least one non-empty field).
"""
col_type_to_field = {
'column_en': 'english',
'column_de': 'german',
'column_example': 'example',
'page_ref': 'source_page',
'column_marker': 'marker',
'column_text': 'text', # generic single-column (box sub-sessions)
}
bbox_key_map = {
'column_en': 'bbox_en',
'column_de': 'bbox_de',
'column_example': 'bbox_ex',
'page_ref': 'bbox_ref',
'column_marker': 'bbox_marker',
'column_text': 'bbox_text',
}
# Group cells by row_index
rows: Dict[int, List[Dict]] = {}
for cell in cells:
ri = cell['row_index']
rows.setdefault(ri, []).append(cell)
entries: List[Dict[str, Any]] = []
for row_idx in sorted(rows.keys()):
row_cells = rows[row_idx]
entry: Dict[str, Any] = {
'row_index': row_idx,
'english': '',
'german': '',
'example': '',
'text': '', # generic single-column (box sub-sessions)
'source_page': '',
'marker': '',
'confidence': 0.0,
'bbox': None,
'bbox_en': None,
'bbox_de': None,
'bbox_ex': None,
'bbox_ref': None,
'bbox_marker': None,
'bbox_text': None,
'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
}
confidences = []
for cell in row_cells:
col_type = cell['col_type']
field = col_type_to_field.get(col_type)
if field:
entry[field] = cell['text']
bbox_field = bbox_key_map.get(col_type)
if bbox_field:
entry[bbox_field] = cell['bbox_pct']
if cell['confidence'] > 0:
confidences.append(cell['confidence'])
# Compute row-level bbox as union of all cell bboxes
all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
if all_bboxes:
min_x = min(b['x'] for b in all_bboxes)
min_y = min(b['y'] for b in all_bboxes)
max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
entry['bbox'] = {
'x': round(min_x, 2),
'y': round(min_y, 2),
'w': round(max_x2 - min_x, 2),
'h': round(max_y2 - min_y, 2),
}
entry['confidence'] = round(
sum(confidences) / len(confidences), 1
) if confidences else 0.0
# Only include if at least one mapped field has text
has_content = any(
entry.get(f)
for f in col_type_to_field.values()
)
if has_content:
entries.append(entry)
return entries
def build_word_grid(
ocr_img,
column_regions,
row_geometries,
img_w: int,
img_h: int,
lang: str = "eng+deu",
ocr_engine: str = "auto",
img_bgr=None,
pronunciation: str = "british",
) -> List[Dict[str, Any]]:
"""Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
Wrapper around build_cell_grid() that adds vocabulary-specific logic:
- Maps cells to english/german/example entries
- Applies character confusion fixes, IPA lookup, comma splitting, etc.
- Falls back to returning raw cells if no vocab columns detected.
Args:
ocr_img: Binarized full-page image (for Tesseract).
column_regions: Classified columns from Step 3.
row_geometries: Rows from Step 4.
img_w, img_h: Image dimensions.
lang: Default Tesseract language.
ocr_engine: 'tesseract', 'rapid', or 'auto'.
img_bgr: BGR color image (required for RapidOCR).
pronunciation: 'british' or 'american' for IPA lookup.
Returns:
List of entry dicts with english/german/example text and bbox info (percent).
"""
cells, columns_meta = build_cell_grid(
ocr_img, column_regions, row_geometries, img_w, img_h,
lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
)
if not cells:
return []
# Check if vocab layout is present
col_types = {c['type'] for c in columns_meta}
if not (col_types & {'column_en', 'column_de'}):
logger.info("build_word_grid: no vocab columns -- returning raw cells")
return cells
# Vocab mapping: cells -> entries
entries = _cells_to_vocab_entries(cells, columns_meta)
# --- Post-processing pipeline (deterministic, no LLM) ---
n_raw = len(entries)
# 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
entries = _merge_wrapped_rows(entries)
# 0a. Merge phonetic-only continuation rows into previous entry
entries = _merge_phonetic_continuation_rows(entries)
# 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
entries = _merge_continuation_rows(entries)
# 1. Character confusion (| -> I, 1 -> I, 8 -> B) is now run in
# llm_review_entries_streaming so changes are visible to the user in Step 6.
# 2. Replace OCR'd phonetics with dictionary IPA
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
# 3. Split comma-separated word forms (break, broke, broken -> 3 entries)
entries = _split_comma_entries(entries)
# 4. Attach example sentences (rows without DE -> examples for preceding entry)
entries = _attach_example_sentences(entries)
engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
logger.info(f"build_word_grid: {len(entries)} entries from "
f"{n_raw} raw -> {len(entries)} after post-processing "
f"(engine={engine_name})")
return entries
@@ -0,0 +1,471 @@
"""
Embedded box detection and page zone splitting for the CV vocabulary pipeline.
Detects boxes (grammar tips, exercises, etc.) that span the page width and
interrupt the normal column layout. Splits the page into vertical zones so
that column detection can run independently per zone.
Two-stage algorithm (both run, results merged):
1. Morphological line detection — finds bordered boxes via horizontal lines.
2. Background shading detection — finds shaded/colored boxes via median-blur
background analysis. Works for colored (blue, green) and grayscale
(gray shading on B/W scans) boxes.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import List, Optional, Tuple
import cv2
import numpy as np
from cv_vocab_types import DetectedBox, PageZone
logger = logging.getLogger(__name__)
__all__ = [
"detect_boxes",
"split_page_into_zones",
]
# ---------------------------------------------------------------------------
# Stage 1: Morphological line detection
# ---------------------------------------------------------------------------
def _detect_boxes_by_lines(
gray: np.ndarray,
content_x: int,
content_w: int,
content_y: int,
content_h: int,
) -> List[DetectedBox]:
"""Find boxes defined by pairs of long horizontal border lines.
Args:
gray: Grayscale image (full page).
content_x, content_w: Horizontal content bounds.
content_y, content_h: Vertical content bounds.
Returns:
List of DetectedBox for each detected bordered box.
"""
h, w = gray.shape[:2]
# Binarize: dark pixels → white on black background
_, binary = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
# Horizontal morphology kernel — at least 50% of content width
kernel_w = max(50, content_w // 2)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, 1))
lines_img = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
# Horizontal projection: count line pixels per row
h_proj = np.sum(lines_img[:, content_x:content_x + content_w] > 0, axis=1)
line_threshold = content_w * 0.30
# Group consecutive rows with enough line pixels into line segments
line_segments: List[Tuple[int, int]] = [] # (y_start, y_end)
seg_start: Optional[int] = None
for y in range(h):
if h_proj[y] >= line_threshold:
if seg_start is None:
seg_start = y
else:
if seg_start is not None:
line_segments.append((seg_start, y))
seg_start = None
if seg_start is not None:
line_segments.append((seg_start, h))
if len(line_segments) < 2:
return []
# Pair lines into boxes: top-line + bottom-line
# Minimum box height: 30px. Maximum: 70% of content height.
min_box_h = 30
max_box_h = int(content_h * 0.70)
boxes: List[DetectedBox] = []
used = set()
for i, (top_start, top_end) in enumerate(line_segments):
if i in used:
continue
for j in range(i + 1, len(line_segments)):
if j in used:
continue
bot_start, bot_end = line_segments[j]
box_y = top_start
box_h = bot_end - top_start
if box_h < min_box_h or box_h > max_box_h:
continue
# Estimate border thickness from line segment heights
border_top = top_end - top_start
border_bot = bot_end - bot_start
box = DetectedBox(
x=content_x,
y=box_y,
width=content_w,
height=box_h,
confidence=0.8,
border_thickness=max(border_top, border_bot),
)
boxes.append(box)
used.add(i)
used.add(j)
break # move to next top-line candidate
return boxes
# ---------------------------------------------------------------------------
# Stage 2: Background shading detection (color + grayscale)
# ---------------------------------------------------------------------------
def _detect_boxes_by_shading(
img_bgr: np.ndarray,
content_x: int,
content_w: int,
content_y: int,
content_h: int,
) -> List[DetectedBox]:
"""Find boxes with shaded/colored background (no visible border lines).
Uses heavy median blur to remove text and reveal the underlying background.
Then detects rectangular regions where the background differs from white.
Works for both colored boxes (blue, green) and grayscale shading (gray on
B/W scans).
Args:
img_bgr: BGR color image (full page).
content_x, content_w: Horizontal content bounds.
content_y, content_h: Vertical content bounds.
Returns:
List of DetectedBox for each detected shaded box.
"""
h, w = img_bgr.shape[:2]
# --- Heavy median blur removes text strokes, keeps background ---
blur_size = 31 # large kernel to wipe out text
blurred = cv2.medianBlur(img_bgr, blur_size)
blur_gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY)
blur_hsv = cv2.cvtColor(blurred, cv2.COLOR_BGR2HSV)
# Estimate page background from top-left / top-right corners
corner_size = max(20, min(h // 10, w // 10))
corners = np.concatenate([
blur_gray[:corner_size, :corner_size].ravel(),
blur_gray[:corner_size, -corner_size:].ravel(),
])
page_bg = float(np.median(corners))
# Two masks: grayscale shading + color saturation
# Grayscale: regions noticeably darker than the page background
shade_thresh = max(page_bg - 30, 150)
gray_mask = (blur_gray < shade_thresh).astype(np.uint8) * 255
# Color: regions with noticeable saturation (blue/green/etc. boxes)
sat_mask = (blur_hsv[:, :, 1] > 20).astype(np.uint8) * 255
combined = cv2.bitwise_or(gray_mask, sat_mask)
# Morphological cleanup: close gaps, remove small noise
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 10))
combined = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel_close)
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 5))
combined = cv2.morphologyEx(combined, cv2.MORPH_OPEN, kernel_open)
contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Size thresholds: smaller boxes allowed (e.g. "German leihen" ~30% width)
min_area = content_w * 30 # at least 30px tall at full width
min_box_h = 25
max_box_h = int(content_h * 0.70)
min_width_ratio = 0.25 # boxes can be ~25% of content width
boxes: List[DetectedBox] = []
for cnt in contours:
area = cv2.contourArea(cnt)
if area < min_area:
continue
bx, by, bw, bh = cv2.boundingRect(cnt)
# Width filter
if bw < content_w * min_width_ratio:
continue
# Height filter
if bh < min_box_h or bh > max_box_h:
continue
# Rectangularity check: area / bounding-rect area > 0.6
rect_area = bw * bh
if rect_area > 0 and area / rect_area < 0.5:
continue
# Verify that the background inside this region is actually shaded
roi_gray = blur_gray[by:by + bh, bx:bx + bw]
roi_hsv = blur_hsv[by:by + bh, bx:bx + bw]
if roi_gray.size == 0:
continue
median_val = float(np.median(roi_gray))
median_sat = float(np.median(roi_hsv[:, :, 1]))
# Must be noticeably different from page background
is_shaded = median_val < (page_bg - 15)
is_colored = median_sat > 15
if not is_shaded and not is_colored:
continue
conf = 0.7 if is_colored else 0.6
boxes.append(DetectedBox(
x=bx,
y=by,
width=bw,
height=bh,
confidence=conf,
border_thickness=0,
))
return boxes
# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------
def _validate_box(
box: DetectedBox,
gray: np.ndarray,
content_w: int,
content_h: int,
median_row_gap: int,
) -> bool:
"""Validate that a detected box is genuine (not a table-row separator etc.)."""
# Must span > 25% of content width (lowered from 60% to allow smaller boxes)
if box.width < content_w * 0.25:
return False
# Height constraints
if box.height < 25 or box.height > content_h * 0.70:
return False
# Must not be confused with a table-row separator:
# real boxes are at least 3x the median row gap
if median_row_gap > 0 and box.height < median_row_gap * 3:
return False
# Must contain some text (ink density check)
h, w = gray.shape[:2]
y1 = max(0, box.y)
y2 = min(h, box.y + box.height)
x1 = max(0, box.x)
x2 = min(w, box.x + box.width)
roi = gray[y1:y2, x1:x2]
if roi.size == 0:
return False
ink_ratio = np.sum(roi < 128) / roi.size
if ink_ratio < 0.002: # nearly empty → not a real content box
return False
return True
# ---------------------------------------------------------------------------
# Public API: detect_boxes
# ---------------------------------------------------------------------------
def _merge_overlapping_boxes(boxes: List[DetectedBox]) -> List[DetectedBox]:
"""Merge boxes that overlap significantly (IoU > 0.3 or one contains the other).
When two boxes overlap, keep the one with higher confidence (or the larger
one if confidences are equal).
"""
if len(boxes) <= 1:
return boxes
# Sort by area descending so larger boxes are processed first
boxes = sorted(boxes, key=lambda b: b.width * b.height, reverse=True)
keep = [True] * len(boxes)
for i in range(len(boxes)):
if not keep[i]:
continue
bi = boxes[i]
for j in range(i + 1, len(boxes)):
if not keep[j]:
continue
bj = boxes[j]
# Compute overlap
x1 = max(bi.x, bj.x)
y1 = max(bi.y, bj.y)
x2 = min(bi.x + bi.width, bj.x + bj.width)
y2 = min(bi.y + bi.height, bj.y + bj.height)
if x2 <= x1 or y2 <= y1:
continue # no overlap
inter = (x2 - x1) * (y2 - y1)
area_i = bi.width * bi.height
area_j = bj.width * bj.height
smaller_area = min(area_i, area_j)
# If overlap covers > 50% of the smaller box, merge (drop the weaker)
if smaller_area > 0 and inter / smaller_area > 0.50:
# Keep the one with higher confidence; if equal, keep larger
if bj.confidence > bi.confidence:
keep[i] = False
break
else:
keep[j] = False
return [b for b, k in zip(boxes, keep) if k]
def detect_boxes(
img_bgr: np.ndarray,
content_x: int,
content_w: int,
content_y: int,
content_h: int,
median_row_gap: int = 0,
) -> List[DetectedBox]:
"""Detect embedded boxes on a page image.
Runs BOTH line-based and shading-based detection, then merges and
deduplicates results.
Args:
img_bgr: BGR color image (full page or cropped).
content_x, content_w: Horizontal content bounds.
content_y, content_h: Vertical content bounds.
median_row_gap: Median row gap height (for filtering out table separators).
Returns:
List of validated DetectedBox instances, sorted by y position.
"""
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
# Stage 1: Line-based detection (bordered boxes)
line_boxes = _detect_boxes_by_lines(gray, content_x, content_w, content_y, content_h)
# Stage 2: Shading-based detection (colored/gray background boxes)
shade_boxes = _detect_boxes_by_shading(img_bgr, content_x, content_w, content_y, content_h)
logger.debug("BoxDetect: %d line-based, %d shading-based candidates",
len(line_boxes), len(shade_boxes))
# Combine and deduplicate
all_boxes = line_boxes + shade_boxes
merged = _merge_overlapping_boxes(all_boxes)
# Validate
validated = [b for b in merged if _validate_box(b, gray, content_w, content_h, median_row_gap)]
# Sort top to bottom
validated.sort(key=lambda b: b.y)
if validated:
logger.info("BoxDetect: %d box(es) detected (line=%d, shade=%d, merged=%d)",
len(validated), len(line_boxes), len(shade_boxes), len(merged))
else:
logger.debug("BoxDetect: no boxes detected")
return validated
# ---------------------------------------------------------------------------
# Zone Splitting
# ---------------------------------------------------------------------------
def split_page_into_zones(
content_x: int,
content_y: int,
content_w: int,
content_h: int,
boxes: List[DetectedBox],
min_zone_height: int = 40,
) -> List[PageZone]:
"""Split a page into vertical zones based on detected boxes.
Regions above, between, and below boxes become 'content' zones;
box regions become 'box' zones.
Args:
content_x, content_y, content_w, content_h: Content area bounds.
boxes: Detected boxes, sorted by y position.
min_zone_height: Minimum height for a content zone to be kept.
Returns:
List of PageZone, ordered top to bottom.
"""
if not boxes:
# Single zone: entire content area
return [PageZone(
index=0,
zone_type='content',
y=content_y,
height=content_h,
x=content_x,
width=content_w,
)]
zones: List[PageZone] = []
zone_idx = 0
cursor_y = content_y
content_bottom = content_y + content_h
for box in boxes:
# Content zone above this box
gap_above = box.y - cursor_y
if gap_above >= min_zone_height:
zones.append(PageZone(
index=zone_idx,
zone_type='content',
y=cursor_y,
height=gap_above,
x=content_x,
width=content_w,
))
zone_idx += 1
# Box zone
zones.append(PageZone(
index=zone_idx,
zone_type='box',
y=box.y,
height=box.height,
x=box.x,
width=box.width,
box=box,
))
zone_idx += 1
cursor_y = box.y + box.height
# Content zone below last box
remaining = content_bottom - cursor_y
if remaining >= min_zone_height:
zones.append(PageZone(
index=zone_idx,
zone_type='content',
y=cursor_y,
height=remaining,
x=content_x,
width=content_w,
))
logger.info(f"ZoneSplit: {len(zones)} zones from {len(boxes)} box(es): "
f"{[z.zone_type for z in zones]}")
return zones
@@ -0,0 +1,339 @@
"""
Box layout classifier — detects internal layout type of embedded boxes.
Classifies each box as: flowing | columnar | bullet_list | header_only
and provides layout-appropriate grid building.
Used by the Box-Grid-Review step to rebuild box zones with correct structure.
"""
import logging
import re
import statistics
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# Bullet / list-item patterns at the start of a line
_BULLET_RE = re.compile(
r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s' # dash, bullet chars
r'|^\d{1,2}[.)]\s' # numbered: "1) " or "1. "
r'|^[a-z][.)]\s' # lettered: "a) " or "a. "
)
def classify_box_layout(
words: List[Dict],
box_w: int,
box_h: int,
) -> str:
"""Classify the internal layout of a detected box.
Args:
words: OCR word dicts within the box (with top, left, width, height, text)
box_w: Box width in pixels
box_h: Box height in pixels
Returns:
'header_only' | 'bullet_list' | 'columnar' | 'flowing'
"""
if not words:
return "header_only"
# Group words into lines by y-proximity
lines = _group_into_lines(words)
# Header only: very few words or single line
total_words = sum(len(line) for line in lines)
if total_words <= 5 or len(lines) <= 1:
return "header_only"
# Bullet list: check if majority of lines start with bullet patterns
bullet_count = 0
for line in lines:
first_text = line[0].get("text", "") if line else ""
if _BULLET_RE.match(first_text):
bullet_count += 1
# Also check if first word IS a bullet char
elif first_text.strip() in ("-", "", "", "", "·", "", ""):
bullet_count += 1
if bullet_count >= len(lines) * 0.4 and bullet_count >= 2:
return "bullet_list"
# Columnar: check for multiple distinct x-clusters
if len(lines) >= 3 and _has_column_structure(words, box_w):
return "columnar"
# Default: flowing text
return "flowing"
def _group_into_lines(words: List[Dict]) -> List[List[Dict]]:
"""Group words into lines by y-proximity."""
if not words:
return []
sorted_words = sorted(words, key=lambda w: (w["top"], w["left"]))
heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
median_h = statistics.median(heights) if heights else 20
y_tolerance = max(median_h * 0.5, 5)
lines: List[List[Dict]] = []
current_line: List[Dict] = [sorted_words[0]]
current_y = sorted_words[0]["top"]
for w in sorted_words[1:]:
if abs(w["top"] - current_y) <= y_tolerance:
current_line.append(w)
else:
lines.append(sorted(current_line, key=lambda ww: ww["left"]))
current_line = [w]
current_y = w["top"]
if current_line:
lines.append(sorted(current_line, key=lambda ww: ww["left"]))
return lines
def _has_column_structure(words: List[Dict], box_w: int) -> bool:
"""Check if words have multiple distinct left-edge clusters (columns)."""
if box_w <= 0:
return False
lines = _group_into_lines(words)
if len(lines) < 3:
return False
# Collect left-edges of non-first words in each line
# (first word of each line often aligns regardless of columns)
left_edges = []
for line in lines:
for w in line[1:]: # skip first word
left_edges.append(w["left"])
if len(left_edges) < 4:
return False
# Check if left edges cluster into 2+ distinct groups
left_edges.sort()
gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)]
if not gaps:
return False
median_gap = statistics.median(gaps)
# A column gap is typically > 15% of box width
column_gap_threshold = box_w * 0.15
large_gaps = [g for g in gaps if g > column_gap_threshold]
return len(large_gaps) >= 1
def build_box_zone_grid(
zone_words: List[Dict],
box_x: int,
box_y: int,
box_w: int,
box_h: int,
zone_index: int,
img_w: int,
img_h: int,
layout_type: Optional[str] = None,
) -> Dict[str, Any]:
"""Build a grid for a box zone with layout-aware processing.
If layout_type is None, auto-detects it.
For 'flowing' and 'bullet_list', forces single-column layout.
For 'columnar', uses the standard multi-column detection.
For 'header_only', creates a single cell.
Returns the same format as _build_zone_grid (columns, rows, cells, header_rows).
"""
from grid_editor_helpers import _build_zone_grid, _cluster_rows
if not zone_words:
return {
"columns": [],
"rows": [],
"cells": [],
"header_rows": [],
"box_layout_type": layout_type or "header_only",
"box_grid_reviewed": False,
}
# Auto-detect layout if not specified
if not layout_type:
layout_type = classify_box_layout(zone_words, box_w, box_h)
logger.info(
"Box zone %d: layout_type=%s, %d words, %dx%d",
zone_index, layout_type, len(zone_words), box_w, box_h,
)
if layout_type == "header_only":
# Single cell with all text concatenated
all_text = " ".join(
w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"]))
).strip()
return {
"columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
"x_min_px": box_x, "x_max_px": box_x + box_w,
"x_min_pct": round(box_x / img_w * 100, 2) if img_w else 0,
"x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
"bold": False}],
"rows": [{"index": 0, "row_index": 0,
"y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2,
"y_min_px": box_y, "y_max_px": box_y + box_h,
"y_min_pct": round(box_y / img_h * 100, 2) if img_h else 0,
"y_max_pct": round((box_y + box_h) / img_h * 100, 2) if img_h else 0,
"is_header": True}],
"cells": [{
"cell_id": f"Z{zone_index}_R0C0",
"row_index": 0,
"col_index": 0,
"col_type": "column_1",
"text": all_text,
"word_boxes": zone_words,
}],
"header_rows": [0],
"box_layout_type": layout_type,
"box_grid_reviewed": False,
}
if layout_type in ("flowing", "bullet_list"):
# Force single column — each line becomes one row with one cell.
# Detect bullet structure from indentation and merge continuation
# lines into the bullet they belong to.
lines = _group_into_lines(zone_words)
column = {
"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
"x_min_px": box_x, "x_max_px": box_x + box_w,
"x_min_pct": round(box_x / img_w * 100, 2) if img_w else 0,
"x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
"bold": False,
}
# --- Detect indentation levels ---
line_indents = []
for line_words in lines:
if not line_words:
line_indents.append(0)
continue
min_left = min(w["left"] for w in line_words)
line_indents.append(min_left - box_x)
# Find the minimum indent (= bullet/main level)
valid_indents = [ind for ind in line_indents if ind >= 0]
min_indent = min(valid_indents) if valid_indents else 0
# Indentation threshold: lines indented > 15px more than minimum
# are continuation lines belonging to the previous bullet
INDENT_THRESHOLD = 15
# --- Group lines into logical items (bullet + continuations) ---
# Each item is a list of line indices
items: List[List[int]] = []
for li, indent in enumerate(line_indents):
is_continuation = (indent > min_indent + INDENT_THRESHOLD) and len(items) > 0
if is_continuation:
items[-1].append(li)
else:
items.append([li])
logger.info(
"Box zone %d flowing: %d lines → %d items (indents=%s, min=%d, threshold=%d)",
zone_index, len(lines), len(items),
[int(i) for i in line_indents], int(min_indent), INDENT_THRESHOLD,
)
# --- Build rows and cells from grouped items ---
rows = []
cells = []
header_rows = []
for row_idx, item_line_indices in enumerate(items):
# Collect all words from all lines in this item
item_words = []
item_texts = []
for li in item_line_indices:
if li < len(lines):
item_words.extend(lines[li])
line_text = " ".join(w.get("text", "") for w in lines[li]).strip()
if line_text:
item_texts.append(line_text)
if not item_words:
continue
y_min = min(w["top"] for w in item_words)
y_max = max(w["top"] + w["height"] for w in item_words)
y_center = (y_min + y_max) / 2
row = {
"index": row_idx,
"row_index": row_idx,
"y_min": y_min,
"y_max": y_max,
"y_center": y_center,
"y_min_px": y_min,
"y_max_px": y_max,
"y_min_pct": round(y_min / img_h * 100, 2) if img_h else 0,
"y_max_pct": round(y_max / img_h * 100, 2) if img_h else 0,
"is_header": False,
}
rows.append(row)
# Join multi-line text with newline for display
merged_text = "\n".join(item_texts)
# Add bullet marker if this is a bullet item without one
first_text = item_texts[0] if item_texts else ""
is_bullet = len(item_line_indices) > 1 or _BULLET_RE.match(first_text)
if is_bullet and not _BULLET_RE.match(first_text) and row_idx > 0:
# Continuation item without bullet — add one
merged_text = "" + merged_text
cell = {
"cell_id": f"Z{zone_index}_R{row_idx}C0",
"row_index": row_idx,
"col_index": 0,
"col_type": "column_1",
"text": merged_text,
"word_boxes": item_words,
}
cells.append(cell)
# Detect header: first item if it has no continuation lines and is short
if len(items) >= 2:
first_item_texts = []
for li in items[0]:
if li < len(lines):
first_item_texts.append(" ".join(w.get("text", "") for w in lines[li]).strip())
first_text = " ".join(first_item_texts)
if (len(first_text) < 40
or first_text.isupper()
or first_text.rstrip().endswith(':')):
header_rows = [0]
return {
"columns": [column],
"rows": rows,
"cells": cells,
"header_rows": header_rows,
"box_layout_type": layout_type,
"box_grid_reviewed": False,
}
# Columnar: use standard grid builder with independent column detection
result = _build_zone_grid(
zone_words, box_x, box_y, box_w, box_h,
zone_index, img_w, img_h,
global_columns=None, # detect columns independently
)
# Colspan detection is now handled generically by _detect_colspan_cells
# in grid_editor_helpers.py (called inside _build_zone_grid).
result["box_layout_type"] = layout_type
result["box_grid_reviewed"] = False
return result
@@ -0,0 +1,312 @@
"""
Color detection for OCR word boxes.
Detects the text color of existing OCR words and recovers colored text
regions (e.g. red markers, blue headings) that standard OCR may have missed.
Standard OCR (Tesseract, PaddleOCR) binarises images before processing,
destroying all color information. This module adds it back by sampling
HSV pixel values at word-box positions and finding colored regions that
no word-box covers.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Any, Dict, List, Optional, Tuple
import cv2
import numpy as np
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# HSV color ranges (OpenCV: H 0-180, S 0-255, V 0-255)
# ---------------------------------------------------------------------------
_COLOR_RANGES: Dict[str, List[Tuple[np.ndarray, np.ndarray]]] = {
"red": [
(np.array([0, 70, 50]), np.array([10, 255, 255])),
(np.array([170, 70, 50]), np.array([180, 255, 255])),
],
"orange": [
(np.array([10, 70, 50]), np.array([25, 255, 255])),
],
"yellow": [
(np.array([25, 70, 50]), np.array([35, 255, 255])),
],
"green": [
(np.array([35, 70, 50]), np.array([85, 255, 255])),
],
"blue": [
(np.array([100, 70, 50]), np.array([130, 255, 255])),
],
"purple": [
(np.array([130, 70, 50]), np.array([170, 255, 255])),
],
}
_COLOR_HEX: Dict[str, str] = {
"black": "#000000",
"gray": "#6b7280",
"red": "#dc2626",
"orange": "#ea580c",
"yellow": "#ca8a04",
"green": "#16a34a",
"blue": "#2563eb",
"purple": "#9333ea",
}
def _hue_to_color_name(hue: float) -> str:
"""Map OpenCV hue (0-180) to a color name."""
if hue < 10 or hue > 170:
return "red"
if hue < 25:
return "orange"
if hue < 35:
return "yellow"
if hue < 85:
return "green"
if hue < 130:
return "blue"
return "purple"
# ---------------------------------------------------------------------------
# 1. Color annotation for existing word boxes
# ---------------------------------------------------------------------------
def detect_word_colors(
img_bgr: np.ndarray,
word_boxes: List[Dict],
sat_threshold: int = 55,
min_sat_ratio: float = 0.25,
) -> None:
"""Annotate each word_box in-place with its detected text color.
Adds ``color`` (hex string) and ``color_name`` (e.g. 'red', 'black')
keys to each dict.
Algorithm per word:
1. Crop the word region from the image.
2. Otsu-threshold for text/background separation.
3. Sample background color from border pixels of the crop.
4. Remove text pixels that match the background (avoids colored
backgrounds like blue boxes leaking into the result).
5. Use **median** hue (robust to outliers) and require a minimum
ratio of saturated pixels before classifying as colored.
"""
if img_bgr is None or not word_boxes:
return
img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
img_h, img_w = img_bgr.shape[:2]
colored_count = 0
for wb in word_boxes:
x1 = max(0, int(wb["left"]))
y1 = max(0, int(wb["top"]))
x2 = min(img_w, int(wb["left"] + wb["width"]))
y2 = min(img_h, int(wb["top"] + wb["height"]))
if x2 <= x1 or y2 <= y1:
wb["color"] = _COLOR_HEX["black"]
wb["color_name"] = "black"
continue
crop_hsv = img_hsv[y1:y2, x1:x2]
crop_bgr = img_bgr[y1:y2, x1:x2]
crop_gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
ch, cw = crop_hsv.shape[:2]
# --- Text mask: Otsu (adaptive) + high-saturation pixels ---
_, dark_mask = cv2.threshold(
crop_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU,
)
sat_mask = (crop_hsv[:, :, 1] > sat_threshold).astype(np.uint8) * 255
text_mask = cv2.bitwise_or(dark_mask, sat_mask)
text_pixels = crop_hsv[text_mask > 0]
if len(text_pixels) < 3:
wb["color"] = _COLOR_HEX["black"]
wb["color_name"] = "black"
continue
# --- Background subtraction via border pixels ---
# Sample background from the 2px border ring of the crop
if ch > 6 and cw > 6:
border = 2
bg_top = crop_hsv[:border, :].reshape(-1, 3)
bg_bot = crop_hsv[-border:, :].reshape(-1, 3)
bg_lft = crop_hsv[border:-border, :border].reshape(-1, 3)
bg_rgt = crop_hsv[border:-border, -border:].reshape(-1, 3)
bg_pixels = np.vstack([bg_top, bg_bot, bg_lft, bg_rgt])
bg_med_h = float(np.median(bg_pixels[:, 0]))
bg_med_s = float(np.median(bg_pixels[:, 1]))
# If background is tinted (S > 15), remove text pixels
# with similar hue to avoid false colored detections
if bg_med_s > 15:
hue_diff = np.minimum(
np.abs(text_pixels[:, 0].astype(float) - bg_med_h),
180.0 - np.abs(text_pixels[:, 0].astype(float) - bg_med_h),
)
keep = hue_diff > 20
if np.any(keep):
text_pixels = text_pixels[keep]
if len(text_pixels) < 3:
wb["color"] = _COLOR_HEX["black"]
wb["color_name"] = "black"
continue
# --- Classification using MEDIAN (robust to outliers) ---
median_sat = float(np.median(text_pixels[:, 1]))
sat_count = int(np.sum(text_pixels[:, 1] > sat_threshold))
sat_ratio = sat_count / len(text_pixels)
if median_sat < sat_threshold or sat_ratio < min_sat_ratio:
wb["color"] = _COLOR_HEX["black"]
wb["color_name"] = "black"
else:
# Use median hue of saturated pixels only for cleaner signal
sat_pixels = text_pixels[text_pixels[:, 1] > sat_threshold]
median_hue = float(np.median(sat_pixels[:, 0]))
name = _hue_to_color_name(median_hue)
# Red requires higher saturation — scanner artifacts on black
# text often produce a slight warm tint (hue ~0) with low
# saturation that would otherwise be misclassified as red.
if name == "red" and median_sat < 90:
wb["color"] = _COLOR_HEX["black"]
wb["color_name"] = "black"
continue
wb["color"] = _COLOR_HEX.get(name, _COLOR_HEX["black"])
wb["color_name"] = name
colored_count += 1
if colored_count:
logger.info("color annotation: %d / %d words are colored",
colored_count, len(word_boxes))
# ---------------------------------------------------------------------------
# 2. Recover colored text that OCR missed
# ---------------------------------------------------------------------------
def recover_colored_text(
img_bgr: np.ndarray,
existing_words: List[Dict],
min_area: int = 40,
max_regions: int = 60,
) -> List[Dict]:
"""Find colored text regions not covered by any existing word box.
Returns a list of recovered word dicts with ``color``, ``color_name``,
and ``recovered=True`` fields. The ``text`` is set via a lightweight
shape heuristic (e.g. ``!`` for tall narrow shapes) or ``?``.
"""
if img_bgr is None:
return []
img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
ih, iw = img_bgr.shape[:2]
max_area = int(ih * iw * 0.005)
# --- Build occupancy mask from existing words (adaptive padding) ---
# Pad word boxes generously to prevent colored-pixel artifacts in
# narrow inter-word gaps from being recovered as false characters.
heights = [wb["height"] for wb in existing_words if wb.get("height", 0) > 0]
median_h = int(np.median(heights)) if heights else 20
pad = max(8, int(median_h * 0.35))
occupied = np.zeros((ih, iw), dtype=np.uint8)
for wb in existing_words:
x1 = max(0, int(wb["left"]) - pad)
y1 = max(0, int(wb["top"]) - pad)
x2 = min(iw, int(wb["left"] + wb["width"]) + pad)
y2 = min(ih, int(wb["top"] + wb["height"]) + pad)
occupied[y1:y2, x1:x2] = 255
recovered: List[Dict] = []
for color_name, ranges in _COLOR_RANGES.items():
# Create mask for this color
mask = np.zeros((ih, iw), dtype=np.uint8)
for lower, upper in ranges:
mask = cv2.bitwise_or(mask, cv2.inRange(img_hsv, lower, upper))
# Remove pixels already covered by existing OCR words
mask = cv2.bitwise_and(mask, cv2.bitwise_not(occupied))
# Morphological cleanup:
# - Close with tall kernel to merge ! stroke + dot
# - Open to remove noise specks
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 8))
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel_close)
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_open)
contours, _ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
candidates = []
for cnt in contours:
area = cv2.contourArea(cnt)
if area < min_area or area > max_area:
continue
bx, by, bw, bh = cv2.boundingRect(cnt)
if bh < 6:
continue
# Reject regions too wide to be single characters
if bw > median_h * 4:
continue
candidates.append((area, bx, by, bw, bh))
# Keep largest first, limited count
candidates.sort(key=lambda c: c[0], reverse=True)
for area, bx, by, bw, bh in candidates[:max_regions]:
text = _identify_shape(bw, bh)
recovered.append({
"text": text,
"left": bx,
"top": by,
"width": bw,
"height": bh,
"conf": 45,
"color": _COLOR_HEX.get(color_name, "#000000"),
"color_name": color_name,
"recovered": True,
})
if recovered:
logger.info(
"color recovery: %d colored regions found (%s)",
len(recovered),
", ".join(
f"{c}: {sum(1 for r in recovered if r['color_name'] == c)}"
for c in sorted({r["color_name"] for r in recovered})
),
)
return recovered
def _identify_shape(w: int, h: int) -> str:
"""Simple shape heuristic for common single-character text markers."""
aspect = w / h if h > 0 else 1.0
if aspect < 0.55 and h > 10:
# Tall, narrow — likely exclamation mark
return "!"
if 0.6 < aspect < 1.5 and max(w, h) < 25:
# Small, roughly square — bullet or dot
return ""
return "?"
@@ -0,0 +1,413 @@
"""
PP-DocLayout ONNX Document Layout Detection.
Uses PP-DocLayout ONNX model to detect document structure regions:
table, figure, title, text, list, header, footer, equation, reference, abstract
Fallback: If ONNX model not available, returns empty list (caller should
fall back to OpenCV-based detection in cv_graphic_detect.py).
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
logger = logging.getLogger(__name__)
__all__ = [
"detect_layout_regions",
"is_doclayout_available",
"get_doclayout_status",
"LayoutRegion",
"DOCLAYOUT_CLASSES",
]
# ---------------------------------------------------------------------------
# Class labels (PP-DocLayout default order)
# ---------------------------------------------------------------------------
DOCLAYOUT_CLASSES = [
"table", "figure", "title", "text", "list",
"header", "footer", "equation", "reference", "abstract",
]
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
@dataclass
class LayoutRegion:
"""A detected document layout region."""
x: int
y: int
width: int
height: int
label: str # table, figure, title, text, list, etc.
confidence: float
label_index: int # raw class index
# ---------------------------------------------------------------------------
# ONNX model loading
# ---------------------------------------------------------------------------
_MODEL_SEARCH_PATHS = [
# 1. Explicit environment variable
os.environ.get("DOCLAYOUT_ONNX_PATH", ""),
# 2. Docker default cache path
"/root/.cache/huggingface/onnx/pp-doclayout/model.onnx",
# 3. Local dev relative to working directory
"models/onnx/pp-doclayout/model.onnx",
]
_onnx_session: Optional[object] = None
_model_path: Optional[str] = None
_load_attempted: bool = False
_load_error: Optional[str] = None
def _find_model_path() -> Optional[str]:
"""Search for the ONNX model file in known locations."""
for p in _MODEL_SEARCH_PATHS:
if p and Path(p).is_file():
return str(Path(p).resolve())
return None
def _load_onnx_session():
"""Lazy-load the ONNX runtime session (once)."""
global _onnx_session, _model_path, _load_attempted, _load_error
if _load_attempted:
return _onnx_session
_load_attempted = True
path = _find_model_path()
if path is None:
_load_error = "ONNX model not found in any search path"
logger.info("PP-DocLayout: %s", _load_error)
return None
try:
import onnxruntime as ort # type: ignore[import-untyped]
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Prefer CPU keeps the GPU free for OCR / LLM.
providers = ["CPUExecutionProvider"]
_onnx_session = ort.InferenceSession(path, sess_options, providers=providers)
_model_path = path
logger.info("PP-DocLayout: model loaded from %s", path)
except ImportError:
_load_error = "onnxruntime not installed"
logger.info("PP-DocLayout: %s", _load_error)
except Exception as exc:
_load_error = str(exc)
logger.warning("PP-DocLayout: failed to load model from %s: %s", path, exc)
return _onnx_session
# ---------------------------------------------------------------------------
# Public helpers
# ---------------------------------------------------------------------------
def is_doclayout_available() -> bool:
"""Return True if the ONNX model can be loaded successfully."""
return _load_onnx_session() is not None
def get_doclayout_status() -> Dict:
"""Return diagnostic information about the DocLayout backend."""
_load_onnx_session() # ensure we tried
return {
"available": _onnx_session is not None,
"model_path": _model_path,
"load_error": _load_error,
"classes": DOCLAYOUT_CLASSES,
"class_count": len(DOCLAYOUT_CLASSES),
}
# ---------------------------------------------------------------------------
# Pre-processing
# ---------------------------------------------------------------------------
_INPUT_SIZE = 800 # PP-DocLayout expects 800x800
def preprocess_image(img_bgr: np.ndarray) -> tuple:
"""Resize + normalize image for PP-DocLayout ONNX input.
Returns:
(input_tensor, scale_x, scale_y, pad_x, pad_y)
where scale/pad allow mapping boxes back to original coords.
"""
orig_h, orig_w = img_bgr.shape[:2]
# Compute scale to fit within _INPUT_SIZE keeping aspect ratio
scale = min(_INPUT_SIZE / orig_w, _INPUT_SIZE / orig_h)
new_w = int(orig_w * scale)
new_h = int(orig_h * scale)
import cv2 # local import — cv2 is always available in this service
resized = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
# Pad to _INPUT_SIZE x _INPUT_SIZE with gray (114)
pad_x = (_INPUT_SIZE - new_w) // 2
pad_y = (_INPUT_SIZE - new_h) // 2
padded = np.full((_INPUT_SIZE, _INPUT_SIZE, 3), 114, dtype=np.uint8)
padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized
# Normalize to [0, 1] float32
blob = padded.astype(np.float32) / 255.0
# HWC → CHW
blob = blob.transpose(2, 0, 1)
# Add batch dimension → (1, 3, 800, 800)
blob = np.expand_dims(blob, axis=0)
return blob, scale, pad_x, pad_y
# ---------------------------------------------------------------------------
# Non-Maximum Suppression (NMS)
# ---------------------------------------------------------------------------
def _compute_iou(box_a: np.ndarray, box_b: np.ndarray) -> float:
"""Compute IoU between two boxes [x1, y1, x2, y2]."""
ix1 = max(box_a[0], box_b[0])
iy1 = max(box_a[1], box_b[1])
ix2 = min(box_a[2], box_b[2])
iy2 = min(box_a[3], box_b[3])
inter = max(0.0, ix2 - ix1) * max(0.0, iy2 - iy1)
if inter == 0:
return 0.0
area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
union = area_a + area_b - inter
return inter / union if union > 0 else 0.0
def nms(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float = 0.5) -> List[int]:
"""Apply greedy Non-Maximum Suppression.
Args:
boxes: (N, 4) array of [x1, y1, x2, y2].
scores: (N,) confidence scores.
iou_threshold: Overlap threshold for suppression.
Returns:
List of kept indices.
"""
if len(boxes) == 0:
return []
order = np.argsort(scores)[::-1].tolist()
keep: List[int] = []
while order:
i = order.pop(0)
keep.append(i)
remaining = []
for j in order:
if _compute_iou(boxes[i], boxes[j]) < iou_threshold:
remaining.append(j)
order = remaining
return keep
# ---------------------------------------------------------------------------
# Post-processing
# ---------------------------------------------------------------------------
def _postprocess(
outputs: list,
scale: float,
pad_x: int,
pad_y: int,
orig_w: int,
orig_h: int,
confidence_threshold: float,
max_regions: int,
) -> List[LayoutRegion]:
"""Parse ONNX output tensors into LayoutRegion list.
PP-DocLayout ONNX typically outputs one tensor of shape
(1, N, 6) or three tensors (boxes, scores, class_ids).
We handle both common formats.
"""
regions: List[LayoutRegion] = []
# --- Determine output format ---
if len(outputs) == 1:
# Single tensor: (1, N, 4+1+1) = (batch, detections, [x1,y1,x2,y2,score,class])
raw = np.squeeze(outputs[0]) # (N, 6) or (N, 5+num_classes)
if raw.ndim == 1:
raw = raw.reshape(1, -1)
if raw.shape[0] == 0:
return []
if raw.shape[1] == 6:
# Format: x1, y1, x2, y2, score, class_id
all_boxes = raw[:, :4]
all_scores = raw[:, 4]
all_classes = raw[:, 5].astype(int)
elif raw.shape[1] > 6:
# Format: x1, y1, x2, y2, obj_conf, cls0_conf, cls1_conf, ...
all_boxes = raw[:, :4]
cls_scores = raw[:, 5:]
all_classes = np.argmax(cls_scores, axis=1)
all_scores = raw[:, 4] * np.max(cls_scores, axis=1)
else:
logger.warning("PP-DocLayout: unexpected output shape %s", raw.shape)
return []
elif len(outputs) == 3:
# Three tensors: boxes (N,4), scores (N,), class_ids (N,)
all_boxes = np.squeeze(outputs[0])
all_scores = np.squeeze(outputs[1])
all_classes = np.squeeze(outputs[2]).astype(int)
if all_boxes.ndim == 1:
all_boxes = all_boxes.reshape(1, 4)
all_scores = np.array([all_scores])
all_classes = np.array([all_classes])
else:
logger.warning("PP-DocLayout: unexpected %d output tensors", len(outputs))
return []
# --- Confidence filter ---
mask = all_scores >= confidence_threshold
boxes = all_boxes[mask]
scores = all_scores[mask]
classes = all_classes[mask]
if len(boxes) == 0:
return []
# --- NMS ---
keep_idxs = nms(boxes, scores, iou_threshold=0.5)
boxes = boxes[keep_idxs]
scores = scores[keep_idxs]
classes = classes[keep_idxs]
# --- Scale boxes back to original image coordinates ---
for i in range(len(boxes)):
x1, y1, x2, y2 = boxes[i]
# Remove padding offset
x1 = (x1 - pad_x) / scale
y1 = (y1 - pad_y) / scale
x2 = (x2 - pad_x) / scale
y2 = (y2 - pad_y) / scale
# Clamp to original dimensions
x1 = max(0, min(x1, orig_w))
y1 = max(0, min(y1, orig_h))
x2 = max(0, min(x2, orig_w))
y2 = max(0, min(y2, orig_h))
w = int(round(x2 - x1))
h = int(round(y2 - y1))
if w < 5 or h < 5:
continue
cls_idx = int(classes[i])
label = DOCLAYOUT_CLASSES[cls_idx] if 0 <= cls_idx < len(DOCLAYOUT_CLASSES) else f"class_{cls_idx}"
regions.append(LayoutRegion(
x=int(round(x1)),
y=int(round(y1)),
width=w,
height=h,
label=label,
confidence=round(float(scores[i]), 4),
label_index=cls_idx,
))
# Sort by confidence descending, limit
regions.sort(key=lambda r: r.confidence, reverse=True)
return regions[:max_regions]
# ---------------------------------------------------------------------------
# Main detection function
# ---------------------------------------------------------------------------
def detect_layout_regions(
img_bgr: np.ndarray,
confidence_threshold: float = 0.5,
max_regions: int = 50,
) -> List[LayoutRegion]:
"""Detect document layout regions using PP-DocLayout ONNX model.
Args:
img_bgr: BGR color image (OpenCV format).
confidence_threshold: Minimum confidence to keep a detection.
max_regions: Maximum number of regions to return.
Returns:
List of LayoutRegion sorted by confidence descending.
Returns empty list if model is not available.
"""
session = _load_onnx_session()
if session is None:
return []
if img_bgr is None or img_bgr.size == 0:
return []
orig_h, orig_w = img_bgr.shape[:2]
# Pre-process
input_tensor, scale, pad_x, pad_y = preprocess_image(img_bgr)
# Run inference
try:
input_name = session.get_inputs()[0].name
outputs = session.run(None, {input_name: input_tensor})
except Exception as exc:
logger.warning("PP-DocLayout inference failed: %s", exc)
return []
# Post-process
regions = _postprocess(
outputs,
scale=scale,
pad_x=pad_x,
pad_y=pad_y,
orig_w=orig_w,
orig_h=orig_h,
confidence_threshold=confidence_threshold,
max_regions=max_regions,
)
if regions:
label_counts: Dict[str, int] = {}
for r in regions:
label_counts[r.label] = label_counts.get(r.label, 0) + 1
logger.info(
"PP-DocLayout: %d regions (%s)",
len(regions),
", ".join(f"{k}: {v}" for k, v in sorted(label_counts.items())),
)
else:
logger.debug("PP-DocLayout: no regions above threshold %.2f", confidence_threshold)
return regions
@@ -0,0 +1,422 @@
"""
Graphical element detection for OCR pages.
Region-based approach:
1. Build a color mask (saturation channel — black text is invisible).
2. Dilate heavily to merge nearby colored pixels into regions.
3. For each region, check overlap with OCR word boxes:
- High word overlap → colored text (skip)
- Low word overlap → colored graphic / image (keep)
4. Separately detect large black-ink illustrations via ink mask.
Boxes and text colors are handled by cv_box_detect / cv_color_detect.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import cv2
import numpy as np
logger = logging.getLogger(__name__)
__all__ = ["detect_graphic_elements", "GraphicElement"]
@dataclass
class GraphicElement:
"""A detected non-text graphical element."""
x: int
y: int
width: int
height: int
area: int
shape: str # image, illustration
color_name: str # dominant color or 'black'
color_hex: str
confidence: float
contour: Any = field(default=None, repr=False)
# ---------------------------------------------------------------------------
# Color helpers
# ---------------------------------------------------------------------------
_COLOR_HEX = {
"black": "#000000",
"gray": "#6b7280",
"red": "#dc2626",
"orange": "#ea580c",
"yellow": "#ca8a04",
"green": "#16a34a",
"blue": "#2563eb",
"purple": "#9333ea",
}
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple:
"""Return (color_name, color_hex) for an HSV region."""
if hsv_roi.size == 0:
return "black", _COLOR_HEX["black"]
pixels = hsv_roi.reshape(-1, 3)
sat = pixels[:, 1]
sat_mask = sat > sat_threshold
sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0
if sat_ratio < 0.15:
return "black", _COLOR_HEX["black"]
sat_pixels = pixels[sat_mask]
if len(sat_pixels) < 3:
return "black", _COLOR_HEX["black"]
med_hue = float(np.median(sat_pixels[:, 0]))
if med_hue < 10 or med_hue > 170:
name = "red"
elif med_hue < 25:
name = "orange"
elif med_hue < 35:
name = "yellow"
elif med_hue < 85:
name = "green"
elif med_hue < 130:
name = "blue"
else:
name = "purple"
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
# ---------------------------------------------------------------------------
# Main detection
# ---------------------------------------------------------------------------
def detect_graphic_elements(
img_bgr: np.ndarray,
word_boxes: List[Dict],
detected_boxes: Optional[List[Dict]] = None,
max_elements: int = 50,
) -> List[GraphicElement]:
"""Find non-text graphical regions on the page.
Region-based: dilate color mask to form regions, then check word
overlap to distinguish colored text from colored graphics.
Args:
img_bgr: BGR color image.
word_boxes: List of OCR word dicts with left/top/width/height.
detected_boxes: Optional list of detected box dicts (x/y/w/h).
max_elements: Maximum number of elements to return.
Returns:
List of GraphicElement, sorted by area descending.
"""
if img_bgr is None:
return []
# ------------------------------------------------------------------
# Try PP-DocLayout ONNX first if available
# ------------------------------------------------------------------
import os
backend = os.environ.get("GRAPHIC_DETECT_BACKEND", "auto")
if backend in ("doclayout", "auto"):
try:
from cv_doclayout_detect import detect_layout_regions, is_doclayout_available
if is_doclayout_available():
regions = detect_layout_regions(img_bgr)
if regions:
_LABEL_TO_COLOR = {
"figure": ("image", "green", _COLOR_HEX.get("green", "#16a34a")),
"table": ("image", "blue", _COLOR_HEX.get("blue", "#2563eb")),
}
converted: List[GraphicElement] = []
for r in regions:
shape, color_name, color_hex = _LABEL_TO_COLOR.get(
r.label,
(r.label, "gray", _COLOR_HEX.get("gray", "#6b7280")),
)
converted.append(GraphicElement(
x=r.x,
y=r.y,
width=r.width,
height=r.height,
area=r.width * r.height,
shape=shape,
color_name=color_name,
color_hex=color_hex,
confidence=r.confidence,
contour=None,
))
converted.sort(key=lambda g: g.area, reverse=True)
result = converted[:max_elements]
if result:
shape_counts: Dict[str, int] = {}
for g in result:
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
logger.info(
"GraphicDetect (PP-DocLayout): %d elements (%s)",
len(result),
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
)
return result
except Exception as e:
logger.warning("PP-DocLayout failed, falling back to OpenCV: %s", e)
# ------------------------------------------------------------------
# OpenCV fallback (original logic)
# ------------------------------------------------------------------
h, w = img_bgr.shape[:2]
logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
w, h, len(word_boxes), len(detected_boxes or []))
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
candidates: List[GraphicElement] = []
# --- Build word mask (for overlap checking) ---
word_mask = np.zeros((h, w), dtype=np.uint8)
for wb in word_boxes:
x1 = max(0, int(wb.get("left", 0)))
y1 = max(0, int(wb.get("top", 0)))
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)))
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)))
word_mask[y1:y2, x1:x2] = 255
# =====================================================================
# PASS 1 — COLORED IMAGE REGIONS
# =====================================================================
# Color mask: saturated pixels (black text has sat ≈ 0 → invisible)
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
color_pixels = cv2.bitwise_and(sat_mask, val_mask)
# Remove tiny speckle
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open)
# Count raw colored pixels before dilation (for density check later)
color_pixel_raw = color_pixels.copy()
# Heavy dilation to merge nearby colored elements into regions.
# A 25x25 kernel merges elements within ~12px of each other.
kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25))
region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1)
contours_regions, _ = cv2.findContours(
region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
for cnt in contours_regions:
bx, by, bw, bh = cv2.boundingRect(cnt)
# Skip tiny regions
if bw < 15 or bh < 15:
continue
# Skip page-spanning regions
if bw > w * 0.6 or bh > h * 0.6:
logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
continue
bbox_area = bw * bh
# Check: how much of this region's bounding box overlaps with words?
roi_words = word_mask[by:by + bh, bx:bx + bw]
word_pixel_count = int(np.sum(roi_words > 0))
word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
# Check: how many OCR word centroids fall inside this region?
# Colored text that OCR detected will have multiple centroids inside.
# Actual images may have 0-1 spurious OCR artifacts.
word_centroid_count = sum(
1 for wb in word_boxes
if (bx <= int(wb.get("left", 0) + wb.get("width", 0) / 2) <= bx + bw
and by <= int(wb.get("top", 0) + wb.get("height", 0) / 2) <= by + bh)
)
# Check: how many actual colored pixels are in this region?
roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
color_pixel_count = int(np.sum(roi_color > 0))
# Color pixel density (before any skip checks so we can log it)
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
# --- Skip heuristics for colored TEXT (not images) ---
# (a) High word-box pixel overlap → clearly text
if word_overlap > 0.40:
logger.info(
"GraphicDetect PASS1 skip text-overlap (%d,%d) %dx%d "
"overlap=%.0f%% centroids=%d",
bx, by, bw, bh, word_overlap * 100, word_centroid_count,
)
continue
# (b) Multiple OCR words detected inside → colored text
# (images rarely produce 2+ confident word detections)
if word_centroid_count >= 2:
logger.info(
"GraphicDetect PASS1 skip multi-word (%d,%d) %dx%d "
"centroids=%d overlap=%.0f%% density=%.0f%%",
bx, by, bw, bh, word_centroid_count,
word_overlap * 100, density * 100,
)
continue
# (c) Even 1 word + some pixel overlap → likely text
if word_centroid_count >= 1 and word_overlap > 0.10:
logger.info(
"GraphicDetect PASS1 skip word+overlap (%d,%d) %dx%d "
"centroids=%d overlap=%.0f%%",
bx, by, bw, bh, word_centroid_count, word_overlap * 100,
)
continue
# Need a minimum number of colored pixels (not just dilated area)
if color_pixel_count < 200:
continue
# (d) Very low density → thin strokes, almost certainly text.
# Large regions (photos/illustrations) can have low color density
# because most pixels are grayscale ink. Use a lower threshold
# for regions bigger than 100×80 px.
_min_density = 0.05 if (bw > 100 and bh > 80) else 0.20
if density < _min_density:
logger.info(
"GraphicDetect PASS1 skip low-density (%d,%d) %dx%d "
"density=%.0f%% (min=%.0f%%, likely colored text)",
bx, by, bw, bh, density * 100, _min_density * 100,
)
continue
# (e) Moderate density + small height → colored text line
if density < 0.35 and bh < h * 0.05:
logger.info(
"GraphicDetect PASS1 skip text-height (%d,%d) %dx%d "
"density=%.0f%% height=%.1f%%",
bx, by, bw, bh, density * 100, 100.0 * bh / h,
)
continue
# Determine dominant color from the actual colored pixels
roi_hsv = hsv[by:by + bh, bx:bx + bw]
color_px_mask = roi_color > 0
if np.sum(color_px_mask) > 0:
masked_hsv = roi_hsv[color_px_mask]
color_name, color_hex = _dominant_color(masked_hsv)
else:
color_name, color_hex = "black", _COLOR_HEX["black"]
# Confidence based on color density and low word overlap
conf = min(0.95, 0.5 + density * 0.5)
logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d density=%.0f%% overlap=%.0f%% %s",
bx, by, bw, bh, color_pixel_count, density * 100, word_overlap * 100, color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=color_pixel_count,
shape="image",
color_name=color_name, color_hex=color_hex,
confidence=round(conf, 2), contour=cnt,
))
# =====================================================================
# PASS 2 — LARGE BLACK-INK ILLUSTRATIONS
# =====================================================================
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Exclude words and colored regions already found
exclusion = np.zeros((h, w), dtype=np.uint8)
word_pad = 5
for wb in word_boxes:
x1 = max(0, int(wb.get("left", 0)) - word_pad)
y1 = max(0, int(wb.get("top", 0)) - word_pad)
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad)
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
exclusion[y1:y2, x1:x2] = 255
if detected_boxes:
for box in detected_boxes:
bbx = int(box.get("x", 0))
bby = int(box.get("y", 0))
bbw = int(box.get("w", box.get("width", 0)))
bbh = int(box.get("h", box.get("height", 0)))
inset = 8
x1 = max(0, bbx + inset)
y1 = max(0, bby + inset)
x2 = min(w, bbx + bbw - inset)
y2 = min(h, bby + bbh - inset)
if x2 > x1 and y2 > y1:
exclusion[y1:y2, x1:x2] = 255
ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels))
contours_ink, _ = cv2.findContours(
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink))
for cnt in contours_ink:
area = cv2.contourArea(cnt)
bx, by, bw, bh = cv2.boundingRect(cnt)
if area < 5000 or min(bw, bh) < 40:
continue
if bw > w * 0.8 or bh > h * 0.8:
continue
logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d",
bx, by, bw, bh, int(area))
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="illustration",
color_name="black", color_hex="#000000",
confidence=0.5, contour=cnt,
))
# =====================================================================
# Deduplicate and return
# =====================================================================
candidates.sort(key=lambda g: g.area, reverse=True)
final: List[GraphicElement] = []
for c in candidates:
overlap = False
for f in final:
ix1 = max(c.x, f.x)
iy1 = max(c.y, f.y)
ix2 = min(c.x + c.width, f.x + f.width)
iy2 = min(c.y + c.height, f.y + f.height)
if ix2 > ix1 and iy2 > iy1:
inter = (ix2 - ix1) * (iy2 - iy1)
smaller = min(c.width * c.height, f.width * f.height)
if smaller > 0 and inter / smaller > 0.5:
overlap = True
break
if not overlap:
final.append(c)
result = final[:max_elements]
if result:
shape_counts: Dict[str, int] = {}
for g in result:
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
logger.info(
"GraphicDetect: %d elements found (%s)",
len(result),
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
)
else:
logger.info("GraphicDetect: no graphic elements found")
return result
@@ -0,0 +1,231 @@
"""
Syllable Core — hyphenator init, word validation, pipe autocorrect.
Extracted from cv_syllable_detect.py for modularity.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# IPA/phonetic characters -- skip cells containing these
_IPA_RE = re.compile(r'[\[\]\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u00e6\u0254\u0259\u025b\u025c\u026a\u028a\u028c]')
# Common German words that should NOT be merged with adjacent tokens.
_STOP_WORDS = frozenset([
# Articles
'der', 'die', 'das', 'dem', 'den', 'des',
'ein', 'eine', 'einem', 'einen', 'einer',
# Pronouns
'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
# Prepositions
'mit', 'von', 'zu', 'f\u00fcr', 'auf', 'in', 'an', 'um', 'am', 'im',
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', '\u00fcber', 'unter',
'zwischen', 'ohne', 'gegen',
# Conjunctions
'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
# Adverbs
'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
# Verbs
'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
'sein', 'haben',
# Other
'kein', 'keine', 'keinem', 'keinen', 'keiner',
])
# Cached hyphenators
_hyph_de = None
_hyph_en = None
# Cached spellchecker (for autocorrect_pipe_artifacts)
_spell_de = None
def _get_hyphenators():
"""Lazy-load pyphen hyphenators (cached across calls)."""
global _hyph_de, _hyph_en
if _hyph_de is not None:
return _hyph_de, _hyph_en
try:
import pyphen
except ImportError:
return None, None
_hyph_de = pyphen.Pyphen(lang='de_DE')
_hyph_en = pyphen.Pyphen(lang='en_US')
return _hyph_de, _hyph_en
def _get_spellchecker():
"""Lazy-load German spellchecker (cached across calls)."""
global _spell_de
if _spell_de is not None:
return _spell_de
try:
from spellchecker import SpellChecker
except ImportError:
return None
_spell_de = SpellChecker(language='de')
return _spell_de
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
"""Check whether pyphen recognises a word (DE or EN)."""
if len(word) < 2:
return False
return ('|' in hyph_de.inserted(word, hyphen='|')
or '|' in hyph_en.inserted(word, hyphen='|'))
def _is_real_word(word: str) -> bool:
"""Check whether spellchecker knows this word (case-insensitive)."""
spell = _get_spellchecker()
if spell is None:
return False
return word.lower() in spell
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
"""Try to hyphenate a word using DE then EN dictionary.
Returns word with | separators, or None if not recognized.
"""
hyph = hyph_de.inserted(word, hyphen='|')
if '|' in hyph:
return hyph
hyph = hyph_en.inserted(word, hyphen='|')
if '|' in hyph:
return hyph
return None
def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
"""Try to correct a word that has OCR pipe artifacts.
Printed syllable divider lines on dictionary pages confuse OCR:
the vertical stroke is often read as an extra character (commonly
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
Uses ``spellchecker`` (frequency-based word list) for validation.
Strategy:
1. Strip ``|`` -- if spellchecker knows the result, done.
2. Try deleting each pipe-like character (l, I, 1, i, t).
3. Fall back to spellchecker's own ``correction()`` method.
4. Preserve the original casing of the first letter.
"""
stripped = word_with_pipes.replace('|', '')
if not stripped or len(stripped) < 3:
return stripped # too short to validate
# Step 1: if the stripped word is already a real word, done
if _is_real_word(stripped):
return stripped
# Step 2: try deleting pipe-like characters (most likely artifacts)
_PIPE_LIKE = frozenset('lI1it')
for idx in range(len(stripped)):
if stripped[idx] not in _PIPE_LIKE:
continue
candidate = stripped[:idx] + stripped[idx + 1:]
if len(candidate) >= 3 and _is_real_word(candidate):
return candidate
# Step 3: use spellchecker's built-in correction
spell = _get_spellchecker()
if spell is not None:
suggestion = spell.correction(stripped.lower())
if suggestion and suggestion != stripped.lower():
# Preserve original first-letter case
if stripped[0].isupper():
suggestion = suggestion[0].upper() + suggestion[1:]
return suggestion
return None # could not fix
def autocorrect_pipe_artifacts(
zones_data: List[Dict], session_id: str,
) -> int:
"""Strip OCR pipe artifacts and correct garbled words in-place.
Printed syllable divider lines on dictionary scans are read by OCR
as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
This function:
1. Strips ``|`` from every word in content cells.
2. Validates with spellchecker (real dictionary lookup).
3. If not recognised, tries deleting pipe-like characters or uses
spellchecker's correction (e.g. ``Zeplpelin`` -> ``Zeppelin``).
4. Updates both word-box texts and cell text.
Returns the number of cells modified.
"""
spell = _get_spellchecker()
if spell is None:
logger.warning("spellchecker not available -- pipe autocorrect limited")
# Fall back: still strip pipes even without spellchecker
pass
modified = 0
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
cell_changed = False
# --- Fix word boxes ---
for wb in cell.get("word_boxes", []):
wb_text = wb.get("text", "")
if "|" not in wb_text:
continue
# Separate trailing punctuation
m = re.match(
r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)'
r'(.*?)'
r'([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$',
wb_text,
)
if not m:
continue
lead, core, trail = m.group(1), m.group(2), m.group(3)
if "|" not in core:
continue
corrected = _autocorrect_piped_word(core)
if corrected is not None and corrected != core:
wb["text"] = lead + corrected + trail
cell_changed = True
# --- Rebuild cell text from word boxes ---
if cell_changed:
wbs = cell.get("word_boxes", [])
if wbs:
cell["text"] = " ".join(
(wb.get("text") or "") for wb in wbs
)
modified += 1
# --- Fallback: strip residual | from cell text ---
text = cell.get("text", "")
if "|" in text:
clean = text.replace("|", "")
if clean != text:
cell["text"] = clean
if not cell_changed:
modified += 1
if modified:
logger.info(
"build-grid session %s: autocorrected pipe artifacts in %d cells",
session_id, modified,
)
return modified
@@ -0,0 +1,32 @@
"""
Syllable divider insertion for dictionary pages — barrel re-export.
All implementation split into:
cv_syllable_core — hyphenator init, word validation, pipe autocorrect
cv_syllable_merge — word gap merging, syllabification, divider insertion
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
# Core: init, validation, autocorrect
from cv_syllable_core import ( # noqa: F401
_IPA_RE,
_STOP_WORDS,
_get_hyphenators,
_get_spellchecker,
_is_known_word,
_is_real_word,
_hyphenate_word,
_autocorrect_piped_word,
autocorrect_pipe_artifacts,
)
# Merge: gap merging, syllabify, insert
from cv_syllable_merge import ( # noqa: F401
_try_merge_pipe_gaps,
merge_word_gaps_in_zones,
_try_merge_word_gaps,
_syllabify_text,
insert_syllable_dividers,
)
@@ -0,0 +1,300 @@
"""
Syllable Merge — word gap merging, syllabification, divider insertion.
Extracted from cv_syllable_detect.py for modularity.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List, Optional
import numpy as np
from cv_syllable_core import (
_get_hyphenators,
_hyphenate_word,
_IPA_RE,
_STOP_WORDS,
)
logger = logging.getLogger(__name__)
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
"""Merge fragments separated by single spaces where OCR split at a pipe.
Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
Guards against false merges:
- The FIRST token must be pure alpha (word start -- no attached punctuation)
- The second token may have trailing punctuation (comma, period) which
stays attached to the merged word: "Ka" + "fer," -> "Kafer,"
- Common German function words (der, die, das, ...) are never merged
- At least one fragment must be very short (<=3 alpha chars)
"""
parts = text.split(' ')
if len(parts) < 2:
return text
result = [parts[0]]
i = 1
while i < len(parts):
prev = result[-1]
curr = parts[i]
# Extract alpha-only core for lookup
prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev)
curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr)
# Guard 1: first token must be pure alpha (word-start fragment)
# second token may have trailing punctuation
# Guard 2: neither alpha core can be a common German function word
# Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
# Guard 4: combined length must be >= 4
should_try = (
prev == prev_alpha # first token: pure alpha (word start)
and prev_alpha and curr_alpha
and prev_alpha.lower() not in _STOP_WORDS
and curr_alpha.lower() not in _STOP_WORDS
and min(len(prev_alpha), len(curr_alpha)) <= 3
and len(prev_alpha) + len(curr_alpha) >= 4
)
if should_try:
merged_alpha = prev_alpha + curr_alpha
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
if '-' in hyph:
# pyphen recognizes merged word -- collapse the space
result[-1] = prev + curr
i += 1
continue
result.append(curr)
i += 1
return ' '.join(result)
def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
"""Merge OCR word-gap fragments in cell texts using pyphen validation.
OCR often splits words at syllable boundaries into separate word_boxes,
producing text like "zerknit tert" instead of "zerknittert". This
function tries to merge adjacent fragments in every content cell.
More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
but still guarded by pyphen dictionary lookup and stop-word exclusion.
Returns the number of cells modified.
"""
hyph_de, _ = _get_hyphenators()
if hyph_de is None:
return 0
modified = 0
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
text = cell.get("text", "")
if not text or " " not in text:
continue
# Skip IPA cells
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
if _IPA_RE.search(text_no_brackets):
continue
new_text = _try_merge_word_gaps(text, hyph_de)
if new_text != text:
cell["text"] = new_text
modified += 1
if modified:
logger.info(
"build-grid session %s: merged word gaps in %d cells",
session_id, modified,
)
return modified
def _try_merge_word_gaps(text: str, hyph_de) -> str:
"""Merge OCR word fragments with relaxed threshold (max_short=5).
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
(max_short=5 instead of 3). Still requires pyphen to recognize the
merged word.
"""
parts = text.split(' ')
if len(parts) < 2:
return text
result = [parts[0]]
i = 1
while i < len(parts):
prev = result[-1]
curr = parts[i]
prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev)
curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr)
should_try = (
prev == prev_alpha
and prev_alpha and curr_alpha
and prev_alpha.lower() not in _STOP_WORDS
and curr_alpha.lower() not in _STOP_WORDS
and min(len(prev_alpha), len(curr_alpha)) <= 5
and len(prev_alpha) + len(curr_alpha) >= 4
)
if should_try:
merged_alpha = prev_alpha + curr_alpha
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
if '-' in hyph:
result[-1] = prev + curr
i += 1
continue
result.append(curr)
i += 1
return ' '.join(result)
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
"""Syllabify all significant words in a text string.
1. Strip existing | dividers
2. Merge pipe-gap spaces where possible
3. Apply pyphen to each word >= 3 alphabetic chars
4. Words pyphen doesn't recognize stay as-is (no bad guesses)
"""
if not text:
return text
# Skip cells that contain IPA transcription characters outside brackets.
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
if _IPA_RE.search(text_no_brackets):
return text
# Phase 1: strip existing pipe dividers for clean normalization
clean = text.replace('|', '')
# Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
clean = _try_merge_pipe_gaps(clean, hyph_de)
# Phase 3: tokenize and syllabify each word
# Split on whitespace and comma/semicolon sequences, keeping separators
tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
result = []
for tok in tokens:
if not tok or re.match(r'^[\s,;:]+$', tok):
result.append(tok)
continue
# Strip trailing/leading punctuation for pyphen lookup
m = re.match(r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)(.*?)([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$', tok)
if not m:
result.append(tok)
continue
lead, word, trail = m.group(1), m.group(2), m.group(3)
if len(word) < 3 or not re.search(r'[a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df]', word):
result.append(tok)
continue
hyph = _hyphenate_word(word, hyph_de, hyph_en)
if hyph:
result.append(lead + hyph + trail)
else:
result.append(tok)
return ''.join(result)
def insert_syllable_dividers(
zones_data: List[Dict],
img_bgr: np.ndarray,
session_id: str,
*,
force: bool = False,
col_filter: Optional[set] = None,
) -> int:
"""Insert pipe syllable dividers into dictionary cells.
For dictionary pages: process all content column cells, strip existing
pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
Pre-check: at least 1% of content cells must already contain ``|`` from
OCR. This guards against pages with zero pipe characters.
Args:
force: If True, skip the pipe-ratio pre-check and syllabify all
content words regardless of whether the original has pipe dividers.
col_filter: If set, only process cells whose col_type is in this set.
None means process all content columns.
Returns the number of cells modified.
"""
hyph_de, hyph_en = _get_hyphenators()
if hyph_de is None:
logger.warning("pyphen not installed -- skipping syllable insertion")
return 0
# Pre-check: count cells that already have | from OCR.
if not force:
total_col_cells = 0
cells_with_pipes = 0
for z in zones_data:
for cell in z.get("cells", []):
if cell.get("col_type", "").startswith("column_"):
total_col_cells += 1
if "|" in cell.get("text", ""):
cells_with_pipes += 1
if total_col_cells > 0:
pipe_ratio = cells_with_pipes / total_col_cells
if pipe_ratio < 0.01:
logger.info(
"build-grid session %s: skipping syllable insertion -- "
"only %.1f%% of cells have existing pipes (need >=1%%)",
session_id, pipe_ratio * 100,
)
return 0
insertions = 0
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
if col_filter is not None and ct not in col_filter:
continue
text = cell.get("text", "")
if not text:
continue
# In auto mode (force=False), only normalize cells that already
# have | from OCR (i.e. printed syllable dividers on the original
# scan). Don't add new syllable marks to other words.
if not force and "|" not in text:
continue
new_text = _syllabify_text(text, hyph_de, hyph_en)
if new_text != text:
cell["text"] = new_text
insertions += 1
if insertions:
logger.info(
"build-grid session %s: syllable dividers inserted/normalized "
"in %d cells (pyphen)",
session_id, insertions,
)
return insertions
@@ -0,0 +1,493 @@
"""
Cell text filtering, column/row word assignment, and bold detection.
This module contains:
- _assign_row_words_to_columns(): spatial assignment of OCR words to grid columns
- Cell text noise filtering (_clean_cell_text, _clean_cell_text_lite, etc.)
- Bold detection via stroke-width analysis (_measure_stroke_width, _classify_bold_cells)
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import re
import logging
from typing import Any, Dict, List, Optional
import numpy as np
from cv_vocab_types import PageRegion, RowGeometry
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
# ---------------------------------------------------------------------------
# Column / Row word assignment
# ---------------------------------------------------------------------------
def _assign_row_words_to_columns(
row: RowGeometry,
columns: List[PageRegion],
) -> Dict[int, List[Dict]]:
"""Assign each word in a row to exactly one column.
Uses a two-pass strategy:
1. Containment: if a word's center falls within a column's horizontal
bounds (with padding), assign it to that column.
2. Nearest center: for words not contained by any column, fall back to
nearest column center distance.
This prevents long sentences in wide columns (e.g. example) from having
their rightmost words stolen by an adjacent column.
Args:
row: Row with words (relative coordinates).
columns: Sorted list of columns (absolute coordinates).
Returns:
Dict mapping col_index -> list of words assigned to that column.
"""
result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
if not row.words or not columns:
return result
left_x = row.x # content ROI left (absolute)
# Build non-overlapping column assignment ranges using midpoints.
# For adjacent columns, the boundary is the midpoint between them.
# This prevents words near column borders from being assigned to
# the wrong column (e.g. "We" at the start of an example sentence
# being stolen by the preceding DE column).
n = len(columns)
col_ranges_rel = [] # (assign_left, assign_right) per column
for ci, col in enumerate(columns):
col_left_rel = col.x - left_x
col_right_rel = col_left_rel + col.width
# Left boundary: midpoint to previous column, or 0
if ci == 0:
assign_left = 0
else:
prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
assign_left = (prev_right + col_left_rel) / 2
# Right boundary: midpoint to next column, or infinity (row width)
if ci == n - 1:
assign_right = row.width + 100 # generous for last column
else:
next_left = columns[ci + 1].x - left_x
assign_right = (col_right_rel + next_left) / 2
col_ranges_rel.append((assign_left, assign_right))
for w in row.words:
w_left = w['left']
w_right = w_left + w['width']
w_center_x = w_left + w['width'] / 2
# Primary: overlap-based matching — assign to column with most overlap.
# This is more robust than center-based for narrow columns (page_ref)
# where the last character's center may fall into the next column.
best_col = -1
best_overlap = 0
for ci, col in enumerate(columns):
col_left_rel = col.x - left_x
col_right_rel = col_left_rel + col.width
overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
if overlap > best_overlap:
best_overlap = overlap
best_col = ci
if best_col >= 0 and best_overlap > 0:
result[best_col].append(w)
else:
# Fallback: center-based range matching
assigned = False
for ci, (al, ar) in enumerate(col_ranges_rel):
if al <= w_center_x < ar:
result[ci].append(w)
assigned = True
break
if not assigned:
# Last resort: nearest column center
best_col = 0
col_left_0 = columns[0].x - left_x
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
for ci in range(1, n):
col_left = columns[ci].x - left_x
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
if dist < best_dist:
best_dist = dist
best_col = ci
result[best_col].append(w)
return result
# ---------------------------------------------------------------------------
# Cell text noise filtering
# ---------------------------------------------------------------------------
# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
# Common short EN/DE words (2-3 chars). Tokens at the end of a cell
# that do NOT appear here are treated as trailing OCR noise.
_COMMON_SHORT_WORDS: set = {
# EN 1-2 letter
'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
'or', 'so', 'to', 'up', 'us', 'we',
# EN 3 letter
'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
'zap', 'zip', 'zoo',
# DE 2-3 letter
'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
'wut', 'zum', 'zur',
}
# Known abbreviations found in EN/DE textbooks and dictionaries.
# Stored WITHOUT trailing period (the noise filter strips periods).
# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
_KNOWN_ABBREVIATIONS: set = {
# EN dictionary meta-words
'sth', 'sb', 'smth', 'smb', 'sbd',
# EN general
'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
# EN references / textbook
'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
'ans', 'wb', 'tb', 'vocab',
# EN parts of speech / grammar
'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
'syn', 'ant', 'opp', 'var', 'orig',
# EN titles
'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
# EN pronunciation
'br', 'am', 'brit', 'amer',
# EN units
'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
# DE general
'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
'bes', 'insb', 'insbes', 'bspw', 'ca',
'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
'inkl', 'exkl', 'zzgl', 'abzgl',
# DE references
'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
's', 'sp', 'zit', 'zs', 'vlg',
# DE grammar
'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
'trennb', 'untrennb', 'ugs', 'geh', 'pej',
# DE regional
'nordd', 'österr', 'schweiz',
# Linguistic
'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
'count', 'uncount', 'indef', 'def', 'poss', 'demon',
}
def _is_noise_tail_token(token: str) -> bool:
"""Check if a token at the END of cell text is trailing OCR noise.
Trailing fragments are very common OCR artifacts from image edges,
borders, and neighbouring cells. This is more aggressive than a
general word filter: any short token that isn't in the dictionary
of common EN/DE words is considered noise.
Examples of noise: "Es)", "3", "ee", "B"
Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]"
"""
t = token.strip()
if not t:
return True
# Keep ellipsis
if t in ('...', ''):
return False
# Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
return False
if t.endswith(']'):
return False
# Keep meaningful punctuation tokens used in textbooks
# = (definition marker), (= (definition opener), ; (separator)
if t in ('=', '(=', '=)', ';', ':', '-', '', '', '/', '+', '&'):
return False
# Pure non-alpha -> noise ("3", ")", "|")
alpha_chars = _RE_ALPHA.findall(t)
if not alpha_chars:
return True
# Extract only alpha characters for dictionary lookup
cleaned = ''.join(alpha_chars)
# Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
if cleaned.lower() in _KNOWN_ABBREVIATIONS:
return False
# Strip normal trailing punctuation before checking for internal noise.
stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." -> "cupcakes"
t_check = stripped_punct if stripped_punct else t
# Check for legitimate punctuation patterns vs. real noise.
# Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
# "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
# Noise: "3d", "B|", "x7"
# Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
# THEN check if residual contains only alpha characters.
t_inner = t_check
# Remove all parentheses, hyphens, slashes, and dots — these are normal
# in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
# "(zer)brechen", "wir/uns", "e.g."
t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
# Now check: does the inner form still have non-alpha noise?
inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
# Long alpha words (4+ chars) without internal noise are likely real
if len(cleaned) >= 4 and not has_internal_noise:
return False
# Short words: check dictionary (uses only alpha chars)
if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
return False
# Default: short or suspicious -> noise
return True
def _is_garbage_text(text: str) -> bool:
"""Check if entire cell text is OCR garbage from image areas.
Garbage text = no recognizable dictionary word. Catches
"(ci]oeu", "uanoaain." etc.
"""
words = _RE_REAL_WORD.findall(text)
if not words:
# Check if any token is a known abbreviation (e.g. "e.g.")
alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
if alpha_only in _KNOWN_ABBREVIATIONS:
return False
return True
for w in words:
wl = w.lower()
# Known short word or abbreviation -> not garbage
if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
return False
# Long word (>= 4 chars): check vowel/consonant ratio.
# Real EN/DE words have 20-60% vowels. Garbage like "uanoaain"
# or "cioeu" has unusual ratios (too many or too few vowels).
if len(wl) >= 4:
vowels = sum(1 for c in wl if c in 'aeiouäöü')
ratio = vowels / len(wl)
if 0.15 <= ratio <= 0.65:
return False # plausible vowel ratio -> real word
return True
def _clean_cell_text(text: str) -> str:
"""Remove OCR noise from cell text. Generic filters:
1. If the entire text has no real alphabetic word (>= 2 letters), clear.
2. If the entire text is garbage (no dictionary word), clear.
3. Strip trailing noise tokens from the end of the text.
"""
stripped = text.strip()
if not stripped:
return ''
# --- Filter 1: No real word at all ---
if not _RE_REAL_WORD.search(stripped):
# Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
if alpha_only not in _KNOWN_ABBREVIATIONS:
return ''
# --- Filter 2: Entire text is garbage ---
if _is_garbage_text(stripped):
return ''
# --- Filter 3: Strip trailing noise tokens ---
tokens = stripped.split()
while tokens and _is_noise_tail_token(tokens[-1]):
tokens.pop()
if not tokens:
return ''
return ' '.join(tokens)
def _clean_cell_text_lite(text: str) -> str:
"""Simplified noise filter for cell-first OCR (isolated cell crops).
Since each cell is OCR'd in isolation (no neighbour content visible),
trailing-noise stripping is unnecessary. Only 2 filters remain:
1. No real alphabetic word (>= 2 letters) and not a known abbreviation -> empty.
2. Entire text is garbage (no dictionary word) -> empty.
"""
stripped = text.strip()
if not stripped:
return ''
# --- Filter 1: No real word at all ---
if not _RE_REAL_WORD.search(stripped):
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
if alpha_only not in _KNOWN_ABBREVIATIONS:
return ''
# --- Filter 2: Entire text is garbage ---
if _is_garbage_text(stripped):
return ''
return stripped
# ---------------------------------------------------------------------------
# Bold detection via stroke-width analysis (relative / page-level)
# ---------------------------------------------------------------------------
def _measure_stroke_width(gray_crop: np.ndarray) -> float:
"""Measure mean stroke width in a binarised cell crop.
Returns a DPI-normalised value (mean stroke width as % of crop height),
or 0.0 if measurement is not possible.
"""
if gray_crop is None or gray_crop.size == 0:
return 0.0
h, w = gray_crop.shape[:2]
if h < 10 or w < 10:
return 0.0
# Binarise: text = white (255), background = black (0)
_, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
if cv2.countNonZero(bw) < 20:
return 0.0
# Distance transform: value at each white pixel = distance to nearest black
dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
# Skeleton via morphological thinning
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
thin = bw.copy()
for _ in range(max(1, min(h, w) // 6)):
eroded = cv2.erode(thin, kernel)
if cv2.countNonZero(eroded) < 5:
break
thin = eroded
skeleton_pts = thin > 0
if not np.any(skeleton_pts):
return 0.0
mean_stroke = float(np.mean(dist[skeleton_pts]))
return mean_stroke / max(h, 1) * 100 # normalised: % of cell height
def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
img_w: int, img_h: int) -> None:
"""Two-pass bold detection: measure all cells, then compare against median.
Cells with stroke width > 1.4x the page median are marked as bold.
This adapts automatically to font, DPI and scan quality.
Modifies cells in-place (sets 'is_bold' key).
"""
if ocr_img is None:
return
# Pass 1: measure stroke width for every cell with text
metrics: List[float] = []
cell_strokes: List[float] = []
for cell in cells:
sw = 0.0
if cell.get('text', '').strip():
bp = cell['bbox_px']
y1 = max(0, bp['y'])
y2 = min(img_h, bp['y'] + bp['h'])
x1 = max(0, bp['x'])
x2 = min(img_w, bp['x'] + bp['w'])
if y2 > y1 and x2 > x1:
sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
cell_strokes.append(sw)
if sw > 0:
metrics.append(sw)
if len(metrics) < 3:
# Too few cells to compare — leave all as non-bold
return
median_sw = float(np.median(metrics))
if median_sw <= 0:
return
# Pass 2: cells significantly above median -> bold
for cell, sw in zip(cells, cell_strokes):
cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
@@ -0,0 +1,189 @@
"""Cell-level IPA phonetic fixes for overlay mode.
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
(entry['english']). But the overlay reads cell['text'] directly, so
phonetic fixes must be applied to cells too.
Split from cv_ocr_engines.py — contains fix_cell_phonetics() and helpers.
"""
import logging
import re
from typing import Any, Dict, List
from cv_vocab_types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import (
_insert_missing_ipa,
_replace_phonetics_in_text,
_text_has_garbled_ipa,
)
from cv_ocr_ipa_repair import (
_has_non_dict_trailing,
_insert_headword_ipa,
_strip_post_bracket_garbled,
)
logger = logging.getLogger(__name__)
def fix_cell_phonetics(
cells: List[Dict[str, Any]],
pronunciation: str = 'british',
) -> List[Dict[str, Any]]:
"""Apply IPA phonetic fixes to cell texts for overlay mode.
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
(entry['english']). But the overlay reads cell['text'] directly, so
phonetic fixes must be applied to cells too.
Processing depends on column type:
- column_en: Full processing (replace garbled IPA + strip orphan brackets
+ insert missing IPA). Safe because these cells contain only English
headwords.
- column_text: Light processing (replace garbled IPA ONLY). No orphan
bracket stripping (brackets may be German content like "(probieren)")
and no IPA insertion (would add tokens and break overlay positioning).
"""
if not IPA_AVAILABLE:
return cells
ipa_col_types = {'column_en', 'column_text'}
replaced = 0
for cell in cells:
col_type = cell.get('col_type', '')
if col_type not in ipa_col_types:
continue
text = cell.get('text', '') or ''
if not text.strip():
continue
if col_type == 'column_en':
# Full processing: replace garbled IPA, strip orphan brackets.
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
if new_text == text:
# Insert IPA when garbled phonetics exist OR when trailing
# non-dictionary words suggest garbled IPA in plain ASCII.
if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
new_text = _insert_missing_ipa(text, pronunciation)
# Strip trailing garbled fragments after proper [IPA] brackets
# (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
if ']' in new_text:
new_text = _strip_post_bracket_garbled(new_text, pronunciation)
else:
# column_text: replace garbled IPA, no orphan stripping
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
# Insert headword IPA ONLY if there's a gap in word_boxes
# suggesting Tesseract missed an IPA bracket on the page.
# Without gap evidence, the original page had no IPA.
if new_text == text:
wb = cell.get('word_boxes', [])
if _has_ipa_gap(text, wb):
inserted = _insert_headword_ipa(text, pronunciation)
if inserted != text:
new_text = inserted
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
if new_text != text:
logger.debug(f"fix_cell_phonetics: '{text}''{new_text}'")
cell['text'] = new_text
replaced += 1
if replaced:
logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
return cells
def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
"""Check if word_boxes show a gap where IPA brackets should be.
On a typical vocab page, the layout is:
headword [ipa] German translation
If Tesseract missed the IPA bracket, the gap between the headword
and the next word (German translation) is unusually large (>80px)
because the IPA occupied physical space on the page.
If no IPA was on the page (e.g. "be good at sth."), the words are
close together (<30px).
"""
if not word_boxes or len(word_boxes) < 2:
return False
tokens = text.split()
if not tokens:
return False
# Find the headword index: skip numeric prefixes like "».55", "0.56"
hw_box_idx = 0
for i, wb in enumerate(word_boxes):
wt = wb.get('text', '')
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
if len(clean) >= 2:
hw_box_idx = i
break
if hw_box_idx >= len(word_boxes) - 1:
return False
# Check gap between headword and the next word_box
hw = word_boxes[hw_box_idx]
next_wb = word_boxes[hw_box_idx + 1]
gap = next_wb['left'] - (hw['left'] + hw['width'])
return gap > 80
def _sync_word_boxes_after_ipa_insert(
cell: Dict[str, Any],
old_text: str,
new_text: str,
) -> None:
"""Insert a synthetic word_box for an IPA token added by IPA insertion.
E.g. "challenge ...""challenge [tʃælɪndʒ] ..."
Adds a new word_box right after the headword's box so the 1:1
token-to-box mapping in the frontend overlay stays consistent.
"""
word_boxes = cell.get('word_boxes')
if not word_boxes:
return
old_tokens = old_text.split()
new_tokens = new_text.split()
if len(new_tokens) != len(old_tokens) + 1:
return # unexpected change, skip
# Find the inserted token by walking both lists in parallel.
# One token in new_tokens won't match — that's the inserted IPA.
insert_idx = -1
j = 0 # index into old_tokens
for i in range(len(new_tokens)):
if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
j += 1
else:
insert_idx = i
break
if insert_idx < 0 or insert_idx >= len(new_tokens):
return
ipa_token = new_tokens[insert_idx]
# The headword is at insert_idx - 1 in old_tokens (and word_boxes)
ref_idx = insert_idx - 1
if ref_idx < 0 or ref_idx >= len(word_boxes):
return
ref_box = word_boxes[ref_idx]
ipa_box = {
'text': ipa_token,
'left': ref_box['left'] + ref_box['width'] + 2,
'top': ref_box['top'],
'width': ref_box['width'],
'height': ref_box['height'],
'conf': ref_box.get('conf', 90),
}
word_boxes.insert(insert_idx, ipa_box)
@@ -0,0 +1,381 @@
"""
OCR engines (RapidOCR, TrOCR, LightOn) and re-exports.
This module contains the OCR engine wrappers and re-exports all functions
from the split sub-modules for backward compatibility.
Sub-modules:
- cv_ocr_word_assembly: Word grouping and text assembly
- cv_ocr_vocab_postprocess: Vocabulary postprocessing (char confusion, comma split)
- cv_ocr_ipa_lookup: Core IPA lookup and bracket handling
- cv_ocr_ipa_repair: Advanced IPA repair (continuation cells, post-bracket cleanup)
- cv_ocr_cell_phonetics: Cell-level phonetics for overlay
- cv_ocr_cell_filter: Cell text filtering, column assignment, bold detection
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import io
import logging
import os
import re
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import (
IPA_AVAILABLE,
PageRegion,
RowGeometry,
_britfone_dict,
_ipa_convert_american,
)
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
try:
from PIL import Image
except ImportError:
Image = None # type: ignore[assignment,misc]
# ── Re-exports from sub-modules (backward compatibility) ──────────────────
from cv_ocr_word_assembly import ( # noqa: F401
_group_words_into_lines,
_words_to_reading_order_lines,
_rejoin_hyphenated,
_words_to_reading_order_text,
_words_to_spaced_text,
)
from cv_ocr_vocab_postprocess import ( # noqa: F401
_CHAR_CONFUSION_RULES,
_DE_INDICATORS_FOR_EN_I,
_fix_character_confusion,
_is_singular_plural_pair,
_split_comma_entries,
_split_by_comma,
_find_best_vocab_match,
_attach_example_sentences,
)
from cv_ocr_ipa_lookup import ( # noqa: F401
_PHONETIC_BRACKET_RE,
_IPA_CHARS,
_MIN_WORD_CONF,
_GRAMMAR_BRACKET_WORDS,
_lookup_ipa,
_fix_phonetic_brackets,
_is_grammar_bracket_content,
_replace_phonetics_in_text,
_text_has_garbled_ipa,
_decompose_compound,
_insert_missing_ipa,
)
from cv_ocr_ipa_repair import ( # noqa: F401
_has_non_dict_trailing,
_strip_post_bracket_garbled,
fix_ipa_continuation_cell,
_insert_headword_ipa,
)
from cv_ocr_cell_phonetics import ( # noqa: F401
fix_cell_phonetics,
_has_ipa_gap,
_sync_word_boxes_after_ipa_insert,
)
from cv_ocr_cell_filter import ( # noqa: F401
_RE_REAL_WORD,
_RE_ALPHA,
_COMMON_SHORT_WORDS,
_KNOWN_ABBREVIATIONS,
_assign_row_words_to_columns,
_is_noise_tail_token,
_is_garbage_text,
_clean_cell_text,
_clean_cell_text_lite,
_measure_stroke_width,
_classify_bold_cells,
)
# ── OCR Engine Wrappers ───────────────────────────────────────────────────
_rapid_engine = None
RAPIDOCR_AVAILABLE = False
try:
from rapidocr import RapidOCR as _RapidOCRClass
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
RAPIDOCR_AVAILABLE = True
logger.info("RapidOCR available — can be used as alternative to Tesseract")
except ImportError:
logger.info("RapidOCR not installed — using Tesseract only")
def _get_rapid_engine():
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
global _rapid_engine
if _rapid_engine is None:
_rapid_engine = _RapidOCRClass(params={
"Rec.lang_type": _LangRec.LATIN,
"Rec.model_type": _ModelType.SERVER,
"Rec.ocr_version": _OCRVersion.PPOCRV5,
"Det.unclip_ratio": 1.3,
"Det.box_thresh": 0.4,
"Global.log_level": "critical",
})
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
return _rapid_engine
def ocr_region_rapid(
img_bgr: np.ndarray,
region: PageRegion,
) -> List[Dict[str, Any]]:
"""Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format."""
engine = _get_rapid_engine()
crop = img_bgr[region.y:region.y + region.height,
region.x:region.x + region.width]
if crop.size == 0:
return []
result = engine(crop)
if result is None or result.boxes is None or result.txts is None:
return []
words = []
boxes = result.boxes
txts = result.txts
scores = result.scores
for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
if not txt or not txt.strip():
continue
xs = [p[0] for p in box]
ys = [p[1] for p in box]
left = int(min(xs))
top = int(min(ys))
w = int(max(xs) - left)
h = int(max(ys) - top)
words.append({
'text': txt.strip(),
'left': left + region.x,
'top': top + region.y,
'width': w,
'height': h,
'conf': int(score * 100),
'region_type': region.type,
})
return words
def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
"""Run TrOCR on a region. Returns line-level word dicts."""
from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
if not _check_trocr_available():
logger.warning("TrOCR not available, falling back to Tesseract")
if region.height > 0 and region.width > 0:
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
if ocr_img_crop is not None:
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
return []
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
if crop.size == 0:
return []
try:
import torch
from PIL import Image as _PILImage
processor, model = get_trocr_model(handwritten=handwritten)
if processor is None or model is None:
logger.warning("TrOCR model not loaded, falling back to Tesseract")
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
lines = _split_into_lines(pil_crop)
if not lines:
lines = [pil_crop]
device = next(model.parameters()).device
all_text = []
confidences = []
for line_img in lines:
pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
with torch.no_grad():
generated_ids = model.generate(pixel_values, max_length=128)
text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
if text_line:
all_text.append(text_line)
confidences.append(0.85 if len(text_line) > 3 else 0.5)
if not all_text:
return []
avg_conf = int(sum(confidences) / len(confidences) * 100)
line_h = region.height // max(len(all_text), 1)
words = []
for i, line in enumerate(all_text):
words.append({
"text": line,
"left": region.x,
"top": region.y + i * line_h,
"width": region.width,
"height": line_h,
"conf": avg_conf,
"region_type": region.type,
})
return words
except Exception as e:
logger.error(f"ocr_region_trocr failed: {e}")
return []
def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
"""Run LightOnOCR-2-1B on a region. Returns line-level word dicts."""
from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
if not _check_lighton_available():
logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
if RAPIDOCR_AVAILABLE and img_bgr is not None:
return ocr_region_rapid(img_bgr, region)
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
if crop.size == 0:
return []
try:
import io
import torch
from PIL import Image as _PILImage
processor, model = get_lighton_model()
if processor is None or model is None:
logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
if RAPIDOCR_AVAILABLE and img_bgr is not None:
return ocr_region_rapid(img_bgr, region)
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
conversation = [{"role": "user", "content": [{"type": "image"}]}]
inputs = processor.apply_chat_template(
conversation, images=[pil_crop],
add_generation_prompt=True, return_tensors="pt"
).to(model.device)
with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=1024)
text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
if not text:
return []
lines = [l.strip() for l in text.split("\n") if l.strip()]
line_h = region.height // max(len(lines), 1)
words = []
for i, line in enumerate(lines):
words.append({
"text": line,
"left": region.x,
"top": region.y + i * line_h,
"width": region.width,
"height": line_h,
"conf": 85,
"region_type": region.type,
})
return words
except Exception as e:
logger.error(f"ocr_region_lighton failed: {e}")
return []
async def ocr_region_paddle(
img_bgr: np.ndarray,
region: Optional["PageRegion"] = None,
) -> List[Dict[str, Any]]:
"""Run OCR via local RapidOCR (default) or remote PaddleOCR (fallback)."""
force_remote = os.environ.get("FORCE_REMOTE_PADDLE", "").strip() == "1"
if not force_remote:
try:
if region is None:
h, w = img_bgr.shape[:2]
_region = PageRegion(type="full_page", x=0, y=0, width=w, height=h)
else:
_region = region
words = ocr_region_rapid(img_bgr, _region)
if words:
logger.info("ocr_region_paddle: used local RapidOCR (%d words)", len(words))
return words
logger.warning("ocr_region_paddle: RapidOCR returned 0 words, trying remote")
except Exception as e:
logger.warning("ocr_region_paddle: RapidOCR failed (%s), trying remote", e)
from services.paddleocr_remote import ocr_remote_paddle
if region is not None:
crop = img_bgr[
region.y : region.y + region.height,
region.x : region.x + region.width,
]
offset_x, offset_y = region.x, region.y
else:
crop = img_bgr
offset_x, offset_y = 0, 0
if crop.size == 0:
return []
h, w = crop.shape[:2]
scale = 1.0
_MAX_DIM = 1500
if max(h, w) > _MAX_DIM:
scale = _MAX_DIM / max(h, w)
new_w, new_h = int(w * scale), int(h * scale)
crop = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
logger.info("ocr_region_paddle: downscaled %dx%d%dx%d (scale=%.2f)",
w, h, new_w, new_h, scale)
success, jpg_buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 90])
if not success:
logger.error("ocr_region_paddle: cv2.imencode failed")
return []
words, _w, _h = await ocr_remote_paddle(jpg_buf.tobytes(), filename="scan.jpg")
logger.info("ocr_region_paddle: used remote PaddleOCR (%d words)", len(words))
inv_scale = 1.0 / scale if scale != 1.0 else 1.0
for wd in words:
wd["left"] = int(wd["left"] * inv_scale) + offset_x
wd["top"] = int(wd["top"] * inv_scale) + offset_y
wd["width"] = int(wd["width"] * inv_scale)
wd["height"] = int(wd["height"] * inv_scale)
if region is not None:
wd["region_type"] = region.type
return words
@@ -0,0 +1,476 @@
"""
IPA lookup and phonetic bracket handling for OCR-extracted vocabulary.
Tesseract and other OCR engines frequently garble IPA phonetic transcriptions
in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)). This module
provides functions to:
- Look up correct IPA pronunciations (British/American) for English words.
- Detect and replace garbled phonetic brackets with dictionary IPA.
- Insert missing IPA for headwords where OCR destroyed the brackets entirely.
- Strip orphan brackets and post-bracket garbled fragments.
- Handle IPA continuation cells (phonetics on a separate row from headword).
All IPA data comes from open-source dictionaries:
- Britfone (MIT) for British English
- eng_to_ipa / CMU (MIT) for American English
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List, Optional
from cv_vocab_types import (
IPA_AVAILABLE,
_britfone_dict,
_ipa_convert_american,
)
logger = logging.getLogger(__name__)
# --- D. Phonetic Bracket IPA Replacement ---
# Pattern: word followed by any bracket type containing phonetic content.
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
# This intentionally matches mixed brackets (e.g. {content]) because
# Tesseract frequently misrecognizes bracket characters.
_PHONETIC_BRACKET_RE = re.compile(
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
)
# Unicode IPA characters — used to distinguish correct IPA (from dictionary
# lookup) from garbled OCR content when stripping orphan brackets.
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
# Minimum word confidence for full-page Tesseract results (0-100).
# Words below this threshold are OCR noise (scanner shadows, borders).
_MIN_WORD_CONF = 30
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
"""Look up IPA for a word using the selected pronunciation dictionary.
Args:
word: English word to look up.
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
Returns:
IPA string or None if not found.
"""
word_lower = word.lower().strip()
if not word_lower:
return None
if pronunciation == 'british' and _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
# Fallback to American if not in Britfone
if _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
return None
if pronunciation == 'american' and _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
# Fallback to Britfone if not in CMU
if _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
return None
# Try any available source
if _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
if _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
return None
def _fix_phonetic_brackets(
entries: List[Dict[str, Any]],
pronunciation: str = 'british',
) -> List[Dict[str, Any]]:
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
- British: "dance [dˈɑːns]" (Britfone, MIT)
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
Only replaces if the word before brackets is found in the dictionary.
"""
if not IPA_AVAILABLE:
return entries
# IPA phonetics only appear in the ENGLISH field of vocab tables.
# German and example fields contain meaningful parenthetical content:
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
# These must NEVER be processed as phonetic transcriptions.
replaced_count = 0
for entry in entries:
text = entry.get('english', '') or ''
if not any(ch in text for ch in '[{('):
continue
new_text = _replace_phonetics_in_text(text, pronunciation)
if new_text != text:
logger.debug(f"_fix_phonetic_brackets: '{text}''{new_text}'")
replaced_count += 1
entry['english'] = new_text
if replaced_count:
logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
return entries
# Grammar particles that appear in brackets after English words:
# cross (with), complain (about/of), agree (on/with), look (sth) up
# These must NOT be replaced with IPA. Only used for the English field
# (German/example fields are never processed for IPA replacement).
_GRAMMAR_BRACKET_WORDS = frozenset({
# English prepositions/particles commonly in vocab tables
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
# English grammar abbreviations used in vocab tables
'sth', 'sb', 'adj', 'adv',
# Number/plural/grammar annotations
'pl', 'sg', 'sing', 'no', 'also', 'auch',
# Regional English markers
'ae', 'be', 'ame', 'bre',
})
def _is_grammar_bracket_content(content: str) -> bool:
"""Return True if bracket content is grammar info in the ENGLISH field.
Grammar info: cross (with), complain (about/of), agree (on/with)
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
Since we only process the English field, we only need to recognize
English grammar particles. Everything else is (garbled) IPA.
"""
if not content:
return False
# Split on / and spaces for patterns like (about/of), (no pl)
tokens = re.split(r'[/\s]+', content.strip().lower())
tokens = [t for t in tokens if t]
if not tokens:
return False
# ALL tokens must be known grammar words
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
def _replace_phonetics_in_text(
text: str,
pronunciation: str = 'british',
strip_orphans: bool = True,
) -> str:
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
We match any bracket type and replace with dictionary IPA if found.
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
Args:
strip_orphans: If True, strip orphan brackets that look like garbled IPA.
Set to False for column_text where brackets may be German content.
"""
if not IPA_AVAILABLE:
return text
def replacer(match):
word = match.group(1)
bracket_content = match.group(2).strip()
full_match = match.group(0)
# Skip if bracket content looks like regular text (multiple words)
if len(bracket_content.split()) > 3:
return full_match
# Look up IPA for the word before brackets
ipa = _lookup_ipa(word, pronunciation)
if ipa:
# Word has IPA → bracket content is phonetic (garbled or correct).
# Exception: grammar particles like cross (with) — keep those.
if _is_grammar_bracket_content(bracket_content):
return full_match
logger.debug(f"phonetic: '{full_match}''{word} [{ipa}]'")
return f"{word} [{ipa}]"
# No IPA for this word — keep as-is
return full_match
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
if strip_orphans:
# Second pass: strip remaining orphan brackets that are garbled IPA.
# These have no word before them (the main regex requires \b word \s* bracket).
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
def _strip_orphan_bracket(m):
content = m.group(1).strip()
# Keep grammar info: (sich beschweren), (about/of)
if _is_grammar_bracket_content(content):
return m.group(0)
# Keep correct IPA (contains Unicode IPA characters)
if any(ch in _IPA_CHARS for ch in content):
return m.group(0)
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
# — they never contain a real word ≥4 letters with proper casing.
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
if len(content_alpha) >= 4:
return m.group(0)
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
return ''
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
text = text.strip()
return text
def _text_has_garbled_ipa(text: str) -> bool:
"""Check if text contains garbled IPA-like fragments from OCR.
Returns True if there is evidence of OCR-mangled phonetic
transcription, e.g. stress marks, length marks, or IPA special chars.
This is used to decide whether ``_insert_missing_ipa`` should run:
it must only insert IPA to *replace* garbled phonetics that are already
in the text — never to ADD phonetics where none existed on the page.
"""
# Bracketed text that doesn't contain valid IPA symbols is garbled OCR
# of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
stripped = text.strip()
if stripped.startswith('[') and stripped.endswith(']'):
inner = stripped[1:-1]
# Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
# Not a valid dictionary-style bracket like "(no pl)" — those
# use parentheses, not square brackets. Square brackets with
# no IPA chars are garbled phonetics.
return True
for w in text.strip().split():
# Skip delimiters and very short tokens
if len(w) <= 1 or w in ('', '', '-', '/', '|', ',', ';'):
continue
# Starts with stress mark (OCR read IPA stress ' as apostrophe)
if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
return True
if w.startswith("\u02c8") or w.startswith("\u02cc"): # ˈ ˌ
return True
# Contains IPA length mark ':' in a short non-word fragment
if ':' in w and len(w) < 12:
# But not things like "3:00" (time) or common words
stripped = re.sub(r'[^a-zA-Z:]', '', w)
if ':' in stripped and not stripped.replace(':', '').isalpha():
continue
return True
# Contains IPA special characters
if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
return True
# Embedded apostrophe suggesting merged garbled IPA with stress mark.
# E.g. "Scotland'skotland" — OCR reads ˈ as '.
# Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
# chars to avoid contractions (don't, won't, o'clock).
if "'" in w and not w.startswith("'"):
apos_idx = w.index("'")
after = w[apos_idx + 1:]
if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
return True
return False
def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
"""Try to decompose a compound word and concatenate IPA for each part.
E.g. "schoolbag""school"+"bag" → IPA for both concatenated.
Only returns IPA if ALL parts are found in the dictionary.
Tries splits at every position (min 3 chars per part) and picks the
split where the first part is longest.
"""
if not IPA_AVAILABLE:
return None
lower = word.lower().strip()
if len(lower) < 6:
return None # too short for a compound
best_ipa = None
best_first_len = 0
for split_pos in range(3, len(lower) - 2): # min 3 chars each part
first = lower[:split_pos]
second = lower[split_pos:]
ipa_first = _lookup_ipa(first, pronunciation)
ipa_second = _lookup_ipa(second, pronunciation)
if ipa_first and ipa_second:
if split_pos > best_first_len:
best_first_len = split_pos
best_ipa = ipa_first + ipa_second
return best_ipa
def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
"""Insert IPA pronunciation for English words that have no brackets at all.
OCR sometimes garbles the phonetic transcription into plain-text fragments
(e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text
for the headword, inserts correct [IPA], and strips the garbled fragments.
Only inserts for words that:
- are standalone (not already followed by a bracket)
- have an IPA entry in the dictionary
- appear to be English headwords (at the start of text or after common
separators like ",", ";", "")
This is intentionally conservative: it only inserts at the END of each
whitespace-separated token group to avoid breaking phrases.
"""
if not IPA_AVAILABLE:
return text
if not text or not text.strip():
return text
# Skip if already has brackets (IPA replacement handles those)
if any(ch in text for ch in '[{('):
return text
# Only process short text fragments (typical vocab cells).
# Long sentences / paragraphs should not get IPA insertions.
words = text.strip().split()
if len(words) > 6:
return text
# Try to insert IPA for the first alphanumeric word
# Typical patterns: "challenge", "profit", "film", "badge"
for i, w in enumerate(words):
# Clean punctuation for lookup
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
if not clean or len(clean) < 2:
continue
# Skip German/grammar words
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
ipa = _lookup_ipa(clean, pronunciation)
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
if not ipa and '-' in clean:
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
# Fallback 0b: compound word decomposition
# E.g. "schoolbag" → "school"+"bag" → concatenated IPA
if not ipa:
ipa = _decompose_compound(clean, pronunciation)
# Fallback 1: IPA-marker split for merged tokens where OCR
# joined headword with its IPA (e.g. "schoolbagsku:lbæg").
# Find the first IPA marker character (:, æ, ɪ, etc.), walk
# backwards ≤3 chars for the onset consonant cluster, and
# split into headword + OCR IPA.
_IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
if not ipa:
first_marker = next(
(p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
)
if first_marker >= 3:
split = first_marker
while (split > 0
and split > first_marker - 3
and w[split - 1].isalpha()
and w[split - 1].islower()):
split -= 1
if split >= 2:
headword = w[:split]
ocr_ipa = w[split:]
hw_ipa = _lookup_ipa(headword, pronunciation)
if not hw_ipa:
# Try compound decomposition for the headword part
hw_ipa = _decompose_compound(headword, pronunciation)
if hw_ipa:
words[i] = f"{headword} [{hw_ipa}]"
else:
# Word not in dictionary — use OCR IPA
words[i] = f"{headword} [{ocr_ipa}]"
words = words[:i + 1]
ipa = True # signal that we handled it
break
# Fallback 2: prefix matching for merged tokens WITHOUT IPA
# markers (e.g. "Scotland'skotland"). Find longest dictionary
# prefix using only alpha chars to avoid punctuation matches.
if not ipa:
alpha = re.sub(r'[^a-zA-Z]', '', clean)
if len(alpha) > 5: # need at least 6 chars for meaningful split
for end in range(len(alpha), 3, -1): # min prefix 4 chars
prefix = alpha[:end]
test_ipa = _lookup_ipa(prefix, pronunciation)
if test_ipa:
ipa = test_ipa
w = prefix
words[i] = prefix
break
if ipa:
words[i] = f"{w} [{ipa}]"
# Strip garbled OCR phonetics after the IPA bracket.
# On scanned vocab pages, printed IPA is read as garbled
# text (e.g. "scare skea" where "skea" is garbled /skɛə/).
# After inserting correct IPA, remove remaining words that
# aren't real English words, delimiters, or German text.
kept = words[:i + 1]
for j in range(i + 1, len(words)):
wj = words[j]
# Delimiter — keep this and everything after
if wj in ('', '', '-', '/', '|', ',', ';'):
kept.extend(words[j:])
break
# Pure digits or numbering (e.g. "1", "2.", "3)") — keep
if re.match(r'^[\d.)\-]+$', wj):
kept.extend(words[j:])
break
# Starts with uppercase — likely German or proper noun
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
if clean_j and clean_j[0].isupper():
kept.extend(words[j:])
break
# Known English word (≥2 chars) — keep it and rest
if clean_j and len(clean_j) >= 2:
if _lookup_ipa(clean_j, pronunciation):
kept.extend(words[j:])
break
# Merged token: dictionary word + garbled IPA stuck together.
# E.g. "fictionsalans'fIkfn" starts with "fiction".
# Extract the dictionary prefix (≥4 chars) and add it with
# IPA, but only if enough chars remain after the prefix (≥3)
# to look like garbled IPA, not just a plural 's'.
if clean_j and len(clean_j) >= 7:
for pend in range(min(len(clean_j) - 3, 15), 3, -1):
prefix_j = clean_j[:pend]
prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
if prefix_ipa:
kept.append(f"{prefix_j} [{prefix_ipa}]")
break
break # rest of this token is garbled
# Otherwise — likely garbled phonetics, skip
words = kept
break
return ' '.join(words)
@@ -0,0 +1,287 @@
"""
Advanced IPA repair for OCR-extracted vocabulary.
Functions that detect and fix garbled IPA fragments trailing after
headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py
to stay within the 500 LOC budget.
Contains:
- _has_non_dict_trailing: detect non-dictionary trailing words
- _strip_post_bracket_garbled: strip garbled IPA after [brackets]
- fix_ipa_continuation_cell: replace garbled IPA in continuation rows
- _insert_headword_ipa: insert IPA for first headword in mixed-lang lines
"""
import logging
import re
from typing import Any, Dict, List, Optional
from cv_vocab_types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import (
_lookup_ipa,
_GRAMMAR_BRACKET_WORDS,
)
logger = logging.getLogger(__name__)
def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
"""Check if text has a headword followed by non-dictionary trailing words.
Used as an additional trigger for ``_insert_missing_ipa`` when
``_text_has_garbled_ipa`` returns False because the garbled IPA
happens to look like plain ASCII (e.g. "skea" for /skɛə/).
"""
if not IPA_AVAILABLE:
return False
words = text.strip().split()
if len(words) < 2 or len(words) > 6:
return False
# Find first dictionary word
hw_idx = -1
for i, w in enumerate(words):
clean = re.sub(r'[^a-zA-Z\'-]', '', w)
if not clean or len(clean) < 2:
continue
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
if _lookup_ipa(clean, pronunciation):
hw_idx = i
break
if hw_idx < 0 or hw_idx >= len(words) - 1:
return False
# Check ALL remaining words — if none are dictionary/delimiter/German,
# they are likely garbled IPA.
for j in range(hw_idx + 1, len(words)):
wj = words[j]
if wj in ('', '', '-', '/', '|', ',', ';'):
return False
# Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
if re.match(r'^[\d.)\-]+$', wj):
return False
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
if clean_j and clean_j[0].isupper():
return False
if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
return False
return True
def _strip_post_bracket_garbled(
text: str, pronunciation: str = 'british',
) -> str:
"""Strip garbled IPA fragments that trail after proper [IPA] brackets.
E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``
For multi-word headwords like "seat belt", a real English word ("belt")
may be followed by garbled IPA duplicates. We detect this by checking
whether the sequence after a real word contains IPA markers (`:`, `ə`,
etc.) — if so, everything from the first garbled token onward is stripped.
"""
if ']' not in text:
return text
last_bracket = text.rfind(']')
if last_bracket >= len(text) - 1:
return text
before = text[:last_bracket + 1].rstrip()
after = text[last_bracket + 1:].strip()
if not after:
return text
_IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
after_words = after.split()
kept: List[str] = []
for idx, w in enumerate(after_words):
# Delimiter — keep rest
if w in ('', '', '-', '/', '|', ',', ';'):
kept.extend(after_words[idx:])
break
# Contains IPA markers (length mark, IPA chars) — garbled, skip
if any(c in w for c in _IPA_MARKER_CHARS):
# Everything from here is garbled IPA — stop scanning
# but look ahead: if any remaining words are real English
# words WITHOUT IPA markers, they might be a different headword
# following. Only skip the contiguous garbled run.
continue
clean = re.sub(r'[^a-zA-Z]', '', w)
# Uppercase — likely German, keep rest
if clean and clean[0].isupper():
kept.extend(after_words[idx:])
break
# Known English word — keep it, but check if followed by garbled IPA
# (multi-word headword case like "seat [siːt] belt si:t belt")
if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
# Peek ahead: if next word has IPA markers, the rest is garbled
remaining = after_words[idx + 1:]
has_garbled_after = any(
any(c in rw for c in _IPA_MARKER_CHARS)
for rw in remaining
)
if has_garbled_after:
# Keep this real word but stop — rest is garbled duplication
kept.append(w)
# Still scan for delimiters/German in the remaining words
for ridx, rw in enumerate(remaining):
if rw in ('', '', '-', '/', '|', ',', ';'):
kept.extend(remaining[ridx:])
break
rclean = re.sub(r'[^a-zA-Z]', '', rw)
if rclean and rclean[0].isupper():
kept.extend(remaining[ridx:])
break
break
else:
kept.extend(after_words[idx:])
break
# Unknown short word — likely garbled, skip
if kept:
return before + ' ' + ' '.join(kept)
return before
def fix_ipa_continuation_cell(
garbled_text: str,
headword_text: str,
pronunciation: str = 'british',
) -> str:
"""Replace garbled IPA in a continuation row with proper IPA.
Continuation rows appear below the headword and contain only the
printed phonetic transcription, which OCR garbles into fragments
like ``ska:f ska:vz`` (should be ``[skˈɑːf] [skˈɑːvz]``).
Args:
garbled_text: The OCR-garbled IPA text from the continuation row.
headword_text: The headword text from the previous row
(e.g. ``scarf scarves``).
pronunciation: ``'british'`` or ``'american'``.
Returns:
Corrected IPA text, or the original if no fix could be applied.
"""
if not IPA_AVAILABLE or not garbled_text or not headword_text:
return garbled_text
# If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
# only generate continuation IPA for words NOT already covered.
covered_words: set = set()
has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
if has_inline_ipa:
# Words before the first bracket already have their IPA shown
first_bracket = headword_text.index('[')
pre_bracket = headword_text[:first_bracket].strip()
for w in pre_bracket.split():
clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
if clean and len(clean) >= 2:
covered_words.add(clean)
last_bracket_end = headword_text.rfind(']')
tail = headword_text[last_bracket_end + 1:].strip()
if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
# Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
# — return the inline IPA directly (continuation duplicates it)
last_bracket_start = headword_text.rfind('[')
inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
return inline_ipa
# Only the tail words need continuation IPA
headword_text = tail
# Strip existing IPA brackets and parenthetical grammar annotations
# like "(no pl)", "(sth)", "(sb)" from headword text
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
if not clean_hw:
return garbled_text
# Split headword by delimiters ( — -)
# "scarf scarves" → ["scarf", "scarves"]
# "see - saw - seen" → ["see", "saw", "seen"]
parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
parts = [p.strip() for p in parts if p.strip()]
if not parts:
return garbled_text
# Look up IPA for each headword part.
# Skip articles (the, a, an) — they never get IPA in vocab books.
# Other function words like "down", "up" are kept because they are
# integral parts of phrasal verbs (e.g. "close down").
# Skip words that already have inline IPA in the headword row.
_ARTICLES = {'the', 'a', 'an'}
ipa_parts: List[str] = []
for part in parts:
# A part may be multi-word like "secondary school"
words = part.split()
word_ipas: List[str] = []
for w in words:
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
if not clean_w or len(clean_w) < 2:
continue
if covered_words and clean_w.lower() in covered_words:
continue # Already has IPA inline in the headword
if clean_w.lower() in _ARTICLES:
continue # Articles never get IPA in vocab books
ipa = _lookup_ipa(clean_w, pronunciation)
if ipa:
word_ipas.append(ipa)
if word_ipas:
ipa_parts.append('[' + ' '.join(word_ipas) + ']')
if not ipa_parts:
return garbled_text
# Join with delimiter
result = ' '.join(ipa_parts)
logger.debug(
"fix_ipa_continuation: '%s''%s' (headwords: '%s')",
garbled_text, result, headword_text,
)
return result
def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
"""Insert IPA for the first English headword in a long mixed-language line.
Unlike _insert_missing_ipa (for short column_en cells), this handles
column_text lines of any length. It only inserts IPA for the FIRST word
if that word:
- has no bracket following it already
- has an IPA entry in the dictionary
- is not a number/symbol prefix like "».55"
Returns the text with [ipa] inserted after the first word, or unchanged.
"""
if not IPA_AVAILABLE:
return text
if not text or not text.strip():
return text
words = text.strip().split()
if not words:
return text
# Check if text already starts with a bracket (IPA already present)
if len(words) > 1 and words[1].startswith(('[', '{', '(')):
return text
# Try the first few words (skip numeric prefixes like "».55", "0.56")
for i in range(min(3, len(words))):
w = words[i]
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
if not clean or len(clean) < 2:
continue
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
ipa = _lookup_ipa(clean, pronunciation)
if ipa:
words[i] = f"{w} [{ipa}]"
return ' '.join(words)
# Stop at first real word even if no IPA found
break
return text
@@ -0,0 +1,318 @@
"""
Vocab postprocessing: deterministic quality fixes for OCR-extracted vocabulary.
- Character confusion fix (I/1/l/|)
- Comma-separated word form splitting
- Example sentence attachment to matching vocab entries
Split from cv_ocr_engines.py for maintainability.
"""
import re
from typing import Any, Dict, List
# =============================================================================
# Post-Processing: Deterministic Quality Fixes
# =============================================================================
# --- A. Character Confusion Fix (I/1/l) ---
# Common OCR confusion pairs in vocabulary context
_CHAR_CONFUSION_RULES = [
# "1" at word start followed by lowercase → likely "I" or "l"
# Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
# "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
# and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
(re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
]
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Fix common OCR character confusions using context.
Deterministic rules:
- "1" at word start → "I" or "l" based on context
- Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1""I"
- "y " artifact at word boundaries → remove (e.g. "y you""you")
"""
for entry in entries:
en = entry.get('english', '') or ''
de = entry.get('german', '') or ''
ex = entry.get('example', '') or ''
# Apply general rules to all fields
for pattern, replacement in _CHAR_CONFUSION_RULES:
en = pattern.sub(replacement, en)
de = pattern.sub(replacement, de)
ex = pattern.sub(replacement, ex)
# Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
de_lower_words = set(de.lower().replace(',', ' ').split())
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
# Any remaining "1" in EN that looks like "I"
en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
# Fix "y " artifact before repeated word: "y you" → "you"
en = re.sub(r'\by\s+([a-z])', r'\1', en)
ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
entry['english'] = en.strip()
entry['german'] = de.strip()
entry['example'] = ex.strip()
return entries
# --- B. Comma-Separated Word Form Splitting ---
def _is_singular_plural_pair(parts: List[str]) -> bool:
"""Detect if comma-separated parts are singular/plural forms of the same word.
E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
"break, broke, broken" → False (different verb forms, OK to split).
Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
"""
if len(parts) != 2:
return False
a, b = parts[0].lower().strip(), parts[1].lower().strip()
if not a or not b:
return False
# Common prefix heuristic: if words share >= 50% of the shorter word,
# they are likely forms of the same word (Maus/Mäuse, child/children).
min_len = min(len(a), len(b))
common = 0
for ca, cb in zip(a, b):
if ca == cb:
common += 1
else:
break
if common >= max(2, min_len * 0.5):
return True
# Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
umlaut_map = str.maketrans('aou', 'äöü')
if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
return True
return False
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Split entries with comma-separated word forms into individual entries.
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
because those are forms of the same vocabulary entry.
Only splits when both EN and DE have the same number of comma-parts,
parts are short (word forms, not sentences), and at least 3 parts
(to avoid splitting pairs that likely belong together).
"""
result: List[Dict[str, Any]] = []
for entry in entries:
en = (entry.get('english', '') or '').strip()
de = (entry.get('german', '') or '').strip()
# Split by comma (but not inside brackets or parentheses)
en_parts = _split_by_comma(en)
de_parts = _split_by_comma(de)
# Only split if we have multiple parts and counts match
should_split = False
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
# All parts must be short (word forms, not sentences)
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
# Do NOT split singular/plural pairs (2 parts that are
# forms of the same word)
if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
should_split = False
else:
should_split = True
if not should_split:
result.append(entry)
continue
# Split into individual entries
for k in range(len(en_parts)):
sub = dict(entry) # shallow copy
sub['english'] = en_parts[k].strip()
sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
sub['example'] = '' # examples get attached later
sub['split_from_comma'] = True
result.append(sub)
# Re-number
for i, e in enumerate(result):
e['row_index'] = i
return result
def _split_by_comma(text: str) -> List[str]:
"""Split text by commas, but not inside brackets [...] or parens (...)."""
if ',' not in text:
return [text]
parts = []
depth_bracket = 0
depth_paren = 0
current = []
for ch in text:
if ch == '[':
depth_bracket += 1
elif ch == ']':
depth_bracket = max(0, depth_bracket - 1)
elif ch == '(':
depth_paren += 1
elif ch == ')':
depth_paren = max(0, depth_paren - 1)
elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
parts.append(''.join(current).strip())
current = []
continue
current.append(ch)
if current:
parts.append(''.join(current).strip())
# Filter empty parts
return [p for p in parts if p]
# --- C. Example Sentence Attachment ---
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
"""Find the vocab entry whose English word(s) best match the example sentence.
Returns index into vocab_entries, or -1 if no match found.
Uses word stem overlap: "a broken arm" matches "broken" or "break".
"""
if not vocab_entries or not example_text:
return -1
example_lower = example_text.lower()
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
best_idx = -1
best_score = 0
for i, entry in enumerate(vocab_entries):
en = (entry.get('english', '') or '').lower()
if not en:
continue
# Extract vocab words (split on space, comma, newline)
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
# Score: how many vocab words appear in the example?
# Also check if example words share a common stem (first 4 chars)
direct_matches = vocab_words & example_words
score = len(direct_matches) * 10
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
if score == 0:
for vw in vocab_words:
if len(vw) < 3:
continue
stem = vw[:4] if len(vw) >= 4 else vw[:3]
for ew in example_words:
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
score += 5
break
if score > best_score:
best_score = score
best_idx = i
return best_idx if best_score > 0 else -1
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Attach rows with EN text but no DE translation as examples to matching vocab entries.
Vocabulary worksheets often have:
Row 1: break, broke, broken / brechen, brach, gebrochen
Row 2: a broken arm (no DE → example for "broken")
Row 3: a broken plate (no DE → example for "broken")
Row 4: egg / Ei (has DE → new vocab entry)
Rules (deterministic, generic):
- A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
- Find the best matching vocab entry by checking which entry's English words
appear in the example sentence (semantic matching via word overlap)
- Fall back to the nearest preceding entry if no word match found
- Multiple examples get joined with " | "
"""
if not entries:
return entries
# Separate into vocab entries (have DE) and example candidates (no DE)
vocab_entries: List[Dict[str, Any]] = []
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
for entry in entries:
en = (entry.get('english', '') or '').strip()
de = (entry.get('german', '') or '').strip()
ex = (entry.get('example', '') or '').strip()
# Treat single-char DE as OCR noise, not real translation.
# "Ei" (2 chars) is a valid German word, so threshold is 1.
has_de = len(de) > 1
has_en = bool(en)
# Heuristic: a row without DE is an "example sentence" only if
# the EN text looks like a sentence (>= 4 words, or contains
# typical sentence punctuation). Short EN text (1-3 words) is
# more likely a vocab entry whose DE was missed by OCR.
_looks_like_sentence = (
len(en.split()) >= 4
or en.rstrip().endswith(('.', '!', '?'))
)
is_example_candidate = (
has_en and not has_de and _looks_like_sentence and vocab_entries
)
if is_example_candidate:
# This is an example sentence — find best matching vocab entry
example_text = en
match_idx = _find_best_vocab_match(en, vocab_entries)
if match_idx < 0:
# No word match → fall back to last entry
match_idx = len(vocab_entries) - 1
if match_idx not in examples_for:
examples_for[match_idx] = []
examples_for[match_idx].append(example_text)
else:
vocab_entries.append(entry)
# Attach examples to their matched vocab entries
for idx, example_list in examples_for.items():
if 0 <= idx < len(vocab_entries):
entry = vocab_entries[idx]
existing_ex = (entry.get('example', '') or '').strip()
new_examples = ' | '.join(example_list)
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
# Re-number
for i, e in enumerate(vocab_entries):
e['row_index'] = i
return vocab_entries
@@ -0,0 +1,134 @@
"""
Word assembly helpers for OCR output.
Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys)
into visual lines, rejoins hyphenated words, and produces reading-order
text. All functions are pure standard-library; no NumPy or project
imports required.
"""
import logging
from typing import Dict, List
logger = logging.getLogger(__name__)
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
"""Group words by Y position into lines, sorted by X within each line."""
if not words:
return []
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
lines: List[List[Dict]] = []
current_line: List[Dict] = [sorted_words[0]]
current_y = sorted_words[0]['top']
for word in sorted_words[1:]:
if abs(word['top'] - current_y) <= y_tolerance_px:
current_line.append(word)
else:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
current_line = [word]
current_y = word['top']
if current_line:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
return lines
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
"""Group OCR words into visual lines in reading order.
Returns a list of line strings (one per visual line in the cell).
"""
if not words:
return []
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
return [' '.join(w['text'] for w in line) for line in lines]
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
"""Rejoin words split by line-break hyphenation.
E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden']
['some text-', 'thing here'] \u2192 ['something here']
"""
if len(lines) <= 1:
return lines
result = []
i = 0
while i < len(lines):
line = lines[i]
# If line ends with '-' and there's a next line, rejoin
if i + 1 < len(lines) and line.rstrip().endswith('-'):
stripped = line.rstrip()
# Get the word fragment before hyphen (last word)
prefix = stripped[:-1] # remove trailing hyphen
next_line = lines[i + 1]
# Join: last word of this line + first word of next line
prefix_words = prefix.rsplit(' ', 1)
next_words = next_line.split(' ', 1)
if len(prefix_words) > 1:
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
else:
joined = prefix_words[0] + next_words[0]
remainder = next_words[1] if len(next_words) > 1 else ''
if remainder:
result.append(joined + ' ' + remainder)
else:
result.append(joined)
i += 2
else:
result.append(line)
i += 1
return result
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
"""Join OCR words into text in correct reading order, preserving line breaks.
Groups words into visual lines by Y-tolerance, sorts each line by X,
rejoins hyphenated words, then joins lines with newlines.
"""
lines = _words_to_reading_order_lines(words, y_tolerance_px)
lines = _rejoin_hyphenated(lines)
return '\n'.join(lines)
def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
"""Join OCR words preserving proportional horizontal spacing.
Instead of single spaces between words, inserts multiple spaces based on
the pixel gap between words relative to average character width.
Useful for box sub-sessions where spatial layout matters.
"""
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
result_lines = []
for line_words in lines:
if not line_words:
continue
sorted_words = sorted(line_words, key=lambda w: w['left'])
# Calculate average character width from all words in line
total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
total_width = sum(w['width'] for w in sorted_words if w.get('text'))
avg_char_width = total_width / total_chars if total_chars > 0 else 10
parts = []
for i, word in enumerate(sorted_words):
parts.append(word.get('text', ''))
if i < len(sorted_words) - 1:
next_word = sorted_words[i + 1]
gap_px = next_word['left'] - (word['left'] + word['width'])
num_spaces = max(1, round(gap_px / avg_char_width))
parts.append(' ' * num_spaces)
result_lines.append(''.join(parts))
return '\n'.join(result_lines)
+275
View File
@@ -0,0 +1,275 @@
"""
Gutter Repair Core — spellchecker setup, data types, and single-word repair logic.
Extracted from cv_gutter_repair.py for modularity.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import itertools
import logging
import re
import uuid
from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Spellchecker setup (lazy, cached)
# ---------------------------------------------------------------------------
_spell_de = None
_spell_en = None
_SPELL_AVAILABLE = False
def _init_spellcheckers():
"""Lazy-load DE + EN spellcheckers (cached across calls)."""
global _spell_de, _spell_en, _SPELL_AVAILABLE
if _spell_de is not None:
return
try:
from spellchecker import SpellChecker
_spell_de = SpellChecker(language='de', distance=1)
_spell_en = SpellChecker(language='en', distance=1)
_SPELL_AVAILABLE = True
logger.info("Gutter repair: spellcheckers loaded (DE + EN)")
except ImportError:
logger.warning("pyspellchecker not installed — gutter repair unavailable")
def _is_known(word: str) -> bool:
"""Check if a word is known in DE or EN dictionary."""
_init_spellcheckers()
if not _SPELL_AVAILABLE:
return False
w = word.lower()
return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))
def _spell_candidates(word: str, lang: str = "both") -> List[str]:
"""Get all plausible spellchecker candidates for a word (deduplicated)."""
_init_spellcheckers()
if not _SPELL_AVAILABLE:
return []
w = word.lower()
seen: set = set()
results: List[str] = []
for checker in ([_spell_de, _spell_en] if lang == "both"
else [_spell_de] if lang == "de"
else [_spell_en]):
if checker is None:
continue
cands = checker.candidates(w)
if cands:
for c in cands:
if c and c != w and c not in seen:
seen.add(c)
results.append(c)
return results
# ---------------------------------------------------------------------------
# Gutter position detection
# ---------------------------------------------------------------------------
# Minimum word length for spell-fix (very short words are often legitimate)
_MIN_WORD_LEN_SPELL = 3
# Minimum word length for hyphen-join candidates (fragments at the gutter
# can be as short as 1-2 chars, e.g. "ve" from "ver-künden")
_MIN_WORD_LEN_HYPHEN = 2
# How close to the right column edge a word must be to count as "gutter-adjacent".
# Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
_GUTTER_EDGE_THRESHOLD = 0.70
# Small common words / abbreviations that should NOT be repaired
_STOPWORDS = frozenset([
# German
"ab", "an", "am", "da", "er", "es", "im", "in", "ja", "ob", "so", "um",
"zu", "wo", "du", "eh", "ei", "je", "na", "nu", "oh",
# English
"a", "am", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in",
"is", "it", "me", "my", "no", "of", "on", "or", "so", "to", "up", "us",
"we",
])
# IPA / phonetic patterns — skip these cells
_IPA_RE = re.compile(r'[\[\]/ˈˌːʃʒθðŋɑɒæɔəɛɪʊʌ]')
def _is_ipa_text(text: str) -> bool:
"""True if text looks like IPA transcription."""
return bool(_IPA_RE.search(text))
def _word_is_at_gutter_edge(word_bbox: Dict, col_x: float, col_width: float) -> bool:
"""Check if a word's right edge is near the right boundary of its column."""
if col_width <= 0:
return False
word_right = word_bbox.get("left", 0) + word_bbox.get("width", 0)
col_right = col_x + col_width
# Word's right edge within the rightmost portion of the column
relative_pos = (word_right - col_x) / col_width
return relative_pos >= _GUTTER_EDGE_THRESHOLD
# ---------------------------------------------------------------------------
# Suggestion types
# ---------------------------------------------------------------------------
@dataclass
class GutterSuggestion:
"""A single correction suggestion."""
id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
type: str = "" # "hyphen_join" | "spell_fix"
zone_index: int = 0
row_index: int = 0
col_index: int = 0
col_type: str = ""
cell_id: str = ""
original_text: str = ""
suggested_text: str = ""
# For hyphen_join:
next_row_index: int = -1
next_row_cell_id: str = ""
next_row_text: str = ""
missing_chars: str = ""
display_parts: List[str] = field(default_factory=list)
# Alternatives (other plausible corrections the user can pick from)
alternatives: List[str] = field(default_factory=list)
# Meta:
confidence: float = 0.0
reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
# ---------------------------------------------------------------------------
# Core repair logic
# ---------------------------------------------------------------------------
_TRAILING_PUNCT_RE = re.compile(r'[.,;:!?\)\]]+$')
def _try_hyphen_join(
word_text: str,
next_word_text: str,
max_missing: int = 3,
) -> Optional[Tuple[str, str, float]]:
"""Try joining two fragments with 0..max_missing interpolated chars.
Strips trailing punctuation from the continuation word before testing
(e.g. "künden,""künden") so dictionary lookup succeeds.
Returns (joined_word, missing_chars, confidence) or None.
"""
base = word_text.rstrip("-").rstrip()
# Strip trailing punctuation from continuation (commas, periods, etc.)
raw_continuation = next_word_text.lstrip()
continuation = _TRAILING_PUNCT_RE.sub('', raw_continuation)
if not base or not continuation:
return None
# 1. Direct join (no missing chars)
direct = base + continuation
if _is_known(direct):
return (direct, "", 0.95)
# 2. Try with 1..max_missing missing characters
# Use common letters, weighted by frequency in German/English
_COMMON_CHARS = "enristaldhgcmobwfkzpvjyxqu"
for n_missing in range(1, max_missing + 1):
for chars in itertools.product(_COMMON_CHARS[:15], repeat=n_missing):
candidate = base + "".join(chars) + continuation
if _is_known(candidate):
missing = "".join(chars)
# Confidence decreases with more missing chars
conf = 0.90 - (n_missing - 1) * 0.10
return (candidate, missing, conf)
return None
def _try_spell_fix(
word_text: str, col_type: str = "",
) -> Optional[Tuple[str, float, List[str]]]:
"""Try to fix a single garbled gutter word via spellchecker.
Returns (best_correction, confidence, alternatives_list) or None.
The alternatives list contains other plausible corrections the user
can choose from (e.g. "stammelt" vs "stammeln").
"""
if len(word_text) < _MIN_WORD_LEN_SPELL:
return None
# Strip trailing/leading parentheses and check if the bare word is valid.
# Words like "probieren)" or "(Englisch" are valid words with punctuation,
# not OCR errors. Don't suggest corrections for them.
stripped = word_text.strip("()")
if stripped and _is_known(stripped):
return None
# Determine language priority from column type
if "en" in col_type:
lang = "en"
elif "de" in col_type:
lang = "de"
else:
lang = "both"
candidates = _spell_candidates(word_text, lang=lang)
if not candidates and lang != "both":
candidates = _spell_candidates(word_text, lang="both")
if not candidates:
return None
# Preserve original casing
is_upper = word_text[0].isupper()
def _preserve_case(w: str) -> str:
if is_upper and w:
return w[0].upper() + w[1:]
return w
# Sort candidates by edit distance (closest first)
scored = []
for c in candidates:
dist = _edit_distance(word_text.lower(), c.lower())
scored.append((dist, c))
scored.sort(key=lambda x: x[0])
best_dist, best = scored[0]
best = _preserve_case(best)
conf = max(0.5, 1.0 - best_dist * 0.15)
# Build alternatives (all other candidates, also case-preserved)
alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()]
# Limit to top 5 alternatives
alts = alts[:5]
return (best, conf, alts)
def _edit_distance(a: str, b: str) -> int:
"""Simple Levenshtein distance."""
if len(a) < len(b):
return _edit_distance(b, a)
if len(b) == 0:
return len(a)
prev = list(range(len(b) + 1))
for i, ca in enumerate(a):
curr = [i + 1]
for j, cb in enumerate(b):
cost = 0 if ca == cb else 1
curr.append(min(curr[j] + 1, prev[j + 1] + 1, prev[j] + cost))
prev = curr
return prev[len(b)]
+356
View File
@@ -0,0 +1,356 @@
"""
Gutter Repair Grid — grid analysis and suggestion application.
Extracted from cv_gutter_repair.py for modularity.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import time
from typing import Any, Dict, List, Tuple
from cv_gutter_repair_core import (
_init_spellcheckers,
_is_ipa_text,
_is_known,
_MIN_WORD_LEN_HYPHEN,
_SPELL_AVAILABLE,
_STOPWORDS,
_TRAILING_PUNCT_RE,
_try_hyphen_join,
_try_spell_fix,
_word_is_at_gutter_edge,
GutterSuggestion,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Grid analysis
# ---------------------------------------------------------------------------
def analyse_grid_for_gutter_repair(
grid_data: Dict[str, Any],
image_width: int = 0,
) -> Dict[str, Any]:
"""Analyse a structured grid and return gutter repair suggestions.
Args:
grid_data: The grid_editor_result from the session (zones→cells structure).
image_width: Image width in pixels (for determining gutter side).
Returns:
Dict with "suggestions" list and "stats".
"""
t0 = time.time()
_init_spellcheckers()
if not _SPELL_AVAILABLE:
return {
"suggestions": [],
"stats": {"error": "pyspellchecker not installed"},
"duration_seconds": 0,
}
zones = grid_data.get("zones", [])
suggestions: List[GutterSuggestion] = []
words_checked = 0
gutter_candidates = 0
for zi, zone in enumerate(zones):
columns = zone.get("columns", [])
cells = zone.get("cells", [])
if not columns or not cells:
continue
# Build column lookup: col_index → {x, width, type}
col_info: Dict[int, Dict] = {}
for col in columns:
ci = col.get("index", col.get("col_index", -1))
col_info[ci] = {
"x": col.get("x_min_px", col.get("x", 0)),
"width": col.get("x_max_px", col.get("width", 0)) - col.get("x_min_px", col.get("x", 0)),
"type": col.get("type", col.get("col_type", "")),
}
# Build row→col→cell lookup
cell_map: Dict[Tuple[int, int], Dict] = {}
max_row = 0
for cell in cells:
ri = cell.get("row_index", 0)
ci = cell.get("col_index", 0)
cell_map[(ri, ci)] = cell
if ri > max_row:
max_row = ri
# Determine which columns are at the gutter edge.
# For a left page: rightmost content columns.
# For now, check ALL columns — a word is a candidate if it's at the
# right edge of its column AND not a known word.
for (ri, ci), cell in cell_map.items():
text = (cell.get("text") or "").strip()
if not text:
continue
if _is_ipa_text(text):
continue
words_checked += 1
col = col_info.get(ci, {})
col_type = col.get("type", "")
# Get word boxes to check position
word_boxes = cell.get("word_boxes", [])
# Check the LAST word in the cell (rightmost, closest to gutter)
cell_words = text.split()
if not cell_words:
continue
last_word = cell_words[-1]
# Skip stopwords
if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
continue
last_word_clean = last_word.rstrip(".,;:!?)(")
if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
continue
# Check if the last word is at the gutter edge
is_at_edge = False
if word_boxes:
last_wb = word_boxes[-1]
is_at_edge = _word_is_at_gutter_edge(
last_wb, col.get("x", 0), col.get("width", 1)
)
else:
# No word boxes — use cell bbox
bbox = cell.get("bbox_px", {})
is_at_edge = _word_is_at_gutter_edge(
{"left": bbox.get("x", 0), "width": bbox.get("w", 0)},
col.get("x", 0), col.get("width", 1)
)
if not is_at_edge:
continue
# Word is at gutter edge — check if it's a known word
if _is_known(last_word_clean):
continue
# Check if the word ends with "-" (explicit hyphen break)
ends_with_hyphen = last_word.endswith("-")
# If the word already ends with "-" and the stem (without
# the hyphen) is a known word, this is a VALID line-break
# hyphenation — not a gutter error. Gutter problems cause
# the hyphen to be LOST ("ve" instead of "ver-"), so a
# visible hyphen + known stem = intentional word-wrap.
# Example: "wunder-" → "wunder" is known → skip.
if ends_with_hyphen:
stem = last_word_clean.rstrip("-")
if stem and _is_known(stem):
continue
gutter_candidates += 1
# --- Strategy 1: Hyphen join with next row ---
next_cell = cell_map.get((ri + 1, ci))
if next_cell:
next_text = (next_cell.get("text") or "").strip()
next_words = next_text.split()
if next_words:
first_next = next_words[0]
first_next_clean = _TRAILING_PUNCT_RE.sub('', first_next)
first_alpha = next((c for c in first_next if c.isalpha()), "")
# Also skip if the joined word is known (covers compound
# words where the stem alone might not be in the dictionary)
if ends_with_hyphen and first_next_clean:
direct = last_word_clean.rstrip("-") + first_next_clean
if _is_known(direct):
continue
# Continuation likely if:
# - explicit hyphen, OR
# - next row starts lowercase (= not a new entry)
if ends_with_hyphen or (first_alpha and first_alpha.islower()):
result = _try_hyphen_join(last_word_clean, first_next)
if result:
joined, missing, conf = result
# Build display parts: show hyphenation for original layout
if ends_with_hyphen:
display_p1 = last_word_clean.rstrip("-")
if missing:
display_p1 += missing
display_p1 += "-"
else:
display_p1 = last_word_clean
if missing:
display_p1 += missing + "-"
else:
display_p1 += "-"
suggestion = GutterSuggestion(
type="hyphen_join",
zone_index=zi,
row_index=ri,
col_index=ci,
col_type=col_type,
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
original_text=last_word,
suggested_text=joined,
next_row_index=ri + 1,
next_row_cell_id=next_cell.get("cell_id", f"R{ri+1:02d}_C{ci}"),
next_row_text=next_text,
missing_chars=missing,
display_parts=[display_p1, first_next],
confidence=conf,
reason="gutter_truncation" if missing else "hyphen_continuation",
)
suggestions.append(suggestion)
continue # skip spell_fix if hyphen_join found
# --- Strategy 2: Single-word spell fix (only for longer words) ---
fix_result = _try_spell_fix(last_word_clean, col_type)
if fix_result:
corrected, conf, alts = fix_result
suggestion = GutterSuggestion(
type="spell_fix",
zone_index=zi,
row_index=ri,
col_index=ci,
col_type=col_type,
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
original_text=last_word,
suggested_text=corrected,
alternatives=alts,
confidence=conf,
reason="gutter_blur",
)
suggestions.append(suggestion)
duration = round(time.time() - t0, 3)
logger.info(
"Gutter repair: checked %d words, %d gutter candidates, %d suggestions (%.2fs)",
words_checked, gutter_candidates, len(suggestions), duration,
)
return {
"suggestions": [s.to_dict() for s in suggestions],
"stats": {
"words_checked": words_checked,
"gutter_candidates": gutter_candidates,
"suggestions_found": len(suggestions),
},
"duration_seconds": duration,
}
def apply_gutter_suggestions(
grid_data: Dict[str, Any],
accepted_ids: List[str],
suggestions: List[Dict[str, Any]],
) -> Dict[str, Any]:
"""Apply accepted gutter repair suggestions to the grid data.
Modifies cells in-place and returns summary of changes.
Args:
grid_data: The grid_editor_result (zones→cells).
accepted_ids: List of suggestion IDs the user accepted.
suggestions: The full suggestions list (from analyse_grid_for_gutter_repair).
Returns:
Dict with "applied_count" and "changes" list.
"""
accepted_set = set(accepted_ids)
accepted_suggestions = [s for s in suggestions if s.get("id") in accepted_set]
zones = grid_data.get("zones", [])
changes: List[Dict[str, Any]] = []
for s in accepted_suggestions:
zi = s.get("zone_index", 0)
ri = s.get("row_index", 0)
ci = s.get("col_index", 0)
stype = s.get("type", "")
if zi >= len(zones):
continue
zone_cells = zones[zi].get("cells", [])
# Find the target cell
target_cell = None
for cell in zone_cells:
if cell.get("row_index") == ri and cell.get("col_index") == ci:
target_cell = cell
break
if not target_cell:
continue
old_text = target_cell.get("text", "")
if stype == "spell_fix":
# Replace the last word in the cell text
original_word = s.get("original_text", "")
corrected = s.get("suggested_text", "")
if original_word and corrected:
# Replace from the right (last occurrence)
idx = old_text.rfind(original_word)
if idx >= 0:
new_text = old_text[:idx] + corrected + old_text[idx + len(original_word):]
target_cell["text"] = new_text
changes.append({
"type": "spell_fix",
"zone_index": zi,
"row_index": ri,
"col_index": ci,
"cell_id": target_cell.get("cell_id", ""),
"old_text": old_text,
"new_text": new_text,
})
elif stype == "hyphen_join":
# Current cell: replace last word with the hyphenated first part
original_word = s.get("original_text", "")
joined = s.get("suggested_text", "")
display_parts = s.get("display_parts", [])
next_ri = s.get("next_row_index", -1)
if not original_word or not joined or not display_parts:
continue
# The first display part is what goes in the current row
first_part = display_parts[0] if display_parts else ""
# Replace the last word in current cell with the restored form.
# The next row is NOT modified — "künden" stays in its row
# because the original book layout has it there. We only fix
# the truncated word in the current row (e.g. "ve" → "ver-").
idx = old_text.rfind(original_word)
if idx >= 0:
new_text = old_text[:idx] + first_part + old_text[idx + len(original_word):]
target_cell["text"] = new_text
changes.append({
"type": "hyphen_join",
"zone_index": zi,
"row_index": ri,
"col_index": ci,
"cell_id": target_cell.get("cell_id", ""),
"old_text": old_text,
"new_text": new_text,
"joined_word": joined,
})
logger.info("Gutter repair applied: %d/%d suggestions", len(changes), len(accepted_suggestions))
return {
"applied_count": len(accepted_suggestions),
"changes": changes,
}
@@ -0,0 +1,35 @@
"""
Gutter Repair — barrel re-export.
All implementation split into:
cv_gutter_repair_core — spellchecker setup, data types, single-word repair
cv_gutter_repair_grid — grid analysis, suggestion application
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
# Core: spellchecker, data types, repair helpers
from cv_gutter_repair_core import ( # noqa: F401
_init_spellcheckers,
_is_known,
_spell_candidates,
_MIN_WORD_LEN_SPELL,
_MIN_WORD_LEN_HYPHEN,
_GUTTER_EDGE_THRESHOLD,
_STOPWORDS,
_IPA_RE,
_is_ipa_text,
_word_is_at_gutter_edge,
GutterSuggestion,
_TRAILING_PUNCT_RE,
_try_hyphen_join,
_try_spell_fix,
_edit_distance,
)
# Grid: analysis and application
from cv_gutter_repair_grid import ( # noqa: F401
analyse_grid_for_gutter_repair,
apply_gutter_suggestions,
)
@@ -0,0 +1,92 @@
"""
OCR Image Enhancement — Improve scan quality before OCR.
Applies CLAHE contrast enhancement + bilateral filter denoising
to degraded scans. Only runs when scan_quality.is_degraded is True.
Pattern adapted from handwriting_htr_api.py (lines 50-68) and
cv_layout.py (lines 229-241).
All operations use OpenCV (Apache-2.0).
"""
import logging
import cv2
import numpy as np
logger = logging.getLogger(__name__)
def enhance_for_ocr(
img_bgr: np.ndarray,
is_degraded: bool = False,
clip_limit: float = 3.0,
tile_size: int = 8,
denoise_d: int = 9,
denoise_sigma_color: float = 75,
denoise_sigma_space: float = 75,
sharpen: bool = True,
) -> np.ndarray:
"""
Enhance image quality for OCR processing.
Only applies aggressive enhancement when is_degraded is True.
For good scans, applies minimal enhancement (light CLAHE only).
Args:
img_bgr: Input BGR image
is_degraded: Whether the scan is degraded (from ScanQualityReport)
clip_limit: CLAHE clip limit (higher = more contrast)
tile_size: CLAHE tile grid size
denoise_d: Bilateral filter diameter
denoise_sigma_color: Bilateral filter sigma for color
denoise_sigma_space: Bilateral filter sigma for space
sharpen: Apply unsharp mask for blurry scans
Returns:
Enhanced BGR image
"""
if not is_degraded:
# For good scans: light CLAHE only (preserves quality)
lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
l_channel, a_channel, b_channel = cv2.split(lab)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
l_enhanced = clahe.apply(l_channel)
lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel])
result = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
logger.info("enhance_for_ocr: light CLAHE applied (good scan)")
return result
# Degraded scan: full enhancement pipeline
logger.info(
f"enhance_for_ocr: full enhancement "
f"(CLAHE clip={clip_limit}, denoise d={denoise_d}, sharpen={sharpen})"
)
# 1. CLAHE on L-channel of LAB colorspace (preserves color for RapidOCR)
lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
l_channel, a_channel, b_channel = cv2.split(lab)
clahe = cv2.createCLAHE(
clipLimit=clip_limit,
tileGridSize=(tile_size, tile_size),
)
l_enhanced = clahe.apply(l_channel)
lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel])
enhanced = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
# 2. Bilateral filter: denoises while preserving edges
enhanced = cv2.bilateralFilter(
enhanced,
d=denoise_d,
sigmaColor=denoise_sigma_color,
sigmaSpace=denoise_sigma_space,
)
# 3. Unsharp mask for sharpening blurry text
if sharpen:
gaussian = cv2.GaussianBlur(enhanced, (0, 0), 3)
enhanced = cv2.addWeighted(enhanced, 1.5, gaussian, -0.5, 0)
logger.info("enhance_for_ocr: full enhancement pipeline complete")
return enhanced
+135
View File
@@ -0,0 +1,135 @@
"""German IPA insertion for grid editor cells.
Hybrid approach:
1. Primary lookup: wiki-pronunciation-dict (636k entries, CC-BY-SA)
2. Fallback: epitran rule-based G2P (MIT license)
German IPA data sourced from Wiktionary contributors (CC-BY-SA 4.0).
Attribution required — see grid editor UI.
Lizenz: Code Apache-2.0, IPA-Daten CC-BY-SA 4.0 (Wiktionary)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Dict, List, Optional, Set
logger = logging.getLogger(__name__)
# IPA/phonetic characters — skip cells that already contain IPA
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
def _lookup_ipa_de(word: str) -> Optional[str]:
"""Look up German IPA for a single word.
Returns IPA string or None if not found.
"""
from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
if not DE_IPA_AVAILABLE and _epitran_de is None:
return None
lower = word.lower().strip()
if not lower:
return None
# 1. Dictionary lookup (636k entries)
ipa = _de_ipa_dict.get(lower)
if ipa:
return ipa
# 2. epitran fallback (rule-based)
if _epitran_de is not None:
try:
result = _epitran_de.transliterate(word)
if result and result != word.lower():
return result
except Exception:
pass
return None
def _insert_ipa_for_text(text: str) -> str:
"""Insert German IPA after each recognized word in a text string.
Handles comma-separated lists:
"bildschön, blendend""bildschön [bɪltʃøn], blendend [blɛndənt]"
Skips cells already containing IPA brackets.
"""
if not text or _IPA_RE.search(text):
return text
# Split on comma/semicolon sequences, keeping separators
tokens = re.split(r'([,;:]+\s*)', text)
result = []
changed = False
for tok in tokens:
# Keep separators as-is
if not tok or re.match(r'^[,;:\s]+$', tok):
result.append(tok)
continue
# Process words within this token
words = tok.split()
new_words = []
for w in words:
# Strip punctuation for lookup
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', w)
if len(clean) < 3:
new_words.append(w)
continue
ipa = _lookup_ipa_de(clean)
if ipa:
new_words.append(f"{w} [{ipa}]")
changed = True
else:
new_words.append(w)
result.append(' '.join(new_words))
return ''.join(result) if changed else text
def insert_german_ipa(
cells: List[Dict],
target_cols: Set[str],
) -> int:
"""Insert German IPA transcriptions into cells of target columns.
Args:
cells: Flat list of all cells (modified in-place).
target_cols: Set of col_type values to process.
Returns:
Number of cells modified.
"""
from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de
if not DE_IPA_AVAILABLE and _epitran_de is None:
logger.warning("German IPA not available — skipping")
return 0
count = 0
for cell in cells:
ct = cell.get("col_type", "")
if ct not in target_cols:
continue
text = cell.get("text", "")
if not text.strip():
continue
new_text = _insert_ipa_for_text(text)
if new_text != text:
cell["text"] = new_text
cell["_ipa_corrected"] = True
count += 1
if count:
logger.info(f"German IPA inserted in {count} cells")
return count
@@ -0,0 +1,257 @@
"""
Legacy layout analysis using projection profiles.
Extracted from cv_layout_columns.py — contains:
- analyze_layout() (projection-profile based column/header/footer detection)
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import List
import numpy as np
from cv_vocab_types import PageRegion
from cv_layout_detection import _find_content_bounds
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
"""Detect columns, header, and footer using projection profiles.
Uses content-bounds detection to exclude page margins before searching
for column separators within the actual text area.
Args:
layout_img: CLAHE-enhanced grayscale image.
ocr_img: Binarized image for text density analysis.
Returns:
List of PageRegion objects describing detected regions.
"""
h, w = ocr_img.shape[:2]
# Invert: black text on white → white text on black for projection
inv = cv2.bitwise_not(ocr_img)
# --- Find actual content bounds (exclude page margins) ---
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
content_w = right_x - left_x
content_h = bottom_y - top_y
logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
if content_w < w * 0.3 or content_h < h * 0.3:
# Fallback if detection seems wrong
left_x, right_x = 0, w
top_y, bottom_y = 0, h
content_w, content_h = w, h
# --- Vertical projection within content area to find column separators ---
content_strip = inv[top_y:bottom_y, left_x:right_x]
v_proj = np.sum(content_strip, axis=0).astype(float)
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
# Smooth the projection profile
kernel_size = max(5, content_w // 50)
if kernel_size % 2 == 0:
kernel_size += 1
v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
# Debug: log projection profile statistics
p_mean = float(np.mean(v_proj_smooth))
p_median = float(np.median(v_proj_smooth))
p_min = float(np.min(v_proj_smooth))
p_max = float(np.max(v_proj_smooth))
logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
f"mean={p_mean:.4f}, median={p_median:.4f}")
# Find valleys using multiple threshold strategies
# Strategy 1: relative to median (catches clear separators)
# Strategy 2: local minima approach (catches subtle gaps)
threshold = max(p_median * 0.3, p_mean * 0.2)
logger.info(f"Layout: valley threshold={threshold:.4f}")
in_valley = v_proj_smooth < threshold
# Find contiguous valley regions
all_valleys = []
start = None
for x in range(len(v_proj_smooth)):
if in_valley[x] and start is None:
start = x
elif not in_valley[x] and start is not None:
valley_width = x - start
valley_depth = float(np.min(v_proj_smooth[start:x]))
# Valley must be at least 3px wide
if valley_width >= 3:
all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
start = None
logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)}"
f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
# Filter: valleys must be inside the content area (not at edges)
inner_margin = int(content_w * 0.08)
valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
# If no valleys found with strict threshold, try local minima approach
if len(valleys) < 2:
logger.info("Layout: trying local minima approach for column detection")
# Divide content into 20 segments, find the 2 lowest
seg_count = 20
seg_width = content_w // seg_count
seg_scores = []
for i in range(seg_count):
sx = i * seg_width
ex = min((i + 1) * seg_width, content_w)
seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
seg_scores.append((i, sx, ex, seg_mean))
seg_scores.sort(key=lambda s: s[3])
logger.info(f"Layout: segment scores (lowest 5): "
f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
# Find two lowest non-adjacent segments that create reasonable columns
candidate_valleys = []
for seg_idx, sx, ex, seg_mean in seg_scores:
# Must not be at the edges
if seg_idx <= 1 or seg_idx >= seg_count - 2:
continue
# Must be significantly lower than overall mean
if seg_mean < p_mean * 0.6:
center = (sx + ex) // 2
candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
if len(candidate_valleys) >= 2:
# Pick the best pair: non-adjacent, creating reasonable column widths
candidate_valleys.sort(key=lambda v: v[2])
best_pair = None
best_score = float('inf')
for i in range(len(candidate_valleys)):
for j in range(i + 1, len(candidate_valleys)):
c1 = candidate_valleys[i][2]
c2 = candidate_valleys[j][2]
# Must be at least 20% apart
if (c2 - c1) < content_w * 0.2:
continue
col1 = c1
col2 = c2 - c1
col3 = content_w - c2
# Each column at least 15%
if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
continue
parts = sorted([col1, col2, col3])
score = parts[2] - parts[0]
if score < best_score:
best_score = score
best_pair = (candidate_valleys[i], candidate_valleys[j])
if best_pair:
valleys = list(best_pair)
logger.info(f"Layout: local minima found 2 valleys: "
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
logger.info(f"Layout: final {len(valleys)} valleys: "
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
regions = []
if len(valleys) >= 2:
# 3-column layout detected
valleys.sort(key=lambda v: v[2])
if len(valleys) == 2:
sep1_center = valleys[0][2]
sep2_center = valleys[1][2]
else:
# Pick the two valleys that best divide into 3 parts
# Prefer wider valleys (more likely true separators)
best_pair = None
best_score = float('inf')
for i in range(len(valleys)):
for j in range(i + 1, len(valleys)):
c1, c2 = valleys[i][2], valleys[j][2]
# Each column should be at least 15% of content width
col1 = c1
col2 = c2 - c1
col3 = content_w - c2
if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
continue
# Score: lower is better (more even distribution)
parts = sorted([col1, col2, col3])
score = parts[2] - parts[0]
# Bonus for wider valleys (subtract valley width)
score -= (valleys[i][3] + valleys[j][3]) * 0.5
if score < best_score:
best_score = score
best_pair = (c1, c2)
if best_pair:
sep1_center, sep2_center = best_pair
else:
sep1_center = valleys[0][2]
sep2_center = valleys[1][2]
# Convert from content-relative to absolute coordinates
abs_sep1 = sep1_center + left_x
abs_sep2 = sep2_center + left_x
logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
regions.append(PageRegion(
type='column_en', x=0, y=top_y,
width=abs_sep1, height=content_h
))
regions.append(PageRegion(
type='column_de', x=abs_sep1, y=top_y,
width=abs_sep2 - abs_sep1, height=content_h
))
regions.append(PageRegion(
type='column_example', x=abs_sep2, y=top_y,
width=w - abs_sep2, height=content_h
))
elif len(valleys) == 1:
# 2-column layout
abs_sep = valleys[0][2] + left_x
logger.info(f"Layout: 2 columns at separator x={abs_sep}")
regions.append(PageRegion(
type='column_en', x=0, y=top_y,
width=abs_sep, height=content_h
))
regions.append(PageRegion(
type='column_de', x=abs_sep, y=top_y,
width=w - abs_sep, height=content_h
))
else:
# No columns detected — run full-page OCR as single column
logger.warning("Layout: no column separators found, using full page")
regions.append(PageRegion(
type='column_en', x=0, y=top_y,
width=w, height=content_h
))
# Add header/footer info (gap-based detection with fallback)
# Lazy import to avoid circular dependency with cv_layout.py
from cv_layout_detection import _add_header_footer
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
col_count = len([r for r in regions if r.type.startswith('column')])
logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
return regions
@@ -0,0 +1,494 @@
"""
Column type classification for OCR layout analysis.
Entry point: classify_column_types() with 4-level fallback chain.
Also provides positional_column_regions() and _build_margin_regions().
Position-based classifiers (Level 2+3) in cv_layout_classify_position.py.
"""
import logging
from typing import Dict, List, Optional
import numpy as np
from cv_vocab_types import ColumnGeometry, PageRegion
from cv_layout_scoring import (
_score_language,
_score_role,
_score_dictionary_signals,
_classify_dictionary_columns,
)
from cv_layout_classify_position import (
_classify_by_position_enhanced,
_classify_by_position_fallback,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Margin Region Building
# ---------------------------------------------------------------------------
def _build_margin_regions(
all_regions: List[PageRegion],
left_x: int,
right_x: int,
img_w: int,
top_y: int,
content_h: int,
) -> List[PageRegion]:
"""Create margin_left / margin_right PageRegions from content bounds.
Margins represent the space between the image edge and the first/last
content column. They are used downstream for faithful page
reconstruction but are skipped during OCR.
"""
margins: List[PageRegion] = []
# Minimum gap (px) to create a margin region
_min_gap = 5
if left_x > _min_gap:
margins.append(PageRegion(
type='margin_left', x=0, y=top_y,
width=left_x, height=content_h,
classification_confidence=1.0,
classification_method='content_bounds',
))
# Right margin: from end of last content column to image edge
non_margin = [r for r in all_regions
if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
'margin_top', 'margin_bottom')]
if non_margin:
last_col_end = max(r.x + r.width for r in non_margin)
else:
last_col_end = right_x
if img_w - last_col_end > _min_gap:
margins.append(PageRegion(
type='margin_right', x=last_col_end, y=top_y,
width=img_w - last_col_end, height=content_h,
classification_confidence=1.0,
classification_method='content_bounds',
))
if margins:
logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
return margins
# ---------------------------------------------------------------------------
# Positional Column Regions
# ---------------------------------------------------------------------------
def positional_column_regions(
geometries: List[ColumnGeometry],
content_w: int,
content_h: int,
left_x: int,
) -> List[PageRegion]:
"""Classify columns by position only (no language scoring).
Structural columns (page_ref, column_marker) are identified by geometry.
Remaining content columns are labelled left->right as column_en, column_de,
column_example. The names are purely positional -- no language analysis.
"""
structural: List[PageRegion] = []
content_cols: List[ColumnGeometry] = []
for g in geometries:
rel_x = g.x - left_x
# page_ref: narrow column in the leftmost 20% region
if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
structural.append(PageRegion(
type='page_ref', x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.95,
classification_method='positional',
))
# column_marker: very narrow, few words
elif g.width_ratio < 0.06 and g.word_count <= 15:
structural.append(PageRegion(
type='column_marker', x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.95,
classification_method='positional',
))
# empty or near-empty narrow column -> treat as margin/structural
elif g.word_count <= 2 and g.width_ratio < 0.15:
structural.append(PageRegion(
type='column_marker', x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.85,
classification_method='positional',
))
else:
content_cols.append(g)
# Single content column -> plain text page
if len(content_cols) == 1:
g = content_cols[0]
return structural + [PageRegion(
type='column_text', x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.9,
classification_method='positional',
)]
# No content columns
if not content_cols:
return structural
# Sort content columns left->right and assign positional labels
content_cols.sort(key=lambda g: g.x)
# With exactly 2 content columns: if the left one is very wide (>35%),
# it likely contains EN+DE combined, so the right one is examples.
if (len(content_cols) == 2
and content_cols[0].width_ratio > 0.35
and content_cols[1].width_ratio > 0.20):
labels = ['column_en', 'column_example']
else:
labels = ['column_en', 'column_de', 'column_example']
regions = list(structural)
for i, g in enumerate(content_cols):
label = labels[i] if i < len(labels) else 'column_example'
regions.append(PageRegion(
type=label, x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.95,
classification_method='positional',
))
logger.info(f"PositionalColumns: {len(structural)} structural, "
f"{len(content_cols)} content -> "
f"{[r.type for r in regions]}")
return regions
# ---------------------------------------------------------------------------
# Main Classification Entry Point
# ---------------------------------------------------------------------------
def classify_column_types(geometries: List[ColumnGeometry],
content_w: int,
top_y: int,
img_w: int,
img_h: int,
bottom_y: int,
left_x: int = 0,
right_x: int = 0,
inv: Optional[np.ndarray] = None,
document_category: Optional[str] = None,
margin_strip_detected: bool = False) -> List[PageRegion]:
"""Classify column types using a 3-level fallback chain.
Level 0: Dictionary detection (if signals are strong enough)
Level 1: Content-based (language + role scoring)
Level 2: Position + language (old rules enhanced with language detection)
Level 3: Pure position (exact old code, no regression)
Args:
geometries: List of ColumnGeometry from Phase A.
content_w: Total content width.
top_y: Top Y of content area.
img_w: Full image width.
img_h: Full image height.
bottom_y: Bottom Y of content area.
left_x: Left content bound (from _find_content_bounds).
right_x: Right content bound (from _find_content_bounds).
document_category: User-selected category (e.g. 'woerterbuch').
margin_strip_detected: Whether a decorative A-Z margin strip was found.
Returns:
List of PageRegion with types, confidence, and method.
"""
# _add_header_footer lives in cv_layout (avoids circular import at module
# level). Lazy-import here so the module can be tested independently when
# cv_layout hasn't been modified yet.
from cv_layout_detection import _add_header_footer # noqa: E402
content_h = bottom_y - top_y
def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
"""Append margin_left / margin_right regions to *result*."""
margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
return result + margins
# Special case: single column -> plain text page
if len(geometries) == 1:
geom = geometries[0]
return _with_margins([PageRegion(
type='column_text', x=geom.x, y=geom.y,
width=geom.width, height=geom.height,
classification_confidence=0.9,
classification_method='content',
)])
# --- Pre-filter: first/last columns with very few words -> column_ignore ---
# Sub-columns from _detect_sub_columns() are exempt: they intentionally
# have few words (page refs, markers) and should not be discarded.
ignore_regions = []
active_geometries = []
for idx, g in enumerate(geometries):
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
ignore_regions.append(PageRegion(
type='column_ignore', x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.95,
classification_method='content',
))
logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) -> column_ignore (edge, few words)")
else:
active_geometries.append(g)
# Re-index active geometries for classification
for new_idx, g in enumerate(active_geometries):
g.index = new_idx
geometries = active_geometries
# Handle edge case: all columns ignored or only 1 left
if len(geometries) == 0:
return _with_margins(ignore_regions)
if len(geometries) == 1:
geom = geometries[0]
ignore_regions.append(PageRegion(
type='column_text', x=geom.x, y=geom.y,
width=geom.width, height=geom.height,
classification_confidence=0.9,
classification_method='content',
))
return _with_margins(ignore_regions)
# --- Score all columns ---
lang_scores = [_score_language(g.words) for g in geometries]
role_scores = [_score_role(g) for g in geometries]
logger.info(f"ClassifyColumns: language scores: "
f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
logger.info(f"ClassifyColumns: role scores: "
f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
# --- Level 0: Dictionary detection ---
dict_signals = _score_dictionary_signals(
geometries,
document_category=document_category,
margin_strip_detected=margin_strip_detected,
)
if dict_signals["is_dictionary"]:
regions = _classify_dictionary_columns(
geometries, dict_signals, lang_scores, content_h,
)
if regions is not None:
logger.info("ClassifyColumns: Level 0 (dictionary) succeeded, confidence=%.3f",
dict_signals["confidence"])
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
return _with_margins(ignore_regions + regions)
# --- Level 1: Content-based classification ---
regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
if regions is not None:
logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
return _with_margins(ignore_regions + regions)
# --- Level 2: Position + language enhanced ---
regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
if regions is not None:
logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
return _with_margins(ignore_regions + regions)
# --- Level 3: Pure position fallback (old code, no regression) ---
logger.info("ClassifyColumns: Level 3 (position fallback)")
regions = _classify_by_position_fallback(geometries, content_w, content_h)
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
return _with_margins(ignore_regions + regions)
# ---------------------------------------------------------------------------
# Level 1: Content-Based Classification
# ---------------------------------------------------------------------------
def _classify_by_content(geometries: List[ColumnGeometry],
lang_scores: List[Dict[str, float]],
role_scores: List[Dict[str, float]],
content_w: int,
content_h: int) -> Optional[List[PageRegion]]:
"""Level 1: Classify columns purely by content analysis.
Requires clear language signals to distinguish EN/DE columns.
Returns None if language signals are too weak.
"""
regions = []
assigned = set()
# Step 1: Assign structural roles first (reference, marker)
# left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
is_left_side = geom.x < left_20_threshold
has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
regions.append(PageRegion(
type='page_ref', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=rs['reference'],
classification_method='content',
))
assigned.add(i)
elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=rs['marker'],
classification_method='content',
))
assigned.add(i)
elif geom.width_ratio < 0.05 and not is_left_side:
# Narrow column on the right side -> marker, not page_ref
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.8,
classification_method='content',
))
assigned.add(i)
# Step 2: Among remaining columns, find EN and DE by language scores
remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
for i in range(len(geometries)) if i not in assigned]
if len(remaining) < 2:
# Not enough columns for EN/DE pair
if len(remaining) == 1:
i, geom, ls, rs = remaining[0]
regions.append(PageRegion(
type='column_text', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.6,
classification_method='content',
))
regions.sort(key=lambda r: r.x)
return regions
# Check if we have enough language signal
en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
# Position tiebreaker: when language signals are weak, use left=EN, right=DE
if (not en_candidates or not de_candidates) and len(remaining) >= 2:
max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
if max_eng < 0.15 and max_deu < 0.15:
# Both signals weak -- fall back to positional: left=EN, right=DE
sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
en_conf = 0.4
de_conf = 0.4
regions.append(PageRegion(
type='column_en', x=best_en[1].x, y=best_en[1].y,
width=best_en[1].width, height=content_h,
classification_confidence=en_conf,
classification_method='content',
))
assigned.add(best_en[0])
regions.append(PageRegion(
type='column_de', x=best_de[1].x, y=best_de[1].y,
width=best_de[1].width, height=content_h,
classification_confidence=de_conf,
classification_method='content',
))
assigned.add(best_de[0])
# Assign remaining as example
for i, geom, ls, rs in remaining:
if i not in assigned:
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.4,
classification_method='content',
))
regions.sort(key=lambda r: r.x)
return regions
if not en_candidates or not de_candidates:
# Language signals too weak for content-based classification
logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
return None
# Pick the best EN and DE candidates
best_en = max(en_candidates, key=lambda x: x[2]['eng'])
best_de = max(de_candidates, key=lambda x: x[2]['deu'])
# Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
# Example sentences contain English function words ("the", "a", "is") which inflate
# the eng score of the Example column. When the best EN candidate sits to the RIGHT
# of the DE column and there is another EN candidate to the LEFT, prefer the left one
# -- it is almost certainly the real vocabulary column.
if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
if left_of_de:
alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
logger.info(
f"ClassifyColumns: Level 1 position fix -- best EN col {best_en[0]} "
f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
best_en = alt_en
if best_en[0] == best_de[0]:
# Same column scored highest for both -- ambiguous
logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
return None
en_conf = best_en[2]['eng']
de_conf = best_de[2]['deu']
regions.append(PageRegion(
type='column_en', x=best_en[1].x, y=best_en[1].y,
width=best_en[1].width, height=content_h,
classification_confidence=round(en_conf, 2),
classification_method='content',
))
assigned.add(best_en[0])
regions.append(PageRegion(
type='column_de', x=best_de[1].x, y=best_de[1].y,
width=best_de[1].width, height=content_h,
classification_confidence=round(de_conf, 2),
classification_method='content',
))
assigned.add(best_de[0])
# Step 3: Remaining columns -> example or text based on role scores
for i, geom, ls, rs in remaining:
if i in assigned:
continue
if rs['sentence'] > 0.4:
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=round(rs['sentence'], 2),
classification_method='content',
))
else:
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='content',
))
regions.sort(key=lambda r: r.x)
return regions
@@ -0,0 +1,218 @@
"""
Position-based column type classification for OCR layout analysis.
Contains Level 2 and Level 3 classification functions:
Level 2 _classify_by_position_enhanced: Position + language confirmation
Level 3 _classify_by_position_fallback: Pure positional (no regression)
Extracted from cv_layout_classify.py during file-size split.
"""
import logging
from typing import Dict, List, Optional
from cv_vocab_types import ColumnGeometry, PageRegion
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Level 2: Position-Enhanced Classification
# ---------------------------------------------------------------------------
def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
lang_scores: List[Dict[str, float]],
content_w: int,
content_h: int) -> Optional[List[PageRegion]]:
"""Level 2: Position-based rules enhanced with language confirmation.
Uses the old positional heuristics but confirms EN/DE assignment
with language scores (swapping if needed).
"""
regions = []
untyped = list(range(len(geometries)))
first_x = geometries[0].x if geometries else 0
left_20_threshold = first_x + content_w * 0.20
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%, no strong language)
g0 = geometries[0]
ls0 = lang_scores[0]
has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
regions.append(PageRegion(
type='page_ref', x=g0.x, y=g0.y,
width=g0.width, height=content_h,
classification_confidence=0.8,
classification_method='position_enhanced',
))
untyped.remove(0)
# Rule 2: Narrow columns with few words -> marker
for i in list(untyped):
geom = geometries[i]
if geom.width_ratio < 0.06 and geom.word_count <= 15:
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.7,
classification_method='position_enhanced',
))
untyped.remove(i)
# Rule 3: Rightmost remaining -> column_example (if 3+ remaining)
if len(untyped) >= 3:
last_idx = untyped[-1]
geom = geometries[last_idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.7,
classification_method='position_enhanced',
))
untyped.remove(last_idx)
# Rule 4: First two remaining -> EN/DE, but check language to possibly swap
if len(untyped) >= 2:
idx_a = untyped[0]
idx_b = untyped[1]
ls_a = lang_scores[idx_a]
ls_b = lang_scores[idx_b]
# Default: first=EN, second=DE (old behavior)
en_idx, de_idx = idx_a, idx_b
conf = 0.7
# Swap if language signals clearly indicate the opposite
if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
en_idx, de_idx = idx_b, idx_a
conf = 0.85
logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
regions.append(PageRegion(
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
width=geometries[en_idx].width, height=content_h,
classification_confidence=conf,
classification_method='position_enhanced',
))
regions.append(PageRegion(
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
width=geometries[de_idx].width, height=content_h,
classification_confidence=conf,
classification_method='position_enhanced',
))
untyped = untyped[2:]
elif len(untyped) == 1:
idx = untyped[0]
geom = geometries[idx]
regions.append(PageRegion(
type='column_en', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='position_enhanced',
))
untyped = []
# Remaining -> example
for idx in untyped:
geom = geometries[idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='position_enhanced',
))
regions.sort(key=lambda r: r.x)
return regions
# ---------------------------------------------------------------------------
# Level 3: Position Fallback Classification
# ---------------------------------------------------------------------------
def _classify_by_position_fallback(geometries: List[ColumnGeometry],
content_w: int,
content_h: int) -> List[PageRegion]:
"""Level 3: Pure position-based fallback (identical to old code).
Guarantees no regression from the previous behavior.
"""
regions = []
untyped = list(range(len(geometries)))
first_x = geometries[0].x if geometries else 0
left_20_threshold = first_x + content_w * 0.20
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%)
g0 = geometries[0]
if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
regions.append(PageRegion(
type='page_ref', x=g0.x, y=g0.y,
width=g0.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(0)
# Rule 2: Narrow + few words -> marker
for i in list(untyped):
geom = geometries[i]
if geom.width_ratio < 0.06 and geom.word_count <= 15:
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(i)
# Rule 3: Rightmost remaining -> example (if 3+)
if len(untyped) >= 3:
last_idx = untyped[-1]
geom = geometries[last_idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(last_idx)
# Rule 4: First remaining -> EN, second -> DE
if len(untyped) >= 2:
en_idx = untyped[0]
de_idx = untyped[1]
regions.append(PageRegion(
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
width=geometries[en_idx].width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
regions.append(PageRegion(
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
width=geometries[de_idx].width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped = untyped[2:]
elif len(untyped) == 1:
idx = untyped[0]
geom = geometries[idx]
regions.append(PageRegion(
type='column_en', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped = []
for idx in untyped:
geom = geometries[idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
regions.sort(key=lambda r: r.x)
return regions
@@ -0,0 +1,458 @@
"""
Post-processing refinements for column geometry.
Extracted from cv_layout_columns.py contains:
- _detect_sub_columns() (sub-column detection via left-edge alignment)
- _split_broad_columns() (broad column splitting via word-coverage gaps)
- expand_narrow_columns() (narrow column expansion into whitespace)
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import statistics
from typing import Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import ColumnGeometry
logger = logging.getLogger(__name__)
def _detect_sub_columns(
geometries: List[ColumnGeometry],
content_w: int,
left_x: int = 0,
top_y: int = 0,
header_y: Optional[int] = None,
footer_y: Optional[int] = None,
_edge_tolerance: int = 8,
_min_col_start_ratio: float = 0.10,
) -> List[ColumnGeometry]:
"""Split columns that contain internal sub-columns based on left-edge alignment.
For each column, clusters word left-edges into alignment bins (within
``_edge_tolerance`` px). The leftmost bin whose word count reaches
``_min_col_start_ratio`` of the column total is treated as the true column
start. Any words to the left of that bin form a sub-column, provided they
number >= 2 and < 35 % of total.
Word ``left`` values are relative to the content ROI (offset by *left_x*),
while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
bridges the two coordinate systems.
If *header_y* / *footer_y* are provided (absolute y-coordinates), words
in header/footer regions are excluded from alignment clustering to avoid
polluting the bins with page numbers or chapter titles. Word ``top``
values are relative to *top_y*.
Returns a new list of ColumnGeometry potentially longer than the input.
"""
if content_w <= 0:
return geometries
result: List[ColumnGeometry] = []
for geo in geometries:
# Only consider wide-enough columns with enough words
if geo.width_ratio < 0.15 or geo.word_count < 5:
result.append(geo)
continue
# Collect left-edges of confident words, excluding header/footer
# Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
min_top_rel = (header_y - top_y) if header_y is not None else None
max_top_rel = (footer_y - top_y) if footer_y is not None else None
confident = [w for w in geo.words
if w.get('conf', 0) >= 30
and (min_top_rel is None or w['top'] >= min_top_rel)
and (max_top_rel is None or w['top'] <= max_top_rel)]
if len(confident) < 3:
result.append(geo)
continue
# --- Cluster left-edges into alignment bins ---
sorted_edges = sorted(w['left'] for w in confident)
bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge)
cur = [sorted_edges[0]]
for i in range(1, len(sorted_edges)):
if sorted_edges[i] - cur[-1] <= _edge_tolerance:
cur.append(sorted_edges[i])
else:
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
cur = [sorted_edges[i]]
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
# --- Find the leftmost bin qualifying as a real column start ---
total = len(confident)
min_count = max(3, int(total * _min_col_start_ratio))
col_start_bin = None
for b in bins:
if b[1] >= min_count:
col_start_bin = b
break
if col_start_bin is None:
result.append(geo)
continue
# Words to the left of the column-start bin are sub-column candidates
split_threshold = col_start_bin[2] - _edge_tolerance
sub_words = [w for w in geo.words if w['left'] < split_threshold]
main_words = [w for w in geo.words if w['left'] >= split_threshold]
# Count only body words (excluding header/footer) for the threshold check
# so that header/footer words don't artificially trigger a split.
sub_body = [w for w in sub_words
if (min_top_rel is None or w['top'] >= min_top_rel)
and (max_top_rel is None or w['top'] <= max_top_rel)]
if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
result.append(geo)
continue
# --- Guard against inline markers (bullet points, numbering) ---
# Bullet points like "1.", "2.", "•", "-" sit close to the main
# column text and are part of the cell, not a separate column.
# Only split if the horizontal gap between the rightmost sub-word
# and the main column start is large enough.
max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
gap_to_main = col_start_bin[2] - max_sub_right # px gap
median_heights = [w.get('height', 20) for w in confident]
med_h = statistics.median(median_heights) if median_heights else 20
min_gap = max(med_h * 1.2, 20) # at least 1.2× word height or 20px
if gap_to_main < min_gap:
logger.debug(
"SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
"(likely inline markers, not a sub-column)",
geo.index, gap_to_main, min_gap)
result.append(geo)
continue
# --- Build two sub-column geometries ---
# Word 'left' values are relative to left_x; geo.x is absolute.
# Convert the split position from relative to absolute coordinates.
max_sub_left = max(w['left'] for w in sub_words)
split_rel = (max_sub_left + col_start_bin[2]) // 2
split_abs = split_rel + left_x
sub_x = geo.x
sub_width = split_abs - geo.x
main_x = split_abs
main_width = (geo.x + geo.width) - split_abs
if sub_width <= 0 or main_width <= 0:
result.append(geo)
continue
sub_geo = ColumnGeometry(
index=0,
x=sub_x,
y=geo.y,
width=sub_width,
height=geo.height,
word_count=len(sub_words),
words=sub_words,
width_ratio=sub_width / content_w if content_w > 0 else 0.0,
is_sub_column=True,
)
main_geo = ColumnGeometry(
index=0,
x=main_x,
y=geo.y,
width=main_width,
height=geo.height,
word_count=len(main_words),
words=main_words,
width_ratio=main_width / content_w if content_w > 0 else 0.0,
is_sub_column=True,
)
result.append(sub_geo)
result.append(main_geo)
logger.info(
f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
f"(rel={split_rel}), sub={len(sub_words)} words, "
f"main={len(main_words)} words, "
f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
)
# Re-index by left-to-right order
result.sort(key=lambda g: g.x)
for i, g in enumerate(result):
g.index = i
return result
def _split_broad_columns(
geometries: List[ColumnGeometry],
content_w: int,
left_x: int = 0,
_broad_threshold: float = 0.35,
_min_gap_px: int = 15,
_min_words_per_split: int = 5,
) -> List[ColumnGeometry]:
"""Split overly broad columns that contain two language blocks (EN+DE).
Uses word-coverage gap analysis: builds a per-pixel coverage array from the
words inside each broad column, finds the largest horizontal gap, and splits
the column at that gap.
Args:
geometries: Column geometries from _detect_sub_columns.
content_w: Width of the content area in pixels.
left_x: Left edge of content ROI in absolute image coordinates.
_broad_threshold: Minimum width_ratio to consider a column "broad".
_min_gap_px: Minimum gap width (pixels) to trigger a split.
_min_words_per_split: Both halves must have at least this many words.
Returns:
Updated list of ColumnGeometry (possibly with more columns).
"""
result: List[ColumnGeometry] = []
logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")
for geo in geometries:
if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
result.append(geo)
continue
# Build word-coverage array (per pixel within column)
col_left_rel = geo.x - left_x # column left in content-relative coords
coverage = np.zeros(geo.width, dtype=np.float32)
for wd in geo.words:
# wd['left'] is relative to left_x (content ROI)
wl = wd['left'] - col_left_rel
wr = wl + wd.get('width', 0)
wl = max(0, int(wl))
wr = min(geo.width, int(wr))
if wr > wl:
coverage[wl:wr] += 1.0
# Light smoothing (kernel=3px) to avoid noise
if len(coverage) > 3:
kernel = np.ones(3, dtype=np.float32) / 3.0
coverage = np.convolve(coverage, kernel, mode='same')
# Normalise to [0, 1]
cmax = coverage.max()
if cmax > 0:
coverage /= cmax
# Find INTERNAL gaps where coverage < 0.5
# Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
low_mask = coverage < 0.5
all_gaps = []
_gs = None
for px in range(len(low_mask)):
if low_mask[px]:
if _gs is None:
_gs = px
else:
if _gs is not None:
all_gaps.append((_gs, px, px - _gs))
_gs = None
if _gs is not None:
all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
# Filter: only internal gaps (not touching column edges)
_edge_margin = 10 # pixels from edge to ignore
internal_gaps = [g for g in all_gaps
if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
f"{[g for g in all_gaps if g[2] >= 5]}, "
f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
f"best={best_gap}")
if best_gap is None or best_gap[2] < _min_gap_px:
result.append(geo)
continue
gap_center = (best_gap[0] + best_gap[1]) // 2
# Split words by midpoint relative to gap
left_words = []
right_words = []
for wd in geo.words:
wl = wd['left'] - col_left_rel
mid = wl + wd.get('width', 0) / 2.0
if mid < gap_center:
left_words.append(wd)
else:
right_words.append(wd)
if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
result.append(geo)
continue
# Build two new ColumnGeometry objects
split_x_abs = geo.x + gap_center
left_w = gap_center
right_w = geo.width - gap_center
left_geo = ColumnGeometry(
index=0,
x=geo.x,
y=geo.y,
width=left_w,
height=geo.height,
word_count=len(left_words),
words=left_words,
width_ratio=left_w / content_w if content_w else 0,
is_sub_column=True,
)
right_geo = ColumnGeometry(
index=0,
x=split_x_abs,
y=geo.y,
width=right_w,
height=geo.height,
word_count=len(right_words),
words=right_words,
width_ratio=right_w / content_w if content_w else 0,
is_sub_column=True,
)
logger.info(
f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
f"left={len(left_words)} words (w={left_w}), "
f"right={len(right_words)} words (w={right_w})"
)
result.append(left_geo)
result.append(right_geo)
# Re-index left-to-right
result.sort(key=lambda g: g.x)
for i, g in enumerate(result):
g.index = i
return result
def expand_narrow_columns(
geometries: List[ColumnGeometry],
content_w: int,
left_x: int,
word_dicts: List[Dict],
) -> List[ColumnGeometry]:
"""Expand narrow columns into adjacent whitespace gaps.
Narrow columns (marker, page_ref, < 10% content width) often lose
content at image edges due to residual shear. This expands them toward
the neighbouring column, but never past 40% of the gap or past the
nearest word in the neighbour.
Must be called AFTER _detect_sub_columns() so that sub-column splits
(which create the narrowest columns) have already happened.
"""
_NARROW_THRESHOLD_PCT = 10.0
_MIN_WORD_MARGIN = 4
if len(geometries) < 2:
return geometries
logger.info("ExpandNarrowCols: input %d cols: %s",
len(geometries),
[(i, g.x, g.width, round(g.width / content_w * 100, 1))
for i, g in enumerate(geometries)])
for i, g in enumerate(geometries):
col_pct = g.width / content_w * 100 if content_w > 0 else 100
if col_pct >= _NARROW_THRESHOLD_PCT:
continue
expanded = False
orig_pct = col_pct
# --- try expanding to the LEFT ---
if i > 0:
left_nb = geometries[i - 1]
# Gap can be 0 if sub-column split created adjacent columns.
# In that case, look at where the neighbor's rightmost words
# actually are — there may be unused space we can claim.
nb_words_right = [wd['left'] + wd.get('width', 0)
for wd in left_nb.words]
if nb_words_right:
rightmost_word_abs = left_x + max(nb_words_right)
safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
else:
# No words in neighbor → we can take up to neighbor's start
safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
if safe_left_abs < g.x:
g.width += (g.x - safe_left_abs)
g.x = safe_left_abs
expanded = True
# --- try expanding to the RIGHT ---
if i + 1 < len(geometries):
right_nb = geometries[i + 1]
nb_words_left = [wd['left'] for wd in right_nb.words]
if nb_words_left:
leftmost_word_abs = left_x + min(nb_words_left)
safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
else:
safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
cur_right = g.x + g.width
if safe_right_abs > cur_right:
g.width = safe_right_abs - g.x
expanded = True
if expanded:
col_left_rel = g.x - left_x
col_right_rel = col_left_rel + g.width
g.words = [wd for wd in word_dicts
if col_left_rel <= wd['left'] < col_right_rel]
g.word_count = len(g.words)
g.width_ratio = g.width / content_w if content_w > 0 else 0.0
logger.info(
"ExpandNarrowCols: col %d (%.1f%%%.1f%%) x=%d w=%d words=%d",
i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
# --- Shrink overlapping neighbors to match new boundaries ---
# Left neighbor: its right edge must not exceed our new left edge
if i > 0:
left_nb = geometries[i - 1]
nb_right = left_nb.x + left_nb.width
if nb_right > g.x:
left_nb.width = g.x - left_nb.x
if left_nb.width < 0:
left_nb.width = 0
left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
# Re-assign words
nb_left_rel = left_nb.x - left_x
nb_right_rel = nb_left_rel + left_nb.width
left_nb.words = [wd for wd in word_dicts
if nb_left_rel <= wd['left'] < nb_right_rel]
left_nb.word_count = len(left_nb.words)
# Right neighbor: its left edge must not be before our new right edge
if i + 1 < len(geometries):
right_nb = geometries[i + 1]
my_right = g.x + g.width
if right_nb.x < my_right:
old_right_edge = right_nb.x + right_nb.width
right_nb.x = my_right
right_nb.width = old_right_edge - right_nb.x
if right_nb.width < 0:
right_nb.width = 0
right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
# Re-assign words
nb_left_rel = right_nb.x - left_x
nb_right_rel = nb_left_rel + right_nb.width
right_nb.words = [wd for wd in word_dicts
if nb_left_rel <= wd['left'] < nb_right_rel]
right_nb.word_count = len(right_nb.words)
return geometries
@@ -0,0 +1,589 @@
"""
Core column detection: gap-based geometry and clustering fallback.
Extracted from the original cv_layout_columns.py contains:
- _detect_columns_by_clustering() (fallback clustering)
- _build_geometries_from_starts() (geometry construction)
- detect_column_geometry() (main column detection)
Post-processing (sub-columns, broad-column split, narrow expansion)
lives in cv_layout_column_refine.py.
Legacy projection-profile layout lives in cv_layout_analyze.py.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import ColumnGeometry
from cv_layout_detection import _find_content_bounds
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
try:
import pytesseract
from PIL import Image
except ImportError:
pytesseract = None # type: ignore[assignment]
Image = None # type: ignore[assignment,misc]
# =============================================================================
# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
# =============================================================================
# --- Phase A: Geometry Detection ---
def _detect_columns_by_clustering(
word_dicts: List[Dict],
left_edges: List[int],
edge_word_indices: List[int],
content_w: int,
content_h: int,
left_x: int,
right_x: int,
top_y: int,
bottom_y: int,
inv: Optional[np.ndarray] = None,
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
"""Fallback: detect columns by clustering left-aligned word positions.
Used when the primary gap-based algorithm finds fewer than 2 gaps.
"""
tolerance = max(10, int(content_w * 0.01))
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
clusters = []
cluster_widxs = []
cur_edges = [sorted_pairs[0][0]]
cur_widxs = [sorted_pairs[0][1]]
for edge, widx in sorted_pairs[1:]:
if edge - cur_edges[-1] <= tolerance:
cur_edges.append(edge)
cur_widxs.append(widx)
else:
clusters.append(cur_edges)
cluster_widxs.append(cur_widxs)
cur_edges = [edge]
cur_widxs = [widx]
clusters.append(cur_edges)
cluster_widxs.append(cur_widxs)
MIN_Y_COVERAGE_PRIMARY = 0.30
MIN_Y_COVERAGE_SECONDARY = 0.15
MIN_WORDS_SECONDARY = 5
cluster_infos = []
for c_edges, c_widxs in zip(clusters, cluster_widxs):
if len(c_edges) < 2:
continue
y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
y_span = max(y_positions) - min(y_positions)
y_coverage = y_span / content_h if content_h > 0 else 0.0
cluster_infos.append({
'mean_x': int(np.mean(c_edges)),
'count': len(c_edges),
'min_edge': min(c_edges),
'max_edge': max(c_edges),
'y_min': min(y_positions),
'y_max': max(y_positions),
'y_coverage': y_coverage,
})
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
primary_set = set(id(c) for c in primary)
secondary = [c for c in cluster_infos
if id(c) not in primary_set
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
and c['count'] >= MIN_WORDS_SECONDARY]
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
if len(significant) < 3:
logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
return None
merge_distance = max(30, int(content_w * 0.06))
merged = [significant[0].copy()]
for s in significant[1:]:
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
prev = merged[-1]
total = prev['count'] + s['count']
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
prev['mean_x'] = avg_x
prev['count'] = total
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
else:
merged.append(s.copy())
if len(merged) < 3:
logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
return None
logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
margin_px = max(6, int(content_w * 0.003))
return _build_geometries_from_starts(
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
)
def _build_geometries_from_starts(
col_starts: List[Tuple[int, int]],
word_dicts: List[Dict],
left_x: int,
right_x: int,
top_y: int,
bottom_y: int,
content_w: int,
content_h: int,
inv: Optional[np.ndarray] = None,
) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
geometries = []
for i, (start_x, count) in enumerate(col_starts):
if i + 1 < len(col_starts):
col_width = col_starts[i + 1][0] - start_x
else:
col_width = right_x - start_x
col_left_rel = start_x - left_x
col_right_rel = col_left_rel + col_width
col_words = [w for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel]
geometries.append(ColumnGeometry(
index=i,
x=start_x,
y=top_y,
width=col_width,
height=content_h,
word_count=len(col_words),
words=col_words,
width_ratio=col_width / content_w if content_w > 0 else 0.0,
))
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
"""Detect column geometry using whitespace-gap analysis with word validation.
Phase A of the two-phase column detection. Uses vertical projection
profiles to find whitespace gaps between columns, then validates that
no gap cuts through a word bounding box.
Falls back to clustering-based detection if fewer than 2 gaps are found.
Args:
ocr_img: Binarized grayscale image for layout analysis.
dewarped_bgr: Original BGR image (for Tesseract word detection).
Returns:
Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
or None if detection fails entirely.
"""
h, w = ocr_img.shape[:2]
# --- Step 1: Find content bounds ---
inv = cv2.bitwise_not(ocr_img)
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
content_w = right_x - left_x
content_h = bottom_y - top_y
if content_w < w * 0.3 or content_h < h * 0.3:
left_x, right_x = 0, w
top_y, bottom_y = 0, h
content_w, content_h = w, h
logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
f"y=[{top_y}..{bottom_y}] ({content_h}px)")
# --- Step 2: Get word bounding boxes from Tesseract ---
# Crop from left_x to full image width (not right_x) so words at the right
# edge of the last column are included even if they extend past the detected
# content boundary (right_x).
content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
try:
data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
except Exception as e:
logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
return None
word_dicts = []
left_edges = []
edge_word_indices = []
n_words = len(data['text'])
for i in range(n_words):
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
text = str(data['text'][i]).strip()
if conf < 30 or not text:
continue
lx = int(data['left'][i])
ty = int(data['top'][i])
bw = int(data['width'][i])
bh = int(data['height'][i])
left_edges.append(lx)
edge_word_indices.append(len(word_dicts))
word_dicts.append({
'text': text, 'conf': conf,
'left': lx, 'top': ty, 'width': bw, 'height': bh,
})
if len(left_edges) < 5:
logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
return None
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
# --- Step 2b: Segment by sub-headers ---
# Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
# text bands that pollute the vertical projection. We detect large
# horizontal gaps (= whitespace rows separating sections) and use only
# the tallest content segment for the projection. This makes column
# detection immune to sub-headers, illustrations, and section dividers.
content_strip = inv[top_y:bottom_y, left_x:right_x]
h_proj_row = np.sum(content_strip, axis=1).astype(float)
h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
# Find horizontal gaps (near-empty rows)
H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty"
h_in_gap = h_proj_row_norm < H_GAP_THRESH
H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px
h_gaps: List[Tuple[int, int]] = []
h_gap_start = None
for y_idx in range(len(h_in_gap)):
if h_in_gap[y_idx]:
if h_gap_start is None:
h_gap_start = y_idx
else:
if h_gap_start is not None:
if y_idx - h_gap_start >= H_MIN_GAP:
h_gaps.append((h_gap_start, y_idx))
h_gap_start = None
if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
h_gaps.append((h_gap_start, len(h_in_gap)))
# Identify "large" gaps (significantly bigger than median) that indicate
# section boundaries (sub-headers, chapter titles).
if len(h_gaps) >= 3:
gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
median_gap_h = gap_sizes[len(gap_sizes) // 2]
large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
else:
large_gaps = h_gaps
# Build content segments between large gaps and pick the tallest
seg_boundaries = [0]
for gs, ge in large_gaps:
seg_boundaries.append(gs)
seg_boundaries.append(ge)
seg_boundaries.append(content_h)
segments = []
for i in range(0, len(seg_boundaries) - 1, 2):
seg_top = seg_boundaries[i]
seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
seg_height = seg_bot - seg_top
if seg_height > 20: # ignore tiny fragments
segments.append((seg_top, seg_bot, seg_height))
if segments:
segments.sort(key=lambda s: s[2], reverse=True)
best_seg = segments[0]
proj_strip = content_strip[best_seg[0]:best_seg[1], :]
effective_h = best_seg[2]
if len(segments) > 1:
logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
else:
proj_strip = content_strip
effective_h = content_h
# --- Step 3: Vertical projection profile ---
v_proj = np.sum(proj_strip, axis=0).astype(float)
v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
# Smooth the projection to avoid noise-induced micro-gaps
kernel_size = max(5, content_w // 80)
if kernel_size % 2 == 0:
kernel_size += 1 # keep odd for symmetry
v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
# --- Step 4: Find whitespace gaps ---
# Threshold: areas with very little ink density are gaps
median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
gap_threshold = max(median_density * 0.15, 0.005)
in_gap = v_smooth < gap_threshold
MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
# Collect contiguous gap regions
raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
gap_start = None
for x in range(len(in_gap)):
if in_gap[x]:
if gap_start is None:
gap_start = x
else:
if gap_start is not None:
gap_width = x - gap_start
if gap_width >= MIN_GAP_WIDTH:
raw_gaps.append((gap_start, x))
gap_start = None
# Handle gap at the right edge
if gap_start is not None:
gap_width = len(in_gap) - gap_start
if gap_width >= MIN_GAP_WIDTH:
raw_gaps.append((gap_start, len(in_gap)))
logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
f"min_width={MIN_GAP_WIDTH}px): "
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
# --- Step 5: Validate gaps against word bounding boxes ---
# When using a segment for projection, only validate against words
# inside that segment — words from sub-headers or other sections
# would incorrectly overlap with real column gaps.
if segments and len(segments) > 1:
seg_top_abs = best_seg[0] # relative to content strip
seg_bot_abs = best_seg[1]
segment_words = [wd for wd in word_dicts
if wd['top'] >= seg_top_abs
and wd['top'] + wd['height'] <= seg_bot_abs]
logger.info(f"ColumnGeometry: filtering words to segment: "
f"{len(segment_words)}/{len(word_dicts)} words")
else:
segment_words = word_dicts
validated_gaps = []
for gap_start_rel, gap_end_rel in raw_gaps:
# Check if any word overlaps with this gap region
overlapping = False
for wd in segment_words:
word_left = wd['left']
word_right = wd['left'] + wd['width']
if word_left < gap_end_rel and word_right > gap_start_rel:
overlapping = True
break
if not overlapping:
validated_gaps.append((gap_start_rel, gap_end_rel))
else:
# Try to shift the gap to avoid the overlapping word(s)
# Find the tightest word boundaries within the gap region
min_word_left = content_w
max_word_right = 0
for wd in segment_words:
word_left = wd['left']
word_right = wd['left'] + wd['width']
if word_left < gap_end_rel and word_right > gap_start_rel:
min_word_left = min(min_word_left, word_left)
max_word_right = max(max_word_right, word_right)
# Try gap before the overlapping words
if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
validated_gaps.append((gap_start_rel, min_word_left))
logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
# Try gap after the overlapping words
elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
validated_gaps.append((max_word_right, gap_end_rel))
logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
else:
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
f"discarded (word overlap, no room to shift)")
logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
# --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
# When pixel-based projection fails (e.g. due to illustrations or colored
# bands), use word bounding boxes to find clear vertical gaps. This is
# immune to decorative graphics that Tesseract doesn't recognise as words.
if len(validated_gaps) < 2:
logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
word_coverage = np.zeros(content_w, dtype=np.int32)
for wd in segment_words:
wl = max(0, wd['left'])
wr = min(wd['left'] + wd['width'], content_w)
if wr > wl:
word_coverage[wl:wr] += 1
# Smooth slightly to bridge tiny 1-2px noise gaps between words
wc_kernel = max(3, content_w // 300)
if wc_kernel % 2 == 0:
wc_kernel += 1
wc_smooth = np.convolve(word_coverage.astype(float),
np.ones(wc_kernel) / wc_kernel, mode='same')
wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage
WC_MIN_GAP = max(4, content_w // 300)
wc_gaps: List[Tuple[int, int]] = []
wc_gap_start = None
for x in range(len(wc_in_gap)):
if wc_in_gap[x]:
if wc_gap_start is None:
wc_gap_start = x
else:
if wc_gap_start is not None:
if x - wc_gap_start >= WC_MIN_GAP:
wc_gaps.append((wc_gap_start, x))
wc_gap_start = None
if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
wc_gaps.append((wc_gap_start, len(wc_in_gap)))
logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
f"(min_width={WC_MIN_GAP}px): "
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
if len(wc_gaps) >= 2:
validated_gaps = wc_gaps
# --- Step 6: Fallback to clustering if too few gaps ---
if len(validated_gaps) < 2:
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
return _detect_columns_by_clustering(
word_dicts, left_edges, edge_word_indices,
content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
)
# --- Step 7: Derive column boundaries from gaps ---
# Sort gaps by position
validated_gaps.sort(key=lambda g: g[0])
# Identify margin gaps (first and last) vs interior gaps
# A margin gap touches the edge of the content area (within 2% tolerance)
edge_tolerance = max(10, int(content_w * 0.02))
is_left_margin = validated_gaps[0][0] <= edge_tolerance
is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
# Interior gaps define column boundaries
# Column starts at the end of a gap, ends at the start of the next gap
col_starts = []
if is_left_margin:
# First column starts after the left margin gap
first_gap_end = validated_gaps[0][1]
interior_gaps = validated_gaps[1:]
else:
# No left margin gap — first column starts at content left edge
first_gap_end = 0
interior_gaps = validated_gaps[:]
if is_right_margin:
# Last gap is right margin — don't use it as column start
interior_gaps_for_boundaries = interior_gaps[:-1]
right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
else:
interior_gaps_for_boundaries = interior_gaps
right_boundary = content_w
# First column
col_starts.append(left_x + first_gap_end)
# Columns between interior gaps
for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
col_starts.append(left_x + gap_end_rel)
# Count words per column region (for logging)
col_start_counts = []
for i, start_x in enumerate(col_starts):
if i + 1 < len(col_starts):
next_start = col_starts[i + 1]
else:
# Rightmost column always extends to full image width (w).
# The page margin contains only white space — extending the OCR
# crop to the image edge is safe and prevents text near the right
# border from being cut off.
next_start = w
col_left_rel = start_x - left_x
col_right_rel = next_start - left_x
n_words_in_col = sum(1 for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel)
col_start_counts.append((start_x, n_words_in_col))
logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
f"{col_start_counts}")
# --- Step 8: Build ColumnGeometry objects ---
# Determine right edge for each column
all_boundaries = []
for i, start_x in enumerate(col_starts):
if i + 1 < len(col_starts):
end_x = col_starts[i + 1]
else:
# Rightmost column always extends to full image width (w).
end_x = w
all_boundaries.append((start_x, end_x))
geometries = []
for i, (start_x, end_x) in enumerate(all_boundaries):
col_width = end_x - start_x
col_left_rel = start_x - left_x
col_right_rel = col_left_rel + col_width
col_words = [w for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel]
geometries.append(ColumnGeometry(
index=i,
x=start_x,
y=top_y,
width=col_width,
height=content_h,
word_count=len(col_words),
words=col_words,
width_ratio=col_width / content_w if content_w > 0 else 0.0,
))
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
# --- Step 9: Filter phantom narrow columns ---
# Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
# columns (< 3% of content width) with zero or no words. These are not
# real columns — remove them and close the gap between neighbors.
min_real_col_w = max(20, int(content_w * 0.03))
filtered_geoms = [g for g in geometries
if not (g.word_count < 3 and g.width < min_real_col_w)]
if len(filtered_geoms) < len(geometries):
n_removed = len(geometries) - len(filtered_geoms)
logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
f"(width < {min_real_col_w}px and words < 3)")
# Extend each remaining column to close gaps with its right neighbor
for i, g in enumerate(filtered_geoms):
if i + 1 < len(filtered_geoms):
g.width = filtered_geoms[i + 1].x - g.x
else:
g.width = w - g.x
g.index = i
col_left_rel = g.x - left_x
col_right_rel = col_left_rel + g.width
g.words = [w for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel]
g.word_count = len(g.words)
geometries = filtered_geoms
logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
@@ -0,0 +1,479 @@
"""
Document type detection, image preparation, content bounds, and header/footer detection.
Extracted from cv_layout.py these are the "input-side" helpers that run before
column/row geometry analysis.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import List, Optional, Tuple
import numpy as np
from cv_vocab_types import (
DocumentTypeResult,
PageRegion,
)
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
# =============================================================================
# Document Type Detection
# =============================================================================
def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
"""Detect whether the page is a vocab table, generic table, or full text.
Uses projection profiles and text density analysis no OCR required.
Runs in < 2 seconds.
Args:
ocr_img: Binarized grayscale image (for projection profiles).
img_bgr: BGR color image.
Returns:
DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
"""
if ocr_img is None or ocr_img.size == 0:
return DocumentTypeResult(
doc_type='full_text', confidence=0.5, pipeline='full_page',
skip_steps=['columns', 'rows'],
features={'error': 'empty image'},
)
h, w = ocr_img.shape[:2]
# --- 1. Vertical projection profile → detect column gaps ---
# Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
# Invert: dark pixels on white background → high values = text.
vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
# Smooth the profile to avoid noise spikes
kernel_size = max(3, w // 100)
if kernel_size % 2 == 0:
kernel_size += 1
vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
# Find significant vertical gaps (columns of near-zero text density)
# A gap must be at least 1% of image width and have < 5% of max density
max_density = max(vert_smooth.max(), 1)
gap_threshold = max_density * 0.05
min_gap_width = max(5, w // 100)
in_gap = False
gap_count = 0
gap_start = 0
vert_gaps = []
for x in range(w):
if vert_smooth[x] < gap_threshold:
if not in_gap:
in_gap = True
gap_start = x
else:
if in_gap:
gap_width = x - gap_start
if gap_width >= min_gap_width:
gap_count += 1
vert_gaps.append((gap_start, x, gap_width))
in_gap = False
# Filter out margin gaps (within 10% of image edges)
margin_threshold = w * 0.10
internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
internal_gap_count = len(internal_gaps)
# --- 2. Horizontal projection profile → detect row gaps ---
horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
h_kernel = max(3, h // 200)
if h_kernel % 2 == 0:
h_kernel += 1
horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
h_max = max(horiz_smooth.max(), 1)
h_gap_threshold = h_max * 0.05
min_row_gap = max(3, h // 200)
row_gap_count = 0
in_gap = False
for y in range(h):
if horiz_smooth[y] < h_gap_threshold:
if not in_gap:
in_gap = True
gap_start = y
else:
if in_gap:
if y - gap_start >= min_row_gap:
row_gap_count += 1
in_gap = False
# --- 3. Text density distribution (4×4 grid) ---
grid_rows, grid_cols = 4, 4
cell_h, cell_w = h // grid_rows, w // grid_cols
densities = []
for gr in range(grid_rows):
for gc in range(grid_cols):
cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
gc * cell_w:(gc + 1) * cell_w]
if cell.size > 0:
d = float(np.count_nonzero(cell < 128)) / cell.size
densities.append(d)
density_std = float(np.std(densities)) if densities else 0
density_mean = float(np.mean(densities)) if densities else 0
features = {
'vertical_gaps': gap_count,
'internal_vertical_gaps': internal_gap_count,
'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
'row_gaps': row_gap_count,
'density_mean': round(density_mean, 4),
'density_std': round(density_std, 4),
'image_size': (w, h),
}
# --- 4. Decision tree ---
# Use internal_gap_count (excludes margin gaps) for column detection.
if internal_gap_count >= 2 and row_gap_count >= 5:
# Multiple internal vertical gaps + many row gaps → table
confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
return DocumentTypeResult(
doc_type='vocab_table',
confidence=round(confidence, 2),
pipeline='cell_first',
skip_steps=[],
features=features,
)
elif internal_gap_count >= 1 and row_gap_count >= 3:
# Some internal structure, likely a table
confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
return DocumentTypeResult(
doc_type='generic_table',
confidence=round(confidence, 2),
pipeline='cell_first',
skip_steps=[],
features=features,
)
elif internal_gap_count == 0:
# No internal column gaps → full text (regardless of density)
confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
return DocumentTypeResult(
doc_type='full_text',
confidence=round(confidence, 2),
pipeline='full_page',
skip_steps=['columns', 'rows'],
features=features,
)
else:
# Ambiguous — default to vocab_table (most common use case)
return DocumentTypeResult(
doc_type='vocab_table',
confidence=0.5,
pipeline='cell_first',
skip_steps=[],
features=features,
)
# =============================================================================
# Image Creation (Dual Image Preparation)
# =============================================================================
def create_ocr_image(img: np.ndarray) -> np.ndarray:
"""Create a binarized image optimized for Tesseract OCR.
Steps: Grayscale Background normalization Adaptive threshold Denoise.
Args:
img: BGR image.
Returns:
Binary image (white text on black background inverted to black on white).
"""
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Background normalization: divide by blurred version
bg = cv2.GaussianBlur(gray, (51, 51), 0)
normalized = cv2.divide(gray, bg, scale=255)
# Adaptive binarization
binary = cv2.adaptiveThreshold(
normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 31, 10
)
# Light denoise
denoised = cv2.medianBlur(binary, 3)
return denoised
def create_layout_image(img: np.ndarray) -> np.ndarray:
"""Create a CLAHE-enhanced grayscale image for layout analysis.
Args:
img: BGR image.
Returns:
Enhanced grayscale image.
"""
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
return enhanced
# =============================================================================
# Content Bounds Detection
# =============================================================================
def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
"""Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
out = mask.copy()
n = len(out)
i = 0
while i < n:
if out[i]:
start = i
while i < n and out[i]:
i += 1
if (i - start) < min_width:
out[start:i] = False
else:
i += 1
return out
def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
"""Find the bounding box of actual text content (excluding page margins).
Scan artefacts (thin black lines at page edges) are filtered out by
discarding contiguous projection runs narrower than 1 % of the image
dimension (min 5 px).
Returns:
Tuple of (left_x, right_x, top_y, bottom_y).
"""
h, w = inv.shape[:2]
threshold = 0.005
# --- Horizontal projection for top/bottom ---
h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
h_mask = h_proj > threshold
min_h_run = max(5, h // 100)
h_mask = _filter_narrow_runs(h_mask, min_h_run)
top_y = 0
for y in range(h):
if h_mask[y]:
top_y = max(0, y - 5)
break
bottom_y = h
for y in range(h - 1, 0, -1):
if h_mask[y]:
bottom_y = min(h, y + 5)
break
# --- Vertical projection for left/right margins ---
v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
v_mask = v_proj_norm > threshold
min_v_run = max(5, w // 100)
v_mask = _filter_narrow_runs(v_mask, min_v_run)
left_x = 0
for x in range(w):
if v_mask[x]:
left_x = max(0, x - 2)
break
right_x = w
for x in range(w - 1, 0, -1):
if v_mask[x]:
right_x = min(w, x + 2)
break
return left_x, right_x, top_y, bottom_y
# =============================================================================
# Header / Footer Detection
# =============================================================================
def _detect_header_footer_gaps(
inv: np.ndarray,
img_w: int,
img_h: int,
) -> Tuple[Optional[int], Optional[int]]:
"""Detect header/footer boundaries via horizontal projection gap analysis.
Scans the full-page inverted image for large horizontal gaps in the top/bottom
20% that separate header/footer content from the main body.
Returns:
(header_y, footer_y) absolute y-coordinates.
header_y = bottom edge of header region (None if no header detected).
footer_y = top edge of footer region (None if no footer detected).
"""
HEADER_FOOTER_ZONE = 0.20
GAP_MULTIPLIER = 2.0
# Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
actual_h = min(inv.shape[0], img_h)
roi = inv[:actual_h, :]
h_proj = np.sum(roi, axis=1).astype(float)
proj_w = roi.shape[1]
h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
# Step 2: Smoothing
kernel_size = max(3, actual_h // 200)
if kernel_size % 2 == 0:
kernel_size += 1
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
# Step 3: Gap threshold
positive = h_smooth[h_smooth > 0]
median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
gap_threshold = max(median_density * 0.15, 0.003)
in_gap = h_smooth < gap_threshold
MIN_GAP_HEIGHT = max(3, actual_h // 500)
# Step 4: Collect contiguous gaps
raw_gaps: List[Tuple[int, int]] = []
gap_start: Optional[int] = None
for y in range(len(in_gap)):
if in_gap[y]:
if gap_start is None:
gap_start = y
else:
if gap_start is not None:
gap_height = y - gap_start
if gap_height >= MIN_GAP_HEIGHT:
raw_gaps.append((gap_start, y))
gap_start = None
if gap_start is not None:
gap_height = len(in_gap) - gap_start
if gap_height >= MIN_GAP_HEIGHT:
raw_gaps.append((gap_start, len(in_gap)))
if not raw_gaps:
return None, None
# Step 5: Compute median gap size and large-gap threshold
gap_sizes = [g[1] - g[0] for g in raw_gaps]
median_gap = float(np.median(gap_sizes))
large_gap_threshold = median_gap * GAP_MULTIPLIER
# Step 6: Find largest qualifying gap in header / footer zones
# A separator gap must have content on BOTH sides — edge-touching gaps
# (e.g. dewarp padding at bottom) are not valid separators.
EDGE_MARGIN = max(5, actual_h // 400)
header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
header_y: Optional[int] = None
footer_y: Optional[int] = None
best_header_size = 0
for gs, ge in raw_gaps:
if gs <= EDGE_MARGIN:
continue # skip gaps touching the top edge
gap_mid = (gs + ge) / 2
gap_size = ge - gs
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
if gap_size > best_header_size:
best_header_size = gap_size
header_y = ge # bottom edge of gap
best_footer_size = 0
for gs, ge in raw_gaps:
if ge >= actual_h - EDGE_MARGIN:
continue # skip gaps touching the bottom edge
gap_mid = (gs + ge) / 2
gap_size = ge - gs
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
if gap_size > best_footer_size:
best_footer_size = gap_size
footer_y = gs # top edge of gap
if header_y is not None:
logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
if footer_y is not None:
logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
return header_y, footer_y
def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
min_density: float = 0.005) -> bool:
"""Check whether a horizontal strip contains meaningful ink.
Args:
inv: Inverted binarized image (white-on-black).
y_start: Top of the region (inclusive).
y_end: Bottom of the region (exclusive).
min_density: Fraction of white pixels required to count as content.
Returns:
True if the region contains text/graphics, False if empty margin.
"""
if y_start >= y_end:
return False
strip = inv[y_start:y_end, :]
density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
return density > min_density
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
img_w: int, img_h: int,
inv: Optional[np.ndarray] = None) -> None:
"""Add header/footer/margin regions in-place.
Uses gap-based detection when *inv* is provided, otherwise falls back
to simple top_y/bottom_y bounds.
Region types depend on whether there is actual content (text/graphics):
- 'header' / 'footer' region contains text (e.g. title, page number)
- 'margin_top' / 'margin_bottom' region is empty page margin
"""
header_y: Optional[int] = None
footer_y: Optional[int] = None
if inv is not None:
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
# --- Top region ---
top_boundary = header_y if header_y is not None and header_y > 10 else (
top_y if top_y > 10 else None
)
if top_boundary is not None:
has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
rtype = 'header' if has_content else 'margin_top'
regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
f"(has_content={has_content})")
# --- Bottom region ---
bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
bottom_y if bottom_y < img_h - 10 else None
)
if bottom_boundary is not None:
has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
rtype = 'footer' if has_content else 'margin_bottom'
regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
height=img_h - bottom_boundary))
logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
f"height={img_h - bottom_boundary}px (has_content={has_content})")
@@ -0,0 +1,274 @@
"""
Layout analysis for OCR vocabulary pages orchestration and re-exports.
This module provides the high-level entry points for layout analysis and
re-exports all functions from sub-modules for backward compatibility.
Sub-modules:
- cv_layout_detection: Document type detection, image creation, content bounds, header/footer
- cv_layout_analyze: Legacy projection-based layout analysis
- cv_layout_columns: Core column geometry detection
- cv_layout_column_refine: Sub-column, broad-column, expand operations
- cv_layout_rows: Row geometry detection
- cv_layout_row_regularize: Row grid regularization
- cv_layout_scoring: Language/role scoring, dictionary signals
- cv_layout_classify: Column type classification (Phase B)
- cv_layout_classify_position: Position-based classification fallbacks
"""
import logging
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import ColumnGeometry, DetectedBox, PageRegion
logger = logging.getLogger(__name__)
# ── Re-exports (backward compatibility) ───────────────────────────────────
from cv_layout_detection import ( # noqa: F401
detect_document_type,
create_ocr_image,
create_layout_image,
_filter_narrow_runs,
_find_content_bounds,
_detect_header_footer_gaps,
_region_has_content,
_add_header_footer,
)
from cv_layout_analyze import ( # noqa: F401
analyze_layout,
)
from cv_layout_columns import ( # noqa: F401
detect_column_geometry,
_detect_columns_by_clustering,
_build_geometries_from_starts,
)
from cv_layout_column_refine import ( # noqa: F401
_detect_sub_columns,
_split_broad_columns,
expand_narrow_columns,
)
from cv_layout_rows import ( # noqa: F401
detect_row_geometry,
_build_rows_from_word_grouping,
)
from cv_layout_row_regularize import ( # noqa: F401
_regularize_row_grid,
)
from cv_layout_scoring import ( # noqa: F401
_score_language,
_score_role,
_score_dictionary_signals,
_classify_dictionary_columns,
)
from cv_layout_classify import ( # noqa: F401
_build_margin_regions,
positional_column_regions,
classify_column_types,
_classify_by_content,
)
from cv_layout_classify_position import ( # noqa: F401
_classify_by_position_enhanced,
_classify_by_position_fallback,
)
# ── Orchestration Functions ───────────────────────────────────────────────
def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
"""Detect columns using two-phase approach: geometry then content classification.
Phase A: detect_column_geometry() clustering word positions into columns.
Phase B: classify_column_types() content-based type assignment with fallback.
Falls back to projection-based analyze_layout() if geometry detection fails.
"""
h, w = ocr_img.shape[:2]
result = detect_column_geometry(ocr_img, dewarped_bgr)
if result is None:
logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
layout_img = create_layout_image(dewarped_bgr)
return analyze_layout(layout_img, ocr_img)
geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
content_w = right_x - left_x
header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
top_y=top_y, header_y=header_y, footer_y=footer_y)
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
content_h = bottom_y - top_y
regions = positional_column_regions(geometries, content_w, content_h, left_x)
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
methods = set(r.classification_method for r in regions if r.classification_method)
logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
return regions
def detect_column_geometry_zoned(
ocr_img: np.ndarray,
dewarped_bgr: np.ndarray,
) -> Optional[Tuple[
List[ColumnGeometry],
int, int, int, int,
List[Dict],
np.ndarray,
List[Dict],
List[DetectedBox],
]]:
"""Zone-aware column geometry detection.
1. Finds content bounds.
2. Runs box detection.
3. If boxes found: splits page into zones, runs detect_column_geometry()
per content zone on the corresponding sub-image.
4. If no boxes: delegates entirely to detect_column_geometry().
"""
from cv_box_detect import detect_boxes, split_page_into_zones
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
if geo_result is None:
return None
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
content_w = right_x - left_x
content_h = bottom_y - top_y
boxes = detect_boxes(dewarped_bgr, left_x, content_w, top_y, content_h)
if not boxes:
zone_data = [{
"index": 0, "zone_type": "content",
"y": top_y, "height": content_h,
"x": left_x, "width": content_w, "columns": [],
}]
return (geometries, left_x, right_x, top_y, bottom_y,
word_dicts, inv, zone_data, boxes)
zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes)
content_strips: List[Tuple[int, int]] = []
for zone in zones:
if zone.zone_type == 'content' and zone.height >= 40:
content_strips.append((zone.y, zone.y + zone.height))
if not content_strips:
logger.info("ZonedColumns: no content zones with height >= 40, using original result")
zone_data = [{"index": 0, "zone_type": "content", "y": top_y,
"height": content_h, "x": left_x, "width": content_w, "columns": []}]
return (geometries, left_x, right_x, top_y, bottom_y,
word_dicts, inv, zone_data, boxes)
ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips]
bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips]
combined_ocr = np.vstack(ocr_strips)
combined_bgr = np.vstack(bgr_strips)
logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} "
f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}")
combined_result = detect_column_geometry(combined_ocr, combined_bgr)
if combined_result is not None:
combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result
else:
logger.info("ZonedColumns: combined image column detection failed, using original")
combined_geoms = geometries
strip_offsets: List[Tuple[int, int, int]] = []
cum_y = 0
for ys, ye in content_strips:
h = ye - ys
strip_offsets.append((cum_y, h, ys))
cum_y += h
def _combined_y_to_abs(cy: int) -> int:
for c_start, s_h, abs_start in strip_offsets:
if cy < c_start + s_h:
return abs_start + (cy - c_start)
last_c, last_h, last_abs = strip_offsets[-1]
return last_abs + last_h
if combined_result is not None:
for g in combined_geoms:
abs_y = _combined_y_to_abs(g.y)
abs_y_end = _combined_y_to_abs(g.y + g.height)
g.y = abs_y
g.height = abs_y_end - abs_y
if word_dicts:
content_words = []
for w in word_dicts:
w_abs_cx = w['left'] + left_x + w['width'] / 2
w_abs_cy = w['top'] + top_y + w['height'] / 2
inside_box = any(
box.x <= w_abs_cx <= box.x + box.width
and box.y <= w_abs_cy <= box.y + box.height
for box in boxes
)
if not inside_box:
content_words.append(w)
target_geoms = combined_geoms if combined_result is not None else geometries
for g in target_geoms:
g_left_rel = g.x - left_x
g_right_rel = g_left_rel + g.width
g.words = [
w for w in content_words
if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
]
g.word_count = len(g.words)
excluded_count = len(word_dicts) - len(content_words)
if excluded_count:
logger.info(
"ZonedColumns: enriched geometries with %d content words "
"(excluded %d box-interior words)",
len(content_words), excluded_count,
)
zones_data: List[Dict] = []
for zone in zones:
zone_dict: Dict = {
"index": zone.index,
"zone_type": zone.zone_type,
"y": zone.y,
"height": zone.height,
"x": zone.x,
"width": zone.width,
"columns": [],
}
if zone.box is not None:
zone_dict["box"] = {
"x": zone.box.x, "y": zone.box.y,
"width": zone.box.width, "height": zone.box.height,
"confidence": zone.box.confidence,
"border_thickness": zone.box.border_thickness,
}
zones_data.append(zone_dict)
all_geometries = combined_geoms if combined_geoms else geometries
logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), "
f"{len(all_geometries)} total columns (combined-image approach)")
return (all_geometries, left_x, right_x, top_y, bottom_y,
word_dicts, inv, zones_data, boxes)
@@ -0,0 +1,329 @@
"""
Row grid regularization for document layout analysis.
Provides word-center-based row boundary refinement to improve
gap-based row detection. Extracted from cv_layout_rows.py.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Dict, List
import numpy as np
from cv_vocab_types import RowGeometry
logger = logging.getLogger(__name__)
def _regularize_row_grid(
rows: List['RowGeometry'],
word_dicts: List[Dict],
left_x: int, right_x: int,
top_y: int,
content_w: int, content_h: int,
inv: np.ndarray,
) -> List['RowGeometry']:
"""Rebuild row boundaries from word center-lines with section-break awareness.
Instead of overlaying a rigid grid, this derives row positions bottom-up
from the words themselves:
Step A: Group all content words into line clusters by Y-proximity.
Tolerance = 40% of median gap-based row height.
Step B: For each cluster compute:
- center_y = median of (word_top + word_height/2) for all words
- letter_h = median of word heights (excluding outliers > 2× median)
Step B2: Merge clusters whose centers are closer than 30% of row height
(spurious splits from OCR jitter).
Step C: Compute pitches (distances between consecutive centers).
Detect section breaks where gap > 1.8× median pitch.
Step D: Split clusters into sections at the section breaks.
Step E: Within each section, place row boundaries at midpoints between
consecutive line centers:
- First row top = center - local_pitch/2
- Last row bottom = center + local_pitch/2
- Interior boundaries = (center_i + center_{i+1}) / 2
This ensures rows tile seamlessly without gaps or overlaps.
Step F: Re-assign words to the nearest grid row by vertical center distance.
Step G: Validate that >= 85% of words land in a grid row; otherwise
fall back to the original gap-based rows.
Step H: Merge with preserved header/footer rows and re-index.
Guard: Requires >= 5 content rows from gap-based detection to activate.
This prevents the regularizer from running on very small images (e.g.
box sub-sessions with only 3-6 rows) where the gap-based detection
is already accurate enough.
Header/footer rows from the gap-based detection are preserved.
"""
content_rows = [r for r in rows if r.row_type == 'content']
non_content = [r for r in rows if r.row_type != 'content']
if len(content_rows) < 5:
return rows
# --- Step A: Group ALL words into line clusters ---
# Collect words that belong to content rows (deduplicated)
content_words: List[Dict] = []
seen_keys: set = set()
for r in content_rows:
for w in r.words:
key = (w['left'], w['top'], w['width'], w['height'])
if key not in seen_keys:
seen_keys.add(key)
content_words.append(w)
if len(content_words) < 5:
return rows
# Compute median word height (excluding outliers like tall brackets/IPA)
word_heights = sorted(w['height'] for w in content_words)
median_wh = word_heights[len(word_heights) // 2]
# Compute median gap-based row height — this is the actual line height
# as detected by the horizontal projection. We use 40% of this as
# grouping tolerance. This is much more reliable than using word height
# alone, because words on the same line can have very different heights
# (e.g. lowercase vs uppercase, brackets, phonetic symbols).
gap_row_heights = sorted(r.height for r in content_rows)
median_row_h = gap_row_heights[len(gap_row_heights) // 2]
# Tolerance: 40% of row height. Words on the same line should have
# centers within this range. Even if a word's bbox is taller/shorter,
# its center should stay within half a row height of the line center.
y_tol = max(10, int(median_row_h * 0.4))
# Sort by center_y, then group by proximity
words_by_center = sorted(content_words,
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
line_clusters: List[List[Dict]] = []
current_line: List[Dict] = [words_by_center[0]]
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
for w in words_by_center[1:]:
w_center = w['top'] + w['height'] / 2
if abs(w_center - current_center) <= y_tol:
current_line.append(w)
else:
current_line.sort(key=lambda w: w['left'])
line_clusters.append(current_line)
current_line = [w]
current_center = w_center
if current_line:
current_line.sort(key=lambda w: w['left'])
line_clusters.append(current_line)
if len(line_clusters) < 3:
return rows
# --- Step B: Compute center_y per cluster ---
# center_y = median of (word_top + word_height/2) across all words in cluster
# letter_h = median of word heights, but excluding outlier-height words
# (>2× median) so that tall brackets/IPA don't skew the height
cluster_info: List[Dict] = []
for cl_words in line_clusters:
centers = [w['top'] + w['height'] / 2 for w in cl_words]
# Filter outlier heights for letter_h computation
normal_heights = [w['height'] for w in cl_words
if w['height'] <= median_wh * 2.0]
if not normal_heights:
normal_heights = [w['height'] for w in cl_words]
center_y = float(np.median(centers))
letter_h = float(np.median(normal_heights))
cluster_info.append({
'center_y_rel': center_y, # relative to content ROI
'center_y_abs': center_y + top_y, # absolute
'letter_h': letter_h,
'words': cl_words,
})
cluster_info.sort(key=lambda c: c['center_y_rel'])
# --- Step B2: Merge clusters that are too close together ---
# Even with center-based grouping, some edge cases can produce
# spurious clusters. Merge any pair whose centers are closer
# than 30% of the row height (they're definitely the same text line).
merge_threshold = max(8, median_row_h * 0.3)
merged: List[Dict] = [cluster_info[0]]
for cl in cluster_info[1:]:
prev = merged[-1]
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
# Merge: combine words, recompute center
combined_words = prev['words'] + cl['words']
centers = [w['top'] + w['height'] / 2 for w in combined_words]
normal_heights = [w['height'] for w in combined_words
if w['height'] <= median_wh * 2.0]
if not normal_heights:
normal_heights = [w['height'] for w in combined_words]
prev['center_y_rel'] = float(np.median(centers))
prev['center_y_abs'] = prev['center_y_rel'] + top_y
prev['letter_h'] = float(np.median(normal_heights))
prev['words'] = combined_words
else:
merged.append(cl)
cluster_info = merged
if len(cluster_info) < 3:
return rows
# --- Step C: Compute pitches and detect section breaks ---
pitches: List[float] = []
for i in range(1, len(cluster_info)):
pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
pitches.append(pitch)
if not pitches:
return rows
median_pitch = float(np.median(pitches))
if median_pitch <= 5:
return rows
# A section break is where the gap between line centers is much larger
# than the normal pitch (sub-headings, section titles, etc.)
BREAK_FACTOR = 1.8
# --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
sections: List[List[Dict]] = []
current_section: List[Dict] = [cluster_info[0]]
for i in range(1, len(cluster_info)):
gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
if gap > median_pitch * BREAK_FACTOR:
sections.append(current_section)
current_section = [cluster_info[i]]
else:
current_section.append(cluster_info[i])
if current_section:
sections.append(current_section)
# --- Step E: Build row boundaries per section ---
grid_rows: List[RowGeometry] = []
for section in sections:
if not section:
continue
if len(section) == 1:
# Single-line section (likely a heading)
cl = section[0]
half_h = max(cl['letter_h'], median_pitch * 0.4)
row_top = cl['center_y_abs'] - half_h
row_bot = cl['center_y_abs'] + half_h
grid_rows.append(RowGeometry(
index=0,
x=left_x,
y=round(row_top),
width=content_w,
height=round(row_bot - row_top),
word_count=len(cl['words']),
words=cl['words'],
row_type='content',
gap_before=0,
))
continue
# Compute local pitch for this section
local_pitches = []
for i in range(1, len(section)):
local_pitches.append(
section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
)
local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
# Row boundaries are placed at midpoints between consecutive centers.
# First row: top = center - local_pitch/2
# Last row: bottom = center + local_pitch/2
for i, cl in enumerate(section):
if i == 0:
row_top = cl['center_y_abs'] - local_pitch / 2
else:
# Midpoint between this center and previous center
prev_center = section[i - 1]['center_y_abs']
row_top = (prev_center + cl['center_y_abs']) / 2
if i == len(section) - 1:
row_bot = cl['center_y_abs'] + local_pitch / 2
else:
next_center = section[i + 1]['center_y_abs']
row_bot = (cl['center_y_abs'] + next_center) / 2
# Clamp to reasonable bounds
row_top = max(top_y, row_top)
row_bot = min(top_y + content_h, row_bot)
if row_bot - row_top < 5:
continue
grid_rows.append(RowGeometry(
index=0,
x=left_x,
y=round(row_top),
width=content_w,
height=round(row_bot - row_top),
word_count=len(cl['words']),
words=cl['words'],
row_type='content',
gap_before=0,
))
if not grid_rows:
return rows
# --- Step F: Re-assign words to grid rows ---
# Words may have shifted slightly; assign each word to the row whose
# center is closest to the word's vertical center.
for gr in grid_rows:
gr.words = []
for w in content_words:
w_center = w['top'] + top_y + w['height'] / 2
best_row = None
best_dist = float('inf')
for gr in grid_rows:
row_center = gr.y + gr.height / 2
dist = abs(w_center - row_center)
if dist < best_dist:
best_dist = dist
best_row = gr
if best_row is not None and best_dist < median_pitch:
best_row.words.append(w)
for gr in grid_rows:
gr.word_count = len(gr.words)
# --- Step G: Validate ---
words_placed = sum(gr.word_count for gr in grid_rows)
if len(content_words) > 0:
match_ratio = words_placed / len(content_words)
if match_ratio < 0.85:
logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
f"of words, keeping gap-based rows")
return rows
# Remove empty grid rows (no words assigned)
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
# --- Step H: Merge header/footer + re-index ---
result = list(non_content) + grid_rows
result.sort(key=lambda r: r.y)
for i, r in enumerate(result):
r.index = i
row_heights = [gr.height for gr in grid_rows]
min_h = min(row_heights) if row_heights else 0
max_h = max(row_heights) if row_heights else 0
logger.info(f"RowGrid: word-center grid applied "
f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
f"{len(sections)} sections, "
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
f"was {len(content_rows)} gap-based rows)")
return result
+352
View File
@@ -0,0 +1,352 @@
"""
Row geometry detection for document layout analysis.
Provides horizontal whitespace-gap analysis to detect text rows,
word-center grid regularization, and fallback word-grouping.
Extracted from cv_layout.py.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Dict, List
import numpy as np
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
from cv_vocab_types import RowGeometry
from cv_ocr_word_assembly import _group_words_into_lines
from cv_layout_row_regularize import _regularize_row_grid
logger = logging.getLogger(__name__)
# =============================================================================
# Row Geometry Detection (horizontal whitespace-gap analysis)
# =============================================================================
def detect_row_geometry(
inv: np.ndarray,
word_dicts: List[Dict],
left_x: int, right_x: int,
top_y: int, bottom_y: int,
) -> List['RowGeometry']:
"""Detect row geometry using horizontal whitespace-gap analysis.
Algorithm overview (two phases):
Phase 1 Gap-based detection (Steps 16):
1. Build a horizontal projection profile: for each y-pixel, sum the
ink density across the content width. Only pixels within/near
Tesseract word bounding boxes contribute (word_mask), so that
images/illustrations don't merge adjacent text rows.
2. Smooth the projection and find contiguous regions below a
threshold (= gaps / horizontal whitespace between text lines).
The threshold is 15% of the median non-zero density.
3. Validate gaps against word bounding boxes discard any gap
that overlaps a word, or shift the gap boundary to avoid the word.
4. Build rows from the spans between validated gaps.
5. Detect header/footer rows: gaps in the top/bottom 15% of the
page that are >= 2× the median gap size mark section boundaries.
Phase 2 Word-center regularization (_regularize_row_grid, Step 7):
For each word, compute its vertical center (top + height/2).
Group words into line clusters by Y-proximity (tolerance = 40% of
the median gap-based row height).
For each cluster, the line center = median of all word centers.
The "pitch" = distance between consecutive line centers.
Section breaks are detected where the pitch exceeds 1.8× the median.
Within each section, row boundaries are placed at the midpoints
between consecutive line centers:
- Row top = midpoint to previous line center (or center - pitch/2 for first)
- Row bottom = midpoint to next line center (or center + pitch/2 for last)
This ensures rows tile without gaps or overlaps.
Fallback:
If < 2 gaps are found (very dense or uniform text), falls back to
_build_rows_from_word_grouping() which groups words by Y proximity.
Args:
inv: Inverted binarized image (white text on black bg, full page).
word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
left_x, right_x: Absolute X bounds of the content area.
top_y, bottom_y: Absolute Y bounds of the content area.
Returns:
List of RowGeometry objects sorted top to bottom.
"""
content_w = right_x - left_x
content_h = bottom_y - top_y
if content_h < 10 or content_w < 10:
logger.warning("detect_row_geometry: content area too small")
return []
# --- Step 1: Horizontal projection profile ---
# For each y-pixel row, sum ink density across the content width.
# A word-coverage mask ensures only pixels near Tesseract words contribute,
# so that illustrations/images don't inflate the density and merge rows.
content_strip = inv[top_y:bottom_y, left_x:right_x]
WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
for wd in word_dicts:
y1 = max(0, wd['top'] - WORD_PAD_Y)
y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
x1 = max(0, wd['left'])
x2 = min(content_w, wd['left'] + wd['width'])
word_mask[y1:y2, x1:x2] = 255
masked_strip = cv2.bitwise_and(content_strip, word_mask)
h_proj = np.sum(masked_strip, axis=1).astype(float)
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
# --- Step 2: Smoothing + gap threshold ---
# Smooth the projection to reduce noise, then threshold at 15% of the
# median non-zero density. Pixels below this threshold are considered
# "gap" (horizontal whitespace between text lines).
# MIN_GAP_HEIGHT prevents tiny noise gaps from splitting rows.
kernel_size = max(3, content_h // 200)
if kernel_size % 2 == 0:
kernel_size += 1
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
gap_threshold = max(median_density * 0.15, 0.003)
in_gap = h_smooth < gap_threshold
MIN_GAP_HEIGHT = max(3, content_h // 500)
# --- Step 3: Collect contiguous gap regions ---
raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI
gap_start = None
for y in range(len(in_gap)):
if in_gap[y]:
if gap_start is None:
gap_start = y
else:
if gap_start is not None:
gap_height = y - gap_start
if gap_height >= MIN_GAP_HEIGHT:
raw_gaps.append((gap_start, y))
gap_start = None
if gap_start is not None:
gap_height = len(in_gap) - gap_start
if gap_height >= MIN_GAP_HEIGHT:
raw_gaps.append((gap_start, len(in_gap)))
logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
f"min_height={MIN_GAP_HEIGHT}px)")
# --- Step 4: Validate gaps against word bounding boxes ---
# A gap is valid only if no word's bounding box overlaps it vertically.
# If a word overlaps, try to shift the gap boundary above or below the
# word. If neither shift yields enough room (>= MIN_GAP_HEIGHT), discard.
validated_gaps = []
for gap_start_rel, gap_end_rel in raw_gaps:
overlapping = False
for wd in word_dicts:
word_top = wd['top']
word_bottom = wd['top'] + wd['height']
if word_top < gap_end_rel and word_bottom > gap_start_rel:
overlapping = True
break
if not overlapping:
validated_gaps.append((gap_start_rel, gap_end_rel))
else:
# Try to shift the gap to avoid overlapping words
min_word_top = content_h
max_word_bottom = 0
for wd in word_dicts:
word_top = wd['top']
word_bottom = wd['top'] + wd['height']
if word_top < gap_end_rel and word_bottom > gap_start_rel:
min_word_top = min(min_word_top, word_top)
max_word_bottom = max(max_word_bottom, word_bottom)
if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
validated_gaps.append((gap_start_rel, min_word_top))
elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
validated_gaps.append((max_word_bottom, gap_end_rel))
else:
logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
f"discarded (word overlap, no room to shift)")
logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
# --- Fallback if too few gaps ---
if len(validated_gaps) < 2:
logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
return _build_rows_from_word_grouping(
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
)
validated_gaps.sort(key=lambda g: g[0])
# --- Step 5: Header/footer detection via gap size ---
HEADER_FOOTER_ZONE = 0.15
GAP_MULTIPLIER = 2.0
gap_sizes = [g[1] - g[0] for g in validated_gaps]
median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
large_gap_threshold = median_gap * GAP_MULTIPLIER
header_boundary_rel = None # y below which is header
footer_boundary_rel = None # y above which is footer
header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
# Find largest gap in header zone
best_header_gap = None
for gs, ge in validated_gaps:
gap_mid = (gs + ge) / 2
gap_size = ge - gs
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
best_header_gap = (gs, ge)
if best_header_gap is not None:
header_boundary_rel = best_header_gap[1]
logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
f"median_gap={median_gap:.0f}px)")
# Find largest gap in footer zone
best_footer_gap = None
for gs, ge in validated_gaps:
gap_mid = (gs + ge) / 2
gap_size = ge - gs
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
best_footer_gap = (gs, ge)
if best_footer_gap is not None:
footer_boundary_rel = best_footer_gap[0]
logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
# --- Step 6: Build RowGeometry objects from gaps ---
# Rows are the spans between consecutive gaps. The gap midpoints define
# where one row ends and the next begins. Each row's height extends
# from the end of the previous gap to the start of the next gap.
row_boundaries = [] # (start_y_rel, end_y_rel)
# Top of content to first gap
if validated_gaps[0][0] > MIN_GAP_HEIGHT:
row_boundaries.append((0, validated_gaps[0][0]))
# Between gaps
for i in range(len(validated_gaps) - 1):
row_start = validated_gaps[i][1]
row_end = validated_gaps[i + 1][0]
if row_end - row_start > 0:
row_boundaries.append((row_start, row_end))
# Last gap to bottom of content
if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
row_boundaries.append((validated_gaps[-1][1], content_h))
rows = []
for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
# Determine row type
row_mid = (row_start_rel + row_end_rel) / 2
if header_boundary_rel is not None and row_mid < header_boundary_rel:
row_type = 'header'
elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
row_type = 'footer'
else:
row_type = 'content'
# Collect words in this row
row_words = [w for w in word_dicts
if w['top'] + w['height'] / 2 >= row_start_rel
and w['top'] + w['height'] / 2 < row_end_rel]
# Gap before this row
gap_before = 0
if idx == 0 and validated_gaps[0][0] > 0:
gap_before = validated_gaps[0][0]
elif idx > 0:
# Find the gap just before this row boundary
for gs, ge in validated_gaps:
if ge == row_start_rel:
gap_before = ge - gs
break
rows.append(RowGeometry(
index=idx,
x=left_x,
y=top_y + row_start_rel,
width=content_w,
height=row_end_rel - row_start_rel,
word_count=len(row_words),
words=row_words,
row_type=row_type,
gap_before=gap_before,
))
# --- Step 7: Word-center grid regularization ---
# Refine the gap-based rows using word vertical centers. For each word,
# compute center_y = top + height/2. Group into line clusters, compute
# the pitch (distance between consecutive line centers), and place row
# boundaries at the midpoints between centers. This gives more precise
# and evenly-spaced rows than the gap-based approach alone.
# Also detects section breaks (headings, paragraphs) where the pitch
# exceeds 1.8× the median, and handles each section independently.
rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
content_w, content_h, inv)
type_counts = {}
for r in rows:
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
return rows
def _build_rows_from_word_grouping(
word_dicts: List[Dict],
left_x: int, right_x: int,
top_y: int, bottom_y: int,
content_w: int, content_h: int,
) -> List['RowGeometry']:
"""Fallback: build rows by grouping words by Y position.
Uses _group_words_into_lines() with a generous tolerance.
No header/footer detection in fallback mode.
"""
if not word_dicts:
return []
y_tolerance = max(20, content_h // 100)
lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
rows = []
for idx, line_words in enumerate(lines):
if not line_words:
continue
min_top = min(w['top'] for w in line_words)
max_bottom = max(w['top'] + w['height'] for w in line_words)
row_height = max_bottom - min_top
rows.append(RowGeometry(
index=idx,
x=left_x,
y=top_y + min_top,
width=content_w,
height=row_height,
word_count=len(line_words),
words=line_words,
row_type='content',
gap_before=0,
))
logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
return rows
@@ -0,0 +1,441 @@
"""
Language scoring, role scoring, and dictionary detection/classification.
Extracted from cv_layout.py to keep modules under 500 LOC.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from collections import Counter
from typing import Any, Dict, List, Optional
from cv_vocab_types import (
ColumnGeometry,
ENGLISH_FUNCTION_WORDS,
GERMAN_FUNCTION_WORDS,
PageRegion,
)
logger = logging.getLogger(__name__)
# --- Dictionary / Wörterbuch Detection ---
# Article words that appear as a dedicated column in dictionaries
_DICT_ARTICLE_WORDS = {
# German articles
"die", "der", "das", "dem", "den", "des", "ein", "eine", "einem", "einer",
# English articles / infinitive marker
"the", "a", "an", "to",
}
# --- Phase B: Content-Based Classification ---
def _score_language(words: List[Dict]) -> Dict[str, float]:
"""Score the language of a column's words.
Analyzes function words, umlauts, and capitalization patterns
to determine whether text is English or German.
Args:
words: List of word dicts with 'text' and 'conf' keys.
Returns:
Dict with 'eng' and 'deu' scores (0.0-1.0).
"""
if not words:
return {'eng': 0.0, 'deu': 0.0}
# Only consider words with decent confidence
good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
if not good_words:
return {'eng': 0.0, 'deu': 0.0}
total = len(good_words)
en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
# Check for umlauts (strong German signal)
raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
umlaut_count = sum(1 for t in raw_texts
for c in t if c in 'äöüÄÖÜß')
# German capitalization: nouns are capitalized mid-sentence
# Count words that start with uppercase but aren't at position 0
cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
en_score = en_hits / total if total > 0 else 0.0
de_score = de_hits / total if total > 0 else 0.0
# Boost German score for umlauts
if umlaut_count > 0:
de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
# Boost German score for high capitalization ratio (typical for German nouns)
if total > 5:
cap_ratio = cap_words / total
if cap_ratio > 0.3:
de_score = min(1.0, de_score + 0.1)
return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
"""Score the role of a column based on its geometry and content patterns.
Args:
geom: ColumnGeometry with words and dimensions.
Returns:
Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
"""
scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
if not geom.words:
return scores
texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
if not texts:
return scores
avg_word_len = sum(len(t) for t in texts) / len(texts)
has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
digit_ratio = digit_words / len(texts) if texts else 0.0
# Reference: narrow + mostly numbers/page references
if geom.width_ratio < 0.12:
scores['reference'] = 0.5
if digit_ratio > 0.4:
scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
# Marker: narrow + few short entries
if geom.width_ratio < 0.06 and geom.word_count <= 15:
scores['marker'] = 0.7
if avg_word_len < 4:
scores['marker'] = 0.9
# Very narrow non-edge column → strong marker regardless of word count
if geom.width_ratio < 0.04 and geom.index > 0:
scores['marker'] = max(scores['marker'], 0.9)
# Sentence: longer words + punctuation present
if geom.width_ratio > 0.15 and has_punctuation > 2:
scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
if avg_word_len > 4:
scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
# Vocabulary: medium width + medium word length
if 0.10 < geom.width_ratio < 0.45:
scores['vocabulary'] = 0.4
if 3 < avg_word_len < 8:
scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
return {k: round(v, 3) for k, v in scores.items()}
def _score_dictionary_signals(
geometries: List[ColumnGeometry],
document_category: Optional[str] = None,
margin_strip_detected: bool = False,
) -> Dict[str, Any]:
"""Score dictionary-specific patterns across all columns.
Combines 4 independent signals to determine if the page is a dictionary:
1. Alphabetical ordering of words in each column
2. Article column detection (der/die/das, to)
3. First-letter uniformity (most headwords share a letter)
4. Decorative A-Z margin strip (detected upstream)
Args:
geometries: List of ColumnGeometry with words.
document_category: User-selected category (e.g. 'woerterbuch').
margin_strip_detected: Whether a decorative A-Z margin strip was found.
Returns:
Dict with 'is_dictionary', 'confidence', 'article_col_index',
'headword_col_index', and 'signals' sub-dict.
"""
result: Dict[str, Any] = {
"is_dictionary": False,
"confidence": 0.0,
"article_col_index": None,
"headword_col_index": None,
"signals": {},
}
if not geometries or len(geometries) < 2:
return result
# --- Signal 1: Alphabetical ordering per column (weight 0.35) ---
best_alpha_score = 0.0
best_alpha_col = -1
for geom in geometries:
texts = [
w["text"].strip().lower()
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
]
if len(texts) < 5:
continue
# Deduplicate consecutive identical words (OCR double-reads)
deduped = [texts[0]]
for t in texts[1:]:
if t != deduped[-1]:
deduped.append(t)
if len(deduped) < 5:
continue
# Count consecutive pairs in alphabetical order
ordered_pairs = sum(
1 for i in range(len(deduped) - 1)
if deduped[i] <= deduped[i + 1]
)
alpha_score = ordered_pairs / (len(deduped) - 1)
if alpha_score > best_alpha_score:
best_alpha_score = alpha_score
best_alpha_col = geom.index
result["signals"]["alphabetical_score"] = round(best_alpha_score, 3)
result["signals"]["alphabetical_col"] = best_alpha_col
# --- Signal 2: Article detection (weight 0.25) ---
# Check three patterns:
# (a) Dedicated narrow article column (der/die/das only)
# (b) Inline articles: multi-word texts starting with "der X", "die X"
# (c) High article word frequency: many individual words ARE articles
# (common when OCR splits "der Zustand" into separate word_boxes)
best_article_density = 0.0
best_article_col = -1
best_inline_article_ratio = 0.0
best_article_word_ratio = 0.0
for geom in geometries:
texts = [
w["text"].strip().lower()
for w in geom.words
if w.get("conf", 0) > 30 and len(w["text"].strip()) > 0
]
if len(texts) < 3:
continue
# (a) Dedicated article column: narrow, mostly article words
article_count = sum(1 for t in texts if t in _DICT_ARTICLE_WORDS)
if geom.width_ratio <= 0.20:
density = article_count / len(texts)
if density > best_article_density:
best_article_density = density
best_article_col = geom.index
# (b) Inline articles: "der Zustand", "die Zutat", etc.
inline_count = sum(
1 for t in texts
if any(t.startswith(art + " ") for art in _DICT_ARTICLE_WORDS)
)
inline_ratio = inline_count / len(texts)
if inline_ratio > best_inline_article_ratio:
best_inline_article_ratio = inline_ratio
# (c) Article word frequency in any column (for OCR-split word_boxes)
# In dictionaries, articles appear frequently among headwords
# Require at least 10% articles and >= 3 article words
if article_count >= 3:
art_ratio = article_count / len(texts)
# Only count if column has enough non-article words too
# (pure article column is handled by (a))
non_art = len(texts) - article_count
if non_art >= 3 and art_ratio > best_article_word_ratio:
best_article_word_ratio = art_ratio
# Use the strongest signal
effective_article_score = max(
best_article_density,
best_inline_article_ratio,
best_article_word_ratio * 0.8, # slight discount for raw word ratio
)
result["signals"]["article_density"] = round(best_article_density, 3)
result["signals"]["inline_article_ratio"] = round(best_inline_article_ratio, 3)
result["signals"]["article_word_ratio"] = round(best_article_word_ratio, 3)
result["signals"]["article_col"] = best_article_col
# --- Signal 3: First-letter uniformity (weight 0.25) ---
best_uniformity = 0.0
best_uniform_col = -1
has_letter_transition = False
for geom in geometries:
texts = [
w["text"].strip().lower()
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
]
if len(texts) < 5:
continue
# Count first letters
first_letters = [t[0] for t in texts if t[0].isalpha()]
if not first_letters:
continue
letter_counts = Counter(first_letters)
most_common_letter, most_common_count = letter_counts.most_common(1)[0]
uniformity = most_common_count / len(first_letters)
# Check for orderly letter transitions (A→B or Y→Z)
# Group consecutive words by first letter, check if groups are in order
groups = []
current_letter = first_letters[0]
for fl in first_letters:
if fl != current_letter:
groups.append(current_letter)
current_letter = fl
groups.append(current_letter)
if len(groups) >= 2 and len(groups) <= 5:
# Check if groups are alphabetically ordered
if all(groups[i] <= groups[i + 1] for i in range(len(groups) - 1)):
has_letter_transition = True
# Boost uniformity for orderly transitions
uniformity = max(uniformity, 0.70)
if uniformity > best_uniformity:
best_uniformity = uniformity
best_uniform_col = geom.index
result["signals"]["first_letter_uniformity"] = round(best_uniformity, 3)
result["signals"]["uniform_col"] = best_uniform_col
result["signals"]["has_letter_transition"] = has_letter_transition
# --- Signal 4: Decorative margin strip (weight 0.15) ---
result["signals"]["margin_strip_detected"] = margin_strip_detected
# --- Combine signals ---
s1 = min(best_alpha_score, 1.0) * 0.35
s2 = min(effective_article_score, 1.0) * 0.25
s3 = min(best_uniformity, 1.0) * 0.25
s4 = (1.0 if margin_strip_detected else 0.0) * 0.15
combined = s1 + s2 + s3 + s4
# Boost if user set document_category to 'woerterbuch'
if document_category == "woerterbuch":
combined = min(1.0, combined + 0.20)
result["signals"]["category_boost"] = True
result["confidence"] = round(combined, 3)
# Threshold: combined >= 0.40 to classify as dictionary
# (at least 2 strong signals or 3 moderate ones)
if combined >= 0.40:
result["is_dictionary"] = True
# Identify headword column: best alphabetical OR best uniform
if best_alpha_col >= 0 and best_alpha_score >= 0.60:
result["headword_col_index"] = best_alpha_col
elif best_uniform_col >= 0 and best_uniformity >= 0.50:
result["headword_col_index"] = best_uniform_col
if best_article_col >= 0 and best_article_density >= 0.30:
result["article_col_index"] = best_article_col
# If inline articles are strong but no dedicated column, note it
if best_inline_article_ratio >= 0.30 and result["article_col_index"] is None:
result["signals"]["inline_articles_detected"] = True
logger.info(
"DictionaryDetection: combined=%.3f is_dict=%s signals=%s",
combined, result["is_dictionary"], result["signals"],
)
return result
def _classify_dictionary_columns(
geometries: List[ColumnGeometry],
dict_signals: Dict[str, Any],
lang_scores: List[Dict[str, float]],
content_h: int,
) -> Optional[List[PageRegion]]:
"""Classify columns for a detected dictionary page.
Assigns column_headword, column_article, column_ipa, and
column_de/column_en based on dictionary signals and language scores.
Returns None if classification fails.
"""
if not dict_signals.get("is_dictionary"):
return None
regions: List[PageRegion] = []
assigned = set()
article_idx = dict_signals.get("article_col_index")
headword_idx = dict_signals.get("headword_col_index")
# 1. Assign article column if detected
if article_idx is not None:
for geom in geometries:
if geom.index == article_idx:
regions.append(PageRegion(
type="column_article",
x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=round(
dict_signals["signals"].get("article_density", 0.5), 2),
classification_method="dictionary",
))
assigned.add(geom.index)
break
# 2. Assign headword column
if headword_idx is not None and headword_idx not in assigned:
for geom in geometries:
if geom.index == headword_idx:
regions.append(PageRegion(
type="column_headword",
x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=round(
dict_signals["confidence"], 2),
classification_method="dictionary",
))
assigned.add(geom.index)
break
# 3. Assign remaining columns by language + content
remaining = [g for g in geometries if g.index not in assigned]
for geom in remaining:
ls = lang_scores[geom.index] if geom.index < len(lang_scores) else {"eng": 0, "deu": 0}
# Check if column contains IPA (brackets like [, /, ˈ)
ipa_chars = sum(
1 for w in geom.words
if any(c in (w.get("text") or "") for c in "[]/ˈˌːɪəɒʊæɑɔ")
)
ipa_ratio = ipa_chars / max(len(geom.words), 1)
if ipa_ratio > 0.25:
col_type = "column_ipa"
conf = round(min(1.0, ipa_ratio), 2)
elif ls["deu"] > ls["eng"] and ls["deu"] > 0.05:
col_type = "column_de"
conf = round(ls["deu"], 2)
elif ls["eng"] > ls["deu"] and ls["eng"] > 0.05:
col_type = "column_en"
conf = round(ls["eng"], 2)
else:
# Positional fallback: leftmost unassigned = EN, next = DE
left_unassigned = sorted(
[g for g in remaining if g.index not in assigned],
key=lambda g: g.x,
)
if geom == left_unassigned[0] if left_unassigned else None:
col_type = "column_en"
else:
col_type = "column_de"
conf = 0.4
regions.append(PageRegion(
type=col_type,
x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=conf,
classification_method="dictionary",
))
assigned.add(geom.index)
regions.sort(key=lambda r: r.x)
return regions
+37
View File
@@ -0,0 +1,37 @@
"""
CV-based Document Reconstruction Pipeline for Vocabulary Extraction.
Re-export facade all logic lives in the sub-modules:
cv_vocab_types Dataklassen, Konstanten, IPA, Feature-Flags
cv_preprocessing Bild-I/O, Orientierung, Deskew, Dewarp
cv_layout Dokumenttyp, Spalten, Zeilen, Klassifikation
cv_ocr_engines OCR-Engines, Vocab-Postprocessing, Text-Cleaning
cv_cell_grid Cell-Grid (v2 + Legacy), Vocab-Konvertierung
cv_review LLM/Spell Review, Pipeline-Orchestrierung
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
from cv_vocab_types import * # noqa: F401,F403
from cv_preprocessing import * # noqa: F401,F403
from cv_layout import * # noqa: F401,F403
from cv_ocr_engines import * # noqa: F401,F403
from cv_cell_grid import * # noqa: F401,F403
from cv_box_detect import * # noqa: F401,F403
from cv_review import * # noqa: F401,F403
# Private names used by consumers — not covered by wildcard re-exports.
from cv_preprocessing import _apply_shear # noqa: F401
from cv_layout import ( # noqa: F401
_detect_header_footer_gaps,
_detect_sub_columns,
_split_broad_columns,
)
from cv_ocr_engines import ( # noqa: F401
_fix_character_confusion,
_fix_phonetic_brackets,
)
from cv_cell_grid import _cells_to_vocab_entries # noqa: F401
from cv_words_first import build_grid_from_words # noqa: F401
@@ -0,0 +1,437 @@
"""
CV Preprocessing Deskew Rotation correction via Hough lines, word alignment, and iterative projection.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from collections import defaultdict
from typing import Any, Dict, Tuple
import numpy as np
from cv_vocab_types import (
CV2_AVAILABLE,
TESSERACT_AVAILABLE,
)
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
try:
import pytesseract
from PIL import Image
except ImportError:
pytesseract = None # type: ignore[assignment]
Image = None # type: ignore[assignment,misc]
# =============================================================================
# Deskew via Hough Lines
# =============================================================================
def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
"""Correct rotation using Hough Line detection.
Args:
img: BGR image.
Returns:
Tuple of (corrected image, detected angle in degrees).
"""
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
minLineLength=img.shape[1] // 4, maxLineGap=20)
if lines is None or len(lines) < 3:
return img, 0.0
angles = []
for line in lines:
x1, y1, x2, y2 = line[0]
angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
if abs(angle) < 15:
angles.append(angle)
if not angles:
return img, 0.0
median_angle = float(np.median(angles))
if abs(median_angle) > 5.0:
median_angle = 5.0 * np.sign(median_angle)
if abs(median_angle) < 0.1:
return img, 0.0
h, w = img.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
corrected = cv2.warpAffine(img, M, (w, h),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE)
logger.info(f"Deskew: corrected {median_angle:.2f}\u00b0 rotation")
return corrected, median_angle
# =============================================================================
# Deskew via Word Alignment
# =============================================================================
def deskew_image_by_word_alignment(
image_data: bytes,
lang: str = "eng+deu",
downscale_factor: float = 0.5,
) -> Tuple[bytes, float]:
"""Correct rotation by fitting a line through left-most word starts per text line.
More robust than Hough-based deskew for vocabulary worksheets where text lines
have consistent left-alignment.
Args:
image_data: Raw image bytes (PNG/JPEG).
lang: Tesseract language string for the quick pass.
downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
Returns:
Tuple of (rotated image as PNG bytes, detected angle in degrees).
"""
if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
return image_data, 0.0
img_array = np.frombuffer(image_data, dtype=np.uint8)
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
if img is None:
logger.warning("deskew_by_word_alignment: could not decode image")
return image_data, 0.0
orig_h, orig_w = img.shape[:2]
small_w = int(orig_w * downscale_factor)
small_h = int(orig_h * downscale_factor)
small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
try:
data = pytesseract.image_to_data(
pil_small, lang=lang, config="--psm 6 --oem 3",
output_type=pytesseract.Output.DICT,
)
except Exception as e:
logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
return image_data, 0.0
line_groups: Dict[tuple, list] = defaultdict(list)
for i in range(len(data["text"])):
text = (data["text"][i] or "").strip()
conf = int(data["conf"][i])
if not text or conf < 20:
continue
key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
line_groups[key].append(i)
if len(line_groups) < 5:
logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
return image_data, 0.0
scale = 1.0 / downscale_factor
points = []
for key, indices in line_groups.items():
best_idx = min(indices, key=lambda i: data["left"][i])
lx = data["left"][best_idx] * scale
top = data["top"][best_idx] * scale
h = data["height"][best_idx] * scale
cy = top + h / 2.0
points.append((lx, cy))
xs = np.array([p[0] for p in points])
ys = np.array([p[1] for p in points])
median_x = float(np.median(xs))
tolerance = orig_w * 0.03
mask = np.abs(xs - median_x) <= tolerance
filtered_xs = xs[mask]
filtered_ys = ys[mask]
if len(filtered_xs) < 5:
logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
return image_data, 0.0
coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
slope = coeffs[0]
angle_rad = np.arctan(slope)
angle_deg = float(np.degrees(angle_rad))
angle_deg = max(-5.0, min(5.0, angle_deg))
logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}\u00b0 from {len(filtered_xs)} points "
f"(total lines: {len(line_groups)})")
if abs(angle_deg) < 0.05:
return image_data, 0.0
center = (orig_w // 2, orig_h // 2)
M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE)
success, png_buf = cv2.imencode(".png", rotated)
if not success:
logger.warning("deskew_by_word_alignment: PNG encoding failed")
return image_data, 0.0
return png_buf.tobytes(), angle_deg
# =============================================================================
# Projection Gradient Scoring
# =============================================================================
def _projection_gradient_score(profile: np.ndarray) -> float:
"""Score a projection profile by the L2-norm of its first derivative."""
diff = np.diff(profile)
return float(np.sum(diff * diff))
# =============================================================================
# Iterative Deskew (Vertical-Edge Projection)
# =============================================================================
def deskew_image_iterative(
img: np.ndarray,
coarse_range: float = 5.0,
coarse_step: float = 0.1,
fine_range: float = 0.15,
fine_step: float = 0.02,
) -> Tuple[np.ndarray, float, Dict[str, Any]]:
"""Iterative deskew using vertical-edge projection optimisation.
Args:
img: BGR image (full resolution).
coarse_range: half-range in degrees for the coarse sweep.
coarse_step: step size in degrees for the coarse sweep.
fine_range: half-range around the coarse winner for the fine sweep.
fine_step: step size in degrees for the fine sweep.
Returns:
(rotated_bgr, angle_degrees, debug_dict)
"""
h, w = img.shape[:2]
debug: Dict[str, Any] = {}
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
y_lo, y_hi = int(h * 0.15), int(h * 0.85)
x_lo, x_hi = int(w * 0.10), int(w * 0.90)
gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
edges = np.abs(sobel_x)
edge_max = edges.max()
if edge_max > 0:
edges = (edges / edge_max * 255).astype(np.uint8)
else:
return img, 0.0, {"error": "no edges detected"}
crop_h, crop_w = edges.shape[:2]
crop_center = (crop_w // 2, crop_h // 2)
trim_y = max(4, int(crop_h * 0.03))
trim_x = max(4, int(crop_w * 0.03))
def _sweep_edges(angles: np.ndarray) -> list:
results = []
for angle in angles:
if abs(angle) < 1e-6:
rotated = edges
else:
M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
flags=cv2.INTER_NEAREST,
borderMode=cv2.BORDER_REPLICATE)
trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
score = _projection_gradient_score(v_profile)
results.append((float(angle), score))
return results
coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
coarse_results = _sweep_edges(coarse_angles)
best_coarse = max(coarse_results, key=lambda x: x[1])
best_coarse_angle, best_coarse_score = best_coarse
debug["coarse_best_angle"] = round(best_coarse_angle, 2)
debug["coarse_best_score"] = round(best_coarse_score, 1)
debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
fine_lo = best_coarse_angle - fine_range
fine_hi = best_coarse_angle + fine_range
fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
fine_results = _sweep_edges(fine_angles)
best_fine = max(fine_results, key=lambda x: x[1])
best_fine_angle, best_fine_score = best_fine
debug["fine_best_angle"] = round(best_fine_angle, 2)
debug["fine_best_score"] = round(best_fine_score, 1)
debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
final_angle = best_fine_angle
final_angle = max(-5.0, min(5.0, final_angle))
logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}\u00b0 fine={best_fine_angle:.2f}\u00b0 -> {final_angle:.2f}\u00b0")
if abs(final_angle) < 0.05:
return img, 0.0, debug
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
rotated = cv2.warpAffine(img, M, (w, h),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE)
return rotated, final_angle, debug
# =============================================================================
# Text-Line Slope Measurement
# =============================================================================
def _measure_textline_slope(img: np.ndarray) -> float:
"""Measure residual text-line slope via Tesseract word-position regression."""
import math as _math
if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
return 0.0
h, w = img.shape[:2]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
data = pytesseract.image_to_data(
Image.fromarray(gray),
output_type=pytesseract.Output.DICT,
config="--psm 6",
)
lines: Dict[tuple, list] = {}
for i in range(len(data["text"])):
txt = (data["text"][i] or "").strip()
if len(txt) < 2 or int(data["conf"][i]) < 30:
continue
key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
cx = data["left"][i] + data["width"][i] / 2.0
cy = data["top"][i] + data["height"][i] / 2.0
lines.setdefault(key, []).append((cx, cy))
slopes: list = []
for pts in lines.values():
if len(pts) < 3:
continue
pts.sort(key=lambda p: p[0])
xs = np.array([p[0] for p in pts], dtype=np.float64)
ys = np.array([p[1] for p in pts], dtype=np.float64)
if xs[-1] - xs[0] < w * 0.15:
continue
A = np.vstack([xs, np.ones_like(xs)]).T
result = np.linalg.lstsq(A, ys, rcond=None)
slope = result[0][0]
slopes.append(_math.degrees(_math.atan(slope)))
if len(slopes) < 3:
return 0.0
slopes.sort()
trim = max(1, len(slopes) // 10)
trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
if not trimmed:
return 0.0
return sum(trimmed) / len(trimmed)
# =============================================================================
# Two-Pass Deskew
# =============================================================================
def deskew_two_pass(
img: np.ndarray,
coarse_range: float = 5.0,
) -> Tuple[np.ndarray, float, Dict[str, Any]]:
"""Two-pass deskew: iterative projection + word-alignment residual check.
Returns:
(corrected_bgr, total_angle_degrees, debug_dict)
"""
debug: Dict[str, Any] = {}
# --- Pass 1: iterative projection ---
corrected, angle1, dbg1 = deskew_image_iterative(
img.copy(), coarse_range=coarse_range,
)
debug["pass1_angle"] = round(angle1, 3)
debug["pass1_method"] = "iterative"
debug["pass1_debug"] = dbg1
# --- Pass 2: word-alignment residual check ---
angle2 = 0.0
try:
ok, buf = cv2.imencode(".png", corrected)
if ok:
corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
if abs(angle2) >= 0.3:
arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
if corrected2 is not None:
corrected = corrected2
logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 applied "
f"(total={angle1 + angle2:.2f}\u00b0)")
else:
angle2 = 0.0
else:
logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 < 0.3\u00b0 -- skipped")
angle2 = 0.0
except Exception as e:
logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
angle2 = 0.0
# --- Pass 3: Tesseract text-line regression residual check ---
angle3 = 0.0
try:
residual = _measure_textline_slope(corrected)
debug["pass3_raw"] = round(residual, 3)
if abs(residual) >= 0.3:
h3, w3 = corrected.shape[:2]
center3 = (w3 // 2, h3 // 2)
M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
corrected = cv2.warpAffine(
corrected, M3, (w3, h3),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE,
)
angle3 = residual
logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 applied", residual)
else:
logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 < 0.3\u00b0 -- skipped", residual)
except Exception as e:
logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
total_angle = angle1 + angle2 + angle3
debug["pass2_angle"] = round(angle2, 3)
debug["pass2_method"] = "word_alignment"
debug["pass3_angle"] = round(angle3, 3)
debug["pass3_method"] = "textline_regression"
debug["total_angle"] = round(total_angle, 3)
logger.info(
"deskew_two_pass: pass1=%.2f\u00b0 + pass2=%.2f\u00b0 + pass3=%.2f\u00b0 = %.2f\u00b0",
angle1, angle2, angle3, total_angle,
)
return corrected, total_angle, debug
@@ -0,0 +1,474 @@
"""
CV Preprocessing Dewarp Vertical shear detection and correction.
Provides four shear detection methods (vertical edge, projection variance,
Hough lines, text-line drift), ensemble combination, quality gating,
and the main dewarp_image() function.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import math
import time
from typing import Any, Dict, List, Tuple
import numpy as np
from cv_vocab_types import (
CV2_AVAILABLE,
TESSERACT_AVAILABLE,
)
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
try:
import pytesseract
from PIL import Image
except ImportError:
pytesseract = None # type: ignore[assignment]
Image = None # type: ignore[assignment,misc]
# =============================================================================
# Shear Detection Methods
# =============================================================================
def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
"""Detect vertical shear angle via strongest vertical edge tracking (Method A)."""
h, w = img.shape[:2]
result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
abs_sobel = np.abs(sobel_x).astype(np.uint8)
_, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
num_strips = 20
strip_h = h // num_strips
edge_positions = []
for i in range(num_strips):
y_start = i * strip_h
y_end = min((i + 1) * strip_h, h)
strip = binary[y_start:y_end, :]
projection = np.sum(strip, axis=0).astype(np.float64)
if projection.max() == 0:
continue
search_w = int(w * 0.4)
left_proj = projection[:search_w]
if left_proj.max() == 0:
continue
kernel_size = max(3, w // 100)
if kernel_size % 2 == 0:
kernel_size += 1
smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
x_pos = float(np.argmax(smoothed))
y_center = (y_start + y_end) / 2.0
edge_positions.append((y_center, x_pos))
if len(edge_positions) < 8:
return result
ys = np.array([p[0] for p in edge_positions])
xs = np.array([p[1] for p in edge_positions])
median_x = np.median(xs)
std_x = max(np.std(xs), 1.0)
mask = np.abs(xs - median_x) < 2 * std_x
ys = ys[mask]
xs = xs[mask]
if len(ys) < 6:
return result
straight_coeffs = np.polyfit(ys, xs, 1)
slope = straight_coeffs[0]
fitted = np.polyval(straight_coeffs, ys)
residuals = xs - fitted
rmse = float(np.sqrt(np.mean(residuals ** 2)))
shear_degrees = math.degrees(math.atan(slope))
confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
result["shear_degrees"] = round(shear_degrees, 3)
result["confidence"] = round(float(confidence), 2)
return result
def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
"""Detect shear angle by maximising variance of horizontal text-line projections (Method B)."""
result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
h, w = img.shape[:2]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
sh, sw = small.shape
def _sweep_variance(angles_list):
results = []
for angle_deg in angles_list:
if abs(angle_deg) < 0.001:
rotated = small
else:
shear_tan = math.tan(math.radians(angle_deg))
M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
rotated = cv2.warpAffine(small, M, (sw, sh),
flags=cv2.INTER_NEAREST,
borderMode=cv2.BORDER_CONSTANT)
profile = np.sum(rotated, axis=1).astype(float)
results.append((angle_deg, float(np.var(profile))))
return results
coarse_angles = [a * 0.5 for a in range(-6, 7)]
coarse_results = _sweep_variance(coarse_angles)
coarse_best = max(coarse_results, key=lambda x: x[1])
fine_center = coarse_best[0]
fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)]
fine_results = _sweep_variance(fine_angles)
fine_best = max(fine_results, key=lambda x: x[1])
best_angle = fine_best[0]
best_variance = fine_best[1]
variances = coarse_results + fine_results
all_mean = sum(v for _, v in variances) / len(variances)
if all_mean > 0 and best_variance > all_mean:
confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
else:
confidence = 0.0
result["shear_degrees"] = round(best_angle, 3)
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
return result
def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
"""Detect shear using Hough transform on printed table / ruled lines (Method C)."""
result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
h, w = img.shape[:2]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
min_len = int(w * 0.15)
lines = cv2.HoughLinesP(
edges, rho=1, theta=np.pi / 360,
threshold=int(w * 0.08),
minLineLength=min_len,
maxLineGap=20,
)
if lines is None or len(lines) < 3:
return result
horizontal_angles: List[Tuple[float, float]] = []
for line in lines:
x1, y1, x2, y2 = line[0]
if x1 == x2:
continue
angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
if abs(angle) <= 5.0:
length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
horizontal_angles.append((angle, length))
if len(horizontal_angles) < 3:
return result
angles_arr = np.array([a for a, _ in horizontal_angles])
weights_arr = np.array([l for _, l in horizontal_angles])
sorted_idx = np.argsort(angles_arr)
s_angles = angles_arr[sorted_idx]
s_weights = weights_arr[sorted_idx]
cum = np.cumsum(s_weights)
mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
shear_degrees = -median_angle
result["shear_degrees"] = round(shear_degrees, 3)
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
return result
def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
"""Detect shear by measuring text-line straightness (Method D)."""
result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
h, w = img.shape[:2]
scale = 0.5
small = cv2.resize(img, (int(w * scale), int(h * scale)),
interpolation=cv2.INTER_AREA)
gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
pil_img = Image.fromarray(gray)
try:
data = pytesseract.image_to_data(
pil_img, lang='eng+deu', config='--psm 11 --oem 3',
output_type=pytesseract.Output.DICT,
)
except Exception:
return result
words = []
for i in range(len(data['text'])):
text = data['text'][i].strip()
conf = int(data['conf'][i])
if not text or conf < 20 or len(text) < 2:
continue
left_x = float(data['left'][i])
cy = data['top'][i] + data['height'][i] / 2.0
word_w = float(data['width'][i])
words.append((left_x, cy, word_w))
if len(words) < 15:
return result
avg_w = sum(ww for _, _, ww in words) / len(words)
x_tol = max(avg_w * 0.4, 8)
words_by_x = sorted(words, key=lambda w: w[0])
columns: List[List[Tuple[float, float]]] = []
cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
cur_x = words_by_x[0][0]
for lx, cy, _ in words_by_x[1:]:
if abs(lx - cur_x) <= x_tol:
cur_col.append((lx, cy))
cur_x = cur_x * 0.8 + lx * 0.2
else:
if len(cur_col) >= 5:
columns.append(cur_col)
cur_col = [(lx, cy)]
cur_x = lx
if len(cur_col) >= 5:
columns.append(cur_col)
if len(columns) < 2:
return result
drifts = []
for col in columns:
ys = np.array([p[1] for p in col])
xs = np.array([p[0] for p in col])
y_range = ys.max() - ys.min()
if y_range < h * scale * 0.3:
continue
coeffs = np.polyfit(ys, xs, 1)
drifts.append(coeffs[0])
if len(drifts) < 2:
return result
median_drift = float(np.median(drifts))
shear_degrees = math.degrees(math.atan(median_drift))
drift_std = float(np.std(drifts))
consistency = max(0.0, 1.0 - drift_std * 50)
count_factor = min(1.0, len(drifts) / 4.0)
confidence = count_factor * 0.5 + consistency * 0.5
result["shear_degrees"] = round(shear_degrees, 3)
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
"shear=%.3f\u00b0, conf=%.2f",
len(columns), len(drifts), median_drift,
shear_degrees, confidence)
return result
# =============================================================================
# Quality Check and Shear Application
# =============================================================================
def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
"""Check whether the dewarp correction actually improved alignment."""
def _h_proj_variance(img: np.ndarray) -> float:
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
interpolation=cv2.INTER_AREA)
profile = np.sum(small, axis=1).astype(float)
return float(np.var(profile))
var_before = _h_proj_variance(original)
var_after = _h_proj_variance(corrected)
return var_after > var_before
def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
"""Apply a vertical shear correction to an image."""
h, w = img.shape[:2]
shear_tan = math.tan(math.radians(shear_degrees))
M = np.float32([
[1, shear_tan, -h / 2.0 * shear_tan],
[0, 1, 0],
])
corrected = cv2.warpAffine(img, M, (w, h),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE)
return corrected
# =============================================================================
# Ensemble Shear Combination
# =============================================================================
def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
"""Combine multiple shear detections into a single weighted estimate (v2)."""
_MIN_CONF = 0.35
_METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
accepted = []
for d in detections:
if d["confidence"] < _MIN_CONF:
continue
boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
effective_conf = d["confidence"] * boost
accepted.append((d["shear_degrees"], effective_conf, d["method"]))
if not accepted:
return 0.0, 0.0, "none"
if len(accepted) == 1:
deg, conf, method = accepted[0]
return deg, min(conf, 1.0), method
total_w = sum(c for _, c, _ in accepted)
w_mean = sum(d * c for d, c, _ in accepted) / total_w
filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
if not filtered:
filtered = accepted
total_w2 = sum(c for _, c, _ in filtered)
final_deg = sum(d * c for d, c, _ in filtered) / total_w2
avg_conf = total_w2 / len(filtered)
spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
agreement_bonus = 0.15 if spread < 0.5 else 0.0
ensemble_conf = min(1.0, avg_conf + agreement_bonus)
methods_str = "+".join(m for _, _, m in filtered)
return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
# =============================================================================
# Main Dewarp Function
# =============================================================================
def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
"""Correct vertical shear after deskew (v2 with quality gate).
Methods (all run in ~150ms total):
A. _detect_shear_angle() -- vertical edge profile (~50ms)
B. _detect_shear_by_projection() -- horizontal text-line variance (~30ms)
C. _detect_shear_by_hough() -- Hough lines on table borders (~20ms)
D. _detect_shear_by_text_lines() -- text-line straightness (~50ms)
Args:
img: BGR image (already deskewed).
use_ensemble: If False, fall back to single-method behaviour (method A only).
Returns:
Tuple of (corrected_image, dewarp_info).
"""
no_correction = {
"method": "none",
"shear_degrees": 0.0,
"confidence": 0.0,
"detections": [],
}
if not CV2_AVAILABLE:
return img, no_correction
t0 = time.time()
if use_ensemble:
det_a = _detect_shear_angle(img)
det_b = _detect_shear_by_projection(img)
det_c = _detect_shear_by_hough(img)
det_d = _detect_shear_by_text_lines(img)
detections = [det_a, det_b, det_c, det_d]
shear_deg, confidence, method = _ensemble_shear(detections)
else:
det_a = _detect_shear_angle(img)
detections = [det_a]
shear_deg = det_a["shear_degrees"]
confidence = det_a["confidence"]
method = det_a["method"]
duration = time.time() - t0
logger.info(
"dewarp: ensemble shear=%.3f\u00b0 conf=%.2f method=%s (%.2fs) | "
"A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
shear_deg, confidence, method, duration,
detections[0]["shear_degrees"], detections[0]["confidence"],
detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
detections[1]["confidence"] if len(detections) > 1 else 0.0,
detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
detections[2]["confidence"] if len(detections) > 2 else 0.0,
detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
detections[3]["confidence"] if len(detections) > 3 else 0.0,
)
_all_detections = [
{"method": d["method"], "shear_degrees": d["shear_degrees"],
"confidence": d["confidence"]}
for d in detections
]
if abs(shear_deg) < 0.08 or confidence < 0.4:
no_correction["detections"] = _all_detections
return img, no_correction
corrected = _apply_shear(img, -shear_deg)
if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
logger.info("dewarp: quality gate REJECTED correction (%.3f\u00b0) -- "
"projection variance did not improve", shear_deg)
no_correction["detections"] = _all_detections
return img, no_correction
info = {
"method": method,
"shear_degrees": shear_deg,
"confidence": confidence,
"detections": _all_detections,
}
return corrected, info
def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
"""Apply shear correction with a manual angle."""
if abs(shear_degrees) < 0.001:
return img
return _apply_shear(img, -shear_degrees)
@@ -0,0 +1,157 @@
"""
Image I/O, orientation detection, deskew, and dewarp for the CV vocabulary pipeline.
Re-export facade -- all logic lives in the sub-modules:
cv_preprocessing_deskew Rotation correction (Hough, word-alignment, iterative, two-pass)
cv_preprocessing_dewarp Vertical shear detection and correction (4 methods + ensemble)
This file contains the image I/O and orientation detection functions.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Tuple
import numpy as np
from cv_vocab_types import (
CV2_AVAILABLE,
TESSERACT_AVAILABLE,
)
logger = logging.getLogger(__name__)
# Guarded imports
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
try:
import pytesseract
from PIL import Image
except ImportError:
pytesseract = None # type: ignore[assignment]
Image = None # type: ignore[assignment,misc]
# Re-export all deskew functions
from cv_preprocessing_deskew import ( # noqa: F401
deskew_image,
deskew_image_by_word_alignment,
deskew_image_iterative,
deskew_two_pass,
_projection_gradient_score,
_measure_textline_slope,
)
# Re-export all dewarp functions
from cv_preprocessing_dewarp import ( # noqa: F401
_apply_shear,
_detect_shear_angle,
_detect_shear_by_hough,
_detect_shear_by_projection,
_detect_shear_by_text_lines,
_dewarp_quality_check,
_ensemble_shear,
dewarp_image,
dewarp_image_manual,
)
# =============================================================================
# Image I/O
# =============================================================================
def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
"""Render a PDF page to a high-resolution numpy array (BGR).
Args:
pdf_data: Raw PDF bytes.
page_number: 0-indexed page number.
zoom: Zoom factor (3.0 = 432 DPI).
Returns:
numpy array in BGR format.
"""
import fitz # PyMuPDF
pdf_doc = fitz.open(stream=pdf_data, filetype="pdf")
if page_number >= pdf_doc.page_count:
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)")
page = pdf_doc[page_number]
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
if pix.n == 4: # RGBA
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
elif pix.n == 3: # RGB
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
else: # Grayscale
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
pdf_doc.close()
return img_bgr
def render_image_high_res(image_data: bytes) -> np.ndarray:
"""Load an image (PNG/JPEG) into a numpy array (BGR).
Args:
image_data: Raw image bytes.
Returns:
numpy array in BGR format.
"""
img_array = np.frombuffer(image_data, dtype=np.uint8)
img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
if img_bgr is None:
raise ValueError("Could not decode image data")
return img_bgr
# =============================================================================
# Orientation Detection (0/90/180/270)
# =============================================================================
def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
"""Detect page orientation via Tesseract OSD and rotate if needed.
Returns:
(corrected_image, rotation_degrees) -- rotation is 0, 90, 180, or 270.
"""
if pytesseract is None:
return img_bgr, 0
try:
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
pil_img = Image.fromarray(gray)
osd = pytesseract.image_to_osd(pil_img, output_type=pytesseract.Output.DICT)
rotate = osd.get("rotate", 0)
confidence = osd.get("orientation_conf", 0.0)
logger.info(f"OSD: orientation={rotate}\u00b0 confidence={confidence:.1f}")
if rotate == 0 or confidence < 1.0:
return img_bgr, 0
if rotate == 180:
corrected = cv2.rotate(img_bgr, cv2.ROTATE_180)
elif rotate == 90:
corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_CLOCKWISE)
elif rotate == 270:
corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_COUNTERCLOCKWISE)
else:
return img_bgr, 0
logger.info(f"OSD: rotated {rotate}\u00b0 to fix orientation")
return corrected, rotate
except Exception as e:
logger.warning(f"OSD orientation detection failed: {e}")
return img_bgr, 0
+388
View File
@@ -0,0 +1,388 @@
"""
CV Review LLM LLM-based OCR correction: prompt building, change detection, streaming.
Handles the LLM review path (REVIEW_ENGINE=llm) and shared utilities like
_entry_needs_review, _is_spurious_change, _diff_batch, and JSON parsing.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import json
import logging
import os
import re
import time
from typing import Dict, List, Tuple
import httpx
logger = logging.getLogger(__name__)
_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
# Regex: entry contains IPA phonetic brackets like "dance [da:ns]"
_HAS_PHONETIC_RE = re.compile(r'\[.*?[\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u0254\u0259\u025c\u026a\u028a\u028c\u00e6].*?\]')
# Regex: digit adjacent to a letter -- OCR digit<->letter confusion
_OCR_DIGIT_IN_WORD_RE = re.compile(r'(?<=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])[01568]|[01568](?=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])')
def _entry_needs_review(entry: Dict) -> bool:
"""Check if an entry should be sent for review.
Sends all non-empty entries that don't have IPA phonetic transcriptions.
"""
en = entry.get("english", "") or ""
de = entry.get("german", "") or ""
if not en.strip() and not de.strip():
return False
if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
return False
return True
def _build_llm_prompt(table_lines: List[Dict]) -> str:
"""Build the LLM correction prompt for a batch of entries."""
return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
NUR diese Korrekturen sind erlaubt:
- Ziffer 8 statt B: "8en" -> "Ben", "8uch" -> "Buch", "8all" -> "Ball"
- Ziffer 0 statt O oder o: "L0ndon" -> "London", "0ld" -> "Old"
- Ziffer 1 statt l oder I: "1ong" -> "long", "Ber1in" -> "Berlin"
- Ziffer 5 statt S oder s: "5tadt" -> "Stadt", "5ee" -> "See"
- Ziffer 6 statt G oder g: "6eld" -> "Geld"
- Senkrechter Strich | statt I oder l: "| want" -> "I want", "|ong" -> "long", "he| p" -> "help"
ABSOLUT VERBOTEN -- aendere NIEMALS:
- Woerter die korrekt geschrieben sind -- auch wenn du eine andere Schreibweise kennst
- Uebersetzungen -- du uebersetzt NICHTS, weder EN->DE noch DE->EN
- Korrekte englische Woerter (en-Spalte) -- auch wenn du eine Bedeutung kennst
- Korrekte deutsche Woerter (de-Spalte) -- auch wenn du sie anders sagen wuerdest
- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
- Lautschrift in eckigen Klammern [...] -- diese NIEMALS beruehren
- Beispielsaetze in der ex-Spalte -- NIEMALS aendern
Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
/no_think
Eingabe:
{json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
def _is_spurious_change(old_val: str, new_val: str) -> bool:
"""Detect LLM changes that are likely wrong and should be discarded.
Only digit<->letter substitutions (0->O, 1->l, 5->S, 6->G, 8->B) are
legitimate OCR corrections. Everything else is rejected.
"""
if not old_val or not new_val:
return False
if old_val.lower() == new_val.lower():
return True
old_words = old_val.split()
new_words = new_val.split()
if abs(len(old_words) - len(new_words)) > 1:
return True
_OCR_CHAR_MAP = {
'0': set('oOgG'),
'1': set('lLiI'),
'5': set('sS'),
'6': set('gG'),
'8': set('bB'),
'|': set('lLiI1'),
'l': set('iI|1'),
}
has_valid_fix = False
if len(old_val) == len(new_val):
for oc, nc in zip(old_val, new_val):
if oc != nc:
if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
has_valid_fix = True
elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
has_valid_fix = True
else:
_OCR_SUSPICIOUS_RE = re.compile(r'[|01568]')
if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
has_valid_fix = True
if not has_valid_fix:
return True
return False
def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
"""Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
changes = []
entries_out = []
for i, orig in enumerate(originals):
if i < len(corrected):
c = corrected[i]
entry = dict(orig)
for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
new_val = c.get(key, "").strip()
old_val = (orig.get(field_name, "") or "").strip()
if new_val and new_val != old_val:
if _is_spurious_change(old_val, new_val):
continue
changes.append({
"row_index": orig.get("row_index", i),
"field": field_name,
"old": old_val,
"new": new_val,
})
entry[field_name] = new_val
entry["llm_corrected"] = True
entries_out.append(entry)
else:
entries_out.append(dict(orig))
return changes, entries_out
def _sanitize_for_json(text: str) -> str:
"""Remove or escape control characters that break JSON parsing."""
return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
def _parse_llm_json_array(text: str) -> List[Dict]:
"""Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
text = re.sub(r'```json\s*', '', text)
text = re.sub(r'```\s*', '', text)
text = _sanitize_for_json(text)
match = re.search(r'\[.*\]', text, re.DOTALL)
if match:
try:
return json.loads(match.group())
except (ValueError, json.JSONDecodeError) as e:
logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
else:
logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
return []
async def llm_review_entries(
entries: List[Dict],
model: str = None,
) -> Dict:
"""OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
return spell_review_entries_sync(entries)
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
model = model or OLLAMA_REVIEW_MODEL
reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
if not reviewable:
return {
"entries_original": entries,
"entries_corrected": [dict(e) for e in entries],
"changes": [],
"skipped_count": len(entries),
"model_used": model,
"duration_ms": 0,
}
review_entries = [e for _, e in reviewable]
table_lines = [
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
for e in review_entries
]
logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
len(review_entries), len(entries), model, len(entries) - len(reviewable))
prompt = _build_llm_prompt(table_lines)
t0 = time.time()
async with httpx.AsyncClient(timeout=300.0) as client:
resp = await client.post(
f"{_OLLAMA_URL}/api/chat",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"think": False,
"options": {"temperature": 0.1, "num_predict": 8192},
},
)
resp.raise_for_status()
content = resp.json().get("message", {}).get("content", "")
duration_ms = int((time.time() - t0) * 1000)
logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
corrected = _parse_llm_json_array(content)
changes, corrected_entries = _diff_batch(review_entries, corrected)
all_corrected = [dict(e) for e in entries]
for batch_idx, (orig_idx, _) in enumerate(reviewable):
if batch_idx < len(corrected_entries):
all_corrected[orig_idx] = corrected_entries[batch_idx]
return {
"entries_original": entries,
"entries_corrected": all_corrected,
"changes": changes,
"skipped_count": len(entries) - len(reviewable),
"model_used": model,
"duration_ms": duration_ms,
}
async def llm_review_entries_streaming(
entries: List[Dict],
model: str = None,
batch_size: int = _REVIEW_BATCH_SIZE,
):
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
Phase 0 (always): Run _fix_character_confusion and emit any changes.
"""
from cv_ocr_engines import _fix_character_confusion
from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE
_CONF_FIELDS = ('english', 'german', 'example')
originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
_fix_character_confusion(entries)
char_changes = [
{'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
for i in range(len(entries))
for f in _CONF_FIELDS
if originals[i][f] != entries[i].get(f, '')
]
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
_meta_sent = False
async for event in spell_review_entries_streaming(entries, batch_size):
yield event
if not _meta_sent and event.get('type') == 'meta' and char_changes:
_meta_sent = True
yield {
'type': 'batch',
'changes': char_changes,
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
'progress': {'current': 0, 'total': len(entries)},
}
return
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
# LLM path
if char_changes:
yield {
'type': 'batch',
'changes': char_changes,
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
'progress': {'current': 0, 'total': len(entries)},
}
model = model or OLLAMA_REVIEW_MODEL
reviewable = []
skipped_indices = []
for i, e in enumerate(entries):
if _entry_needs_review(e):
reviewable.append((i, e))
else:
skipped_indices.append(i)
total_to_review = len(reviewable)
yield {
"type": "meta",
"total_entries": len(entries),
"to_review": total_to_review,
"skipped": len(skipped_indices),
"model": model,
"batch_size": batch_size,
}
all_changes = []
all_corrected = [dict(e) for e in entries]
total_duration_ms = 0
reviewed_count = 0
for batch_start in range(0, total_to_review, batch_size):
batch_items = reviewable[batch_start:batch_start + batch_size]
batch_entries = [e for _, e in batch_items]
table_lines = [
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
for e in batch_entries
]
prompt = _build_llm_prompt(table_lines)
logger.info("LLM review streaming: batch %d -- sending %d entries to %s",
batch_start // batch_size, len(batch_entries), model)
t0 = time.time()
async with httpx.AsyncClient(timeout=300.0) as client:
resp = await client.post(
f"{_OLLAMA_URL}/api/chat",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"think": False,
"options": {"temperature": 0.1, "num_predict": 8192},
},
)
resp.raise_for_status()
content = resp.json().get("message", {}).get("content", "")
batch_ms = int((time.time() - t0) * 1000)
total_duration_ms += batch_ms
corrected = _parse_llm_json_array(content)
batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
for batch_idx, (orig_idx, _) in enumerate(batch_items):
if batch_idx < len(batch_corrected):
all_corrected[orig_idx] = batch_corrected[batch_idx]
all_changes.extend(batch_changes)
reviewed_count += len(batch_items)
yield {
"type": "batch",
"batch_index": batch_start // batch_size,
"entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
"changes": batch_changes,
"duration_ms": batch_ms,
"progress": {"current": reviewed_count, "total": total_to_review},
}
yield {
"type": "complete",
"changes": all_changes,
"model_used": model,
"duration_ms": total_duration_ms,
"total_entries": len(entries),
"reviewed": total_to_review,
"skipped": len(skipped_indices),
"corrections_found": len(all_changes),
"entries_corrected": all_corrected,
}
@@ -0,0 +1,430 @@
"""
CV Review Pipeline Multi-pass OCR, line alignment, LLM post-correction, and orchestration.
Stages 6-8 of the CV vocabulary pipeline plus the main orchestrator.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import time
from typing import Any, Dict, List, Optional
import numpy as np
from cv_vocab_types import (
CV_PIPELINE_AVAILABLE,
PageRegion,
PipelineResult,
VocabRow,
)
from cv_preprocessing import (
deskew_image,
dewarp_image,
render_image_high_res,
render_pdf_high_res,
)
from cv_layout import (
analyze_layout,
create_layout_image,
create_ocr_image,
)
from cv_ocr_engines import (
_group_words_into_lines,
)
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
try:
import pytesseract
from PIL import Image
except ImportError:
pytesseract = None # type: ignore[assignment]
Image = None # type: ignore[assignment,misc]
# =============================================================================
# Stage 6: Multi-Pass OCR
# =============================================================================
def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
psm: int, fallback_psm: Optional[int] = None,
min_confidence: float = 40.0) -> List[Dict[str, Any]]:
"""Run Tesseract OCR on a specific region with given PSM.
Args:
ocr_img: Binarized full-page image.
region: Region to crop and OCR.
lang: Tesseract language string.
psm: Page Segmentation Mode.
fallback_psm: If confidence too low, retry with this PSM per line.
min_confidence: Minimum average confidence before fallback.
Returns:
List of word dicts with text, position, confidence.
"""
crop = ocr_img[region.y:region.y + region.height,
region.x:region.x + region.width]
if crop.size == 0:
return []
pil_img = Image.fromarray(crop)
config = f'--psm {psm} --oem 3'
try:
data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
output_type=pytesseract.Output.DICT)
except Exception as e:
logger.warning(f"Tesseract failed for region {region.type}: {e}")
return []
words = []
for i in range(len(data['text'])):
text = data['text'][i].strip()
conf = int(data['conf'][i])
if not text or conf < 10:
continue
words.append({
'text': text,
'left': data['left'][i] + region.x,
'top': data['top'][i] + region.y,
'width': data['width'][i],
'height': data['height'][i],
'conf': conf,
'region_type': region.type,
})
if words and fallback_psm is not None:
avg_conf = sum(w['conf'] for w in words) / len(words)
if avg_conf < min_confidence:
logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
f"trying fallback PSM {fallback_psm}")
words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
return words
def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
lang: str, psm: int) -> List[Dict[str, Any]]:
"""OCR a region line by line (fallback for low-confidence regions)."""
crop = ocr_img[region.y:region.y + region.height,
region.x:region.x + region.width]
if crop.size == 0:
return []
inv = cv2.bitwise_not(crop)
h_proj = np.sum(inv, axis=1)
threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
lines = []
in_text = False
line_start = 0
for y in range(len(h_proj)):
if h_proj[y] > threshold and not in_text:
line_start = y
in_text = True
elif h_proj[y] <= threshold and in_text:
if y - line_start > 5:
lines.append((line_start, y))
in_text = False
if in_text and len(h_proj) - line_start > 5:
lines.append((line_start, len(h_proj)))
all_words = []
config = f'--psm {psm} --oem 3'
for line_y_start, line_y_end in lines:
pad = 3
y1 = max(0, line_y_start - pad)
y2 = min(crop.shape[0], line_y_end + pad)
line_crop = crop[y1:y2, :]
if line_crop.size == 0:
continue
pil_img = Image.fromarray(line_crop)
try:
data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
output_type=pytesseract.Output.DICT)
except Exception:
continue
for i in range(len(data['text'])):
text = data['text'][i].strip()
conf = int(data['conf'][i])
if not text or conf < 10:
continue
all_words.append({
'text': text,
'left': data['left'][i] + region.x,
'top': data['top'][i] + region.y + y1,
'width': data['width'][i],
'height': data['height'][i],
'conf': conf,
'region_type': region.type,
})
return all_words
def run_multi_pass_ocr(ocr_img: np.ndarray,
regions: List[PageRegion],
lang: str = "eng+deu") -> Dict[str, List[Dict]]:
"""Run OCR on each detected region with optimized settings."""
results: Dict[str, List[Dict]] = {}
_ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
for region in regions:
if region.type in _ocr_skip:
continue
if region.type == 'column_en':
words = ocr_region(ocr_img, region, lang='eng', psm=4)
elif region.type == 'column_de':
words = ocr_region(ocr_img, region, lang='deu', psm=4)
elif region.type == 'column_example':
words = ocr_region(ocr_img, region, lang=lang, psm=6,
fallback_psm=7, min_confidence=40.0)
else:
words = ocr_region(ocr_img, region, lang=lang, psm=6)
results[region.type] = words
logger.info(f"OCR {region.type}: {len(words)} words")
return results
# =============================================================================
# Stage 7: Line Alignment -> Vocabulary Entries
# =============================================================================
def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
regions: List[PageRegion],
y_tolerance_px: int = 25) -> List[VocabRow]:
"""Align OCR results from different columns into vocabulary rows."""
if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
return []
en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
def line_y_center(line: List[Dict]) -> float:
return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
def line_text(line: List[Dict]) -> str:
return ' '.join(w['text'] for w in line)
def line_confidence(line: List[Dict]) -> float:
return sum(w['conf'] for w in line) / len(line) if line else 0
vocab_rows: List[VocabRow] = []
for en_line in en_lines:
en_y = line_y_center(en_line)
en_text = line_text(en_line)
en_conf = line_confidence(en_line)
if len(en_text.strip()) < 2:
continue
de_text = ""
de_conf = 0.0
best_de_dist = float('inf')
best_de_idx = -1
for idx, de_line in enumerate(de_lines):
dist = abs(line_y_center(de_line) - en_y)
if dist < y_tolerance_px and dist < best_de_dist:
best_de_dist = dist
best_de_idx = idx
if best_de_idx >= 0:
de_text = line_text(de_lines[best_de_idx])
de_conf = line_confidence(de_lines[best_de_idx])
ex_text = ""
ex_conf = 0.0
best_ex_dist = float('inf')
best_ex_idx = -1
for idx, ex_line in enumerate(ex_lines):
dist = abs(line_y_center(ex_line) - en_y)
if dist < y_tolerance_px and dist < best_ex_dist:
best_ex_dist = dist
best_ex_idx = idx
if best_ex_idx >= 0:
ex_text = line_text(ex_lines[best_ex_idx])
ex_conf = line_confidence(ex_lines[best_ex_idx])
avg_conf = en_conf
conf_count = 1
if de_conf > 0:
avg_conf += de_conf
conf_count += 1
if ex_conf > 0:
avg_conf += ex_conf
conf_count += 1
vocab_rows.append(VocabRow(
english=en_text.strip(),
german=de_text.strip(),
example=ex_text.strip(),
confidence=avg_conf / conf_count,
y_position=int(en_y),
))
# Handle multi-line wrapping in example column
matched_ex_ys = set()
for row in vocab_rows:
if row.example:
matched_ex_ys.add(row.y_position)
for ex_line in ex_lines:
ex_y = line_y_center(ex_line)
already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
if already_matched:
continue
best_row = None
best_dist = float('inf')
for row in vocab_rows:
dist = ex_y - row.y_position
if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
best_dist = dist
best_row = row
if best_row:
continuation = line_text(ex_line).strip()
if continuation:
best_row.example = (best_row.example + " " + continuation).strip()
vocab_rows.sort(key=lambda r: r.y_position)
return vocab_rows
# =============================================================================
# Stage 8: Optional LLM Post-Correction
# =============================================================================
async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
confidence_threshold: float = 50.0,
enabled: bool = False) -> List[VocabRow]:
"""Optionally send low-confidence regions to Qwen-VL for correction."""
if not enabled:
return vocab_rows
logger.info(f"LLM post-correction skipped (not yet implemented)")
return vocab_rows
# =============================================================================
# Orchestrator
# =============================================================================
async def run_cv_pipeline(
pdf_data: Optional[bytes] = None,
image_data: Optional[bytes] = None,
page_number: int = 0,
zoom: float = 3.0,
enable_dewarp: bool = True,
enable_llm_correction: bool = False,
lang: str = "eng+deu",
) -> PipelineResult:
"""Run the complete CV document reconstruction pipeline."""
if not CV_PIPELINE_AVAILABLE:
return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
result = PipelineResult()
total_start = time.time()
try:
# Stage 1: Render
t = time.time()
if pdf_data:
img = render_pdf_high_res(pdf_data, page_number, zoom)
elif image_data:
img = render_image_high_res(image_data)
else:
return PipelineResult(error="No input data (pdf_data or image_data required)")
result.stages['render'] = round(time.time() - t, 2)
result.image_width = img.shape[1]
result.image_height = img.shape[0]
logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
# Stage 2: Deskew
t = time.time()
img, angle = deskew_image(img)
result.stages['deskew'] = round(time.time() - t, 2)
logger.info(f"Stage 2 (deskew): {angle:.2f}\u00b0 in {result.stages['deskew']}s")
# Stage 3: Dewarp
if enable_dewarp:
t = time.time()
img, _dewarp_info = dewarp_image(img)
result.stages['dewarp'] = round(time.time() - t, 2)
# Stage 4: Dual image preparation
t = time.time()
ocr_img = create_ocr_image(img)
layout_img = create_layout_image(img)
result.stages['image_prep'] = round(time.time() - t, 2)
# Stage 5: Layout analysis
t = time.time()
regions = analyze_layout(layout_img, ocr_img)
result.stages['layout'] = round(time.time() - t, 2)
result.columns_detected = len([r for r in regions if r.type.startswith('column')])
logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
# Stage 6: Multi-pass OCR
t = time.time()
ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
result.stages['ocr'] = round(time.time() - t, 2)
total_words = sum(len(w) for w in ocr_results.values())
result.word_count = total_words
logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
# Stage 7: Line alignment
t = time.time()
vocab_rows = match_lines_to_vocab(ocr_results, regions)
result.stages['alignment'] = round(time.time() - t, 2)
# Stage 8: Optional LLM correction
if enable_llm_correction:
t = time.time()
vocab_rows = await llm_post_correct(img, vocab_rows)
result.stages['llm_correction'] = round(time.time() - t, 2)
# Convert to output format
result.vocabulary = [
{
"english": row.english,
"german": row.german,
"example": row.example,
"confidence": round(row.confidence, 1),
}
for row in vocab_rows
if row.english or row.german
]
result.duration_seconds = round(time.time() - total_start, 2)
logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
except Exception as e:
logger.error(f"CV Pipeline error: {e}")
import traceback
logger.debug(traceback.format_exc())
result.error = str(e)
result.duration_seconds = round(time.time() - total_start, 2)
return result
@@ -0,0 +1,46 @@
"""
Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration.
Re-export facade -- all logic lives in the sub-modules:
cv_review_pipeline Stages 6-8: OCR, line alignment, orchestrator
cv_review_spell Rule-based spell-checker OCR correction
cv_review_llm LLM-based OCR correction, prompt building, streaming
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
# Re-export everything for backward compatibility
from cv_review_pipeline import ( # noqa: F401
ocr_region,
run_multi_pass_ocr,
match_lines_to_vocab,
llm_post_correct,
run_cv_pipeline,
)
from cv_review_spell import ( # noqa: F401
_SPELL_AVAILABLE,
_spell_dict_knows,
_spell_fix_field,
_spell_fix_token,
_try_split_merged_word,
_normalize_page_ref,
spell_review_entries_sync,
spell_review_entries_streaming,
)
from cv_review_llm import ( # noqa: F401
OLLAMA_REVIEW_MODEL,
REVIEW_ENGINE,
_REVIEW_BATCH_SIZE,
_build_llm_prompt,
_diff_batch,
_entry_needs_review,
_is_spurious_change,
_parse_llm_json_array,
_sanitize_for_json,
llm_review_entries,
llm_review_entries_streaming,
)
+315
View File
@@ -0,0 +1,315 @@
"""
CV Review Spell Rule-based OCR spell correction (no LLM).
Provides dictionary-backed digit-to-letter substitution, umlaut correction,
general spell correction, merged-word splitting, and page-ref normalization.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
import time
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
try:
from spellchecker import SpellChecker as _SpellChecker
_en_spell = _SpellChecker(language='en', distance=1)
_de_spell = _SpellChecker(language='de', distance=1)
_SPELL_AVAILABLE = True
logger.info("pyspellchecker loaded (EN+DE)")
except ImportError:
_SPELL_AVAILABLE = False
_en_spell = None # type: ignore[assignment]
_de_spell = None # type: ignore[assignment]
logger.warning("pyspellchecker not installed")
# ---- Page-Ref Normalization ----
# Normalizes OCR variants like "p-60", "p 61", "p60" -> "p.60"
_PAGE_REF_RE = re.compile(r'\bp[\s\-]?(\d+)', re.IGNORECASE)
def _normalize_page_ref(text: str) -> str:
"""Normalize page references: 'p-60' / 'p 61' / 'p60' -> 'p.60'."""
if not text:
return text
return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
# Suspicious OCR chars -> ordered list of most-likely correct replacements
_SPELL_SUBS: Dict[str, List[str]] = {
'0': ['O', 'o'],
'1': ['l', 'I'],
'5': ['S', 's'],
'6': ['G', 'g'],
'8': ['B', 'b'],
'|': ['I', 'l', '1'],
}
_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
# Tokenizer: word tokens (letters + pipe) alternating with separators
_SPELL_TOKEN_RE = re.compile(r'([A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df|]+)([^A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df|]*)')
def _spell_dict_knows(word: str) -> bool:
"""True if word is known in EN or DE dictionary."""
if not _SPELL_AVAILABLE:
return False
w = word.lower()
return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
def _try_split_merged_word(token: str) -> Optional[str]:
"""Try to split a merged word like 'atmyschool' into 'at my school'.
Uses dynamic programming to find the shortest sequence of dictionary
words that covers the entire token. Only returns a result when the
split produces at least 2 words and ALL parts are known dictionary words.
Preserves original capitalisation by mapping back to the input string.
"""
if not _SPELL_AVAILABLE or len(token) < 4:
return None
lower = token.lower()
n = len(lower)
# dp[i] = (word_lengths_list, score) for best split of lower[:i], or None
dp: list = [None] * (n + 1)
dp[0] = ([], 0)
for i in range(1, n + 1):
for j in range(max(0, i - 20), i):
if dp[j] is None:
continue
candidate = lower[j:i]
word_len = i - j
if word_len == 1 and candidate not in ('a', 'i'):
continue
if _spell_dict_knows(candidate):
prev_words, prev_sq = dp[j]
new_words = prev_words + [word_len]
new_sq = prev_sq + word_len * word_len
new_key = (-len(new_words), new_sq)
if dp[i] is None:
dp[i] = (new_words, new_sq)
else:
old_key = (-len(dp[i][0]), dp[i][1])
if new_key >= old_key:
dp[i] = (new_words, new_sq)
if dp[n] is None or len(dp[n][0]) < 2:
return None
result = []
pos = 0
for wlen in dp[n][0]:
result.append(token[pos:pos + wlen])
pos += wlen
logger.debug("Split merged word: %r -> %r", token, " ".join(result))
return " ".join(result)
def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
"""Return corrected form of token, or None if no fix needed/possible.
*field* is 'english' or 'german' -- used to pick the right dictionary.
"""
has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
# 1. Already known word -> no fix needed
if _spell_dict_knows(token):
return None
# 2. Digit/pipe substitution
if has_suspicious:
if token == '|':
return 'I'
for i, ch in enumerate(token):
if ch not in _SPELL_SUBS:
continue
for replacement in _SPELL_SUBS[ch]:
candidate = token[:i] + replacement + token[i + 1:]
if _spell_dict_knows(candidate):
return candidate
first = token[0]
if first in _SPELL_SUBS and len(token) >= 2:
rest = token[1:]
if rest.isalpha() and rest.islower():
candidate = _SPELL_SUBS[first][0] + rest
if not candidate[0].isdigit():
return candidate
# 3. OCR umlaut confusion
if len(token) >= 3 and token.isalpha() and field == "german":
_UMLAUT_SUBS = {'a': '\u00e4', 'o': '\u00f6', 'u': '\u00fc', 'i': '\u00fc',
'A': '\u00c4', 'O': '\u00d6', 'U': '\u00dc', 'I': '\u00dc'}
for i, ch in enumerate(token):
if ch in _UMLAUT_SUBS:
candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
if _spell_dict_knows(candidate):
return candidate
# 4. General spell correction for unknown words (no digits/pipes)
if not has_suspicious and len(token) >= 3 and token.isalpha():
spell = _en_spell if field == "english" else _de_spell if field == "german" else None
if spell is not None:
correction = spell.correction(token.lower())
if correction and correction != token.lower():
if token[0].isupper():
correction = correction[0].upper() + correction[1:]
if _spell_dict_knows(correction):
return correction
# 5. Merged-word split
if len(token) >= 4 and token.isalpha():
split = _try_split_merged_word(token)
if split:
return split
return None
def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
"""Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
if not text:
return text, False
has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
if not has_suspicious and not any(c.isalpha() for c in text):
return text, False
# Pattern: | immediately before . or , -> numbered list prefix
fixed = re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
changed = fixed != text
# Tokenize and fix word by word
parts: List[str] = []
pos = 0
for m in _SPELL_TOKEN_RE.finditer(fixed):
token, sep = m.group(1), m.group(2)
correction = _spell_fix_token(token, field=field)
if correction:
parts.append(correction)
changed = True
else:
parts.append(token)
parts.append(sep)
pos = m.end()
if pos < len(fixed):
parts.append(fixed[pos:])
return ''.join(parts), changed
def spell_review_entries_sync(entries: List[Dict]) -> Dict:
"""Rule-based OCR correction: spell-checker + structural heuristics.
Deterministic -- never translates, never touches IPA, never hallucinates.
Uses SmartSpellChecker for language-aware corrections with context-based
disambiguation (a/I), multi-digit substitution, and cross-language guard.
"""
from cv_review_llm import _entry_needs_review
t0 = time.time()
changes: List[Dict] = []
all_corrected: List[Dict] = []
# Use SmartSpellChecker if available
_smart = None
try:
from smart_spell import SmartSpellChecker
_smart = SmartSpellChecker()
logger.debug("spell_review: using SmartSpellChecker")
except Exception:
logger.debug("spell_review: SmartSpellChecker not available, using legacy")
_LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
for i, entry in enumerate(entries):
e = dict(entry)
# Page-ref normalization
old_ref = (e.get("source_page") or "").strip()
if old_ref:
new_ref = _normalize_page_ref(old_ref)
if new_ref != old_ref:
changes.append({
"row_index": e.get("row_index", i),
"field": "source_page",
"old": old_ref,
"new": new_ref,
})
e["source_page"] = new_ref
e["llm_corrected"] = True
if not _entry_needs_review(e):
all_corrected.append(e)
continue
for field_name in ("english", "german", "example"):
old_val = (e.get(field_name) or "").strip()
if not old_val:
continue
if _smart:
lang_code = _LANG_MAP.get(field_name, "en")
result = _smart.correct_text(old_val, lang=lang_code)
new_val = result.corrected
was_changed = result.changed
else:
lang = "german" if field_name in ("german", "example") else "english"
new_val, was_changed = _spell_fix_field(old_val, field=lang)
if was_changed and new_val != old_val:
changes.append({
"row_index": e.get("row_index", i),
"field": field_name,
"old": old_val,
"new": new_val,
})
e[field_name] = new_val
e["llm_corrected"] = True
all_corrected.append(e)
duration_ms = int((time.time() - t0) * 1000)
model_name = "smart-spell-checker" if _smart else "spell-checker"
return {
"entries_original": entries,
"entries_corrected": all_corrected,
"changes": changes,
"skipped_count": 0,
"model_used": model_name,
"duration_ms": duration_ms,
}
async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
"""Async generator yielding SSE-compatible events for spell-checker review."""
total = len(entries)
yield {
"type": "meta",
"total_entries": total,
"to_review": total,
"skipped": 0,
"model": "spell-checker",
"batch_size": batch_size,
}
result = spell_review_entries_sync(entries)
changes = result["changes"]
yield {
"type": "batch",
"batch_index": 0,
"entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
"changes": changes,
"duration_ms": result["duration_ms"],
"progress": {"current": total, "total": total},
}
yield {
"type": "complete",
"changes": changes,
"model_used": "spell-checker",
"duration_ms": result["duration_ms"],
"total_entries": total,
"reviewed": total,
"skipped": 0,
"corrections_found": len(changes),
"entries_corrected": result["entries_corrected"],
}
+215
View File
@@ -0,0 +1,215 @@
"""
Shared types, constants, and availability guards for the CV vocabulary pipeline.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import json
import logging
import os
import re # noqa: F401 — re-exported for downstream modules
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import numpy as np # noqa: F401
logger = logging.getLogger(__name__)
# --- Availability Guards ---
try:
import cv2 # noqa: F401
CV2_AVAILABLE = True
except ImportError:
cv2 = None # type: ignore[assignment]
CV2_AVAILABLE = False
logger.warning("OpenCV not available — CV pipeline disabled")
try:
import pytesseract # noqa: F401
from PIL import Image # noqa: F401
TESSERACT_AVAILABLE = True
except ImportError:
pytesseract = None # type: ignore[assignment]
Image = None # type: ignore[assignment,misc]
TESSERACT_AVAILABLE = False
logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
# --- IPA Dictionary ---
IPA_AVAILABLE = False
_ipa_convert_american = None
_britfone_dict: Dict[str, str] = {}
try:
import eng_to_ipa as _eng_to_ipa
_ipa_convert_american = _eng_to_ipa.convert
IPA_AVAILABLE = True
logger.info("eng_to_ipa available — American IPA lookup enabled")
except ImportError:
logger.info("eng_to_ipa not installed — American IPA disabled")
# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
if os.path.exists(_britfone_path):
try:
with open(_britfone_path, 'r', encoding='utf-8') as f:
_britfone_dict = json.load(f)
IPA_AVAILABLE = True
logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
except Exception as e:
logger.warning(f"Failed to load Britfone: {e}")
else:
logger.info("Britfone not found — British IPA disabled")
# --- German IPA Dictionary (CC-BY-SA, Wiktionary) ---
DE_IPA_AVAILABLE = False
_de_ipa_dict: Dict[str, str] = {}
_de_ipa_path = os.path.join(os.path.dirname(__file__), 'data', 'de_ipa.tsv')
if os.path.exists(_de_ipa_path):
try:
with open(_de_ipa_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.rstrip('\n').split('\t', 1)
if len(parts) == 2:
_de_ipa_dict[parts[0]] = parts[1]
DE_IPA_AVAILABLE = True
logger.info(f"German IPA loaded — {len(_de_ipa_dict)} entries (CC-BY-SA, Wiktionary)")
except Exception as e:
logger.warning(f"Failed to load German IPA: {e}")
else:
logger.info("German IPA not found — German IPA disabled")
# --- epitran German fallback (MIT license) ---
_epitran_de = None
try:
import epitran as _epitran_module
_epitran_de = _epitran_module.Epitran('deu-Latn')
logger.info("epitran loaded — German rule-based IPA fallback enabled")
except ImportError:
logger.info("epitran not installed — German IPA fallback disabled")
except Exception as e:
logger.warning(f"Failed to init epitran: {e}")
# --- Language Detection Constants ---
GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
# --- Data Classes ---
@dataclass
class PageRegion:
"""A detected region on the page."""
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom', 'column_headword', 'column_article', 'column_ipa'
x: int
y: int
width: int
height: int
classification_confidence: float = 1.0 # 0.0-1.0
classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback'
@dataclass
class ColumnGeometry:
"""Geometrisch erkannte Spalte vor Typ-Klassifikation."""
index: int # 0-basiert, links->rechts
x: int
y: int
width: int
height: int
word_count: int
words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
width_ratio: float # width / content_width (0.0-1.0)
is_sub_column: bool = False # True if created by _detect_sub_columns() split
@dataclass
class RowGeometry:
"""Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
index: int # 0-basiert, oben→unten
x: int # absolute left (= content left_x)
y: int # absolute y start
width: int # content width
height: int # Zeilenhoehe in px
word_count: int
words: List[Dict]
row_type: str = 'content' # 'content' | 'header' | 'footer'
gap_before: int = 0 # Gap in px ueber dieser Zeile
@dataclass
class VocabRow:
"""A single vocabulary entry assembled from multi-column OCR."""
english: str = ""
german: str = ""
example: str = ""
source_page: str = ""
confidence: float = 0.0
y_position: int = 0
@dataclass
class PipelineResult:
"""Complete result of the CV pipeline."""
vocabulary: List[Dict[str, Any]] = field(default_factory=list)
word_count: int = 0
columns_detected: int = 0
duration_seconds: float = 0.0
stages: Dict[str, float] = field(default_factory=dict)
error: Optional[str] = None
image_width: int = 0
image_height: int = 0
@dataclass
class DocumentTypeResult:
"""Result of automatic document type detection."""
doc_type: str # 'vocab_table' | 'full_text' | 'generic_table'
confidence: float # 0.0-1.0
pipeline: str # 'cell_first' | 'full_page'
skip_steps: List[str] = field(default_factory=list) # e.g. ['columns', 'rows']
features: Dict[str, Any] = field(default_factory=dict) # debug info
@dataclass
class DetectedBox:
"""An embedded box (e.g. grammar tip, exercise) detected on the page."""
x: int # absolute pixel position
y: int
width: int
height: int
confidence: float # 0.0-1.0
border_thickness: int = 0
@dataclass
class PageZone:
"""A horizontal zone of the page — either normal content or a detected box."""
index: int # 0-based, top to bottom
zone_type: str # 'content' | 'box'
y: int # absolute pixel y
height: int
x: int
width: int
box: Optional[DetectedBox] = None
columns: List[ColumnGeometry] = field(default_factory=list)
image_overlays: List[Dict] = field(default_factory=list)
layout_hint: Optional[str] = None # 'left_of_vsplit', 'right_of_vsplit'
vsplit_group: Optional[int] = None # group ID for side-by-side rendering
+404
View File
@@ -0,0 +1,404 @@
"""
Words-First Grid Builder (Bottom-Up).
Builds a cell grid from Tesseract word_boxes directly, without requiring
pre-detected columns or rows. Algorithm:
1. Cluster words into columns by X-gap analysis
2. Cluster words into rows by Y-proximity
3. Build cells at (column, row) intersections
Returns the same (cells, columns_meta) format as build_cell_grid_v2().
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
import statistics
from typing import Any, Dict, List, Optional, Tuple
from cv_ocr_engines import (
_group_words_into_lines,
_words_to_reading_order_text,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# 1. Column clustering
# ---------------------------------------------------------------------------
def _cluster_columns(
words: List[Dict],
img_w: int,
min_gap_pct: float = 3.0,
max_columns: Optional[int] = None,
) -> List[Dict[str, Any]]:
"""Cluster words into columns by finding large horizontal gaps.
Args:
max_columns: If set, limits the number of columns by merging
the closest adjacent pairs until the count matches.
Prevents phantom columns from degraded OCR.
Returns a list of column dicts:
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
sorted left-to-right.
"""
if not words:
return []
# Sort by X center
sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
# Collect word heights to compute adaptive threshold
heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
median_h = statistics.median(heights) if heights else 30
# Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
# Find X-gap boundaries between consecutive words (sorted by X-center)
# For each word, compute right edge; for next word, compute left edge
# Collect gaps with their sizes for max_columns enforcement
gaps: List[Tuple[float, float]] = [] # (gap_size, split_x)
for i in range(len(sorted_w) - 1):
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
left_edge = sorted_w[i + 1]['left']
gap = left_edge - right_edge
if gap > min_gap_px:
split_x = (right_edge + left_edge) / 2
gaps.append((gap, split_x))
# If max_columns is set, keep only the (max_columns - 1) largest gaps
if max_columns and len(gaps) >= max_columns:
gaps.sort(key=lambda g: g[0], reverse=True)
gaps = gaps[:max_columns - 1]
logger.info(
f"_cluster_columns: limited to {max_columns} columns "
f"(removed {len(gaps) + max_columns - 1 - (max_columns - 1)} smallest gaps)"
)
boundaries = sorted(g[1] for g in gaps)
# Build column ranges from boundaries
col_edges = [0.0] + boundaries + [float(img_w)]
columns = []
for ci in range(len(col_edges) - 1):
columns.append({
'index': ci,
'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
'x_min': col_edges[ci],
'x_max': col_edges[ci + 1],
})
return columns
# ---------------------------------------------------------------------------
# 2. Row clustering
# ---------------------------------------------------------------------------
def _cluster_rows(
words: List[Dict],
) -> List[Dict[str, Any]]:
"""Cluster words into visual rows by Y-proximity.
Uses half the median word height as Y-tolerance.
Returns a list of row dicts:
[{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
sorted top-to-bottom.
"""
if not words:
return []
heights = [w['height'] for w in words if w.get('height', 0) > 0]
median_h = statistics.median(heights) if heights else 20
y_tol = max(median_h * 0.5, 5)
lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
rows = []
for ri, line_words in enumerate(lines):
y_min = min(w['top'] for w in line_words)
y_max = max(w['top'] + w['height'] for w in line_words)
rows.append({
'index': ri,
'y_min': y_min,
'y_max': y_max,
'y_center': (y_min + y_max) / 2,
})
return rows
# ---------------------------------------------------------------------------
# 3. Build cells
# ---------------------------------------------------------------------------
def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
"""Return column index for a word based on overlap, then center, then nearest.
Three-pass strategy (consistent with _assign_row_words_to_columns):
1. Overlap-based: assign to column with maximum horizontal overlap.
2. Midpoint-range: if no overlap, use midpoints between adjacent columns.
3. Nearest center: last resort fallback.
"""
w_left = word['left']
w_right = w_left + word['width']
w_center = w_left + word['width'] / 2
# Pass 1: overlap-based
best_col = -1
best_overlap = 0
for col in columns:
overlap = max(0, min(w_right, col['x_max']) - max(w_left, col['x_min']))
if overlap > best_overlap:
best_overlap = overlap
best_col = col['index']
if best_col >= 0 and best_overlap > 0:
return best_col
# Pass 2: midpoint-range (non-overlapping assignment zones)
for ci, col in enumerate(columns):
if ci == 0:
assign_left = 0
else:
assign_left = (columns[ci - 1]['x_max'] + col['x_min']) / 2
if ci == len(columns) - 1:
assign_right = float('inf')
else:
assign_right = (col['x_max'] + columns[ci + 1]['x_min']) / 2
if assign_left <= w_center < assign_right:
return col['index']
# Pass 3: nearest column center
return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - w_center))['index']
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
"""Return row index for a word based on its Y-center.
When rows overlap (e.g. due to tall border-ghost characters inflating
a row's y_max), prefer the row whose y_center is closest.
"""
y_center = word['top'] + word['height'] / 2
# Find all rows whose y_range contains this word's center
matching = [r for r in rows if r['y_min'] <= y_center <= r['y_max']]
if matching:
return min(matching, key=lambda r: abs(r['y_center'] - y_center))['index']
# Fallback: nearest row by Y-center
return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
def _build_cells(
words: List[Dict],
columns: List[Dict],
rows: List[Dict],
img_w: int,
img_h: int,
) -> List[Dict[str, Any]]:
"""Build cell dicts from word assignments to (column, row) pairs."""
if not columns or not rows:
return []
# Bucket words into (col_idx, row_idx)
buckets: Dict[Tuple[int, int], List[Dict]] = {}
for w in words:
ci = _assign_word_to_column(w, columns)
ri = _assign_word_to_row(w, rows)
buckets.setdefault((ci, ri), []).append(w)
cells = []
for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
col = columns[ci]
row = rows[ri]
# Compute tight bbox from actual word positions
x_min = min(w['left'] for w in cell_words)
y_min = min(w['top'] for w in cell_words)
x_max = max(w['left'] + w['width'] for w in cell_words)
y_max = max(w['top'] + w['height'] for w in cell_words)
bw = x_max - x_min
bh = y_max - y_min
# Text from words in reading order
text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
# Average confidence
confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
avg_conf = sum(confs) / len(confs) if confs else 0.0
# Word boxes with absolute pixel coordinates (consistent with cv_cell_grid.py).
# PaddleOCR returns phrase-level boxes (e.g. "competition [kompa'tifn]"),
# but the overlay slide mechanism expects one box per word. Split multi-word
# boxes into individual word positions proportional to character length.
# Also split at "[" boundaries (IPA patterns like "badge[bxd3]").
#
# Sort in reading order: group by Y (same visual line), then sort by X.
# Simple (top, left) sort fails when words on the same line have slightly
# different top values (1-6px), causing wrong word order.
y_tol_wb = max(10, int(bh * 0.4))
reading_lines = _group_words_into_lines(cell_words, y_tolerance_px=y_tol_wb)
ordered_cell_words = [w for line in reading_lines for w in line]
word_boxes = []
for w in ordered_cell_words:
raw_text = w.get('text', '').strip()
# Split by whitespace, at "[" boundaries (IPA), and after leading "!"
# e.g. "badge[bxd3]" → ["badge", "[bxd3]"]
# e.g. "profit['proft]" → ["profit", "['proft]"]
# e.g. "!Betonung" → ["!", "Betonung"]
tokens = re.split(r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text)
tokens = [t for t in tokens if t] # remove empty strings
if len(tokens) <= 1:
# Single word — keep as-is
word_boxes.append({
'text': raw_text,
'left': w['left'],
'top': w['top'],
'width': w['width'],
'height': w['height'],
'conf': w.get('conf', 0),
})
else:
# Multi-word phrase — split proportionally by character count
total_chars = sum(len(t) for t in tokens)
if total_chars == 0:
continue
# Small gap between words (2% of box width per gap)
n_gaps = len(tokens) - 1
gap_px = w['width'] * 0.02
usable_w = w['width'] - gap_px * n_gaps
cursor = w['left']
for t in tokens:
token_w = max(1, usable_w * len(t) / total_chars)
word_boxes.append({
'text': t,
'left': round(cursor),
'top': w['top'],
'width': round(token_w),
'height': w['height'],
'conf': w.get('conf', 0),
})
cursor += token_w + gap_px
cells.append({
'cell_id': f"R{ri:02d}_C{ci}",
'row_index': ri,
'col_index': ci,
'col_type': col['type'],
'text': text,
'confidence': round(avg_conf, 1),
'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
'bbox_pct': {
'x': round(x_min / img_w * 100, 2) if img_w else 0,
'y': round(y_min / img_h * 100, 2) if img_h else 0,
'w': round(bw / img_w * 100, 2) if img_w else 0,
'h': round(bh / img_h * 100, 2) if img_h else 0,
},
'word_boxes': word_boxes,
'ocr_engine': 'words_first',
'is_bold': False,
})
return cells
# ---------------------------------------------------------------------------
# 4. Public API
# ---------------------------------------------------------------------------
def build_grid_from_words(
word_dicts: List[Dict],
img_w: int,
img_h: int,
min_confidence: int = 30,
box_rects: Optional[List[Dict]] = None,
max_columns: Optional[int] = None,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Build a cell grid bottom-up from Tesseract word boxes.
Args:
word_dicts: Flat list of word dicts with keys:
text, left, top, width, height, conf
(absolute pixel coordinates).
img_w: Image width in pixels.
img_h: Image height in pixels.
min_confidence: Minimum OCR confidence to keep a word.
box_rects: Optional list of box dicts with keys x, y, width, height.
Words inside these boxes are excluded from column clustering
(box-internal columns are detected separately in sub-sessions).
Returns:
(cells, columns_meta) same format as build_cell_grid_v2().
cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
"""
if not word_dicts:
logger.info("build_grid_from_words: no words — returning empty grid")
return [], []
# Filter by confidence
words = [
w for w in word_dicts
if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
]
if not words:
logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
return [], []
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
# Exclude words inside detected boxes — box columns are detected separately
if box_rects:
content_words = []
for w in words:
w_cx = w['left'] + w['width'] / 2
w_cy = w['top'] + w['height'] / 2
inside = any(
b['x'] <= w_cx <= b['x'] + b['width']
and b['y'] <= w_cy <= b['y'] + b['height']
for b in box_rects
)
if not inside:
content_words.append(w)
excluded = len(words) - len(content_words)
if excluded:
logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
excluded, len(box_rects))
words = content_words
if not words:
logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
return [], []
# Step 1: cluster columns
columns = _cluster_columns(words, img_w, max_columns=max_columns)
logger.info("build_grid_from_words: %d column(s) detected%s",
len(columns), f" (max={max_columns})" if max_columns else "")
# Step 2: cluster rows
rows = _cluster_rows(words)
logger.info("build_grid_from_words: %d row(s) detected", len(rows))
# Step 3: build cells
cells = _build_cells(words, columns, rows, img_w, img_h)
logger.info("build_grid_from_words: %d cells built", len(cells))
# Build columns_meta in same format as build_cell_grid_v2
columns_meta = []
for col in columns:
x = int(col['x_min'])
w = int(col['x_max'] - col['x_min'])
columns_meta.append({
'index': col['index'],
'type': col['type'],
'x': x,
'width': w,
})
return cells, columns_meta