Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 20s
Prevents first content row from expanding into header area (causing "ulary" from "VOCABULARY" to appear in DE column) and last content row from expanding into footer area (causing page numbers to appear as content). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
7092 lines
266 KiB
Python
7092 lines
266 KiB
Python
"""
|
||
CV-based Document Reconstruction Pipeline for Vocabulary Extraction.
|
||
|
||
Uses classical Computer Vision techniques for high-quality OCR:
|
||
- High-resolution PDF rendering (432 DPI)
|
||
- Deskew (rotation correction via Hough Lines)
|
||
- Dewarp (book curvature correction) — pass-through initially
|
||
- Dual image preparation (binarized for OCR, CLAHE for layout)
|
||
- Projection-profile layout analysis (column/row detection)
|
||
- Multi-pass Tesseract OCR with region-specific PSM settings
|
||
- Y-coordinate line alignment for vocabulary matching
|
||
- Optional LLM post-correction for low-confidence regions
|
||
|
||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import io
|
||
import logging
|
||
import time
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from dataclasses import dataclass, field
|
||
from typing import Any, Dict, Generator, List, Optional, Tuple
|
||
|
||
import numpy as np
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# --- Availability Guards ---
|
||
|
||
try:
|
||
import cv2
|
||
CV2_AVAILABLE = True
|
||
except ImportError:
|
||
cv2 = None
|
||
CV2_AVAILABLE = False
|
||
logger.warning("OpenCV not available — CV pipeline disabled")
|
||
|
||
try:
|
||
import pytesseract
|
||
from PIL import Image
|
||
TESSERACT_AVAILABLE = True
|
||
except ImportError:
|
||
pytesseract = None
|
||
Image = None
|
||
TESSERACT_AVAILABLE = False
|
||
logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
|
||
|
||
CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
|
||
|
||
# --- IPA Dictionary ---
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
|
||
IPA_AVAILABLE = False
|
||
_ipa_convert_american = None
|
||
_britfone_dict: Dict[str, str] = {}
|
||
|
||
try:
|
||
import eng_to_ipa as _eng_to_ipa
|
||
_ipa_convert_american = _eng_to_ipa.convert
|
||
IPA_AVAILABLE = True
|
||
logger.info("eng_to_ipa available — American IPA lookup enabled")
|
||
except ImportError:
|
||
logger.info("eng_to_ipa not installed — American IPA disabled")
|
||
|
||
# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
|
||
_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
|
||
if os.path.exists(_britfone_path):
|
||
try:
|
||
with open(_britfone_path, 'r', encoding='utf-8') as f:
|
||
_britfone_dict = json.load(f)
|
||
IPA_AVAILABLE = True
|
||
logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
|
||
except Exception as e:
|
||
logger.warning(f"Failed to load Britfone: {e}")
|
||
else:
|
||
logger.info("Britfone not found — British IPA disabled")
|
||
|
||
# --- Language Detection Constants ---
|
||
|
||
GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
|
||
'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
|
||
'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
|
||
'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
|
||
'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
|
||
|
||
ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
|
||
'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
|
||
'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
||
'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
|
||
'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
|
||
|
||
|
||
# --- Data Classes ---
|
||
|
||
@dataclass
|
||
class PageRegion:
|
||
"""A detected region on the page."""
|
||
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
|
||
x: int
|
||
y: int
|
||
width: int
|
||
height: int
|
||
classification_confidence: float = 1.0 # 0.0-1.0
|
||
classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback'
|
||
|
||
|
||
@dataclass
|
||
class ColumnGeometry:
|
||
"""Geometrisch erkannte Spalte vor Typ-Klassifikation."""
|
||
index: int # 0-basiert, links->rechts
|
||
x: int
|
||
y: int
|
||
width: int
|
||
height: int
|
||
word_count: int
|
||
words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
|
||
width_ratio: float # width / content_width (0.0-1.0)
|
||
is_sub_column: bool = False # True if created by _detect_sub_columns() split
|
||
|
||
|
||
@dataclass
|
||
class RowGeometry:
|
||
"""Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
|
||
index: int # 0-basiert, oben→unten
|
||
x: int # absolute left (= content left_x)
|
||
y: int # absolute y start
|
||
width: int # content width
|
||
height: int # Zeilenhoehe in px
|
||
word_count: int
|
||
words: List[Dict]
|
||
row_type: str = 'content' # 'content' | 'header' | 'footer'
|
||
gap_before: int = 0 # Gap in px ueber dieser Zeile
|
||
|
||
|
||
@dataclass
|
||
class VocabRow:
|
||
"""A single vocabulary entry assembled from multi-column OCR."""
|
||
english: str = ""
|
||
german: str = ""
|
||
example: str = ""
|
||
source_page: str = ""
|
||
confidence: float = 0.0
|
||
y_position: int = 0
|
||
|
||
|
||
@dataclass
|
||
class PipelineResult:
|
||
"""Complete result of the CV pipeline."""
|
||
vocabulary: List[Dict[str, Any]] = field(default_factory=list)
|
||
word_count: int = 0
|
||
columns_detected: int = 0
|
||
duration_seconds: float = 0.0
|
||
stages: Dict[str, float] = field(default_factory=dict)
|
||
error: Optional[str] = None
|
||
image_width: int = 0
|
||
image_height: int = 0
|
||
|
||
|
||
@dataclass
|
||
class DocumentTypeResult:
|
||
"""Result of automatic document type detection."""
|
||
doc_type: str # 'vocab_table' | 'full_text' | 'generic_table'
|
||
confidence: float # 0.0-1.0
|
||
pipeline: str # 'cell_first' | 'full_page'
|
||
skip_steps: List[str] = field(default_factory=list) # e.g. ['columns', 'rows']
|
||
features: Dict[str, Any] = field(default_factory=dict) # debug info
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 1: High-Resolution PDF Rendering
|
||
# =============================================================================
|
||
|
||
def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
|
||
"""Render a PDF page to a high-resolution numpy array (BGR).
|
||
|
||
Args:
|
||
pdf_data: Raw PDF bytes.
|
||
page_number: 0-indexed page number.
|
||
zoom: Zoom factor (3.0 = 432 DPI).
|
||
|
||
Returns:
|
||
numpy array in BGR format.
|
||
"""
|
||
import fitz # PyMuPDF
|
||
|
||
pdf_doc = fitz.open(stream=pdf_data, filetype="pdf")
|
||
if page_number >= pdf_doc.page_count:
|
||
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)")
|
||
|
||
page = pdf_doc[page_number]
|
||
mat = fitz.Matrix(zoom, zoom)
|
||
pix = page.get_pixmap(matrix=mat)
|
||
|
||
# Convert to numpy BGR
|
||
img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
|
||
if pix.n == 4: # RGBA
|
||
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
|
||
elif pix.n == 3: # RGB
|
||
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
|
||
else: # Grayscale
|
||
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
|
||
|
||
pdf_doc.close()
|
||
return img_bgr
|
||
|
||
|
||
def render_image_high_res(image_data: bytes) -> np.ndarray:
|
||
"""Load an image (PNG/JPEG) into a numpy array (BGR).
|
||
|
||
Args:
|
||
image_data: Raw image bytes.
|
||
|
||
Returns:
|
||
numpy array in BGR format.
|
||
"""
|
||
img_array = np.frombuffer(image_data, dtype=np.uint8)
|
||
img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||
if img_bgr is None:
|
||
raise ValueError("Could not decode image data")
|
||
return img_bgr
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 2: Deskew (Rotation Correction)
|
||
# =============================================================================
|
||
|
||
def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
|
||
"""Correct rotation using Hough Line detection.
|
||
|
||
Args:
|
||
img: BGR image.
|
||
|
||
Returns:
|
||
Tuple of (corrected image, detected angle in degrees).
|
||
"""
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
# Binarize for line detection
|
||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
|
||
# Detect lines
|
||
lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
|
||
minLineLength=img.shape[1] // 4, maxLineGap=20)
|
||
|
||
if lines is None or len(lines) < 3:
|
||
return img, 0.0
|
||
|
||
# Compute angles of near-horizontal lines
|
||
angles = []
|
||
for line in lines:
|
||
x1, y1, x2, y2 = line[0]
|
||
angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
|
||
if abs(angle) < 15: # Only near-horizontal
|
||
angles.append(angle)
|
||
|
||
if not angles:
|
||
return img, 0.0
|
||
|
||
median_angle = float(np.median(angles))
|
||
|
||
# Limit correction to ±5°
|
||
if abs(median_angle) > 5.0:
|
||
median_angle = 5.0 * np.sign(median_angle)
|
||
|
||
if abs(median_angle) < 0.1:
|
||
return img, 0.0
|
||
|
||
# Rotate
|
||
h, w = img.shape[:2]
|
||
center = (w // 2, h // 2)
|
||
M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
|
||
corrected = cv2.warpAffine(img, M, (w, h),
|
||
flags=cv2.INTER_LINEAR,
|
||
borderMode=cv2.BORDER_REPLICATE)
|
||
|
||
logger.info(f"Deskew: corrected {median_angle:.2f}° rotation")
|
||
return corrected, median_angle
|
||
|
||
|
||
def deskew_image_by_word_alignment(
|
||
image_data: bytes,
|
||
lang: str = "eng+deu",
|
||
downscale_factor: float = 0.5,
|
||
) -> Tuple[bytes, float]:
|
||
"""Correct rotation by fitting a line through left-most word starts per text line.
|
||
|
||
More robust than Hough-based deskew for vocabulary worksheets where text lines
|
||
have consistent left-alignment. Runs a quick Tesseract pass on a downscaled
|
||
copy to find word positions, computes the dominant left-edge column, fits a
|
||
line through those points and rotates the full-resolution image.
|
||
|
||
Args:
|
||
image_data: Raw image bytes (PNG/JPEG).
|
||
lang: Tesseract language string for the quick pass.
|
||
downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
|
||
|
||
Returns:
|
||
Tuple of (rotated image as PNG bytes, detected angle in degrees).
|
||
"""
|
||
if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
|
||
return image_data, 0.0
|
||
|
||
# 1. Decode image
|
||
img_array = np.frombuffer(image_data, dtype=np.uint8)
|
||
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||
if img is None:
|
||
logger.warning("deskew_by_word_alignment: could not decode image")
|
||
return image_data, 0.0
|
||
|
||
orig_h, orig_w = img.shape[:2]
|
||
|
||
# 2. Downscale for fast Tesseract pass
|
||
small_w = int(orig_w * downscale_factor)
|
||
small_h = int(orig_h * downscale_factor)
|
||
small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
|
||
|
||
# 3. Quick Tesseract — word-level positions
|
||
pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
|
||
try:
|
||
data = pytesseract.image_to_data(
|
||
pil_small, lang=lang, config="--psm 6 --oem 3",
|
||
output_type=pytesseract.Output.DICT,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
|
||
return image_data, 0.0
|
||
|
||
# 4. Per text-line, find the left-most word start
|
||
# Group by (block_num, par_num, line_num)
|
||
from collections import defaultdict
|
||
line_groups: Dict[tuple, list] = defaultdict(list)
|
||
for i in range(len(data["text"])):
|
||
text = (data["text"][i] or "").strip()
|
||
conf = int(data["conf"][i])
|
||
if not text or conf < 20:
|
||
continue
|
||
key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
|
||
line_groups[key].append(i)
|
||
|
||
if len(line_groups) < 5:
|
||
logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
|
||
return image_data, 0.0
|
||
|
||
# For each line, pick the word with smallest 'left' → compute (left_x, center_y)
|
||
# Scale back to original resolution
|
||
scale = 1.0 / downscale_factor
|
||
points = [] # list of (x, y) in original-image coords
|
||
for key, indices in line_groups.items():
|
||
best_idx = min(indices, key=lambda i: data["left"][i])
|
||
lx = data["left"][best_idx] * scale
|
||
top = data["top"][best_idx] * scale
|
||
h = data["height"][best_idx] * scale
|
||
cy = top + h / 2.0
|
||
points.append((lx, cy))
|
||
|
||
# 5. Find dominant left-edge column + compute angle
|
||
xs = np.array([p[0] for p in points])
|
||
ys = np.array([p[1] for p in points])
|
||
median_x = float(np.median(xs))
|
||
tolerance = orig_w * 0.03 # 3% of image width
|
||
|
||
mask = np.abs(xs - median_x) <= tolerance
|
||
filtered_xs = xs[mask]
|
||
filtered_ys = ys[mask]
|
||
|
||
if len(filtered_xs) < 5:
|
||
logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
|
||
return image_data, 0.0
|
||
|
||
# polyfit: x = a*y + b → a = dx/dy → angle = arctan(a)
|
||
coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
|
||
slope = coeffs[0] # dx/dy
|
||
angle_rad = np.arctan(slope)
|
||
angle_deg = float(np.degrees(angle_rad))
|
||
|
||
# Clamp to ±5°
|
||
angle_deg = max(-5.0, min(5.0, angle_deg))
|
||
|
||
logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points "
|
||
f"(total lines: {len(line_groups)})")
|
||
|
||
if abs(angle_deg) < 0.05:
|
||
return image_data, 0.0
|
||
|
||
# 6. Rotate full-res image
|
||
center = (orig_w // 2, orig_h // 2)
|
||
M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
|
||
rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
|
||
flags=cv2.INTER_LINEAR,
|
||
borderMode=cv2.BORDER_REPLICATE)
|
||
|
||
# Encode back to PNG
|
||
success, png_buf = cv2.imencode(".png", rotated)
|
||
if not success:
|
||
logger.warning("deskew_by_word_alignment: PNG encoding failed")
|
||
return image_data, 0.0
|
||
|
||
return png_buf.tobytes(), angle_deg
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 3: Dewarp (Book Curvature Correction)
|
||
# =============================================================================
|
||
|
||
def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
|
||
"""Detect the vertical shear angle of the page.
|
||
|
||
After deskew (horizontal lines aligned), vertical features like column
|
||
edges may still be tilted. This measures that tilt by tracking the
|
||
strongest vertical edge across horizontal strips.
|
||
|
||
The result is a shear angle in degrees: the angular difference between
|
||
true vertical and the detected column edge.
|
||
|
||
Returns:
|
||
Dict with keys: method, shear_degrees, confidence.
|
||
"""
|
||
h, w = img.shape[:2]
|
||
result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
|
||
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
|
||
# Vertical Sobel to find vertical edges
|
||
sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
|
||
abs_sobel = np.abs(sobel_x).astype(np.uint8)
|
||
|
||
# Binarize with Otsu
|
||
_, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||
|
||
num_strips = 20
|
||
strip_h = h // num_strips
|
||
edge_positions = [] # (y_center, x_position)
|
||
|
||
for i in range(num_strips):
|
||
y_start = i * strip_h
|
||
y_end = min((i + 1) * strip_h, h)
|
||
strip = binary[y_start:y_end, :]
|
||
|
||
# Project vertically (sum along y-axis)
|
||
projection = np.sum(strip, axis=0).astype(np.float64)
|
||
if projection.max() == 0:
|
||
continue
|
||
|
||
# Find the strongest vertical edge in left 40% of image
|
||
search_w = int(w * 0.4)
|
||
left_proj = projection[:search_w]
|
||
if left_proj.max() == 0:
|
||
continue
|
||
|
||
# Smooth and find peak
|
||
kernel_size = max(3, w // 100)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
|
||
x_pos = float(np.argmax(smoothed))
|
||
y_center = (y_start + y_end) / 2.0
|
||
edge_positions.append((y_center, x_pos))
|
||
|
||
if len(edge_positions) < 8:
|
||
return result
|
||
|
||
ys = np.array([p[0] for p in edge_positions])
|
||
xs = np.array([p[1] for p in edge_positions])
|
||
|
||
# Remove outliers (> 2 std from median)
|
||
median_x = np.median(xs)
|
||
std_x = max(np.std(xs), 1.0)
|
||
mask = np.abs(xs - median_x) < 2 * std_x
|
||
ys = ys[mask]
|
||
xs = xs[mask]
|
||
|
||
if len(ys) < 6:
|
||
return result
|
||
|
||
# Fit straight line: x = slope * y + intercept
|
||
# The slope tells us the tilt of the vertical edge
|
||
straight_coeffs = np.polyfit(ys, xs, 1)
|
||
slope = straight_coeffs[0] # dx/dy in pixels
|
||
fitted = np.polyval(straight_coeffs, ys)
|
||
residuals = xs - fitted
|
||
rmse = float(np.sqrt(np.mean(residuals ** 2)))
|
||
|
||
# Convert slope to angle: arctan(dx/dy) in degrees
|
||
import math
|
||
shear_degrees = math.degrees(math.atan(slope))
|
||
|
||
confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
|
||
|
||
result["shear_degrees"] = round(shear_degrees, 3)
|
||
result["confidence"] = round(float(confidence), 2)
|
||
|
||
return result
|
||
|
||
|
||
def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
|
||
"""Detect shear angle by maximising variance of horizontal text-line projections.
|
||
|
||
Principle: horizontal text lines produce a row-projection profile with sharp
|
||
peaks (high variance) when the image is correctly aligned. Any residual shear
|
||
smears the peaks and reduces variance. We sweep ±3° and pick the angle whose
|
||
corrected projection has the highest variance.
|
||
|
||
Works best on pages with clear horizontal banding (vocabulary tables, prose).
|
||
Complements _detect_shear_angle() which needs strong vertical edges.
|
||
|
||
Returns:
|
||
Dict with keys: method, shear_degrees, confidence.
|
||
"""
|
||
import math
|
||
result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
|
||
|
||
h, w = img.shape[:2]
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
|
||
# Otsu binarisation
|
||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
|
||
# Work at half resolution for speed
|
||
small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
|
||
sh, sw = small.shape
|
||
|
||
# 2-pass angle sweep for 10x better precision:
|
||
# Pass 1: Coarse sweep ±3° in 0.5° steps (13 values)
|
||
# Pass 2: Fine sweep ±0.5° around coarse best in 0.05° steps (21 values)
|
||
|
||
def _sweep_variance(angles_list):
|
||
results = []
|
||
for angle_deg in angles_list:
|
||
if abs(angle_deg) < 0.001:
|
||
rotated = small
|
||
else:
|
||
shear_tan = math.tan(math.radians(angle_deg))
|
||
M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
|
||
rotated = cv2.warpAffine(small, M, (sw, sh),
|
||
flags=cv2.INTER_NEAREST,
|
||
borderMode=cv2.BORDER_CONSTANT)
|
||
profile = np.sum(rotated, axis=1).astype(float)
|
||
results.append((angle_deg, float(np.var(profile))))
|
||
return results
|
||
|
||
# Pass 1: coarse
|
||
coarse_angles = [a * 0.5 for a in range(-6, 7)] # 13 values
|
||
coarse_results = _sweep_variance(coarse_angles)
|
||
coarse_best = max(coarse_results, key=lambda x: x[1])
|
||
|
||
# Pass 2: fine around coarse best
|
||
fine_center = coarse_best[0]
|
||
fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)] # 21 values
|
||
fine_results = _sweep_variance(fine_angles)
|
||
fine_best = max(fine_results, key=lambda x: x[1])
|
||
|
||
best_angle = fine_best[0]
|
||
best_variance = fine_best[1]
|
||
variances = coarse_results + fine_results
|
||
|
||
# Confidence: how much sharper is the best angle vs. the mean?
|
||
all_mean = sum(v for _, v in variances) / len(variances)
|
||
if all_mean > 0 and best_variance > all_mean:
|
||
confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
|
||
else:
|
||
confidence = 0.0
|
||
|
||
result["shear_degrees"] = round(best_angle, 3)
|
||
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
|
||
return result
|
||
|
||
|
||
def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
|
||
"""Detect shear using Hough transform on printed table / ruled lines.
|
||
|
||
Vocabulary worksheets have near-horizontal printed table borders. After
|
||
deskew these should be exactly horizontal; any residual tilt equals the
|
||
vertical shear angle (with inverted sign).
|
||
|
||
The sign convention: a horizontal line tilting +α degrees (left end lower)
|
||
means the page has vertical shear of -α degrees (left column edge drifts
|
||
to the left going downward).
|
||
|
||
Returns:
|
||
Dict with keys: method, shear_degrees, confidence.
|
||
"""
|
||
result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
|
||
|
||
h, w = img.shape[:2]
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
|
||
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
||
|
||
min_len = int(w * 0.15)
|
||
lines = cv2.HoughLinesP(
|
||
edges, rho=1, theta=np.pi / 360,
|
||
threshold=int(w * 0.08),
|
||
minLineLength=min_len,
|
||
maxLineGap=20,
|
||
)
|
||
|
||
if lines is None or len(lines) < 3:
|
||
return result
|
||
|
||
horizontal_angles: List[Tuple[float, float]] = []
|
||
for line in lines:
|
||
x1, y1, x2, y2 = line[0]
|
||
if x1 == x2:
|
||
continue
|
||
angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
|
||
if abs(angle) <= 5.0:
|
||
length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
|
||
horizontal_angles.append((angle, length))
|
||
|
||
if len(horizontal_angles) < 3:
|
||
return result
|
||
|
||
# Weighted median
|
||
angles_arr = np.array([a for a, _ in horizontal_angles])
|
||
weights_arr = np.array([l for _, l in horizontal_angles])
|
||
sorted_idx = np.argsort(angles_arr)
|
||
s_angles = angles_arr[sorted_idx]
|
||
s_weights = weights_arr[sorted_idx]
|
||
cum = np.cumsum(s_weights)
|
||
mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
|
||
median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
|
||
|
||
agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
|
||
confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
|
||
|
||
# Sign inversion: horizontal line tilt is complementary to vertical shear
|
||
shear_degrees = -median_angle
|
||
|
||
result["shear_degrees"] = round(shear_degrees, 3)
|
||
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
|
||
return result
|
||
|
||
|
||
def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
|
||
"""Detect shear by measuring text-line straightness (Method D).
|
||
|
||
Runs a quick Tesseract scan (PSM 11, 50% downscale) to locate word
|
||
bounding boxes, groups them into vertical columns by X-proximity,
|
||
and measures how the left-edge X position drifts with Y (vertical
|
||
position). The drift dx/dy is the tangent of the shear angle.
|
||
|
||
This directly measures vertical shear (column tilt) rather than
|
||
horizontal text-line slope, which is already corrected by deskew.
|
||
|
||
Returns:
|
||
Dict with keys: method, shear_degrees, confidence.
|
||
"""
|
||
import math
|
||
result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
|
||
|
||
h, w = img.shape[:2]
|
||
# Downscale 50% for speed
|
||
scale = 0.5
|
||
small = cv2.resize(img, (int(w * scale), int(h * scale)),
|
||
interpolation=cv2.INTER_AREA)
|
||
gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
|
||
pil_img = Image.fromarray(gray)
|
||
|
||
try:
|
||
data = pytesseract.image_to_data(
|
||
pil_img, lang='eng+deu', config='--psm 11 --oem 3',
|
||
output_type=pytesseract.Output.DICT,
|
||
)
|
||
except Exception:
|
||
return result
|
||
|
||
# Collect word left-edges (x) and vertical centres (y)
|
||
words = []
|
||
for i in range(len(data['text'])):
|
||
text = data['text'][i].strip()
|
||
conf = int(data['conf'][i])
|
||
if not text or conf < 20 or len(text) < 2:
|
||
continue
|
||
left_x = float(data['left'][i])
|
||
cy = data['top'][i] + data['height'][i] / 2.0
|
||
word_w = float(data['width'][i])
|
||
words.append((left_x, cy, word_w))
|
||
|
||
if len(words) < 15:
|
||
return result
|
||
|
||
# --- Group words into vertical columns by left-edge X proximity ---
|
||
# Sort by x, then cluster words whose left-edges are within x_tol
|
||
avg_w = sum(ww for _, _, ww in words) / len(words)
|
||
x_tol = max(avg_w * 0.4, 8) # tolerance for "same column"
|
||
|
||
words_by_x = sorted(words, key=lambda w: w[0])
|
||
columns: List[List[Tuple[float, float]]] = [] # each: [(left_x, cy), ...]
|
||
cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
|
||
cur_x = words_by_x[0][0]
|
||
|
||
for lx, cy, _ in words_by_x[1:]:
|
||
if abs(lx - cur_x) <= x_tol:
|
||
cur_col.append((lx, cy))
|
||
# Update running x as median of cluster
|
||
cur_x = cur_x * 0.8 + lx * 0.2
|
||
else:
|
||
if len(cur_col) >= 5:
|
||
columns.append(cur_col)
|
||
cur_col = [(lx, cy)]
|
||
cur_x = lx
|
||
if len(cur_col) >= 5:
|
||
columns.append(cur_col)
|
||
|
||
if len(columns) < 2:
|
||
return result
|
||
|
||
# --- For each column, measure X-drift as a function of Y ---
|
||
# Fit: left_x = a * cy + b → a = dx/dy = tan(shear_angle)
|
||
drifts = []
|
||
for col in columns:
|
||
ys = np.array([p[1] for p in col])
|
||
xs = np.array([p[0] for p in col])
|
||
y_range = ys.max() - ys.min()
|
||
if y_range < h * scale * 0.3:
|
||
continue # column must span at least 30% of image height
|
||
# Linear regression: x = a*y + b
|
||
coeffs = np.polyfit(ys, xs, 1)
|
||
drifts.append(coeffs[0]) # dx/dy
|
||
|
||
if len(drifts) < 2:
|
||
return result
|
||
|
||
# Median dx/dy → shear angle
|
||
# dx/dy > 0 means left-edges move RIGHT as we go DOWN → columns lean right
|
||
median_drift = float(np.median(drifts))
|
||
shear_degrees = math.degrees(math.atan(median_drift))
|
||
|
||
# Confidence from column count + drift consistency
|
||
drift_std = float(np.std(drifts))
|
||
consistency = max(0.0, 1.0 - drift_std * 50) # tighter penalty for drift variance
|
||
count_factor = min(1.0, len(drifts) / 4.0)
|
||
confidence = count_factor * 0.5 + consistency * 0.5
|
||
|
||
result["shear_degrees"] = round(shear_degrees, 3)
|
||
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
|
||
logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
|
||
"shear=%.3f°, conf=%.2f",
|
||
len(columns), len(drifts), median_drift,
|
||
shear_degrees, confidence)
|
||
return result
|
||
|
||
|
||
def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
|
||
"""Check whether the dewarp correction actually improved alignment.
|
||
|
||
Compares horizontal projection variance before and after correction.
|
||
Higher variance means sharper text-line peaks, which indicates better
|
||
horizontal alignment.
|
||
|
||
Returns True if the correction improved the image, False if it should
|
||
be discarded.
|
||
"""
|
||
def _h_proj_variance(img: np.ndarray) -> float:
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
_, binary = cv2.threshold(gray, 0, 255,
|
||
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
|
||
interpolation=cv2.INTER_AREA)
|
||
profile = np.sum(small, axis=1).astype(float)
|
||
return float(np.var(profile))
|
||
|
||
var_before = _h_proj_variance(original)
|
||
var_after = _h_proj_variance(corrected)
|
||
|
||
# Correction must improve variance (even by a tiny margin)
|
||
return var_after > var_before
|
||
|
||
|
||
def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
|
||
"""Apply a vertical shear correction to an image.
|
||
|
||
Shifts each row horizontally proportional to its distance from the
|
||
vertical center. This corrects the tilt of vertical features (columns)
|
||
without affecting horizontal alignment (text lines).
|
||
|
||
Args:
|
||
img: BGR image.
|
||
shear_degrees: Shear angle in degrees. Positive = shift top-right/bottom-left.
|
||
|
||
Returns:
|
||
Corrected image.
|
||
"""
|
||
import math
|
||
h, w = img.shape[:2]
|
||
shear_tan = math.tan(math.radians(shear_degrees))
|
||
|
||
# Affine matrix: shift x by shear_tan * (y - h/2)
|
||
# [1 shear_tan -h/2*shear_tan]
|
||
# [0 1 0 ]
|
||
M = np.float32([
|
||
[1, shear_tan, -h / 2.0 * shear_tan],
|
||
[0, 1, 0],
|
||
])
|
||
|
||
corrected = cv2.warpAffine(img, M, (w, h),
|
||
flags=cv2.INTER_LINEAR,
|
||
borderMode=cv2.BORDER_REPLICATE)
|
||
return corrected
|
||
|
||
|
||
def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
|
||
"""Combine multiple shear detections into a single weighted estimate (v2).
|
||
|
||
Ensemble v2 changes vs v1:
|
||
- Minimum confidence raised to 0.5 (was 0.3)
|
||
- text_lines method gets 1.5× weight boost (most reliable detector)
|
||
- Outlier filter at 1° from weighted mean
|
||
|
||
Returns:
|
||
(shear_degrees, ensemble_confidence, methods_used_str)
|
||
"""
|
||
# Confidence threshold — lowered from 0.5 to 0.35 to catch subtle shear
|
||
# that individual methods detect with moderate confidence.
|
||
_MIN_CONF = 0.35
|
||
|
||
# text_lines gets a weight boost as the most content-aware method
|
||
_METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
|
||
|
||
accepted = []
|
||
for d in detections:
|
||
if d["confidence"] < _MIN_CONF:
|
||
continue
|
||
boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
|
||
effective_conf = d["confidence"] * boost
|
||
accepted.append((d["shear_degrees"], effective_conf, d["method"]))
|
||
|
||
if not accepted:
|
||
return 0.0, 0.0, "none"
|
||
|
||
if len(accepted) == 1:
|
||
deg, conf, method = accepted[0]
|
||
return deg, min(conf, 1.0), method
|
||
|
||
# First pass: weighted mean
|
||
total_w = sum(c for _, c, _ in accepted)
|
||
w_mean = sum(d * c for d, c, _ in accepted) / total_w
|
||
|
||
# Outlier filter: keep results within 1° of weighted mean
|
||
filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
|
||
if not filtered:
|
||
filtered = accepted # fallback: keep all
|
||
|
||
# Second pass: weighted mean on filtered results
|
||
total_w2 = sum(c for _, c, _ in filtered)
|
||
final_deg = sum(d * c for d, c, _ in filtered) / total_w2
|
||
|
||
# Ensemble confidence: average of individual confidences, boosted when
|
||
# methods agree (all within 0.5° of each other)
|
||
avg_conf = total_w2 / len(filtered)
|
||
spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
|
||
agreement_bonus = 0.15 if spread < 0.5 else 0.0
|
||
ensemble_conf = min(1.0, avg_conf + agreement_bonus)
|
||
|
||
methods_str = "+".join(m for _, _, m in filtered)
|
||
return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
|
||
|
||
|
||
def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
|
||
"""Correct vertical shear after deskew (v2 with quality gate).
|
||
|
||
After deskew aligns horizontal text lines, vertical features (column
|
||
edges) may still be tilted. This detects the tilt angle using an ensemble
|
||
of four complementary methods and applies an affine shear correction.
|
||
|
||
Methods (all run in ~150ms total):
|
||
A. _detect_shear_angle() — vertical edge profile (~50ms)
|
||
B. _detect_shear_by_projection() — horizontal text-line variance (~30ms)
|
||
C. _detect_shear_by_hough() — Hough lines on table borders (~20ms)
|
||
D. _detect_shear_by_text_lines() — text-line straightness (~50ms)
|
||
|
||
Quality gate: after correction, horizontal projection variance is compared
|
||
before vs after. If correction worsened alignment, it is discarded.
|
||
|
||
Args:
|
||
img: BGR image (already deskewed).
|
||
use_ensemble: If False, fall back to single-method behaviour (method A only).
|
||
|
||
Returns:
|
||
Tuple of (corrected_image, dewarp_info).
|
||
dewarp_info keys: method, shear_degrees, confidence, detections.
|
||
"""
|
||
no_correction = {
|
||
"method": "none",
|
||
"shear_degrees": 0.0,
|
||
"confidence": 0.0,
|
||
"detections": [],
|
||
}
|
||
|
||
if not CV2_AVAILABLE:
|
||
return img, no_correction
|
||
|
||
t0 = time.time()
|
||
|
||
if use_ensemble:
|
||
det_a = _detect_shear_angle(img)
|
||
det_b = _detect_shear_by_projection(img)
|
||
det_c = _detect_shear_by_hough(img)
|
||
det_d = _detect_shear_by_text_lines(img)
|
||
detections = [det_a, det_b, det_c, det_d]
|
||
shear_deg, confidence, method = _ensemble_shear(detections)
|
||
else:
|
||
det_a = _detect_shear_angle(img)
|
||
detections = [det_a]
|
||
shear_deg = det_a["shear_degrees"]
|
||
confidence = det_a["confidence"]
|
||
method = det_a["method"]
|
||
|
||
duration = time.time() - t0
|
||
|
||
logger.info(
|
||
"dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | "
|
||
"A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
|
||
shear_deg, confidence, method, duration,
|
||
detections[0]["shear_degrees"], detections[0]["confidence"],
|
||
detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
|
||
detections[1]["confidence"] if len(detections) > 1 else 0.0,
|
||
detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
|
||
detections[2]["confidence"] if len(detections) > 2 else 0.0,
|
||
detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
|
||
detections[3]["confidence"] if len(detections) > 3 else 0.0,
|
||
)
|
||
|
||
# Always include individual detections (even when no correction applied)
|
||
_all_detections = [
|
||
{"method": d["method"], "shear_degrees": d["shear_degrees"],
|
||
"confidence": d["confidence"]}
|
||
for d in detections
|
||
]
|
||
|
||
# Thresholds: very small shear (<0.08°) is truly irrelevant for OCR.
|
||
# For ensemble confidence, require at least 0.4 (lowered from 0.5 to
|
||
# catch moderate-confidence detections from multiple agreeing methods).
|
||
if abs(shear_deg) < 0.08 or confidence < 0.4:
|
||
no_correction["detections"] = _all_detections
|
||
return img, no_correction
|
||
|
||
# Apply correction (negate the detected shear to straighten)
|
||
corrected = _apply_shear(img, -shear_deg)
|
||
|
||
# Quality gate: verify the correction actually improved alignment.
|
||
# For small corrections (< 0.5°), the projection variance change can be
|
||
# negligible, so we skip the quality gate — the cost of a tiny wrong
|
||
# correction is much less than the cost of leaving 0.4° uncorrected
|
||
# (which shifts content ~25px at image edges on tall scans).
|
||
if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
|
||
logger.info("dewarp: quality gate REJECTED correction (%.3f°) — "
|
||
"projection variance did not improve", shear_deg)
|
||
no_correction["detections"] = _all_detections
|
||
return img, no_correction
|
||
|
||
info = {
|
||
"method": method,
|
||
"shear_degrees": shear_deg,
|
||
"confidence": confidence,
|
||
"detections": _all_detections,
|
||
}
|
||
|
||
return corrected, info
|
||
|
||
|
||
def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
|
||
"""Apply shear correction with a manual angle.
|
||
|
||
Args:
|
||
img: BGR image (deskewed, before dewarp).
|
||
shear_degrees: Shear angle in degrees to correct.
|
||
|
||
Returns:
|
||
Corrected image.
|
||
"""
|
||
if abs(shear_degrees) < 0.001:
|
||
return img
|
||
return _apply_shear(img, -shear_degrees)
|
||
|
||
|
||
# =============================================================================
|
||
# Document Type Detection
|
||
# =============================================================================
|
||
|
||
def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
|
||
"""Detect whether the page is a vocab table, generic table, or full text.
|
||
|
||
Uses projection profiles and text density analysis — no OCR required.
|
||
Runs in < 2 seconds.
|
||
|
||
Args:
|
||
ocr_img: Binarized grayscale image (for projection profiles).
|
||
img_bgr: BGR color image.
|
||
|
||
Returns:
|
||
DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
|
||
"""
|
||
if ocr_img is None or ocr_img.size == 0:
|
||
return DocumentTypeResult(
|
||
doc_type='full_text', confidence=0.5, pipeline='full_page',
|
||
skip_steps=['columns', 'rows'],
|
||
features={'error': 'empty image'},
|
||
)
|
||
|
||
h, w = ocr_img.shape[:2]
|
||
|
||
# --- 1. Vertical projection profile → detect column gaps ---
|
||
# Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
|
||
# Invert: dark pixels on white background → high values = text.
|
||
vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
|
||
|
||
# Smooth the profile to avoid noise spikes
|
||
kernel_size = max(3, w // 100)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
# Find significant vertical gaps (columns of near-zero text density)
|
||
# A gap must be at least 1% of image width and have < 5% of max density
|
||
max_density = max(vert_smooth.max(), 1)
|
||
gap_threshold = max_density * 0.05
|
||
min_gap_width = max(5, w // 100)
|
||
|
||
in_gap = False
|
||
gap_count = 0
|
||
gap_start = 0
|
||
vert_gaps = []
|
||
|
||
for x in range(w):
|
||
if vert_smooth[x] < gap_threshold:
|
||
if not in_gap:
|
||
in_gap = True
|
||
gap_start = x
|
||
else:
|
||
if in_gap:
|
||
gap_width = x - gap_start
|
||
if gap_width >= min_gap_width:
|
||
gap_count += 1
|
||
vert_gaps.append((gap_start, x, gap_width))
|
||
in_gap = False
|
||
|
||
# Filter out margin gaps (within 10% of image edges)
|
||
margin_threshold = w * 0.10
|
||
internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
|
||
internal_gap_count = len(internal_gaps)
|
||
|
||
# --- 2. Horizontal projection profile → detect row gaps ---
|
||
horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
|
||
h_kernel = max(3, h // 200)
|
||
if h_kernel % 2 == 0:
|
||
h_kernel += 1
|
||
horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
|
||
|
||
h_max = max(horiz_smooth.max(), 1)
|
||
h_gap_threshold = h_max * 0.05
|
||
min_row_gap = max(3, h // 200)
|
||
|
||
row_gap_count = 0
|
||
in_gap = False
|
||
for y in range(h):
|
||
if horiz_smooth[y] < h_gap_threshold:
|
||
if not in_gap:
|
||
in_gap = True
|
||
gap_start = y
|
||
else:
|
||
if in_gap:
|
||
if y - gap_start >= min_row_gap:
|
||
row_gap_count += 1
|
||
in_gap = False
|
||
|
||
# --- 3. Text density distribution (4×4 grid) ---
|
||
grid_rows, grid_cols = 4, 4
|
||
cell_h, cell_w = h // grid_rows, w // grid_cols
|
||
densities = []
|
||
for gr in range(grid_rows):
|
||
for gc in range(grid_cols):
|
||
cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
|
||
gc * cell_w:(gc + 1) * cell_w]
|
||
if cell.size > 0:
|
||
d = float(np.count_nonzero(cell < 128)) / cell.size
|
||
densities.append(d)
|
||
|
||
density_std = float(np.std(densities)) if densities else 0
|
||
density_mean = float(np.mean(densities)) if densities else 0
|
||
|
||
features = {
|
||
'vertical_gaps': gap_count,
|
||
'internal_vertical_gaps': internal_gap_count,
|
||
'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
|
||
'row_gaps': row_gap_count,
|
||
'density_mean': round(density_mean, 4),
|
||
'density_std': round(density_std, 4),
|
||
'image_size': (w, h),
|
||
}
|
||
|
||
# --- 4. Decision tree ---
|
||
# Use internal_gap_count (excludes margin gaps) for column detection.
|
||
if internal_gap_count >= 2 and row_gap_count >= 5:
|
||
# Multiple internal vertical gaps + many row gaps → table
|
||
confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
|
||
return DocumentTypeResult(
|
||
doc_type='vocab_table',
|
||
confidence=round(confidence, 2),
|
||
pipeline='cell_first',
|
||
skip_steps=[],
|
||
features=features,
|
||
)
|
||
elif internal_gap_count >= 1 and row_gap_count >= 3:
|
||
# Some internal structure, likely a table
|
||
confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
|
||
return DocumentTypeResult(
|
||
doc_type='generic_table',
|
||
confidence=round(confidence, 2),
|
||
pipeline='cell_first',
|
||
skip_steps=[],
|
||
features=features,
|
||
)
|
||
elif internal_gap_count == 0:
|
||
# No internal column gaps → full text (regardless of density)
|
||
confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
|
||
return DocumentTypeResult(
|
||
doc_type='full_text',
|
||
confidence=round(confidence, 2),
|
||
pipeline='full_page',
|
||
skip_steps=['columns', 'rows'],
|
||
features=features,
|
||
)
|
||
else:
|
||
# Ambiguous — default to vocab_table (most common use case)
|
||
return DocumentTypeResult(
|
||
doc_type='vocab_table',
|
||
confidence=0.5,
|
||
pipeline='cell_first',
|
||
skip_steps=[],
|
||
features=features,
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 4: Dual Image Preparation
|
||
# =============================================================================
|
||
|
||
def create_ocr_image(img: np.ndarray) -> np.ndarray:
|
||
"""Create a binarized image optimized for Tesseract OCR.
|
||
|
||
Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
|
||
|
||
Args:
|
||
img: BGR image.
|
||
|
||
Returns:
|
||
Binary image (white text on black background inverted to black on white).
|
||
"""
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
|
||
# Background normalization: divide by blurred version
|
||
bg = cv2.GaussianBlur(gray, (51, 51), 0)
|
||
normalized = cv2.divide(gray, bg, scale=255)
|
||
|
||
# Adaptive binarization
|
||
binary = cv2.adaptiveThreshold(
|
||
normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||
cv2.THRESH_BINARY, 31, 10
|
||
)
|
||
|
||
# Light denoise
|
||
denoised = cv2.medianBlur(binary, 3)
|
||
|
||
return denoised
|
||
|
||
|
||
def create_layout_image(img: np.ndarray) -> np.ndarray:
|
||
"""Create a CLAHE-enhanced grayscale image for layout analysis.
|
||
|
||
Args:
|
||
img: BGR image.
|
||
|
||
Returns:
|
||
Enhanced grayscale image.
|
||
"""
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||
enhanced = clahe.apply(gray)
|
||
return enhanced
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 5: Layout Analysis (Projection Profiles)
|
||
# =============================================================================
|
||
|
||
def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
|
||
"""Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
|
||
out = mask.copy()
|
||
n = len(out)
|
||
i = 0
|
||
while i < n:
|
||
if out[i]:
|
||
start = i
|
||
while i < n and out[i]:
|
||
i += 1
|
||
if (i - start) < min_width:
|
||
out[start:i] = False
|
||
else:
|
||
i += 1
|
||
return out
|
||
|
||
|
||
def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
|
||
"""Find the bounding box of actual text content (excluding page margins).
|
||
|
||
Scan artefacts (thin black lines at page edges) are filtered out by
|
||
discarding contiguous projection runs narrower than 1 % of the image
|
||
dimension (min 5 px).
|
||
|
||
Returns:
|
||
Tuple of (left_x, right_x, top_y, bottom_y).
|
||
"""
|
||
h, w = inv.shape[:2]
|
||
threshold = 0.005
|
||
|
||
# --- Horizontal projection for top/bottom ---
|
||
h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
|
||
h_mask = h_proj > threshold
|
||
min_h_run = max(5, h // 100)
|
||
h_mask = _filter_narrow_runs(h_mask, min_h_run)
|
||
|
||
top_y = 0
|
||
for y in range(h):
|
||
if h_mask[y]:
|
||
top_y = max(0, y - 5)
|
||
break
|
||
|
||
bottom_y = h
|
||
for y in range(h - 1, 0, -1):
|
||
if h_mask[y]:
|
||
bottom_y = min(h, y + 5)
|
||
break
|
||
|
||
# --- Vertical projection for left/right margins ---
|
||
v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
|
||
v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
|
||
v_mask = v_proj_norm > threshold
|
||
min_v_run = max(5, w // 100)
|
||
v_mask = _filter_narrow_runs(v_mask, min_v_run)
|
||
|
||
left_x = 0
|
||
for x in range(w):
|
||
if v_mask[x]:
|
||
left_x = max(0, x - 2)
|
||
break
|
||
|
||
right_x = w
|
||
for x in range(w - 1, 0, -1):
|
||
if v_mask[x]:
|
||
right_x = min(w, x + 2)
|
||
break
|
||
|
||
return left_x, right_x, top_y, bottom_y
|
||
|
||
|
||
def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
|
||
"""Detect columns, header, and footer using projection profiles.
|
||
|
||
Uses content-bounds detection to exclude page margins before searching
|
||
for column separators within the actual text area.
|
||
|
||
Args:
|
||
layout_img: CLAHE-enhanced grayscale image.
|
||
ocr_img: Binarized image for text density analysis.
|
||
|
||
Returns:
|
||
List of PageRegion objects describing detected regions.
|
||
"""
|
||
h, w = ocr_img.shape[:2]
|
||
|
||
# Invert: black text on white → white text on black for projection
|
||
inv = cv2.bitwise_not(ocr_img)
|
||
|
||
# --- Find actual content bounds (exclude page margins) ---
|
||
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||
content_w = right_x - left_x
|
||
content_h = bottom_y - top_y
|
||
|
||
logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||
f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
|
||
|
||
if content_w < w * 0.3 or content_h < h * 0.3:
|
||
# Fallback if detection seems wrong
|
||
left_x, right_x = 0, w
|
||
top_y, bottom_y = 0, h
|
||
content_w, content_h = w, h
|
||
|
||
# --- Vertical projection within content area to find column separators ---
|
||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||
v_proj = np.sum(content_strip, axis=0).astype(float)
|
||
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
|
||
|
||
# Smooth the projection profile
|
||
kernel_size = max(5, content_w // 50)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
# Debug: log projection profile statistics
|
||
p_mean = float(np.mean(v_proj_smooth))
|
||
p_median = float(np.median(v_proj_smooth))
|
||
p_min = float(np.min(v_proj_smooth))
|
||
p_max = float(np.max(v_proj_smooth))
|
||
logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
|
||
f"mean={p_mean:.4f}, median={p_median:.4f}")
|
||
|
||
# Find valleys using multiple threshold strategies
|
||
# Strategy 1: relative to median (catches clear separators)
|
||
# Strategy 2: local minima approach (catches subtle gaps)
|
||
threshold = max(p_median * 0.3, p_mean * 0.2)
|
||
logger.info(f"Layout: valley threshold={threshold:.4f}")
|
||
|
||
in_valley = v_proj_smooth < threshold
|
||
|
||
# Find contiguous valley regions
|
||
all_valleys = []
|
||
start = None
|
||
for x in range(len(v_proj_smooth)):
|
||
if in_valley[x] and start is None:
|
||
start = x
|
||
elif not in_valley[x] and start is not None:
|
||
valley_width = x - start
|
||
valley_depth = float(np.min(v_proj_smooth[start:x]))
|
||
# Valley must be at least 3px wide
|
||
if valley_width >= 3:
|
||
all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
|
||
start = None
|
||
|
||
logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
|
||
f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
|
||
|
||
# Filter: valleys must be inside the content area (not at edges)
|
||
inner_margin = int(content_w * 0.08)
|
||
valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
|
||
|
||
# If no valleys found with strict threshold, try local minima approach
|
||
if len(valleys) < 2:
|
||
logger.info("Layout: trying local minima approach for column detection")
|
||
# Divide content into 20 segments, find the 2 lowest
|
||
seg_count = 20
|
||
seg_width = content_w // seg_count
|
||
seg_scores = []
|
||
for i in range(seg_count):
|
||
sx = i * seg_width
|
||
ex = min((i + 1) * seg_width, content_w)
|
||
seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
|
||
seg_scores.append((i, sx, ex, seg_mean))
|
||
|
||
seg_scores.sort(key=lambda s: s[3])
|
||
logger.info(f"Layout: segment scores (lowest 5): "
|
||
f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
|
||
|
||
# Find two lowest non-adjacent segments that create reasonable columns
|
||
candidate_valleys = []
|
||
for seg_idx, sx, ex, seg_mean in seg_scores:
|
||
# Must not be at the edges
|
||
if seg_idx <= 1 or seg_idx >= seg_count - 2:
|
||
continue
|
||
# Must be significantly lower than overall mean
|
||
if seg_mean < p_mean * 0.6:
|
||
center = (sx + ex) // 2
|
||
candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
|
||
|
||
if len(candidate_valleys) >= 2:
|
||
# Pick the best pair: non-adjacent, creating reasonable column widths
|
||
candidate_valleys.sort(key=lambda v: v[2])
|
||
best_pair = None
|
||
best_score = float('inf')
|
||
for i in range(len(candidate_valleys)):
|
||
for j in range(i + 1, len(candidate_valleys)):
|
||
c1 = candidate_valleys[i][2]
|
||
c2 = candidate_valleys[j][2]
|
||
# Must be at least 20% apart
|
||
if (c2 - c1) < content_w * 0.2:
|
||
continue
|
||
col1 = c1
|
||
col2 = c2 - c1
|
||
col3 = content_w - c2
|
||
# Each column at least 15%
|
||
if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
|
||
continue
|
||
parts = sorted([col1, col2, col3])
|
||
score = parts[2] - parts[0]
|
||
if score < best_score:
|
||
best_score = score
|
||
best_pair = (candidate_valleys[i], candidate_valleys[j])
|
||
|
||
if best_pair:
|
||
valleys = list(best_pair)
|
||
logger.info(f"Layout: local minima found 2 valleys: "
|
||
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
|
||
|
||
logger.info(f"Layout: final {len(valleys)} valleys: "
|
||
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
|
||
|
||
regions = []
|
||
|
||
if len(valleys) >= 2:
|
||
# 3-column layout detected
|
||
valleys.sort(key=lambda v: v[2])
|
||
|
||
if len(valleys) == 2:
|
||
sep1_center = valleys[0][2]
|
||
sep2_center = valleys[1][2]
|
||
else:
|
||
# Pick the two valleys that best divide into 3 parts
|
||
# Prefer wider valleys (more likely true separators)
|
||
best_pair = None
|
||
best_score = float('inf')
|
||
for i in range(len(valleys)):
|
||
for j in range(i + 1, len(valleys)):
|
||
c1, c2 = valleys[i][2], valleys[j][2]
|
||
# Each column should be at least 15% of content width
|
||
col1 = c1
|
||
col2 = c2 - c1
|
||
col3 = content_w - c2
|
||
if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
|
||
continue
|
||
# Score: lower is better (more even distribution)
|
||
parts = sorted([col1, col2, col3])
|
||
score = parts[2] - parts[0]
|
||
# Bonus for wider valleys (subtract valley width)
|
||
score -= (valleys[i][3] + valleys[j][3]) * 0.5
|
||
if score < best_score:
|
||
best_score = score
|
||
best_pair = (c1, c2)
|
||
if best_pair:
|
||
sep1_center, sep2_center = best_pair
|
||
else:
|
||
sep1_center = valleys[0][2]
|
||
sep2_center = valleys[1][2]
|
||
|
||
# Convert from content-relative to absolute coordinates
|
||
abs_sep1 = sep1_center + left_x
|
||
abs_sep2 = sep2_center + left_x
|
||
|
||
logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
|
||
f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=0, y=top_y,
|
||
width=abs_sep1, height=content_h
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_de', x=abs_sep1, y=top_y,
|
||
width=abs_sep2 - abs_sep1, height=content_h
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_example', x=abs_sep2, y=top_y,
|
||
width=w - abs_sep2, height=content_h
|
||
))
|
||
|
||
elif len(valleys) == 1:
|
||
# 2-column layout
|
||
abs_sep = valleys[0][2] + left_x
|
||
|
||
logger.info(f"Layout: 2 columns at separator x={abs_sep}")
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=0, y=top_y,
|
||
width=abs_sep, height=content_h
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_de', x=abs_sep, y=top_y,
|
||
width=w - abs_sep, height=content_h
|
||
))
|
||
|
||
else:
|
||
# No columns detected — run full-page OCR as single column
|
||
logger.warning("Layout: no column separators found, using full page")
|
||
regions.append(PageRegion(
|
||
type='column_en', x=0, y=top_y,
|
||
width=w, height=content_h
|
||
))
|
||
|
||
# Add header/footer info (gap-based detection with fallback)
|
||
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
|
||
|
||
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
|
||
bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
|
||
col_count = len([r for r in regions if r.type.startswith('column')])
|
||
logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
|
||
|
||
return regions
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
|
||
# =============================================================================
|
||
|
||
# --- Phase A: Geometry Detection ---
|
||
|
||
def _detect_columns_by_clustering(
|
||
word_dicts: List[Dict],
|
||
left_edges: List[int],
|
||
edge_word_indices: List[int],
|
||
content_w: int,
|
||
content_h: int,
|
||
left_x: int,
|
||
right_x: int,
|
||
top_y: int,
|
||
bottom_y: int,
|
||
inv: Optional[np.ndarray] = None,
|
||
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
|
||
"""Fallback: detect columns by clustering left-aligned word positions.
|
||
|
||
Used when the primary gap-based algorithm finds fewer than 2 gaps.
|
||
"""
|
||
tolerance = max(10, int(content_w * 0.01))
|
||
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
|
||
|
||
clusters = []
|
||
cluster_widxs = []
|
||
cur_edges = [sorted_pairs[0][0]]
|
||
cur_widxs = [sorted_pairs[0][1]]
|
||
for edge, widx in sorted_pairs[1:]:
|
||
if edge - cur_edges[-1] <= tolerance:
|
||
cur_edges.append(edge)
|
||
cur_widxs.append(widx)
|
||
else:
|
||
clusters.append(cur_edges)
|
||
cluster_widxs.append(cur_widxs)
|
||
cur_edges = [edge]
|
||
cur_widxs = [widx]
|
||
clusters.append(cur_edges)
|
||
cluster_widxs.append(cur_widxs)
|
||
|
||
MIN_Y_COVERAGE_PRIMARY = 0.30
|
||
MIN_Y_COVERAGE_SECONDARY = 0.15
|
||
MIN_WORDS_SECONDARY = 5
|
||
|
||
cluster_infos = []
|
||
for c_edges, c_widxs in zip(clusters, cluster_widxs):
|
||
if len(c_edges) < 2:
|
||
continue
|
||
y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
|
||
y_span = max(y_positions) - min(y_positions)
|
||
y_coverage = y_span / content_h if content_h > 0 else 0.0
|
||
cluster_infos.append({
|
||
'mean_x': int(np.mean(c_edges)),
|
||
'count': len(c_edges),
|
||
'min_edge': min(c_edges),
|
||
'max_edge': max(c_edges),
|
||
'y_min': min(y_positions),
|
||
'y_max': max(y_positions),
|
||
'y_coverage': y_coverage,
|
||
})
|
||
|
||
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
|
||
primary_set = set(id(c) for c in primary)
|
||
secondary = [c for c in cluster_infos
|
||
if id(c) not in primary_set
|
||
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
|
||
and c['count'] >= MIN_WORDS_SECONDARY]
|
||
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
|
||
|
||
if len(significant) < 3:
|
||
logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
|
||
return None
|
||
|
||
merge_distance = max(30, int(content_w * 0.06))
|
||
merged = [significant[0].copy()]
|
||
for s in significant[1:]:
|
||
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
|
||
prev = merged[-1]
|
||
total = prev['count'] + s['count']
|
||
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
|
||
prev['mean_x'] = avg_x
|
||
prev['count'] = total
|
||
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
|
||
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
|
||
else:
|
||
merged.append(s.copy())
|
||
|
||
if len(merged) < 3:
|
||
logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
|
||
return None
|
||
|
||
logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
|
||
|
||
margin_px = max(6, int(content_w * 0.003))
|
||
return _build_geometries_from_starts(
|
||
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
|
||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
|
||
)
|
||
|
||
|
||
def _detect_sub_columns(
|
||
geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
left_x: int = 0,
|
||
top_y: int = 0,
|
||
header_y: Optional[int] = None,
|
||
footer_y: Optional[int] = None,
|
||
_edge_tolerance: int = 8,
|
||
_min_col_start_ratio: float = 0.10,
|
||
) -> List[ColumnGeometry]:
|
||
"""Split columns that contain internal sub-columns based on left-edge alignment.
|
||
|
||
For each column, clusters word left-edges into alignment bins (within
|
||
``_edge_tolerance`` px). The leftmost bin whose word count reaches
|
||
``_min_col_start_ratio`` of the column total is treated as the true column
|
||
start. Any words to the left of that bin form a sub-column, provided they
|
||
number >= 2 and < 35 % of total.
|
||
|
||
Word ``left`` values are relative to the content ROI (offset by *left_x*),
|
||
while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
|
||
bridges the two coordinate systems.
|
||
|
||
If *header_y* / *footer_y* are provided (absolute y-coordinates), words
|
||
in header/footer regions are excluded from alignment clustering to avoid
|
||
polluting the bins with page numbers or chapter titles. Word ``top``
|
||
values are relative to *top_y*.
|
||
|
||
Returns a new list of ColumnGeometry — potentially longer than the input.
|
||
"""
|
||
if content_w <= 0:
|
||
return geometries
|
||
|
||
result: List[ColumnGeometry] = []
|
||
for geo in geometries:
|
||
# Only consider wide-enough columns with enough words
|
||
if geo.width_ratio < 0.15 or geo.word_count < 5:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# Collect left-edges of confident words, excluding header/footer
|
||
# Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
|
||
min_top_rel = (header_y - top_y) if header_y is not None else None
|
||
max_top_rel = (footer_y - top_y) if footer_y is not None else None
|
||
|
||
confident = [w for w in geo.words
|
||
if w.get('conf', 0) >= 30
|
||
and (min_top_rel is None or w['top'] >= min_top_rel)
|
||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||
if len(confident) < 3:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# --- Cluster left-edges into alignment bins ---
|
||
sorted_edges = sorted(w['left'] for w in confident)
|
||
bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge)
|
||
cur = [sorted_edges[0]]
|
||
for i in range(1, len(sorted_edges)):
|
||
if sorted_edges[i] - cur[-1] <= _edge_tolerance:
|
||
cur.append(sorted_edges[i])
|
||
else:
|
||
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
|
||
cur = [sorted_edges[i]]
|
||
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
|
||
|
||
# --- Find the leftmost bin qualifying as a real column start ---
|
||
total = len(confident)
|
||
min_count = max(3, int(total * _min_col_start_ratio))
|
||
col_start_bin = None
|
||
for b in bins:
|
||
if b[1] >= min_count:
|
||
col_start_bin = b
|
||
break
|
||
|
||
if col_start_bin is None:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# Words to the left of the column-start bin are sub-column candidates
|
||
split_threshold = col_start_bin[2] - _edge_tolerance
|
||
sub_words = [w for w in geo.words if w['left'] < split_threshold]
|
||
main_words = [w for w in geo.words if w['left'] >= split_threshold]
|
||
|
||
# Count only body words (excluding header/footer) for the threshold check
|
||
# so that header/footer words don't artificially trigger a split.
|
||
sub_body = [w for w in sub_words
|
||
if (min_top_rel is None or w['top'] >= min_top_rel)
|
||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||
if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# --- Build two sub-column geometries ---
|
||
# Word 'left' values are relative to left_x; geo.x is absolute.
|
||
# Convert the split position from relative to absolute coordinates.
|
||
max_sub_left = max(w['left'] for w in sub_words)
|
||
split_rel = (max_sub_left + col_start_bin[2]) // 2
|
||
split_abs = split_rel + left_x
|
||
|
||
sub_x = geo.x
|
||
sub_width = split_abs - geo.x
|
||
main_x = split_abs
|
||
main_width = (geo.x + geo.width) - split_abs
|
||
|
||
if sub_width <= 0 or main_width <= 0:
|
||
result.append(geo)
|
||
continue
|
||
|
||
sub_geo = ColumnGeometry(
|
||
index=0,
|
||
x=sub_x,
|
||
y=geo.y,
|
||
width=sub_width,
|
||
height=geo.height,
|
||
word_count=len(sub_words),
|
||
words=sub_words,
|
||
width_ratio=sub_width / content_w if content_w > 0 else 0.0,
|
||
is_sub_column=True,
|
||
)
|
||
main_geo = ColumnGeometry(
|
||
index=0,
|
||
x=main_x,
|
||
y=geo.y,
|
||
width=main_width,
|
||
height=geo.height,
|
||
word_count=len(main_words),
|
||
words=main_words,
|
||
width_ratio=main_width / content_w if content_w > 0 else 0.0,
|
||
is_sub_column=True,
|
||
)
|
||
|
||
result.append(sub_geo)
|
||
result.append(main_geo)
|
||
|
||
logger.info(
|
||
f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
|
||
f"(rel={split_rel}), sub={len(sub_words)} words, "
|
||
f"main={len(main_words)} words, "
|
||
f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
|
||
)
|
||
|
||
# Re-index by left-to-right order
|
||
result.sort(key=lambda g: g.x)
|
||
for i, g in enumerate(result):
|
||
g.index = i
|
||
|
||
return result
|
||
|
||
|
||
def _build_geometries_from_starts(
|
||
col_starts: List[Tuple[int, int]],
|
||
word_dicts: List[Dict],
|
||
left_x: int,
|
||
right_x: int,
|
||
top_y: int,
|
||
bottom_y: int,
|
||
content_w: int,
|
||
content_h: int,
|
||
inv: Optional[np.ndarray] = None,
|
||
) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
|
||
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
|
||
geometries = []
|
||
for i, (start_x, count) in enumerate(col_starts):
|
||
if i + 1 < len(col_starts):
|
||
col_width = col_starts[i + 1][0] - start_x
|
||
else:
|
||
col_width = right_x - start_x
|
||
|
||
col_left_rel = start_x - left_x
|
||
col_right_rel = col_left_rel + col_width
|
||
col_words = [w for w in word_dicts
|
||
if col_left_rel <= w['left'] < col_right_rel]
|
||
|
||
geometries.append(ColumnGeometry(
|
||
index=i,
|
||
x=start_x,
|
||
y=top_y,
|
||
width=col_width,
|
||
height=content_h,
|
||
word_count=len(col_words),
|
||
words=col_words,
|
||
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
||
))
|
||
|
||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||
|
||
|
||
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
|
||
"""Detect column geometry using whitespace-gap analysis with word validation.
|
||
|
||
Phase A of the two-phase column detection. Uses vertical projection
|
||
profiles to find whitespace gaps between columns, then validates that
|
||
no gap cuts through a word bounding box.
|
||
|
||
Falls back to clustering-based detection if fewer than 2 gaps are found.
|
||
|
||
Args:
|
||
ocr_img: Binarized grayscale image for layout analysis.
|
||
dewarped_bgr: Original BGR image (for Tesseract word detection).
|
||
|
||
Returns:
|
||
Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||
or None if detection fails entirely.
|
||
"""
|
||
h, w = ocr_img.shape[:2]
|
||
|
||
# --- Step 1: Find content bounds ---
|
||
inv = cv2.bitwise_not(ocr_img)
|
||
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||
content_w = right_x - left_x
|
||
content_h = bottom_y - top_y
|
||
|
||
if content_w < w * 0.3 or content_h < h * 0.3:
|
||
left_x, right_x = 0, w
|
||
top_y, bottom_y = 0, h
|
||
content_w, content_h = w, h
|
||
|
||
logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||
f"y=[{top_y}..{bottom_y}] ({content_h}px)")
|
||
|
||
# --- Step 2: Get word bounding boxes from Tesseract ---
|
||
# Crop from left_x to full image width (not right_x) so words at the right
|
||
# edge of the last column are included even if they extend past the detected
|
||
# content boundary (right_x).
|
||
content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
|
||
pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
|
||
|
||
try:
|
||
data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
|
||
except Exception as e:
|
||
logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
|
||
return None
|
||
|
||
word_dicts = []
|
||
left_edges = []
|
||
edge_word_indices = []
|
||
n_words = len(data['text'])
|
||
for i in range(n_words):
|
||
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
|
||
text = str(data['text'][i]).strip()
|
||
if conf < 30 or not text:
|
||
continue
|
||
lx = int(data['left'][i])
|
||
ty = int(data['top'][i])
|
||
bw = int(data['width'][i])
|
||
bh = int(data['height'][i])
|
||
left_edges.append(lx)
|
||
edge_word_indices.append(len(word_dicts))
|
||
word_dicts.append({
|
||
'text': text, 'conf': conf,
|
||
'left': lx, 'top': ty, 'width': bw, 'height': bh,
|
||
})
|
||
|
||
if len(left_edges) < 5:
|
||
logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
|
||
return None
|
||
|
||
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
|
||
|
||
# --- Step 3: Vertical projection profile ---
|
||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||
v_proj = np.sum(content_strip, axis=0).astype(float)
|
||
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
|
||
|
||
# Smooth the projection to avoid noise-induced micro-gaps
|
||
kernel_size = max(5, content_w // 80)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1 # keep odd for symmetry
|
||
v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
# --- Step 4: Find whitespace gaps ---
|
||
# Threshold: areas with very little ink density are gaps
|
||
median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
|
||
gap_threshold = max(median_density * 0.15, 0.005)
|
||
|
||
in_gap = v_smooth < gap_threshold
|
||
MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
|
||
|
||
# Collect contiguous gap regions
|
||
raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
|
||
gap_start = None
|
||
for x in range(len(in_gap)):
|
||
if in_gap[x]:
|
||
if gap_start is None:
|
||
gap_start = x
|
||
else:
|
||
if gap_start is not None:
|
||
gap_width = x - gap_start
|
||
if gap_width >= MIN_GAP_WIDTH:
|
||
raw_gaps.append((gap_start, x))
|
||
gap_start = None
|
||
# Handle gap at the right edge
|
||
if gap_start is not None:
|
||
gap_width = len(in_gap) - gap_start
|
||
if gap_width >= MIN_GAP_WIDTH:
|
||
raw_gaps.append((gap_start, len(in_gap)))
|
||
|
||
logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||
f"min_width={MIN_GAP_WIDTH}px): "
|
||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
|
||
|
||
# --- Step 5: Validate gaps against word bounding boxes ---
|
||
validated_gaps = []
|
||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||
# Check if any word overlaps with this gap region
|
||
overlapping = False
|
||
for wd in word_dicts:
|
||
word_left = wd['left']
|
||
word_right = wd['left'] + wd['width']
|
||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||
overlapping = True
|
||
break
|
||
|
||
if not overlapping:
|
||
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||
else:
|
||
# Try to shift the gap to avoid the overlapping word(s)
|
||
# Find the tightest word boundaries within the gap region
|
||
min_word_left = content_w
|
||
max_word_right = 0
|
||
for wd in word_dicts:
|
||
word_left = wd['left']
|
||
word_right = wd['left'] + wd['width']
|
||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||
min_word_left = min(min_word_left, word_left)
|
||
max_word_right = max(max_word_right, word_right)
|
||
|
||
# Try gap before the overlapping words
|
||
if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
|
||
validated_gaps.append((gap_start_rel, min_word_left))
|
||
logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
|
||
# Try gap after the overlapping words
|
||
elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
|
||
validated_gaps.append((max_word_right, gap_end_rel))
|
||
logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
|
||
else:
|
||
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||
f"discarded (word overlap, no room to shift)")
|
||
|
||
logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
|
||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
|
||
|
||
# --- Step 6: Fallback to clustering if too few gaps ---
|
||
if len(validated_gaps) < 2:
|
||
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
|
||
return _detect_columns_by_clustering(
|
||
word_dicts, left_edges, edge_word_indices,
|
||
content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
|
||
)
|
||
|
||
# --- Step 7: Derive column boundaries from gaps ---
|
||
# Sort gaps by position
|
||
validated_gaps.sort(key=lambda g: g[0])
|
||
|
||
# Identify margin gaps (first and last) vs interior gaps
|
||
# A margin gap touches the edge of the content area (within 2% tolerance)
|
||
edge_tolerance = max(10, int(content_w * 0.02))
|
||
|
||
is_left_margin = validated_gaps[0][0] <= edge_tolerance
|
||
is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
|
||
|
||
# Interior gaps define column boundaries
|
||
# Column starts at the end of a gap, ends at the start of the next gap
|
||
col_starts = []
|
||
|
||
if is_left_margin:
|
||
# First column starts after the left margin gap
|
||
first_gap_end = validated_gaps[0][1]
|
||
interior_gaps = validated_gaps[1:]
|
||
else:
|
||
# No left margin gap — first column starts at content left edge
|
||
first_gap_end = 0
|
||
interior_gaps = validated_gaps[:]
|
||
|
||
if is_right_margin:
|
||
# Last gap is right margin — don't use it as column start
|
||
interior_gaps_for_boundaries = interior_gaps[:-1]
|
||
right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
|
||
else:
|
||
interior_gaps_for_boundaries = interior_gaps
|
||
right_boundary = content_w
|
||
|
||
# First column
|
||
col_starts.append(left_x + first_gap_end)
|
||
|
||
# Columns between interior gaps
|
||
for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
|
||
col_starts.append(left_x + gap_end_rel)
|
||
|
||
# Count words per column region (for logging)
|
||
col_start_counts = []
|
||
for i, start_x in enumerate(col_starts):
|
||
if i + 1 < len(col_starts):
|
||
next_start = col_starts[i + 1]
|
||
else:
|
||
# Rightmost column always extends to full image width (w).
|
||
# The page margin contains only white space — extending the OCR
|
||
# crop to the image edge is safe and prevents text near the right
|
||
# border from being cut off.
|
||
next_start = w
|
||
|
||
col_left_rel = start_x - left_x
|
||
col_right_rel = next_start - left_x
|
||
n_words_in_col = sum(1 for w in word_dicts
|
||
if col_left_rel <= w['left'] < col_right_rel)
|
||
col_start_counts.append((start_x, n_words_in_col))
|
||
|
||
logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
|
||
f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
|
||
f"{col_start_counts}")
|
||
|
||
# --- Step 8: Build ColumnGeometry objects ---
|
||
# Determine right edge for each column
|
||
all_boundaries = []
|
||
for i, start_x in enumerate(col_starts):
|
||
if i + 1 < len(col_starts):
|
||
end_x = col_starts[i + 1]
|
||
else:
|
||
# Rightmost column always extends to full image width (w).
|
||
end_x = w
|
||
all_boundaries.append((start_x, end_x))
|
||
|
||
geometries = []
|
||
for i, (start_x, end_x) in enumerate(all_boundaries):
|
||
col_width = end_x - start_x
|
||
col_left_rel = start_x - left_x
|
||
col_right_rel = col_left_rel + col_width
|
||
col_words = [w for w in word_dicts
|
||
if col_left_rel <= w['left'] < col_right_rel]
|
||
|
||
geometries.append(ColumnGeometry(
|
||
index=i,
|
||
x=start_x,
|
||
y=top_y,
|
||
width=col_width,
|
||
height=content_h,
|
||
word_count=len(col_words),
|
||
words=col_words,
|
||
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
||
))
|
||
|
||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||
|
||
# --- Step 9: Filter phantom narrow columns ---
|
||
# Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
|
||
# columns (< 3% of content width) with zero or no words. These are not
|
||
# real columns — remove them and close the gap between neighbors.
|
||
min_real_col_w = max(20, int(content_w * 0.03))
|
||
filtered_geoms = [g for g in geometries
|
||
if not (g.word_count < 3 and g.width < min_real_col_w)]
|
||
if len(filtered_geoms) < len(geometries):
|
||
n_removed = len(geometries) - len(filtered_geoms)
|
||
logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
|
||
f"(width < {min_real_col_w}px and words < 3)")
|
||
# Extend each remaining column to close gaps with its right neighbor
|
||
for i, g in enumerate(filtered_geoms):
|
||
if i + 1 < len(filtered_geoms):
|
||
g.width = filtered_geoms[i + 1].x - g.x
|
||
else:
|
||
g.width = w - g.x
|
||
g.index = i
|
||
col_left_rel = g.x - left_x
|
||
col_right_rel = col_left_rel + g.width
|
||
g.words = [w for w in word_dicts
|
||
if col_left_rel <= w['left'] < col_right_rel]
|
||
g.word_count = len(g.words)
|
||
geometries = filtered_geoms
|
||
logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
|
||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||
|
||
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||
|
||
|
||
def expand_narrow_columns(
|
||
geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
left_x: int,
|
||
word_dicts: List[Dict],
|
||
) -> List[ColumnGeometry]:
|
||
"""Expand narrow columns into adjacent whitespace gaps.
|
||
|
||
Narrow columns (marker, page_ref, < 10% content width) often lose
|
||
content at image edges due to residual shear. This expands them toward
|
||
the neighbouring column, but never past 40% of the gap or past the
|
||
nearest word in the neighbour.
|
||
|
||
Must be called AFTER _detect_sub_columns() so that sub-column splits
|
||
(which create the narrowest columns) have already happened.
|
||
"""
|
||
_NARROW_THRESHOLD_PCT = 10.0
|
||
_MIN_WORD_MARGIN = 4
|
||
|
||
if len(geometries) < 2:
|
||
return geometries
|
||
|
||
logger.info("ExpandNarrowCols: input %d cols: %s",
|
||
len(geometries),
|
||
[(i, g.x, g.width, round(g.width / content_w * 100, 1))
|
||
for i, g in enumerate(geometries)])
|
||
|
||
for i, g in enumerate(geometries):
|
||
col_pct = g.width / content_w * 100 if content_w > 0 else 100
|
||
if col_pct >= _NARROW_THRESHOLD_PCT:
|
||
continue
|
||
|
||
expanded = False
|
||
orig_pct = col_pct
|
||
|
||
# --- try expanding to the LEFT ---
|
||
if i > 0:
|
||
left_nb = geometries[i - 1]
|
||
# Gap can be 0 if sub-column split created adjacent columns.
|
||
# In that case, look at where the neighbor's rightmost words
|
||
# actually are — there may be unused space we can claim.
|
||
nb_words_right = [wd['left'] + wd.get('width', 0)
|
||
for wd in left_nb.words]
|
||
if nb_words_right:
|
||
rightmost_word_abs = left_x + max(nb_words_right)
|
||
safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
|
||
else:
|
||
# No words in neighbor → we can take up to neighbor's start
|
||
safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
|
||
|
||
if safe_left_abs < g.x:
|
||
g.width += (g.x - safe_left_abs)
|
||
g.x = safe_left_abs
|
||
expanded = True
|
||
|
||
# --- try expanding to the RIGHT ---
|
||
if i + 1 < len(geometries):
|
||
right_nb = geometries[i + 1]
|
||
nb_words_left = [wd['left'] for wd in right_nb.words]
|
||
if nb_words_left:
|
||
leftmost_word_abs = left_x + min(nb_words_left)
|
||
safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
|
||
else:
|
||
safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
|
||
|
||
cur_right = g.x + g.width
|
||
if safe_right_abs > cur_right:
|
||
g.width = safe_right_abs - g.x
|
||
expanded = True
|
||
|
||
if expanded:
|
||
col_left_rel = g.x - left_x
|
||
col_right_rel = col_left_rel + g.width
|
||
g.words = [wd for wd in word_dicts
|
||
if col_left_rel <= wd['left'] < col_right_rel]
|
||
g.word_count = len(g.words)
|
||
g.width_ratio = g.width / content_w if content_w > 0 else 0.0
|
||
logger.info(
|
||
"ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
|
||
i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
|
||
|
||
# --- Shrink overlapping neighbors to match new boundaries ---
|
||
# Left neighbor: its right edge must not exceed our new left edge
|
||
if i > 0:
|
||
left_nb = geometries[i - 1]
|
||
nb_right = left_nb.x + left_nb.width
|
||
if nb_right > g.x:
|
||
left_nb.width = g.x - left_nb.x
|
||
if left_nb.width < 0:
|
||
left_nb.width = 0
|
||
left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
|
||
# Re-assign words
|
||
nb_left_rel = left_nb.x - left_x
|
||
nb_right_rel = nb_left_rel + left_nb.width
|
||
left_nb.words = [wd for wd in word_dicts
|
||
if nb_left_rel <= wd['left'] < nb_right_rel]
|
||
left_nb.word_count = len(left_nb.words)
|
||
|
||
# Right neighbor: its left edge must not be before our new right edge
|
||
if i + 1 < len(geometries):
|
||
right_nb = geometries[i + 1]
|
||
my_right = g.x + g.width
|
||
if right_nb.x < my_right:
|
||
old_right_edge = right_nb.x + right_nb.width
|
||
right_nb.x = my_right
|
||
right_nb.width = old_right_edge - right_nb.x
|
||
if right_nb.width < 0:
|
||
right_nb.width = 0
|
||
right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
|
||
# Re-assign words
|
||
nb_left_rel = right_nb.x - left_x
|
||
nb_right_rel = nb_left_rel + right_nb.width
|
||
right_nb.words = [wd for wd in word_dicts
|
||
if nb_left_rel <= wd['left'] < nb_right_rel]
|
||
right_nb.word_count = len(right_nb.words)
|
||
|
||
return geometries
|
||
|
||
|
||
# =============================================================================
|
||
# Row Geometry Detection (horizontal whitespace-gap analysis)
|
||
# =============================================================================
|
||
|
||
def detect_row_geometry(
|
||
inv: np.ndarray,
|
||
word_dicts: List[Dict],
|
||
left_x: int, right_x: int,
|
||
top_y: int, bottom_y: int,
|
||
) -> List['RowGeometry']:
|
||
"""Detect row geometry using horizontal whitespace-gap analysis.
|
||
|
||
Mirrors the vertical gap approach used for columns, but operates on
|
||
horizontal projection profiles to find gaps between text lines.
|
||
Also classifies header/footer rows based on gap size.
|
||
|
||
Args:
|
||
inv: Inverted binarized image (white text on black bg, full page).
|
||
word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
|
||
left_x, right_x: Absolute X bounds of the content area.
|
||
top_y, bottom_y: Absolute Y bounds of the content area.
|
||
|
||
Returns:
|
||
List of RowGeometry objects sorted top to bottom.
|
||
"""
|
||
content_w = right_x - left_x
|
||
content_h = bottom_y - top_y
|
||
|
||
if content_h < 10 or content_w < 10:
|
||
logger.warning("detect_row_geometry: content area too small")
|
||
return []
|
||
|
||
# --- Step 1: Horizontal projection profile (text-only, images masked out) ---
|
||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||
|
||
# Build a word-coverage mask so that image regions (high ink density but no
|
||
# Tesseract words) are ignored. Only pixels within/near word bounding boxes
|
||
# contribute to the projection. This prevents large illustrations from
|
||
# merging multiple vocabulary rows into one.
|
||
WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
|
||
word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
|
||
for wd in word_dicts:
|
||
y1 = max(0, wd['top'] - WORD_PAD_Y)
|
||
y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
|
||
x1 = max(0, wd['left'])
|
||
x2 = min(content_w, wd['left'] + wd['width'])
|
||
word_mask[y1:y2, x1:x2] = 255
|
||
|
||
masked_strip = cv2.bitwise_and(content_strip, word_mask)
|
||
h_proj = np.sum(masked_strip, axis=1).astype(float)
|
||
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
|
||
|
||
# --- Step 2: Smoothing + threshold ---
|
||
kernel_size = max(3, content_h // 200)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
|
||
gap_threshold = max(median_density * 0.15, 0.003)
|
||
|
||
in_gap = h_smooth < gap_threshold
|
||
MIN_GAP_HEIGHT = max(3, content_h // 500)
|
||
|
||
# --- Step 3: Collect contiguous gap regions ---
|
||
raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI
|
||
gap_start = None
|
||
for y in range(len(in_gap)):
|
||
if in_gap[y]:
|
||
if gap_start is None:
|
||
gap_start = y
|
||
else:
|
||
if gap_start is not None:
|
||
gap_height = y - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, y))
|
||
gap_start = None
|
||
if gap_start is not None:
|
||
gap_height = len(in_gap) - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, len(in_gap)))
|
||
|
||
logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||
f"min_height={MIN_GAP_HEIGHT}px)")
|
||
|
||
# --- Step 4: Validate gaps against word bounding boxes ---
|
||
validated_gaps = []
|
||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||
overlapping = False
|
||
for wd in word_dicts:
|
||
word_top = wd['top']
|
||
word_bottom = wd['top'] + wd['height']
|
||
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||
overlapping = True
|
||
break
|
||
|
||
if not overlapping:
|
||
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||
else:
|
||
# Try to shift the gap to avoid overlapping words
|
||
min_word_top = content_h
|
||
max_word_bottom = 0
|
||
for wd in word_dicts:
|
||
word_top = wd['top']
|
||
word_bottom = wd['top'] + wd['height']
|
||
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||
min_word_top = min(min_word_top, word_top)
|
||
max_word_bottom = max(max_word_bottom, word_bottom)
|
||
|
||
if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
|
||
validated_gaps.append((gap_start_rel, min_word_top))
|
||
elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
|
||
validated_gaps.append((max_word_bottom, gap_end_rel))
|
||
else:
|
||
logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||
f"discarded (word overlap, no room to shift)")
|
||
|
||
logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
|
||
|
||
# --- Fallback if too few gaps ---
|
||
if len(validated_gaps) < 2:
|
||
logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
|
||
return _build_rows_from_word_grouping(
|
||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
|
||
)
|
||
|
||
validated_gaps.sort(key=lambda g: g[0])
|
||
|
||
# --- Step 5: Header/footer detection via gap size ---
|
||
HEADER_FOOTER_ZONE = 0.15
|
||
GAP_MULTIPLIER = 2.0
|
||
|
||
gap_sizes = [g[1] - g[0] for g in validated_gaps]
|
||
median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
|
||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||
|
||
header_boundary_rel = None # y below which is header
|
||
footer_boundary_rel = None # y above which is footer
|
||
|
||
header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
|
||
footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
|
||
|
||
# Find largest gap in header zone
|
||
best_header_gap = None
|
||
for gs, ge in validated_gaps:
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||
if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
|
||
best_header_gap = (gs, ge)
|
||
|
||
if best_header_gap is not None:
|
||
header_boundary_rel = best_header_gap[1]
|
||
logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
|
||
f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
|
||
f"median_gap={median_gap:.0f}px)")
|
||
|
||
# Find largest gap in footer zone
|
||
best_footer_gap = None
|
||
for gs, ge in validated_gaps:
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||
if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
|
||
best_footer_gap = (gs, ge)
|
||
|
||
if best_footer_gap is not None:
|
||
footer_boundary_rel = best_footer_gap[0]
|
||
logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
|
||
f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
|
||
|
||
# --- Step 6: Build RowGeometry objects from gaps ---
|
||
# Rows are the spans between gaps
|
||
row_boundaries = [] # (start_y_rel, end_y_rel)
|
||
|
||
# Top of content to first gap
|
||
if validated_gaps[0][0] > MIN_GAP_HEIGHT:
|
||
row_boundaries.append((0, validated_gaps[0][0]))
|
||
|
||
# Between gaps
|
||
for i in range(len(validated_gaps) - 1):
|
||
row_start = validated_gaps[i][1]
|
||
row_end = validated_gaps[i + 1][0]
|
||
if row_end - row_start > 0:
|
||
row_boundaries.append((row_start, row_end))
|
||
|
||
# Last gap to bottom of content
|
||
if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
|
||
row_boundaries.append((validated_gaps[-1][1], content_h))
|
||
|
||
rows = []
|
||
for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
|
||
# Determine row type
|
||
row_mid = (row_start_rel + row_end_rel) / 2
|
||
if header_boundary_rel is not None and row_mid < header_boundary_rel:
|
||
row_type = 'header'
|
||
elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
|
||
row_type = 'footer'
|
||
else:
|
||
row_type = 'content'
|
||
|
||
# Collect words in this row
|
||
row_words = [w for w in word_dicts
|
||
if w['top'] + w['height'] / 2 >= row_start_rel
|
||
and w['top'] + w['height'] / 2 < row_end_rel]
|
||
|
||
# Gap before this row
|
||
gap_before = 0
|
||
if idx == 0 and validated_gaps[0][0] > 0:
|
||
gap_before = validated_gaps[0][0]
|
||
elif idx > 0:
|
||
# Find the gap just before this row boundary
|
||
for gs, ge in validated_gaps:
|
||
if ge == row_start_rel:
|
||
gap_before = ge - gs
|
||
break
|
||
|
||
rows.append(RowGeometry(
|
||
index=idx,
|
||
x=left_x,
|
||
y=top_y + row_start_rel,
|
||
width=content_w,
|
||
height=row_end_rel - row_start_rel,
|
||
word_count=len(row_words),
|
||
words=row_words,
|
||
row_type=row_type,
|
||
gap_before=gap_before,
|
||
))
|
||
|
||
# --- Step 7: Word-center grid regularization ---
|
||
# Derive precise row boundaries from word vertical centers. Detects
|
||
# section breaks (headings, paragraphs) and builds per-section grids.
|
||
rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
|
||
content_w, content_h, inv)
|
||
|
||
type_counts = {}
|
||
for r in rows:
|
||
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
|
||
logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
|
||
|
||
return rows
|
||
|
||
|
||
def _regularize_row_grid(
|
||
rows: List['RowGeometry'],
|
||
word_dicts: List[Dict],
|
||
left_x: int, right_x: int,
|
||
top_y: int,
|
||
content_w: int, content_h: int,
|
||
inv: np.ndarray,
|
||
) -> List['RowGeometry']:
|
||
"""Rebuild row boundaries from word center-lines with section-break awareness.
|
||
|
||
Instead of overlaying a rigid grid, this derives row positions bottom-up
|
||
from the words themselves:
|
||
|
||
1. Group words into line clusters (by Y proximity).
|
||
2. For each cluster compute center_y (median of word vertical centers)
|
||
and letter_height (median of word heights).
|
||
3. Compute the pitch (distance between consecutive centers).
|
||
4. Detect section breaks where the gap is >1.8× the median pitch
|
||
(headings, sub-headings, paragraph breaks).
|
||
5. Within each section, use the local pitch to place row boundaries
|
||
at the midpoints between consecutive centers.
|
||
6. Validate that ≥85% of words land in a grid row; otherwise fall back.
|
||
|
||
Header/footer rows from the gap-based detection are preserved.
|
||
"""
|
||
content_rows = [r for r in rows if r.row_type == 'content']
|
||
non_content = [r for r in rows if r.row_type != 'content']
|
||
|
||
if len(content_rows) < 5:
|
||
return rows
|
||
|
||
# --- Step A: Group ALL words into line clusters ---
|
||
# Collect words that belong to content rows (deduplicated)
|
||
content_words: List[Dict] = []
|
||
seen_keys: set = set()
|
||
for r in content_rows:
|
||
for w in r.words:
|
||
key = (w['left'], w['top'], w['width'], w['height'])
|
||
if key not in seen_keys:
|
||
seen_keys.add(key)
|
||
content_words.append(w)
|
||
|
||
if len(content_words) < 5:
|
||
return rows
|
||
|
||
# Compute median word height (excluding outliers like tall brackets/IPA)
|
||
word_heights = sorted(w['height'] for w in content_words)
|
||
median_wh = word_heights[len(word_heights) // 2]
|
||
|
||
# Compute median gap-based row height — this is the actual line height
|
||
# as detected by the horizontal projection. We use 40% of this as
|
||
# grouping tolerance. This is much more reliable than using word height
|
||
# alone, because words on the same line can have very different heights
|
||
# (e.g. lowercase vs uppercase, brackets, phonetic symbols).
|
||
gap_row_heights = sorted(r.height for r in content_rows)
|
||
median_row_h = gap_row_heights[len(gap_row_heights) // 2]
|
||
|
||
# Tolerance: 40% of row height. Words on the same line should have
|
||
# centers within this range. Even if a word's bbox is taller/shorter,
|
||
# its center should stay within half a row height of the line center.
|
||
y_tol = max(10, int(median_row_h * 0.4))
|
||
|
||
# Sort by center_y, then group by proximity
|
||
words_by_center = sorted(content_words,
|
||
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
|
||
line_clusters: List[List[Dict]] = []
|
||
current_line: List[Dict] = [words_by_center[0]]
|
||
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
|
||
|
||
for w in words_by_center[1:]:
|
||
w_center = w['top'] + w['height'] / 2
|
||
if abs(w_center - current_center) <= y_tol:
|
||
current_line.append(w)
|
||
else:
|
||
current_line.sort(key=lambda w: w['left'])
|
||
line_clusters.append(current_line)
|
||
current_line = [w]
|
||
current_center = w_center
|
||
|
||
if current_line:
|
||
current_line.sort(key=lambda w: w['left'])
|
||
line_clusters.append(current_line)
|
||
|
||
if len(line_clusters) < 3:
|
||
return rows
|
||
|
||
# --- Step B: Compute center_y per cluster ---
|
||
# center_y = median of (word_top + word_height/2) across all words in cluster
|
||
# letter_h = median of word heights, but excluding outlier-height words
|
||
# (>2× median) so that tall brackets/IPA don't skew the height
|
||
cluster_info: List[Dict] = []
|
||
for cl_words in line_clusters:
|
||
centers = [w['top'] + w['height'] / 2 for w in cl_words]
|
||
# Filter outlier heights for letter_h computation
|
||
normal_heights = [w['height'] for w in cl_words
|
||
if w['height'] <= median_wh * 2.0]
|
||
if not normal_heights:
|
||
normal_heights = [w['height'] for w in cl_words]
|
||
center_y = float(np.median(centers))
|
||
letter_h = float(np.median(normal_heights))
|
||
cluster_info.append({
|
||
'center_y_rel': center_y, # relative to content ROI
|
||
'center_y_abs': center_y + top_y, # absolute
|
||
'letter_h': letter_h,
|
||
'words': cl_words,
|
||
})
|
||
|
||
cluster_info.sort(key=lambda c: c['center_y_rel'])
|
||
|
||
# --- Step B2: Merge clusters that are too close together ---
|
||
# Even with center-based grouping, some edge cases can produce
|
||
# spurious clusters. Merge any pair whose centers are closer
|
||
# than 30% of the row height (they're definitely the same text line).
|
||
merge_threshold = max(8, median_row_h * 0.3)
|
||
merged: List[Dict] = [cluster_info[0]]
|
||
for cl in cluster_info[1:]:
|
||
prev = merged[-1]
|
||
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
|
||
# Merge: combine words, recompute center
|
||
combined_words = prev['words'] + cl['words']
|
||
centers = [w['top'] + w['height'] / 2 for w in combined_words]
|
||
normal_heights = [w['height'] for w in combined_words
|
||
if w['height'] <= median_wh * 2.0]
|
||
if not normal_heights:
|
||
normal_heights = [w['height'] for w in combined_words]
|
||
prev['center_y_rel'] = float(np.median(centers))
|
||
prev['center_y_abs'] = prev['center_y_rel'] + top_y
|
||
prev['letter_h'] = float(np.median(normal_heights))
|
||
prev['words'] = combined_words
|
||
else:
|
||
merged.append(cl)
|
||
|
||
cluster_info = merged
|
||
|
||
if len(cluster_info) < 3:
|
||
return rows
|
||
|
||
# --- Step C: Compute pitches and detect section breaks ---
|
||
pitches: List[float] = []
|
||
for i in range(1, len(cluster_info)):
|
||
pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||
pitches.append(pitch)
|
||
|
||
if not pitches:
|
||
return rows
|
||
|
||
median_pitch = float(np.median(pitches))
|
||
if median_pitch <= 5:
|
||
return rows
|
||
|
||
# A section break is where the gap between line centers is much larger
|
||
# than the normal pitch (sub-headings, section titles, etc.)
|
||
BREAK_FACTOR = 1.8
|
||
|
||
# --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
|
||
sections: List[List[Dict]] = []
|
||
current_section: List[Dict] = [cluster_info[0]]
|
||
|
||
for i in range(1, len(cluster_info)):
|
||
gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||
if gap > median_pitch * BREAK_FACTOR:
|
||
sections.append(current_section)
|
||
current_section = [cluster_info[i]]
|
||
else:
|
||
current_section.append(cluster_info[i])
|
||
|
||
if current_section:
|
||
sections.append(current_section)
|
||
|
||
# --- Step E: Build row boundaries per section ---
|
||
grid_rows: List[RowGeometry] = []
|
||
|
||
for section in sections:
|
||
if not section:
|
||
continue
|
||
|
||
if len(section) == 1:
|
||
# Single-line section (likely a heading)
|
||
cl = section[0]
|
||
half_h = max(cl['letter_h'], median_pitch * 0.4)
|
||
row_top = cl['center_y_abs'] - half_h
|
||
row_bot = cl['center_y_abs'] + half_h
|
||
grid_rows.append(RowGeometry(
|
||
index=0,
|
||
x=left_x,
|
||
y=round(row_top),
|
||
width=content_w,
|
||
height=round(row_bot - row_top),
|
||
word_count=len(cl['words']),
|
||
words=cl['words'],
|
||
row_type='content',
|
||
gap_before=0,
|
||
))
|
||
continue
|
||
|
||
# Compute local pitch for this section
|
||
local_pitches = []
|
||
for i in range(1, len(section)):
|
||
local_pitches.append(
|
||
section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
|
||
)
|
||
local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
|
||
|
||
# Row boundaries are placed at midpoints between consecutive centers.
|
||
# First row: top = center - local_pitch/2
|
||
# Last row: bottom = center + local_pitch/2
|
||
for i, cl in enumerate(section):
|
||
if i == 0:
|
||
row_top = cl['center_y_abs'] - local_pitch / 2
|
||
else:
|
||
# Midpoint between this center and previous center
|
||
prev_center = section[i - 1]['center_y_abs']
|
||
row_top = (prev_center + cl['center_y_abs']) / 2
|
||
|
||
if i == len(section) - 1:
|
||
row_bot = cl['center_y_abs'] + local_pitch / 2
|
||
else:
|
||
next_center = section[i + 1]['center_y_abs']
|
||
row_bot = (cl['center_y_abs'] + next_center) / 2
|
||
|
||
# Clamp to reasonable bounds
|
||
row_top = max(top_y, row_top)
|
||
row_bot = min(top_y + content_h, row_bot)
|
||
|
||
if row_bot - row_top < 5:
|
||
continue
|
||
|
||
grid_rows.append(RowGeometry(
|
||
index=0,
|
||
x=left_x,
|
||
y=round(row_top),
|
||
width=content_w,
|
||
height=round(row_bot - row_top),
|
||
word_count=len(cl['words']),
|
||
words=cl['words'],
|
||
row_type='content',
|
||
gap_before=0,
|
||
))
|
||
|
||
if not grid_rows:
|
||
return rows
|
||
|
||
# --- Step F: Re-assign words to grid rows ---
|
||
# Words may have shifted slightly; assign each word to the row whose
|
||
# center is closest to the word's vertical center.
|
||
for gr in grid_rows:
|
||
gr.words = []
|
||
|
||
for w in content_words:
|
||
w_center = w['top'] + top_y + w['height'] / 2
|
||
best_row = None
|
||
best_dist = float('inf')
|
||
for gr in grid_rows:
|
||
row_center = gr.y + gr.height / 2
|
||
dist = abs(w_center - row_center)
|
||
if dist < best_dist:
|
||
best_dist = dist
|
||
best_row = gr
|
||
if best_row is not None and best_dist < median_pitch:
|
||
best_row.words.append(w)
|
||
|
||
for gr in grid_rows:
|
||
gr.word_count = len(gr.words)
|
||
|
||
# --- Step G: Validate ---
|
||
words_placed = sum(gr.word_count for gr in grid_rows)
|
||
if len(content_words) > 0:
|
||
match_ratio = words_placed / len(content_words)
|
||
if match_ratio < 0.85:
|
||
logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
|
||
f"of words, keeping gap-based rows")
|
||
return rows
|
||
|
||
# Remove empty grid rows (no words assigned)
|
||
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
|
||
|
||
# --- Step H: Merge header/footer + re-index ---
|
||
result = list(non_content) + grid_rows
|
||
result.sort(key=lambda r: r.y)
|
||
for i, r in enumerate(result):
|
||
r.index = i
|
||
|
||
row_heights = [gr.height for gr in grid_rows]
|
||
min_h = min(row_heights) if row_heights else 0
|
||
max_h = max(row_heights) if row_heights else 0
|
||
logger.info(f"RowGrid: word-center grid applied "
|
||
f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
|
||
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
|
||
f"{len(sections)} sections, "
|
||
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
|
||
f"was {len(content_rows)} gap-based rows)")
|
||
|
||
return result
|
||
|
||
|
||
def _build_rows_from_word_grouping(
|
||
word_dicts: List[Dict],
|
||
left_x: int, right_x: int,
|
||
top_y: int, bottom_y: int,
|
||
content_w: int, content_h: int,
|
||
) -> List['RowGeometry']:
|
||
"""Fallback: build rows by grouping words by Y position.
|
||
|
||
Uses _group_words_into_lines() with a generous tolerance.
|
||
No header/footer detection in fallback mode.
|
||
"""
|
||
if not word_dicts:
|
||
return []
|
||
|
||
y_tolerance = max(20, content_h // 100)
|
||
lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
|
||
|
||
rows = []
|
||
for idx, line_words in enumerate(lines):
|
||
if not line_words:
|
||
continue
|
||
min_top = min(w['top'] for w in line_words)
|
||
max_bottom = max(w['top'] + w['height'] for w in line_words)
|
||
row_height = max_bottom - min_top
|
||
|
||
rows.append(RowGeometry(
|
||
index=idx,
|
||
x=left_x,
|
||
y=top_y + min_top,
|
||
width=content_w,
|
||
height=row_height,
|
||
word_count=len(line_words),
|
||
words=line_words,
|
||
row_type='content',
|
||
gap_before=0,
|
||
))
|
||
|
||
logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
|
||
return rows
|
||
|
||
|
||
# --- Phase B: Content-Based Classification ---
|
||
|
||
def _score_language(words: List[Dict]) -> Dict[str, float]:
|
||
"""Score the language of a column's words.
|
||
|
||
Analyzes function words, umlauts, and capitalization patterns
|
||
to determine whether text is English or German.
|
||
|
||
Args:
|
||
words: List of word dicts with 'text' and 'conf' keys.
|
||
|
||
Returns:
|
||
Dict with 'eng' and 'deu' scores (0.0-1.0).
|
||
"""
|
||
if not words:
|
||
return {'eng': 0.0, 'deu': 0.0}
|
||
|
||
# Only consider words with decent confidence
|
||
good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
|
||
if not good_words:
|
||
return {'eng': 0.0, 'deu': 0.0}
|
||
|
||
total = len(good_words)
|
||
en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
|
||
de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
|
||
|
||
# Check for umlauts (strong German signal)
|
||
raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
|
||
umlaut_count = sum(1 for t in raw_texts
|
||
for c in t if c in 'äöüÄÖÜß')
|
||
|
||
# German capitalization: nouns are capitalized mid-sentence
|
||
# Count words that start with uppercase but aren't at position 0
|
||
cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
|
||
|
||
en_score = en_hits / total if total > 0 else 0.0
|
||
de_score = de_hits / total if total > 0 else 0.0
|
||
|
||
# Boost German score for umlauts
|
||
if umlaut_count > 0:
|
||
de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
|
||
|
||
# Boost German score for high capitalization ratio (typical for German nouns)
|
||
if total > 5:
|
||
cap_ratio = cap_words / total
|
||
if cap_ratio > 0.3:
|
||
de_score = min(1.0, de_score + 0.1)
|
||
|
||
return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
|
||
|
||
|
||
def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
|
||
"""Score the role of a column based on its geometry and content patterns.
|
||
|
||
Args:
|
||
geom: ColumnGeometry with words and dimensions.
|
||
|
||
Returns:
|
||
Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
|
||
"""
|
||
scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
|
||
|
||
if not geom.words:
|
||
return scores
|
||
|
||
texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
|
||
if not texts:
|
||
return scores
|
||
|
||
avg_word_len = sum(len(t) for t in texts) / len(texts)
|
||
has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
|
||
digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
|
||
digit_ratio = digit_words / len(texts) if texts else 0.0
|
||
|
||
# Reference: narrow + mostly numbers/page references
|
||
if geom.width_ratio < 0.12:
|
||
scores['reference'] = 0.5
|
||
if digit_ratio > 0.4:
|
||
scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
|
||
|
||
# Marker: narrow + few short entries
|
||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||
scores['marker'] = 0.7
|
||
if avg_word_len < 4:
|
||
scores['marker'] = 0.9
|
||
# Very narrow non-edge column → strong marker regardless of word count
|
||
if geom.width_ratio < 0.04 and geom.index > 0:
|
||
scores['marker'] = max(scores['marker'], 0.9)
|
||
|
||
# Sentence: longer words + punctuation present
|
||
if geom.width_ratio > 0.15 and has_punctuation > 2:
|
||
scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
|
||
if avg_word_len > 4:
|
||
scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
|
||
|
||
# Vocabulary: medium width + medium word length
|
||
if 0.10 < geom.width_ratio < 0.45:
|
||
scores['vocabulary'] = 0.4
|
||
if 3 < avg_word_len < 8:
|
||
scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
|
||
|
||
return {k: round(v, 3) for k, v in scores.items()}
|
||
|
||
|
||
def _build_margin_regions(
|
||
all_regions: List[PageRegion],
|
||
left_x: int,
|
||
right_x: int,
|
||
img_w: int,
|
||
top_y: int,
|
||
content_h: int,
|
||
) -> List[PageRegion]:
|
||
"""Create margin_left / margin_right PageRegions from content bounds.
|
||
|
||
Margins represent the space between the image edge and the first/last
|
||
content column. They are used downstream for faithful page
|
||
reconstruction but are skipped during OCR.
|
||
"""
|
||
margins: List[PageRegion] = []
|
||
# Minimum gap (px) to create a margin region
|
||
_min_gap = 5
|
||
|
||
if left_x > _min_gap:
|
||
margins.append(PageRegion(
|
||
type='margin_left', x=0, y=top_y,
|
||
width=left_x, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='content_bounds',
|
||
))
|
||
|
||
# Right margin: from end of last content column to image edge
|
||
non_margin = [r for r in all_regions
|
||
if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
|
||
'margin_top', 'margin_bottom')]
|
||
if non_margin:
|
||
last_col_end = max(r.x + r.width for r in non_margin)
|
||
else:
|
||
last_col_end = right_x
|
||
if img_w - last_col_end > _min_gap:
|
||
margins.append(PageRegion(
|
||
type='margin_right', x=last_col_end, y=top_y,
|
||
width=img_w - last_col_end, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='content_bounds',
|
||
))
|
||
|
||
if margins:
|
||
logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
|
||
f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
|
||
|
||
return margins
|
||
|
||
|
||
def classify_column_types(geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
top_y: int,
|
||
img_w: int,
|
||
img_h: int,
|
||
bottom_y: int,
|
||
left_x: int = 0,
|
||
right_x: int = 0,
|
||
inv: Optional[np.ndarray] = None) -> List[PageRegion]:
|
||
"""Classify column types using a 3-level fallback chain.
|
||
|
||
Level 1: Content-based (language + role scoring)
|
||
Level 2: Position + language (old rules enhanced with language detection)
|
||
Level 3: Pure position (exact old code, no regression)
|
||
|
||
Args:
|
||
geometries: List of ColumnGeometry from Phase A.
|
||
content_w: Total content width.
|
||
top_y: Top Y of content area.
|
||
img_w: Full image width.
|
||
img_h: Full image height.
|
||
bottom_y: Bottom Y of content area.
|
||
left_x: Left content bound (from _find_content_bounds).
|
||
right_x: Right content bound (from _find_content_bounds).
|
||
|
||
Returns:
|
||
List of PageRegion with types, confidence, and method.
|
||
"""
|
||
content_h = bottom_y - top_y
|
||
|
||
def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
|
||
"""Append margin_left / margin_right regions to *result*."""
|
||
margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
|
||
return result + margins
|
||
|
||
# Special case: single column → plain text page
|
||
if len(geometries) == 1:
|
||
geom = geometries[0]
|
||
return _with_margins([PageRegion(
|
||
type='column_text', x=geom.x, y=geom.y,
|
||
width=geom.width, height=geom.height,
|
||
classification_confidence=0.9,
|
||
classification_method='content',
|
||
)])
|
||
|
||
# --- Pre-filter: first/last columns with very few words → column_ignore ---
|
||
# Sub-columns from _detect_sub_columns() are exempt: they intentionally
|
||
# have few words (page refs, markers) and should not be discarded.
|
||
ignore_regions = []
|
||
active_geometries = []
|
||
for idx, g in enumerate(geometries):
|
||
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
|
||
ignore_regions.append(PageRegion(
|
||
type='column_ignore', x=g.x, y=g.y,
|
||
width=g.width, height=content_h,
|
||
classification_confidence=0.95,
|
||
classification_method='content',
|
||
))
|
||
logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)")
|
||
else:
|
||
active_geometries.append(g)
|
||
|
||
# Re-index active geometries for classification
|
||
for new_idx, g in enumerate(active_geometries):
|
||
g.index = new_idx
|
||
geometries = active_geometries
|
||
|
||
# Handle edge case: all columns ignored or only 1 left
|
||
if len(geometries) == 0:
|
||
return _with_margins(ignore_regions)
|
||
if len(geometries) == 1:
|
||
geom = geometries[0]
|
||
ignore_regions.append(PageRegion(
|
||
type='column_text', x=geom.x, y=geom.y,
|
||
width=geom.width, height=geom.height,
|
||
classification_confidence=0.9,
|
||
classification_method='content',
|
||
))
|
||
return _with_margins(ignore_regions)
|
||
|
||
# --- Score all columns ---
|
||
lang_scores = [_score_language(g.words) for g in geometries]
|
||
role_scores = [_score_role(g) for g in geometries]
|
||
|
||
logger.info(f"ClassifyColumns: language scores: "
|
||
f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
|
||
logger.info(f"ClassifyColumns: role scores: "
|
||
f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
|
||
|
||
# --- Level 1: Content-based classification ---
|
||
regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
|
||
if regions is not None:
|
||
logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
|
||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||
return _with_margins(ignore_regions + regions)
|
||
|
||
# --- Level 2: Position + language enhanced ---
|
||
regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
|
||
if regions is not None:
|
||
logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
|
||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||
return _with_margins(ignore_regions + regions)
|
||
|
||
# --- Level 3: Pure position fallback (old code, no regression) ---
|
||
logger.info("ClassifyColumns: Level 3 (position fallback)")
|
||
regions = _classify_by_position_fallback(geometries, content_w, content_h)
|
||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||
return _with_margins(ignore_regions + regions)
|
||
|
||
|
||
def _classify_by_content(geometries: List[ColumnGeometry],
|
||
lang_scores: List[Dict[str, float]],
|
||
role_scores: List[Dict[str, float]],
|
||
content_w: int,
|
||
content_h: int) -> Optional[List[PageRegion]]:
|
||
"""Level 1: Classify columns purely by content analysis.
|
||
|
||
Requires clear language signals to distinguish EN/DE columns.
|
||
Returns None if language signals are too weak.
|
||
"""
|
||
regions = []
|
||
assigned = set()
|
||
|
||
# Step 1: Assign structural roles first (reference, marker)
|
||
# left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
|
||
left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
|
||
|
||
for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
|
||
is_left_side = geom.x < left_20_threshold
|
||
has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
|
||
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
|
||
regions.append(PageRegion(
|
||
type='page_ref', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=rs['reference'],
|
||
classification_method='content',
|
||
))
|
||
assigned.add(i)
|
||
elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
|
||
regions.append(PageRegion(
|
||
type='column_marker', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=rs['marker'],
|
||
classification_method='content',
|
||
))
|
||
assigned.add(i)
|
||
elif geom.width_ratio < 0.05 and not is_left_side:
|
||
# Narrow column on the right side → marker, not page_ref
|
||
regions.append(PageRegion(
|
||
type='column_marker', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.8,
|
||
classification_method='content',
|
||
))
|
||
assigned.add(i)
|
||
|
||
# Step 2: Among remaining columns, find EN and DE by language scores
|
||
remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
|
||
for i in range(len(geometries)) if i not in assigned]
|
||
|
||
if len(remaining) < 2:
|
||
# Not enough columns for EN/DE pair
|
||
if len(remaining) == 1:
|
||
i, geom, ls, rs = remaining[0]
|
||
regions.append(PageRegion(
|
||
type='column_text', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.6,
|
||
classification_method='content',
|
||
))
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
# Check if we have enough language signal
|
||
en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
|
||
de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
|
||
|
||
# Position tiebreaker: when language signals are weak, use left=EN, right=DE
|
||
if (not en_candidates or not de_candidates) and len(remaining) >= 2:
|
||
max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
|
||
max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
|
||
if max_eng < 0.15 and max_deu < 0.15:
|
||
# Both signals weak — fall back to positional: left=EN, right=DE
|
||
sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
|
||
best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
|
||
best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
|
||
logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
|
||
en_conf = 0.4
|
||
de_conf = 0.4
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=best_en[1].x, y=best_en[1].y,
|
||
width=best_en[1].width, height=content_h,
|
||
classification_confidence=en_conf,
|
||
classification_method='content',
|
||
))
|
||
assigned.add(best_en[0])
|
||
|
||
regions.append(PageRegion(
|
||
type='column_de', x=best_de[1].x, y=best_de[1].y,
|
||
width=best_de[1].width, height=content_h,
|
||
classification_confidence=de_conf,
|
||
classification_method='content',
|
||
))
|
||
assigned.add(best_de[0])
|
||
|
||
# Assign remaining as example
|
||
for i, geom, ls, rs in remaining:
|
||
if i not in assigned:
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.4,
|
||
classification_method='content',
|
||
))
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
if not en_candidates or not de_candidates:
|
||
# Language signals too weak for content-based classification
|
||
logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
|
||
return None
|
||
|
||
# Pick the best EN and DE candidates
|
||
best_en = max(en_candidates, key=lambda x: x[2]['eng'])
|
||
best_de = max(de_candidates, key=lambda x: x[2]['deu'])
|
||
|
||
if best_en[0] == best_de[0]:
|
||
# Same column scored highest for both — ambiguous
|
||
logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
|
||
return None
|
||
|
||
en_conf = best_en[2]['eng']
|
||
de_conf = best_de[2]['deu']
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=best_en[1].x, y=best_en[1].y,
|
||
width=best_en[1].width, height=content_h,
|
||
classification_confidence=round(en_conf, 2),
|
||
classification_method='content',
|
||
))
|
||
assigned.add(best_en[0])
|
||
|
||
regions.append(PageRegion(
|
||
type='column_de', x=best_de[1].x, y=best_de[1].y,
|
||
width=best_de[1].width, height=content_h,
|
||
classification_confidence=round(de_conf, 2),
|
||
classification_method='content',
|
||
))
|
||
assigned.add(best_de[0])
|
||
|
||
# Step 3: Remaining columns → example or text based on role scores
|
||
for i, geom, ls, rs in remaining:
|
||
if i in assigned:
|
||
continue
|
||
if rs['sentence'] > 0.4:
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=round(rs['sentence'], 2),
|
||
classification_method='content',
|
||
))
|
||
else:
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.5,
|
||
classification_method='content',
|
||
))
|
||
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
|
||
def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
|
||
lang_scores: List[Dict[str, float]],
|
||
content_w: int,
|
||
content_h: int) -> Optional[List[PageRegion]]:
|
||
"""Level 2: Position-based rules enhanced with language confirmation.
|
||
|
||
Uses the old positional heuristics but confirms EN/DE assignment
|
||
with language scores (swapping if needed).
|
||
"""
|
||
regions = []
|
||
untyped = list(range(len(geometries)))
|
||
first_x = geometries[0].x if geometries else 0
|
||
left_20_threshold = first_x + content_w * 0.20
|
||
|
||
# Rule 1: Leftmost narrow column → page_ref (only if in left 20%, no strong language)
|
||
g0 = geometries[0]
|
||
ls0 = lang_scores[0]
|
||
has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
|
||
if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
|
||
regions.append(PageRegion(
|
||
type='page_ref', x=g0.x, y=g0.y,
|
||
width=g0.width, height=content_h,
|
||
classification_confidence=0.8,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped.remove(0)
|
||
|
||
# Rule 2: Narrow columns with few words → marker
|
||
for i in list(untyped):
|
||
geom = geometries[i]
|
||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||
regions.append(PageRegion(
|
||
type='column_marker', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.7,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped.remove(i)
|
||
|
||
# Rule 3: Rightmost remaining → column_example (if 3+ remaining)
|
||
if len(untyped) >= 3:
|
||
last_idx = untyped[-1]
|
||
geom = geometries[last_idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.7,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped.remove(last_idx)
|
||
|
||
# Rule 4: First two remaining → EN/DE, but check language to possibly swap
|
||
if len(untyped) >= 2:
|
||
idx_a = untyped[0]
|
||
idx_b = untyped[1]
|
||
ls_a = lang_scores[idx_a]
|
||
ls_b = lang_scores[idx_b]
|
||
|
||
# Default: first=EN, second=DE (old behavior)
|
||
en_idx, de_idx = idx_a, idx_b
|
||
conf = 0.7
|
||
|
||
# Swap if language signals clearly indicate the opposite
|
||
if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
|
||
en_idx, de_idx = idx_b, idx_a
|
||
conf = 0.85
|
||
logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
|
||
width=geometries[en_idx].width, height=content_h,
|
||
classification_confidence=conf,
|
||
classification_method='position_enhanced',
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
|
||
width=geometries[de_idx].width, height=content_h,
|
||
classification_confidence=conf,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped = untyped[2:]
|
||
elif len(untyped) == 1:
|
||
idx = untyped[0]
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.5,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped = []
|
||
|
||
# Remaining → example
|
||
for idx in untyped:
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.5,
|
||
classification_method='position_enhanced',
|
||
))
|
||
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
|
||
def _classify_by_position_fallback(geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
content_h: int) -> List[PageRegion]:
|
||
"""Level 3: Pure position-based fallback (identical to old code).
|
||
|
||
Guarantees no regression from the previous behavior.
|
||
"""
|
||
regions = []
|
||
untyped = list(range(len(geometries)))
|
||
first_x = geometries[0].x if geometries else 0
|
||
left_20_threshold = first_x + content_w * 0.20
|
||
|
||
# Rule 1: Leftmost narrow column → page_ref (only if in left 20%)
|
||
g0 = geometries[0]
|
||
if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
|
||
regions.append(PageRegion(
|
||
type='page_ref', x=g0.x, y=g0.y,
|
||
width=g0.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped.remove(0)
|
||
|
||
# Rule 2: Narrow + few words → marker
|
||
for i in list(untyped):
|
||
geom = geometries[i]
|
||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||
regions.append(PageRegion(
|
||
type='column_marker', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped.remove(i)
|
||
|
||
# Rule 3: Rightmost remaining → example (if 3+)
|
||
if len(untyped) >= 3:
|
||
last_idx = untyped[-1]
|
||
geom = geometries[last_idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped.remove(last_idx)
|
||
|
||
# Rule 4: First remaining → EN, second → DE
|
||
if len(untyped) >= 2:
|
||
en_idx = untyped[0]
|
||
de_idx = untyped[1]
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
|
||
width=geometries[en_idx].width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
|
||
width=geometries[de_idx].width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped = untyped[2:]
|
||
elif len(untyped) == 1:
|
||
idx = untyped[0]
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped = []
|
||
|
||
for idx in untyped:
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
|
||
def _detect_header_footer_gaps(
|
||
inv: np.ndarray,
|
||
img_w: int,
|
||
img_h: int,
|
||
) -> Tuple[Optional[int], Optional[int]]:
|
||
"""Detect header/footer boundaries via horizontal projection gap analysis.
|
||
|
||
Scans the full-page inverted image for large horizontal gaps in the top/bottom
|
||
20% that separate header/footer content from the main body.
|
||
|
||
Returns:
|
||
(header_y, footer_y) — absolute y-coordinates.
|
||
header_y = bottom edge of header region (None if no header detected).
|
||
footer_y = top edge of footer region (None if no footer detected).
|
||
"""
|
||
HEADER_FOOTER_ZONE = 0.20
|
||
GAP_MULTIPLIER = 2.0
|
||
|
||
# Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
|
||
actual_h = min(inv.shape[0], img_h)
|
||
roi = inv[:actual_h, :]
|
||
h_proj = np.sum(roi, axis=1).astype(float)
|
||
proj_w = roi.shape[1]
|
||
h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
|
||
|
||
# Step 2: Smoothing
|
||
kernel_size = max(3, actual_h // 200)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
# Step 3: Gap threshold
|
||
positive = h_smooth[h_smooth > 0]
|
||
median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
|
||
gap_threshold = max(median_density * 0.15, 0.003)
|
||
|
||
in_gap = h_smooth < gap_threshold
|
||
MIN_GAP_HEIGHT = max(3, actual_h // 500)
|
||
|
||
# Step 4: Collect contiguous gaps
|
||
raw_gaps: List[Tuple[int, int]] = []
|
||
gap_start: Optional[int] = None
|
||
for y in range(len(in_gap)):
|
||
if in_gap[y]:
|
||
if gap_start is None:
|
||
gap_start = y
|
||
else:
|
||
if gap_start is not None:
|
||
gap_height = y - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, y))
|
||
gap_start = None
|
||
if gap_start is not None:
|
||
gap_height = len(in_gap) - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, len(in_gap)))
|
||
|
||
if not raw_gaps:
|
||
return None, None
|
||
|
||
# Step 5: Compute median gap size and large-gap threshold
|
||
gap_sizes = [g[1] - g[0] for g in raw_gaps]
|
||
median_gap = float(np.median(gap_sizes))
|
||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||
|
||
# Step 6: Find largest qualifying gap in header / footer zones
|
||
# A separator gap must have content on BOTH sides — edge-touching gaps
|
||
# (e.g. dewarp padding at bottom) are not valid separators.
|
||
EDGE_MARGIN = max(5, actual_h // 400)
|
||
header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
|
||
footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
|
||
|
||
header_y: Optional[int] = None
|
||
footer_y: Optional[int] = None
|
||
|
||
best_header_size = 0
|
||
for gs, ge in raw_gaps:
|
||
if gs <= EDGE_MARGIN:
|
||
continue # skip gaps touching the top edge
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||
if gap_size > best_header_size:
|
||
best_header_size = gap_size
|
||
header_y = ge # bottom edge of gap
|
||
|
||
best_footer_size = 0
|
||
for gs, ge in raw_gaps:
|
||
if ge >= actual_h - EDGE_MARGIN:
|
||
continue # skip gaps touching the bottom edge
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||
if gap_size > best_footer_size:
|
||
best_footer_size = gap_size
|
||
footer_y = gs # top edge of gap
|
||
|
||
if header_y is not None:
|
||
logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
|
||
f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
|
||
if footer_y is not None:
|
||
logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
|
||
f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
|
||
|
||
return header_y, footer_y
|
||
|
||
|
||
def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
|
||
min_density: float = 0.005) -> bool:
|
||
"""Check whether a horizontal strip contains meaningful ink.
|
||
|
||
Args:
|
||
inv: Inverted binarized image (white-on-black).
|
||
y_start: Top of the region (inclusive).
|
||
y_end: Bottom of the region (exclusive).
|
||
min_density: Fraction of white pixels required to count as content.
|
||
|
||
Returns:
|
||
True if the region contains text/graphics, False if empty margin.
|
||
"""
|
||
if y_start >= y_end:
|
||
return False
|
||
strip = inv[y_start:y_end, :]
|
||
density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
|
||
return density > min_density
|
||
|
||
|
||
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
|
||
img_w: int, img_h: int,
|
||
inv: Optional[np.ndarray] = None) -> None:
|
||
"""Add header/footer/margin regions in-place.
|
||
|
||
Uses gap-based detection when *inv* is provided, otherwise falls back
|
||
to simple top_y/bottom_y bounds.
|
||
|
||
Region types depend on whether there is actual content (text/graphics):
|
||
- 'header' / 'footer' — region contains text (e.g. title, page number)
|
||
- 'margin_top' / 'margin_bottom' — region is empty page margin
|
||
"""
|
||
header_y: Optional[int] = None
|
||
footer_y: Optional[int] = None
|
||
|
||
if inv is not None:
|
||
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
|
||
|
||
# --- Top region ---
|
||
top_boundary = header_y if header_y is not None and header_y > 10 else (
|
||
top_y if top_y > 10 else None
|
||
)
|
||
if top_boundary is not None:
|
||
has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
|
||
rtype = 'header' if has_content else 'margin_top'
|
||
regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
|
||
logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
|
||
f"(has_content={has_content})")
|
||
|
||
# --- Bottom region ---
|
||
bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
|
||
bottom_y if bottom_y < img_h - 10 else None
|
||
)
|
||
if bottom_boundary is not None:
|
||
has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
|
||
rtype = 'footer' if has_content else 'margin_bottom'
|
||
regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
|
||
height=img_h - bottom_boundary))
|
||
logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
|
||
f"height={img_h - bottom_boundary}px (has_content={has_content})")
|
||
|
||
|
||
# --- Main Entry Point ---
|
||
|
||
def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
|
||
"""Detect columns using two-phase approach: geometry then content classification.
|
||
|
||
Phase A: detect_column_geometry() — clustering word positions into columns.
|
||
Phase B: classify_column_types() — content-based type assignment with fallback.
|
||
|
||
Falls back to projection-based analyze_layout() if geometry detection fails.
|
||
|
||
Args:
|
||
ocr_img: Binarized grayscale image for layout analysis.
|
||
dewarped_bgr: Original BGR image (for Tesseract word detection).
|
||
|
||
Returns:
|
||
List of PageRegion objects with types, confidence, and method.
|
||
"""
|
||
h, w = ocr_img.shape[:2]
|
||
|
||
# Phase A: Geometry detection
|
||
result = detect_column_geometry(ocr_img, dewarped_bgr)
|
||
|
||
if result is None:
|
||
# Fallback to projection-based layout
|
||
logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
|
||
layout_img = create_layout_image(dewarped_bgr)
|
||
return analyze_layout(layout_img, ocr_img)
|
||
|
||
geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
|
||
content_w = right_x - left_x
|
||
|
||
# Detect header/footer early so sub-column clustering ignores them
|
||
header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
|
||
|
||
# Split sub-columns (e.g. page references) before classification
|
||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||
|
||
# Phase B: Content-based classification
|
||
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
||
left_x=left_x, right_x=right_x, inv=_inv)
|
||
|
||
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
||
methods = set(r.classification_method for r in regions if r.classification_method)
|
||
logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
|
||
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
|
||
|
||
return regions
|
||
|
||
|
||
# =============================================================================
|
||
# Pipeline Step 5: Word Grid from Columns × Rows
|
||
# =============================================================================
|
||
|
||
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
|
||
"""Group OCR words into visual lines in reading order.
|
||
|
||
Returns a list of line strings (one per visual line in the cell).
|
||
"""
|
||
if not words:
|
||
return []
|
||
|
||
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||
return [' '.join(w['text'] for w in line) for line in lines]
|
||
|
||
|
||
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
|
||
"""Rejoin words split by line-break hyphenation.
|
||
|
||
E.g. ['Fuß-', 'boden'] → ['Fußboden']
|
||
['some text-', 'thing here'] → ['something here']
|
||
"""
|
||
if len(lines) <= 1:
|
||
return lines
|
||
|
||
result = []
|
||
i = 0
|
||
while i < len(lines):
|
||
line = lines[i]
|
||
# If line ends with '-' and there's a next line, rejoin
|
||
if i + 1 < len(lines) and line.rstrip().endswith('-'):
|
||
stripped = line.rstrip()
|
||
# Get the word fragment before hyphen (last word)
|
||
prefix = stripped[:-1] # remove trailing hyphen
|
||
next_line = lines[i + 1]
|
||
# Join: last word of this line + first word of next line
|
||
prefix_words = prefix.rsplit(' ', 1)
|
||
next_words = next_line.split(' ', 1)
|
||
if len(prefix_words) > 1:
|
||
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
|
||
else:
|
||
joined = prefix_words[0] + next_words[0]
|
||
remainder = next_words[1] if len(next_words) > 1 else ''
|
||
if remainder:
|
||
result.append(joined + ' ' + remainder)
|
||
else:
|
||
result.append(joined)
|
||
i += 2
|
||
else:
|
||
result.append(line)
|
||
i += 1
|
||
return result
|
||
|
||
|
||
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||
"""Join OCR words into text in correct reading order, preserving line breaks.
|
||
|
||
Groups words into visual lines by Y-tolerance, sorts each line by X,
|
||
rejoins hyphenated words, then joins lines with newlines.
|
||
"""
|
||
lines = _words_to_reading_order_lines(words, y_tolerance_px)
|
||
lines = _rejoin_hyphenated(lines)
|
||
return '\n'.join(lines)
|
||
|
||
|
||
# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
|
||
|
||
_rapid_engine = None
|
||
RAPIDOCR_AVAILABLE = False
|
||
|
||
try:
|
||
from rapidocr import RapidOCR as _RapidOCRClass
|
||
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
|
||
RAPIDOCR_AVAILABLE = True
|
||
logger.info("RapidOCR available — can be used as alternative to Tesseract")
|
||
except ImportError:
|
||
logger.info("RapidOCR not installed — using Tesseract only")
|
||
|
||
|
||
def _get_rapid_engine():
|
||
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
|
||
global _rapid_engine
|
||
if _rapid_engine is None:
|
||
_rapid_engine = _RapidOCRClass(params={
|
||
# PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
|
||
"Rec.lang_type": _LangRec.LATIN,
|
||
"Rec.model_type": _ModelType.SERVER,
|
||
"Rec.ocr_version": _OCRVersion.PPOCRV5,
|
||
# Tighter detection boxes to reduce word merging
|
||
"Det.unclip_ratio": 1.3,
|
||
"Det.box_thresh": 0.6,
|
||
# Silence verbose logging
|
||
"Global.log_level": "critical",
|
||
})
|
||
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
|
||
return _rapid_engine
|
||
|
||
|
||
def ocr_region_rapid(
|
||
img_bgr: np.ndarray,
|
||
region: PageRegion,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format.
|
||
|
||
Args:
|
||
img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray).
|
||
region: Region to crop and OCR.
|
||
|
||
Returns:
|
||
List of word dicts with text, left, top, width, height, conf, region_type.
|
||
"""
|
||
engine = _get_rapid_engine()
|
||
|
||
# Crop region from BGR image
|
||
crop = img_bgr[region.y:region.y + region.height,
|
||
region.x:region.x + region.width]
|
||
|
||
if crop.size == 0:
|
||
return []
|
||
|
||
result = engine(crop)
|
||
|
||
if result is None or result.boxes is None or result.txts is None:
|
||
return []
|
||
|
||
words = []
|
||
boxes = result.boxes # shape (N, 4, 2) — 4 corner points per text line
|
||
txts = result.txts # tuple of strings
|
||
scores = result.scores # tuple of floats
|
||
|
||
for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
|
||
if not txt or not txt.strip():
|
||
continue
|
||
|
||
# box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left)
|
||
xs = [p[0] for p in box]
|
||
ys = [p[1] for p in box]
|
||
left = int(min(xs))
|
||
top = int(min(ys))
|
||
w = int(max(xs) - left)
|
||
h = int(max(ys) - top)
|
||
|
||
words.append({
|
||
'text': txt.strip(),
|
||
'left': left + region.x, # Absolute coords
|
||
'top': top + region.y,
|
||
'width': w,
|
||
'height': h,
|
||
'conf': int(score * 100), # 0-100 like Tesseract
|
||
'region_type': region.type,
|
||
})
|
||
|
||
return words
|
||
|
||
|
||
def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
|
||
"""Run TrOCR on a region. Returns line-level word dicts (same format as ocr_region_rapid).
|
||
|
||
Uses trocr_service.get_trocr_model() + _split_into_lines() for line segmentation.
|
||
Bboxes are approximated from equal line-height distribution within the region.
|
||
Falls back to Tesseract if TrOCR is not available.
|
||
"""
|
||
from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
|
||
|
||
if not _check_trocr_available():
|
||
logger.warning("TrOCR not available, falling back to Tesseract")
|
||
if region.height > 0 and region.width > 0:
|
||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
|
||
if ocr_img_crop is not None:
|
||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||
return []
|
||
|
||
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
|
||
if crop.size == 0:
|
||
return []
|
||
|
||
try:
|
||
import torch
|
||
from PIL import Image as _PILImage
|
||
|
||
processor, model = get_trocr_model(handwritten=handwritten)
|
||
if processor is None or model is None:
|
||
logger.warning("TrOCR model not loaded, falling back to Tesseract")
|
||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||
|
||
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
|
||
lines = _split_into_lines(pil_crop)
|
||
if not lines:
|
||
lines = [pil_crop]
|
||
|
||
device = next(model.parameters()).device
|
||
all_text = []
|
||
confidences = []
|
||
for line_img in lines:
|
||
pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
|
||
with torch.no_grad():
|
||
generated_ids = model.generate(pixel_values, max_length=128)
|
||
text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
||
if text_line:
|
||
all_text.append(text_line)
|
||
confidences.append(0.85 if len(text_line) > 3 else 0.5)
|
||
|
||
if not all_text:
|
||
return []
|
||
|
||
avg_conf = int(sum(confidences) / len(confidences) * 100)
|
||
line_h = region.height // max(len(all_text), 1)
|
||
words = []
|
||
for i, line in enumerate(all_text):
|
||
words.append({
|
||
"text": line,
|
||
"left": region.x,
|
||
"top": region.y + i * line_h,
|
||
"width": region.width,
|
||
"height": line_h,
|
||
"conf": avg_conf,
|
||
"region_type": region.type,
|
||
})
|
||
return words
|
||
|
||
except Exception as e:
|
||
logger.error(f"ocr_region_trocr failed: {e}")
|
||
return []
|
||
|
||
|
||
def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
|
||
"""Run LightOnOCR-2-1B on a region. Returns line-level word dicts (same format as ocr_region_rapid).
|
||
|
||
Falls back to RapidOCR or Tesseract if LightOnOCR is not available.
|
||
"""
|
||
from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
|
||
|
||
if not _check_lighton_available():
|
||
logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
|
||
if RAPIDOCR_AVAILABLE and img_bgr is not None:
|
||
return ocr_region_rapid(img_bgr, region)
|
||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
|
||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
|
||
|
||
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
|
||
if crop.size == 0:
|
||
return []
|
||
|
||
try:
|
||
import io
|
||
import torch
|
||
from PIL import Image as _PILImage
|
||
|
||
processor, model = get_lighton_model()
|
||
if processor is None or model is None:
|
||
logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
|
||
if RAPIDOCR_AVAILABLE and img_bgr is not None:
|
||
return ocr_region_rapid(img_bgr, region)
|
||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||
|
||
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
|
||
conversation = [{"role": "user", "content": [{"type": "image"}]}]
|
||
inputs = processor.apply_chat_template(
|
||
conversation, images=[pil_crop],
|
||
add_generation_prompt=True, return_tensors="pt"
|
||
).to(model.device)
|
||
|
||
with torch.no_grad():
|
||
output_ids = model.generate(**inputs, max_new_tokens=1024)
|
||
|
||
text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
|
||
if not text:
|
||
return []
|
||
|
||
lines = [l.strip() for l in text.split("\n") if l.strip()]
|
||
line_h = region.height // max(len(lines), 1)
|
||
words = []
|
||
for i, line in enumerate(lines):
|
||
words.append({
|
||
"text": line,
|
||
"left": region.x,
|
||
"top": region.y + i * line_h,
|
||
"width": region.width,
|
||
"height": line_h,
|
||
"conf": 85,
|
||
"region_type": region.type,
|
||
})
|
||
return words
|
||
|
||
except Exception as e:
|
||
logger.error(f"ocr_region_lighton failed: {e}")
|
||
return []
|
||
|
||
|
||
# =============================================================================
|
||
# Post-Processing: Deterministic Quality Fixes
|
||
# =============================================================================
|
||
|
||
# --- A. Character Confusion Fix (I/1/l) ---
|
||
|
||
# Common OCR confusion pairs in vocabulary context
|
||
_CHAR_CONFUSION_RULES = [
|
||
# "1" at word start followed by lowercase → likely "I" or "l"
|
||
# Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
|
||
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
|
||
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
|
||
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
|
||
# "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
|
||
(re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
|
||
]
|
||
|
||
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
|
||
_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
|
||
|
||
|
||
def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
"""Fix common OCR character confusions using context.
|
||
|
||
Deterministic rules:
|
||
- "1" at word start → "I" or "l" based on context
|
||
- Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
|
||
- "y " artifact at word boundaries → remove (e.g. "y you" → "you")
|
||
"""
|
||
for entry in entries:
|
||
en = entry.get('english', '') or ''
|
||
de = entry.get('german', '') or ''
|
||
ex = entry.get('example', '') or ''
|
||
|
||
# Apply general rules to all fields
|
||
for pattern, replacement in _CHAR_CONFUSION_RULES:
|
||
en = pattern.sub(replacement, en)
|
||
de = pattern.sub(replacement, de)
|
||
ex = pattern.sub(replacement, ex)
|
||
|
||
# Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
|
||
de_lower_words = set(de.lower().replace(',', ' ').split())
|
||
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
|
||
# Any remaining "1" in EN that looks like "I"
|
||
en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
|
||
|
||
# Fix "y " artifact before repeated word: "y you" → "you"
|
||
en = re.sub(r'\by\s+([a-z])', r'\1', en)
|
||
ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
|
||
|
||
entry['english'] = en.strip()
|
||
entry['german'] = de.strip()
|
||
entry['example'] = ex.strip()
|
||
|
||
return entries
|
||
|
||
|
||
# --- B. Comma-Separated Word Form Splitting ---
|
||
|
||
def _is_singular_plural_pair(parts: List[str]) -> bool:
|
||
"""Detect if comma-separated parts are singular/plural forms of the same word.
|
||
|
||
E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
|
||
"break, broke, broken" → False (different verb forms, OK to split).
|
||
|
||
Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
|
||
OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
|
||
"""
|
||
if len(parts) != 2:
|
||
return False
|
||
|
||
a, b = parts[0].lower().strip(), parts[1].lower().strip()
|
||
if not a or not b:
|
||
return False
|
||
|
||
# Common prefix heuristic: if words share >= 50% of the shorter word,
|
||
# they are likely forms of the same word (Maus/Mäuse, child/children).
|
||
min_len = min(len(a), len(b))
|
||
common = 0
|
||
for ca, cb in zip(a, b):
|
||
if ca == cb:
|
||
common += 1
|
||
else:
|
||
break
|
||
if common >= max(2, min_len * 0.5):
|
||
return True
|
||
|
||
# Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
|
||
umlaut_map = str.maketrans('aou', 'äöü')
|
||
if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
"""Split entries with comma-separated word forms into individual entries.
|
||
|
||
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
|
||
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
|
||
|
||
Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
|
||
because those are forms of the same vocabulary entry.
|
||
|
||
Only splits when both EN and DE have the same number of comma-parts,
|
||
parts are short (word forms, not sentences), and at least 3 parts
|
||
(to avoid splitting pairs that likely belong together).
|
||
"""
|
||
result: List[Dict[str, Any]] = []
|
||
|
||
for entry in entries:
|
||
en = (entry.get('english', '') or '').strip()
|
||
de = (entry.get('german', '') or '').strip()
|
||
|
||
# Split by comma (but not inside brackets or parentheses)
|
||
en_parts = _split_by_comma(en)
|
||
de_parts = _split_by_comma(de)
|
||
|
||
# Only split if we have multiple parts and counts match
|
||
should_split = False
|
||
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
|
||
# All parts must be short (word forms, not sentences)
|
||
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
|
||
# Do NOT split singular/plural pairs (2 parts that are
|
||
# forms of the same word)
|
||
if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
|
||
should_split = False
|
||
else:
|
||
should_split = True
|
||
|
||
if not should_split:
|
||
result.append(entry)
|
||
continue
|
||
|
||
# Split into individual entries
|
||
for k in range(len(en_parts)):
|
||
sub = dict(entry) # shallow copy
|
||
sub['english'] = en_parts[k].strip()
|
||
sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
|
||
sub['example'] = '' # examples get attached later
|
||
sub['split_from_comma'] = True
|
||
result.append(sub)
|
||
|
||
# Re-number
|
||
for i, e in enumerate(result):
|
||
e['row_index'] = i
|
||
|
||
return result
|
||
|
||
|
||
def _split_by_comma(text: str) -> List[str]:
|
||
"""Split text by commas, but not inside brackets [...] or parens (...)."""
|
||
if ',' not in text:
|
||
return [text]
|
||
|
||
parts = []
|
||
depth_bracket = 0
|
||
depth_paren = 0
|
||
current = []
|
||
|
||
for ch in text:
|
||
if ch == '[':
|
||
depth_bracket += 1
|
||
elif ch == ']':
|
||
depth_bracket = max(0, depth_bracket - 1)
|
||
elif ch == '(':
|
||
depth_paren += 1
|
||
elif ch == ')':
|
||
depth_paren = max(0, depth_paren - 1)
|
||
elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
|
||
parts.append(''.join(current).strip())
|
||
current = []
|
||
continue
|
||
current.append(ch)
|
||
|
||
if current:
|
||
parts.append(''.join(current).strip())
|
||
|
||
# Filter empty parts
|
||
return [p for p in parts if p]
|
||
|
||
|
||
# --- C. Example Sentence Attachment ---
|
||
|
||
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
|
||
"""Find the vocab entry whose English word(s) best match the example sentence.
|
||
|
||
Returns index into vocab_entries, or -1 if no match found.
|
||
Uses word stem overlap: "a broken arm" matches "broken" or "break".
|
||
"""
|
||
if not vocab_entries or not example_text:
|
||
return -1
|
||
|
||
example_lower = example_text.lower()
|
||
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
|
||
|
||
best_idx = -1
|
||
best_score = 0
|
||
|
||
for i, entry in enumerate(vocab_entries):
|
||
en = (entry.get('english', '') or '').lower()
|
||
if not en:
|
||
continue
|
||
|
||
# Extract vocab words (split on space, comma, newline)
|
||
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
|
||
|
||
# Score: how many vocab words appear in the example?
|
||
# Also check if example words share a common stem (first 4 chars)
|
||
direct_matches = vocab_words & example_words
|
||
score = len(direct_matches) * 10
|
||
|
||
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
|
||
if score == 0:
|
||
for vw in vocab_words:
|
||
if len(vw) < 3:
|
||
continue
|
||
stem = vw[:4] if len(vw) >= 4 else vw[:3]
|
||
for ew in example_words:
|
||
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
|
||
score += 5
|
||
break
|
||
|
||
if score > best_score:
|
||
best_score = score
|
||
best_idx = i
|
||
|
||
return best_idx if best_score > 0 else -1
|
||
|
||
|
||
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
"""Attach rows with EN text but no DE translation as examples to matching vocab entries.
|
||
|
||
Vocabulary worksheets often have:
|
||
Row 1: break, broke, broken / brechen, brach, gebrochen
|
||
Row 2: a broken arm (no DE → example for "broken")
|
||
Row 3: a broken plate (no DE → example for "broken")
|
||
Row 4: egg / Ei (has DE → new vocab entry)
|
||
|
||
Rules (deterministic, generic):
|
||
- A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
|
||
- Find the best matching vocab entry by checking which entry's English words
|
||
appear in the example sentence (semantic matching via word overlap)
|
||
- Fall back to the nearest preceding entry if no word match found
|
||
- Multiple examples get joined with " | "
|
||
"""
|
||
if not entries:
|
||
return entries
|
||
|
||
# Separate into vocab entries (have DE) and example candidates (no DE)
|
||
vocab_entries: List[Dict[str, Any]] = []
|
||
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
|
||
|
||
for entry in entries:
|
||
en = (entry.get('english', '') or '').strip()
|
||
de = (entry.get('german', '') or '').strip()
|
||
ex = (entry.get('example', '') or '').strip()
|
||
|
||
# Treat single-char DE as OCR noise, not real translation.
|
||
# "Ei" (2 chars) is a valid German word, so threshold is 1.
|
||
has_de = len(de) > 1
|
||
has_en = bool(en)
|
||
|
||
# Heuristic: a row without DE is an "example sentence" only if
|
||
# the EN text looks like a sentence (>= 4 words, or contains
|
||
# typical sentence punctuation). Short EN text (1-3 words) is
|
||
# more likely a vocab entry whose DE was missed by OCR.
|
||
_looks_like_sentence = (
|
||
len(en.split()) >= 4
|
||
or en.rstrip().endswith(('.', '!', '?'))
|
||
)
|
||
is_example_candidate = (
|
||
has_en and not has_de and _looks_like_sentence and vocab_entries
|
||
)
|
||
|
||
if is_example_candidate:
|
||
# This is an example sentence — find best matching vocab entry
|
||
example_text = en
|
||
|
||
match_idx = _find_best_vocab_match(en, vocab_entries)
|
||
if match_idx < 0:
|
||
# No word match → fall back to last entry
|
||
match_idx = len(vocab_entries) - 1
|
||
|
||
if match_idx not in examples_for:
|
||
examples_for[match_idx] = []
|
||
examples_for[match_idx].append(example_text)
|
||
else:
|
||
vocab_entries.append(entry)
|
||
|
||
# Attach examples to their matched vocab entries
|
||
for idx, example_list in examples_for.items():
|
||
if 0 <= idx < len(vocab_entries):
|
||
entry = vocab_entries[idx]
|
||
existing_ex = (entry.get('example', '') or '').strip()
|
||
new_examples = ' | '.join(example_list)
|
||
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
|
||
|
||
# Re-number
|
||
for i, e in enumerate(vocab_entries):
|
||
e['row_index'] = i
|
||
|
||
return vocab_entries
|
||
|
||
|
||
# --- D. Phonetic Bracket IPA Replacement ---
|
||
|
||
# Pattern: word [phonetic] or word (phonetic) — capture the word before brackets
|
||
_PHONETIC_BRACKET_RE = re.compile(
|
||
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*\[([^\]]*)\]'
|
||
)
|
||
|
||
|
||
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||
"""Look up IPA for a word using the selected pronunciation dictionary.
|
||
|
||
Args:
|
||
word: English word to look up.
|
||
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
|
||
|
||
Returns:
|
||
IPA string or None if not found.
|
||
"""
|
||
word_lower = word.lower().strip()
|
||
if not word_lower:
|
||
return None
|
||
|
||
if pronunciation == 'british' and _britfone_dict:
|
||
ipa = _britfone_dict.get(word_lower)
|
||
if ipa:
|
||
return ipa
|
||
# Fallback to American if not in Britfone
|
||
if _ipa_convert_american:
|
||
result = _ipa_convert_american(word_lower)
|
||
if result and '*' not in result:
|
||
return result
|
||
return None
|
||
|
||
if pronunciation == 'american' and _ipa_convert_american:
|
||
result = _ipa_convert_american(word_lower)
|
||
if result and '*' not in result:
|
||
return result
|
||
# Fallback to Britfone if not in CMU
|
||
if _britfone_dict:
|
||
ipa = _britfone_dict.get(word_lower)
|
||
if ipa:
|
||
return ipa
|
||
return None
|
||
|
||
# Try any available source
|
||
if _britfone_dict:
|
||
ipa = _britfone_dict.get(word_lower)
|
||
if ipa:
|
||
return ipa
|
||
if _ipa_convert_american:
|
||
result = _ipa_convert_american(word_lower)
|
||
if result and '*' not in result:
|
||
return result
|
||
|
||
return None
|
||
|
||
|
||
def _fix_phonetic_brackets(
|
||
entries: List[Dict[str, Any]],
|
||
pronunciation: str = 'british',
|
||
) -> List[Dict[str, Any]]:
|
||
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
|
||
|
||
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
|
||
- British: "dance [dˈɑːns]" (Britfone, MIT)
|
||
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
|
||
|
||
Only replaces if the word before brackets is found in the dictionary.
|
||
"""
|
||
if not IPA_AVAILABLE:
|
||
return entries
|
||
|
||
for entry in entries:
|
||
for field in ('english', 'german', 'example'):
|
||
text = entry.get(field, '') or ''
|
||
if '[' not in text:
|
||
continue
|
||
entry[field] = _replace_phonetics_in_text(text, pronunciation)
|
||
|
||
return entries
|
||
|
||
|
||
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
||
"""Replace [phonetic] after words with dictionary IPA."""
|
||
if not IPA_AVAILABLE:
|
||
return text
|
||
|
||
def replacer(match):
|
||
word = match.group(1)
|
||
ocr_phonetic = match.group(2)
|
||
|
||
# Skip if bracket content looks like regular text (has spaces + capitals)
|
||
if len(ocr_phonetic.split()) > 3:
|
||
return match.group(0) # Keep original
|
||
|
||
# Look up in IPA dictionary
|
||
ipa = _lookup_ipa(word, pronunciation)
|
||
if not ipa:
|
||
return match.group(0) # Keep original
|
||
|
||
return f"{word} [{ipa}]"
|
||
|
||
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||
|
||
|
||
def _assign_row_words_to_columns(
|
||
row: RowGeometry,
|
||
columns: List[PageRegion],
|
||
) -> Dict[int, List[Dict]]:
|
||
"""Assign each word in a row to exactly one column.
|
||
|
||
Uses a two-pass strategy:
|
||
1. Containment: if a word's center falls within a column's horizontal
|
||
bounds (with padding), assign it to that column.
|
||
2. Nearest center: for words not contained by any column, fall back to
|
||
nearest column center distance.
|
||
|
||
This prevents long sentences in wide columns (e.g. example) from having
|
||
their rightmost words stolen by an adjacent column.
|
||
|
||
Args:
|
||
row: Row with words (relative coordinates).
|
||
columns: Sorted list of columns (absolute coordinates).
|
||
|
||
Returns:
|
||
Dict mapping col_index → list of words assigned to that column.
|
||
"""
|
||
result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
|
||
|
||
if not row.words or not columns:
|
||
return result
|
||
|
||
left_x = row.x # content ROI left (absolute)
|
||
|
||
# Build non-overlapping column assignment ranges using midpoints.
|
||
# For adjacent columns, the boundary is the midpoint between them.
|
||
# This prevents words near column borders from being assigned to
|
||
# the wrong column (e.g. "We" at the start of an example sentence
|
||
# being stolen by the preceding DE column).
|
||
n = len(columns)
|
||
col_ranges_rel = [] # (assign_left, assign_right) per column
|
||
for ci, col in enumerate(columns):
|
||
col_left_rel = col.x - left_x
|
||
col_right_rel = col_left_rel + col.width
|
||
|
||
# Left boundary: midpoint to previous column, or 0
|
||
if ci == 0:
|
||
assign_left = 0
|
||
else:
|
||
prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
|
||
assign_left = (prev_right + col_left_rel) / 2
|
||
|
||
# Right boundary: midpoint to next column, or infinity (row width)
|
||
if ci == n - 1:
|
||
assign_right = row.width + 100 # generous for last column
|
||
else:
|
||
next_left = columns[ci + 1].x - left_x
|
||
assign_right = (col_right_rel + next_left) / 2
|
||
|
||
col_ranges_rel.append((assign_left, assign_right))
|
||
|
||
for w in row.words:
|
||
w_left = w['left']
|
||
w_right = w_left + w['width']
|
||
w_center_x = w_left + w['width'] / 2
|
||
|
||
# Primary: overlap-based matching — assign to column with most overlap.
|
||
# This is more robust than center-based for narrow columns (page_ref)
|
||
# where the last character's center may fall into the next column.
|
||
best_col = -1
|
||
best_overlap = 0
|
||
for ci, col in enumerate(columns):
|
||
col_left_rel = col.x - left_x
|
||
col_right_rel = col_left_rel + col.width
|
||
overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
|
||
if overlap > best_overlap:
|
||
best_overlap = overlap
|
||
best_col = ci
|
||
|
||
if best_col >= 0 and best_overlap > 0:
|
||
result[best_col].append(w)
|
||
else:
|
||
# Fallback: center-based range matching
|
||
assigned = False
|
||
for ci, (al, ar) in enumerate(col_ranges_rel):
|
||
if al <= w_center_x < ar:
|
||
result[ci].append(w)
|
||
assigned = True
|
||
break
|
||
|
||
if not assigned:
|
||
# Last resort: nearest column center
|
||
best_col = 0
|
||
col_left_0 = columns[0].x - left_x
|
||
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
|
||
for ci in range(1, n):
|
||
col_left = columns[ci].x - left_x
|
||
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
|
||
if dist < best_dist:
|
||
best_dist = dist
|
||
best_col = ci
|
||
result[best_col].append(w)
|
||
|
||
return result
|
||
|
||
|
||
# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
|
||
_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
|
||
_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
|
||
|
||
# Common short EN/DE words (2-3 chars). Tokens at the end of a cell
|
||
# that do NOT appear here are treated as trailing OCR noise.
|
||
_COMMON_SHORT_WORDS: set = {
|
||
# EN 1-2 letter
|
||
'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
|
||
'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
|
||
'or', 'so', 'to', 'up', 'us', 'we',
|
||
# EN 3 letter
|
||
'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
|
||
'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
|
||
'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
|
||
'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
|
||
'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
|
||
'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
|
||
'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
|
||
'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
|
||
'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
|
||
'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
|
||
'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
|
||
'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
|
||
'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
|
||
'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
|
||
'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
|
||
'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
|
||
'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
|
||
'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
|
||
'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
|
||
'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
|
||
'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
|
||
'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
|
||
'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
|
||
'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
|
||
'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
|
||
'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
|
||
'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
|
||
'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
|
||
'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
|
||
'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
|
||
'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
|
||
'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
|
||
'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
|
||
'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
|
||
'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
|
||
'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
|
||
'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
|
||
'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
|
||
'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
|
||
'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
|
||
'zap', 'zip', 'zoo',
|
||
# DE 2-3 letter
|
||
'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
|
||
'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
|
||
'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
|
||
'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
|
||
'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
|
||
'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
|
||
'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
|
||
'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
|
||
'wut', 'zum', 'zur',
|
||
}
|
||
|
||
# Known abbreviations found in EN/DE textbooks and dictionaries.
|
||
# Stored WITHOUT trailing period (the noise filter strips periods).
|
||
# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
|
||
_KNOWN_ABBREVIATIONS: set = {
|
||
# EN dictionary meta-words
|
||
'sth', 'sb', 'smth', 'smb', 'sbd',
|
||
# EN general
|
||
'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
|
||
'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
|
||
# EN references / textbook
|
||
'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
|
||
'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
|
||
'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
|
||
'ans', 'wb', 'tb', 'vocab',
|
||
# EN parts of speech / grammar
|
||
'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
|
||
'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
|
||
'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
|
||
'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
|
||
'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
|
||
'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
|
||
'syn', 'ant', 'opp', 'var', 'orig',
|
||
# EN titles
|
||
'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
|
||
# EN pronunciation
|
||
'br', 'am', 'brit', 'amer',
|
||
# EN units
|
||
'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
|
||
# DE general
|
||
'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
|
||
'bes', 'insb', 'insbes', 'bspw', 'ca',
|
||
'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
|
||
'inkl', 'exkl', 'zzgl', 'abzgl',
|
||
# DE references
|
||
'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
|
||
'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
|
||
's', 'sp', 'zit', 'zs', 'vlg',
|
||
# DE grammar
|
||
'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
|
||
'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
|
||
'trennb', 'untrennb', 'ugs', 'geh', 'pej',
|
||
# DE regional
|
||
'nordd', 'österr', 'schweiz',
|
||
# Linguistic
|
||
'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
|
||
'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
|
||
'count', 'uncount', 'indef', 'def', 'poss', 'demon',
|
||
}
|
||
|
||
|
||
def _is_noise_tail_token(token: str) -> bool:
|
||
"""Check if a token at the END of cell text is trailing OCR noise.
|
||
|
||
Trailing fragments are very common OCR artifacts from image edges,
|
||
borders, and neighbouring cells. This is more aggressive than a
|
||
general word filter: any short token that isn't in the dictionary
|
||
of common EN/DE words is considered noise.
|
||
|
||
Examples of noise: "Es)", "3", "ee", "B"
|
||
Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]"
|
||
"""
|
||
t = token.strip()
|
||
if not t:
|
||
return True
|
||
|
||
# Keep ellipsis
|
||
if t in ('...', '…'):
|
||
return False
|
||
|
||
# Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
|
||
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
|
||
return False
|
||
if t.endswith(']'):
|
||
return False
|
||
|
||
# Pure non-alpha → noise ("3", ")", "|")
|
||
alpha_chars = _RE_ALPHA.findall(t)
|
||
if not alpha_chars:
|
||
return True
|
||
|
||
# Extract only alpha characters for dictionary lookup
|
||
cleaned = ''.join(alpha_chars)
|
||
|
||
# Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
|
||
if cleaned.lower() in _KNOWN_ABBREVIATIONS:
|
||
return False
|
||
|
||
# Strip normal trailing punctuation before checking for internal noise.
|
||
stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes"
|
||
t_check = stripped_punct if stripped_punct else t
|
||
|
||
# Check for legitimate punctuation patterns vs. real noise.
|
||
# Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
|
||
# "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
|
||
# Noise: "3d", "B|", "x7"
|
||
# Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
|
||
# THEN check if residual contains only alpha characters.
|
||
t_inner = t_check
|
||
# Remove all parentheses, hyphens, slashes, and dots — these are normal
|
||
# in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
|
||
# "(zer)brechen", "wir/uns", "e.g."
|
||
t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
|
||
# Now check: does the inner form still have non-alpha noise?
|
||
inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
|
||
has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
|
||
|
||
# Long alpha words (4+ chars) without internal noise are likely real
|
||
if len(cleaned) >= 4 and not has_internal_noise:
|
||
return False
|
||
|
||
# Short words: check dictionary (uses only alpha chars)
|
||
if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
|
||
return False
|
||
|
||
# Default: short or suspicious → noise
|
||
return True
|
||
|
||
|
||
def _is_garbage_text(text: str) -> bool:
|
||
"""Check if entire cell text is OCR garbage from image areas.
|
||
|
||
Garbage text = no recognizable dictionary word. Catches
|
||
"(ci]oeu", "uanoaain." etc.
|
||
"""
|
||
words = _RE_REAL_WORD.findall(text)
|
||
if not words:
|
||
# Check if any token is a known abbreviation (e.g. "e.g.")
|
||
alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
|
||
if alpha_only in _KNOWN_ABBREVIATIONS:
|
||
return False
|
||
return True
|
||
|
||
for w in words:
|
||
wl = w.lower()
|
||
# Known short word or abbreviation → not garbage
|
||
if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
|
||
return False
|
||
# Long word (>= 4 chars): check vowel/consonant ratio.
|
||
# Real EN/DE words have 20-60% vowels. Garbage like "uanoaain"
|
||
# or "cioeu" has unusual ratios (too many or too few vowels).
|
||
if len(wl) >= 4:
|
||
vowels = sum(1 for c in wl if c in 'aeiouäöü')
|
||
ratio = vowels / len(wl)
|
||
if 0.15 <= ratio <= 0.65:
|
||
return False # plausible vowel ratio → real word
|
||
|
||
return True
|
||
|
||
|
||
def _clean_cell_text(text: str) -> str:
|
||
"""Remove OCR noise from cell text. Generic filters:
|
||
|
||
1. If the entire text has no real alphabetic word (>= 2 letters), clear.
|
||
2. If the entire text is garbage (no dictionary word), clear.
|
||
3. Strip trailing noise tokens from the end of the text.
|
||
"""
|
||
stripped = text.strip()
|
||
if not stripped:
|
||
return ''
|
||
|
||
# --- Filter 1: No real word at all ---
|
||
if not _RE_REAL_WORD.search(stripped):
|
||
# Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
|
||
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
|
||
if alpha_only not in _KNOWN_ABBREVIATIONS:
|
||
return ''
|
||
|
||
# --- Filter 2: Entire text is garbage ---
|
||
if _is_garbage_text(stripped):
|
||
return ''
|
||
|
||
# --- Filter 3: Strip trailing noise tokens ---
|
||
tokens = stripped.split()
|
||
while tokens and _is_noise_tail_token(tokens[-1]):
|
||
tokens.pop()
|
||
if not tokens:
|
||
return ''
|
||
|
||
return ' '.join(tokens)
|
||
|
||
|
||
def _clean_cell_text_lite(text: str) -> str:
|
||
"""Simplified noise filter for cell-first OCR (isolated cell crops).
|
||
|
||
Since each cell is OCR'd in isolation (no neighbour content visible),
|
||
trailing-noise stripping is unnecessary. Only 2 filters remain:
|
||
|
||
1. No real alphabetic word (>= 2 letters) and not a known abbreviation → empty.
|
||
2. Entire text is garbage (no dictionary word) → empty.
|
||
"""
|
||
stripped = text.strip()
|
||
if not stripped:
|
||
return ''
|
||
|
||
# --- Filter 1: No real word at all ---
|
||
if not _RE_REAL_WORD.search(stripped):
|
||
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
|
||
if alpha_only not in _KNOWN_ABBREVIATIONS:
|
||
return ''
|
||
|
||
# --- Filter 2: Entire text is garbage ---
|
||
if _is_garbage_text(stripped):
|
||
return ''
|
||
|
||
return stripped
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Cell-First OCR (v2) — each cell cropped and OCR'd in isolation
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _ocr_cell_crop(
|
||
row_idx: int,
|
||
col_idx: int,
|
||
row: RowGeometry,
|
||
col: PageRegion,
|
||
ocr_img: np.ndarray,
|
||
img_bgr: Optional[np.ndarray],
|
||
img_w: int,
|
||
img_h: int,
|
||
engine_name: str,
|
||
lang: str,
|
||
lang_map: Dict[str, str],
|
||
) -> Dict[str, Any]:
|
||
"""OCR a single cell by cropping the exact column×row intersection.
|
||
|
||
No padding beyond cell boundaries → no neighbour bleeding.
|
||
"""
|
||
# Display bbox: exact column × row intersection
|
||
disp_x = col.x
|
||
disp_y = row.y
|
||
disp_w = col.width
|
||
disp_h = row.height
|
||
|
||
# Crop boundaries (clamped to image)
|
||
cx = max(0, disp_x)
|
||
cy = max(0, disp_y)
|
||
cw = min(disp_w, img_w - cx)
|
||
ch = min(disp_h, img_h - cy)
|
||
|
||
empty_cell = {
|
||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||
'row_index': row_idx,
|
||
'col_index': col_idx,
|
||
'col_type': col.type,
|
||
'text': '',
|
||
'confidence': 0.0,
|
||
'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
|
||
'bbox_pct': {
|
||
'x': round(disp_x / img_w * 100, 2) if img_w else 0,
|
||
'y': round(disp_y / img_h * 100, 2) if img_h else 0,
|
||
'w': round(disp_w / img_w * 100, 2) if img_w else 0,
|
||
'h': round(disp_h / img_h * 100, 2) if img_h else 0,
|
||
},
|
||
'ocr_engine': 'cell_crop_v2',
|
||
}
|
||
|
||
if cw <= 0 or ch <= 0:
|
||
return empty_cell
|
||
|
||
# --- Pixel-density check: skip truly empty cells ---
|
||
if ocr_img is not None:
|
||
crop = ocr_img[cy:cy + ch, cx:cx + cw]
|
||
if crop.size > 0:
|
||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||
if dark_ratio < 0.005:
|
||
return empty_cell
|
||
|
||
# --- Prepare crop for OCR ---
|
||
cell_lang = lang_map.get(col.type, lang)
|
||
psm = _select_psm_for_column(col.type, col.width, row.height)
|
||
text = ''
|
||
avg_conf = 0.0
|
||
used_engine = 'cell_crop_v2'
|
||
|
||
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
|
||
cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
|
||
words = ocr_region_trocr(img_bgr, cell_region,
|
||
handwritten=(engine_name == "trocr-handwritten"))
|
||
elif engine_name == "lighton" and img_bgr is not None:
|
||
cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
|
||
words = ocr_region_lighton(img_bgr, cell_region)
|
||
elif engine_name == "rapid" and img_bgr is not None:
|
||
cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
|
||
words = ocr_region_rapid(img_bgr, cell_region)
|
||
else:
|
||
# Tesseract: upscale tiny crops for better recognition
|
||
if ocr_img is not None:
|
||
crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
|
||
upscaled = _ensure_minimum_crop_size(crop_slice)
|
||
up_h, up_w = upscaled.shape[:2]
|
||
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
|
||
words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
|
||
# Remap word positions back to original image coordinates
|
||
if words and (up_w != cw or up_h != ch):
|
||
sx = cw / max(up_w, 1)
|
||
sy = ch / max(up_h, 1)
|
||
for w in words:
|
||
w['left'] = int(w['left'] * sx) + cx
|
||
w['top'] = int(w['top'] * sy) + cy
|
||
w['width'] = int(w['width'] * sx)
|
||
w['height'] = int(w['height'] * sy)
|
||
elif words:
|
||
for w in words:
|
||
w['left'] += cx
|
||
w['top'] += cy
|
||
else:
|
||
words = []
|
||
|
||
# Filter low-confidence words
|
||
_MIN_WORD_CONF = 30
|
||
if words:
|
||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||
|
||
if words:
|
||
y_tol = max(15, ch)
|
||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||
|
||
# --- PSM 7 fallback for still-empty Tesseract cells ---
|
||
if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
|
||
crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
|
||
upscaled = _ensure_minimum_crop_size(crop_slice)
|
||
up_h, up_w = upscaled.shape[:2]
|
||
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
|
||
psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
|
||
if psm7_words:
|
||
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||
if psm7_words:
|
||
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
|
||
if p7_text.strip():
|
||
text = p7_text
|
||
avg_conf = round(
|
||
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
|
||
)
|
||
used_engine = 'cell_crop_v2_psm7'
|
||
|
||
# --- Noise filter ---
|
||
if text.strip():
|
||
text = _clean_cell_text_lite(text)
|
||
if not text:
|
||
avg_conf = 0.0
|
||
|
||
result = dict(empty_cell)
|
||
result['text'] = text
|
||
result['confidence'] = avg_conf
|
||
result['ocr_engine'] = used_engine
|
||
return result
|
||
|
||
|
||
def build_cell_grid_v2(
|
||
ocr_img: np.ndarray,
|
||
column_regions: List[PageRegion],
|
||
row_geometries: List[RowGeometry],
|
||
img_w: int,
|
||
img_h: int,
|
||
lang: str = "eng+deu",
|
||
ocr_engine: str = "auto",
|
||
img_bgr: Optional[np.ndarray] = None,
|
||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||
"""Cell-First Grid: crop each cell in isolation, then OCR.
|
||
|
||
Drop-in replacement for build_cell_grid() — same signature & return type.
|
||
No full-page word assignment; each cell is OCR'd from its own crop.
|
||
"""
|
||
# Resolve engine
|
||
use_rapid = False
|
||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||
engine_name = ocr_engine
|
||
elif ocr_engine == "auto":
|
||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
elif ocr_engine == "rapid":
|
||
if not RAPIDOCR_AVAILABLE:
|
||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||
else:
|
||
use_rapid = True
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
else:
|
||
engine_name = "tesseract"
|
||
|
||
logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}'")
|
||
|
||
# Filter to content rows only
|
||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||
if not content_rows:
|
||
logger.warning("build_cell_grid_v2: no content rows found")
|
||
return [], []
|
||
|
||
# Filter phantom rows (word_count=0) and artifact rows
|
||
before = len(content_rows)
|
||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||
skipped = before - len(content_rows)
|
||
if skipped > 0:
|
||
logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
|
||
if not content_rows:
|
||
logger.warning("build_cell_grid_v2: no content rows with words found")
|
||
return [], []
|
||
|
||
before_art = len(content_rows)
|
||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||
artifact_skipped = before_art - len(content_rows)
|
||
if artifact_skipped > 0:
|
||
logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
|
||
if not content_rows:
|
||
logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
|
||
return [], []
|
||
|
||
# Filter columns
|
||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
|
||
'margin_bottom', 'margin_left', 'margin_right'}
|
||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||
if not relevant_cols:
|
||
logger.warning("build_cell_grid_v2: no usable columns found")
|
||
return [], []
|
||
|
||
# Heal row gaps — use header/footer boundaries (NOT column bounds!)
|
||
# In Cell-First OCR, the crop IS the OCR input, so extending into
|
||
# header/footer means OCR'ing header/footer text ("VOCABULARY", page nums).
|
||
content_rows.sort(key=lambda r: r.y)
|
||
header_rows = [r for r in row_geometries if r.row_type == 'header']
|
||
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
|
||
if header_rows:
|
||
top_bound = max(r.y + r.height for r in header_rows)
|
||
else:
|
||
top_bound = content_rows[0].y
|
||
if footer_rows:
|
||
bottom_bound = min(r.y for r in footer_rows)
|
||
else:
|
||
bottom_bound = content_rows[-1].y + content_rows[-1].height
|
||
|
||
_heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
|
||
|
||
relevant_cols.sort(key=lambda c: c.x)
|
||
|
||
columns_meta = [
|
||
{'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
|
||
for ci, c in enumerate(relevant_cols)
|
||
]
|
||
|
||
lang_map = {
|
||
'column_en': 'eng',
|
||
'column_de': 'deu',
|
||
'column_example': 'eng+deu',
|
||
}
|
||
|
||
# --- Parallel OCR with ThreadPoolExecutor ---
|
||
# Tesseract is single-threaded per call, so we benefit from parallelism.
|
||
# ~40 rows × 4 cols = 160 cells, ~50% empty (density skip) → ~80 OCR calls.
|
||
cells: List[Dict[str, Any]] = []
|
||
cell_tasks = []
|
||
|
||
for row_idx, row in enumerate(content_rows):
|
||
for col_idx, col in enumerate(relevant_cols):
|
||
cell_tasks.append((row_idx, col_idx, row, col))
|
||
|
||
max_workers = 4 if engine_name == "tesseract" else 2
|
||
|
||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||
futures = {
|
||
pool.submit(
|
||
_ocr_cell_crop,
|
||
ri, ci, row, col,
|
||
ocr_img, img_bgr, img_w, img_h,
|
||
engine_name, lang, lang_map,
|
||
): (ri, ci)
|
||
for ri, ci, row, col in cell_tasks
|
||
}
|
||
|
||
for future in as_completed(futures):
|
||
try:
|
||
cell = future.result()
|
||
cells.append(cell)
|
||
except Exception as e:
|
||
ri, ci = futures[future]
|
||
logger.error(f"build_cell_grid_v2: cell R{ri:02d}_C{ci} failed: {e}")
|
||
|
||
# Sort cells by (row_index, col_index) since futures complete out of order
|
||
cells.sort(key=lambda c: (c['row_index'], c['col_index']))
|
||
|
||
# Remove all-empty rows
|
||
rows_with_text: set = set()
|
||
for cell in cells:
|
||
if cell['text'].strip():
|
||
rows_with_text.add(cell['row_index'])
|
||
before_filter = len(cells)
|
||
cells = [c for c in cells if c['row_index'] in rows_with_text]
|
||
empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
|
||
if empty_rows_removed > 0:
|
||
logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
|
||
|
||
logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
|
||
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
||
f"engine={engine_name}")
|
||
|
||
return cells, columns_meta
|
||
|
||
|
||
def build_cell_grid_v2_streaming(
|
||
ocr_img: np.ndarray,
|
||
column_regions: List[PageRegion],
|
||
row_geometries: List[RowGeometry],
|
||
img_w: int,
|
||
img_h: int,
|
||
lang: str = "eng+deu",
|
||
ocr_engine: str = "auto",
|
||
img_bgr: Optional[np.ndarray] = None,
|
||
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
|
||
"""Streaming variant of build_cell_grid_v2 — yields each cell as OCR'd.
|
||
|
||
Yields:
|
||
(cell_dict, columns_meta, total_cells)
|
||
"""
|
||
# Resolve engine
|
||
use_rapid = False
|
||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||
engine_name = ocr_engine
|
||
elif ocr_engine == "auto":
|
||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
elif ocr_engine == "rapid":
|
||
if not RAPIDOCR_AVAILABLE:
|
||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||
else:
|
||
use_rapid = True
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
else:
|
||
engine_name = "tesseract"
|
||
|
||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||
if not content_rows:
|
||
return
|
||
|
||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||
if not content_rows:
|
||
return
|
||
|
||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
|
||
'margin_bottom', 'margin_left', 'margin_right'}
|
||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||
if not relevant_cols:
|
||
return
|
||
|
||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||
if not content_rows:
|
||
return
|
||
|
||
# Use header/footer boundaries for heal_row_gaps (same as build_cell_grid_v2)
|
||
content_rows.sort(key=lambda r: r.y)
|
||
header_rows = [r for r in row_geometries if r.row_type == 'header']
|
||
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
|
||
if header_rows:
|
||
top_bound = max(r.y + r.height for r in header_rows)
|
||
else:
|
||
top_bound = content_rows[0].y
|
||
if footer_rows:
|
||
bottom_bound = min(r.y for r in footer_rows)
|
||
else:
|
||
bottom_bound = content_rows[-1].y + content_rows[-1].height
|
||
|
||
_heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
|
||
|
||
relevant_cols.sort(key=lambda c: c.x)
|
||
|
||
columns_meta = [
|
||
{'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
|
||
for ci, c in enumerate(relevant_cols)
|
||
]
|
||
|
||
lang_map = {
|
||
'column_en': 'eng',
|
||
'column_de': 'deu',
|
||
'column_example': 'eng+deu',
|
||
}
|
||
|
||
total_cells = len(content_rows) * len(relevant_cols)
|
||
|
||
for row_idx, row in enumerate(content_rows):
|
||
for col_idx, col in enumerate(relevant_cols):
|
||
cell = _ocr_cell_crop(
|
||
row_idx, col_idx, row, col,
|
||
ocr_img, img_bgr, img_w, img_h,
|
||
engine_name, lang, lang_map,
|
||
)
|
||
yield cell, columns_meta, total_cells
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Narrow-column OCR helpers (Proposal B) — DEPRECATED (kept for legacy build_cell_grid)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _compute_cell_padding(col_width: int, img_w: int) -> int:
|
||
"""Adaptive padding for OCR crops based on column width.
|
||
|
||
Narrow columns (page_ref, marker) need more surrounding context so
|
||
Tesseract can segment characters correctly. Wide columns keep the
|
||
minimal 4 px padding to avoid pulling in neighbours.
|
||
"""
|
||
col_pct = col_width / img_w * 100 if img_w > 0 else 100
|
||
if col_pct < 5:
|
||
return max(20, col_width // 2)
|
||
if col_pct < 10:
|
||
return max(12, col_width // 4)
|
||
if col_pct < 15:
|
||
return 8
|
||
return 4
|
||
|
||
|
||
def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
|
||
max_scale: int = 3) -> np.ndarray:
|
||
"""Upscale tiny crops so Tesseract gets enough pixel data.
|
||
|
||
If either dimension is below *min_dim*, the crop is bicubic-upscaled
|
||
so the smallest dimension reaches *min_dim* (capped at *max_scale* ×).
|
||
"""
|
||
h, w = crop.shape[:2]
|
||
if h >= min_dim and w >= min_dim:
|
||
return crop
|
||
scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
|
||
if scale <= 1.0:
|
||
return crop
|
||
new_w = int(w * scale)
|
||
new_h = int(h * scale)
|
||
return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
||
|
||
|
||
def _select_psm_for_column(col_type: str, col_width: int,
|
||
row_height: int) -> int:
|
||
"""Choose the best Tesseract PSM for a given column geometry.
|
||
|
||
- page_ref columns are almost always single short tokens → PSM 8
|
||
- Very narrow or short cells → PSM 7 (single text line)
|
||
- Everything else → PSM 6 (uniform block)
|
||
"""
|
||
if col_type in ('page_ref', 'marker'):
|
||
return 8 # single word
|
||
if col_width < 100 or row_height < 30:
|
||
return 7 # single line
|
||
return 6 # uniform block
|
||
|
||
|
||
def _ocr_single_cell(
|
||
row_idx: int,
|
||
col_idx: int,
|
||
row: RowGeometry,
|
||
col: PageRegion,
|
||
ocr_img: np.ndarray,
|
||
img_bgr: Optional[np.ndarray],
|
||
img_w: int,
|
||
img_h: int,
|
||
use_rapid: bool,
|
||
engine_name: str,
|
||
lang: str,
|
||
lang_map: Dict[str, str],
|
||
preassigned_words: Optional[List[Dict]] = None,
|
||
) -> Dict[str, Any]:
|
||
"""Populate a single cell (column x row intersection) via word lookup."""
|
||
# Display bbox: exact column × row intersection (no padding)
|
||
disp_x = col.x
|
||
disp_y = row.y
|
||
disp_w = col.width
|
||
disp_h = row.height
|
||
|
||
# OCR crop: adaptive padding — narrow columns get more context
|
||
pad = _compute_cell_padding(col.width, img_w)
|
||
cell_x = max(0, col.x - pad)
|
||
cell_y = max(0, row.y - pad)
|
||
cell_w = min(col.width + 2 * pad, img_w - cell_x)
|
||
cell_h = min(row.height + 2 * pad, img_h - cell_y)
|
||
is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
|
||
|
||
if disp_w <= 0 or disp_h <= 0:
|
||
return {
|
||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||
'row_index': row_idx,
|
||
'col_index': col_idx,
|
||
'col_type': col.type,
|
||
'text': '',
|
||
'confidence': 0.0,
|
||
'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
|
||
'bbox_pct': {
|
||
'x': round(col.x / img_w * 100, 2),
|
||
'y': round(row.y / img_h * 100, 2),
|
||
'w': round(col.width / img_w * 100, 2),
|
||
'h': round(row.height / img_h * 100, 2),
|
||
},
|
||
'ocr_engine': 'word_lookup',
|
||
}
|
||
|
||
# --- PRIMARY: Word-lookup from full-page Tesseract ---
|
||
words = preassigned_words if preassigned_words is not None else []
|
||
used_engine = 'word_lookup'
|
||
|
||
# Filter low-confidence words (OCR noise from images/artifacts).
|
||
# Tesseract gives low confidence to misread image edges, borders,
|
||
# and other non-text elements.
|
||
_MIN_WORD_CONF = 30
|
||
if words:
|
||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||
|
||
if words:
|
||
# Use row height as Y-tolerance so all words within a single row
|
||
# are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
|
||
# across two lines due to slight vertical offset).
|
||
y_tol = max(15, row.height)
|
||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||
else:
|
||
text = ''
|
||
avg_conf = 0.0
|
||
|
||
# --- FALLBACK: Cell-OCR for empty cells ---
|
||
# Full-page Tesseract can miss small or isolated words (e.g. "Ei").
|
||
# Re-run OCR on the cell crop to catch what word-lookup missed.
|
||
# To avoid wasting time on truly empty cells, check pixel density first:
|
||
# only run Tesseract if the cell crop contains enough dark pixels to
|
||
# plausibly contain text.
|
||
_run_fallback = False
|
||
if not text.strip() and cell_w > 0 and cell_h > 0:
|
||
if ocr_img is not None:
|
||
crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
||
if crop.size > 0:
|
||
# Threshold: pixels darker than 180 (on 0-255 grayscale).
|
||
# Use 0.5% to catch even small text like "Ei" (2 chars)
|
||
# in an otherwise empty cell.
|
||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||
_run_fallback = dark_ratio > 0.005
|
||
if _run_fallback:
|
||
# For narrow columns, upscale the crop before OCR
|
||
if is_narrow and ocr_img is not None:
|
||
_crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
||
_upscaled = _ensure_minimum_crop_size(_crop_slice)
|
||
if _upscaled is not _crop_slice:
|
||
# Build a temporary full-size image with the upscaled crop
|
||
# placed at origin so ocr_region can crop it cleanly.
|
||
_up_h, _up_w = _upscaled.shape[:2]
|
||
_tmp_region = PageRegion(
|
||
type=col.type, x=0, y=0, width=_up_w, height=_up_h,
|
||
)
|
||
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
|
||
cell_lang = lang_map.get(col.type, lang)
|
||
fallback_words = ocr_region(_upscaled, _tmp_region,
|
||
lang=cell_lang, psm=_cell_psm)
|
||
# Remap word positions back to original image coordinates
|
||
_sx = cell_w / max(_up_w, 1)
|
||
_sy = cell_h / max(_up_h, 1)
|
||
for _fw in (fallback_words or []):
|
||
_fw['left'] = int(_fw['left'] * _sx) + cell_x
|
||
_fw['top'] = int(_fw['top'] * _sy) + cell_y
|
||
_fw['width'] = int(_fw['width'] * _sx)
|
||
_fw['height'] = int(_fw['height'] * _sy)
|
||
else:
|
||
# No upscaling needed, use adaptive PSM
|
||
cell_region = PageRegion(
|
||
type=col.type, x=cell_x, y=cell_y,
|
||
width=cell_w, height=cell_h,
|
||
)
|
||
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
|
||
cell_lang = lang_map.get(col.type, lang)
|
||
fallback_words = ocr_region(ocr_img, cell_region,
|
||
lang=cell_lang, psm=_cell_psm)
|
||
else:
|
||
cell_region = PageRegion(
|
||
type=col.type,
|
||
x=cell_x, y=cell_y,
|
||
width=cell_w, height=cell_h,
|
||
)
|
||
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
|
||
fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
|
||
elif engine_name == "lighton" and img_bgr is not None:
|
||
fallback_words = ocr_region_lighton(img_bgr, cell_region)
|
||
elif use_rapid and img_bgr is not None:
|
||
fallback_words = ocr_region_rapid(img_bgr, cell_region)
|
||
else:
|
||
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
|
||
cell_lang = lang_map.get(col.type, lang)
|
||
fallback_words = ocr_region(ocr_img, cell_region,
|
||
lang=cell_lang, psm=_cell_psm)
|
||
|
||
if fallback_words:
|
||
# Apply same confidence filter to fallback words
|
||
fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||
if fallback_words:
|
||
fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
|
||
fb_y_tol = max(10, int(fb_avg_h * 0.5))
|
||
fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
|
||
if fb_text.strip():
|
||
text = fb_text
|
||
avg_conf = round(
|
||
sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
|
||
)
|
||
used_engine = 'cell_ocr_fallback'
|
||
|
||
# --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
|
||
if not text.strip() and _run_fallback and not use_rapid:
|
||
_fb_region = PageRegion(
|
||
type=col.type, x=cell_x, y=cell_y,
|
||
width=cell_w, height=cell_h,
|
||
)
|
||
cell_lang = lang_map.get(col.type, lang)
|
||
psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
|
||
if psm7_words:
|
||
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||
if psm7_words:
|
||
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
|
||
if p7_text.strip():
|
||
text = p7_text
|
||
avg_conf = round(
|
||
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
|
||
)
|
||
used_engine = 'cell_ocr_psm7'
|
||
|
||
# --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
|
||
# If a narrow cell is still empty, OCR the entire row strip with
|
||
# RapidOCR (which handles small text better) and assign words by
|
||
# X-position overlap with this column.
|
||
if not text.strip() and is_narrow and img_bgr is not None:
|
||
row_region = PageRegion(
|
||
type='_row_strip', x=0, y=row.y,
|
||
width=img_w, height=row.height,
|
||
)
|
||
strip_words = ocr_region_rapid(img_bgr, row_region)
|
||
if strip_words:
|
||
# Filter to words overlapping this column's X-range
|
||
col_left = col.x
|
||
col_right = col.x + col.width
|
||
col_words = []
|
||
for sw in strip_words:
|
||
sw_left = sw.get('left', 0)
|
||
sw_right = sw_left + sw.get('width', 0)
|
||
overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
|
||
if overlap > sw.get('width', 1) * 0.3:
|
||
col_words.append(sw)
|
||
if col_words:
|
||
col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||
if col_words:
|
||
rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
|
||
if rs_text.strip():
|
||
text = rs_text
|
||
avg_conf = round(
|
||
sum(w['conf'] for w in col_words) / len(col_words), 1
|
||
)
|
||
used_engine = 'row_strip_rapid'
|
||
|
||
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
|
||
if text.strip():
|
||
text = _clean_cell_text(text)
|
||
if not text:
|
||
avg_conf = 0.0
|
||
|
||
return {
|
||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||
'row_index': row_idx,
|
||
'col_index': col_idx,
|
||
'col_type': col.type,
|
||
'text': text,
|
||
'confidence': avg_conf,
|
||
'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
|
||
'bbox_pct': {
|
||
'x': round(disp_x / img_w * 100, 2),
|
||
'y': round(disp_y / img_h * 100, 2),
|
||
'w': round(disp_w / img_w * 100, 2),
|
||
'h': round(disp_h / img_h * 100, 2),
|
||
},
|
||
'ocr_engine': used_engine,
|
||
}
|
||
|
||
|
||
def _is_artifact_row(row: RowGeometry) -> bool:
|
||
"""Return True if this row contains only scan artifacts, not real text.
|
||
|
||
Artifact rows (scanner shadows, noise) typically produce only single-character
|
||
detections. A real content row always has at least one token with 2+ characters.
|
||
"""
|
||
if row.word_count == 0:
|
||
return True
|
||
texts = [w.get('text', '').strip() for w in row.words]
|
||
return all(len(t) <= 1 for t in texts)
|
||
|
||
|
||
def _heal_row_gaps(
|
||
rows: List[RowGeometry],
|
||
top_bound: int,
|
||
bottom_bound: int,
|
||
) -> None:
|
||
"""Expand row y/height to fill vertical gaps caused by removed adjacent rows.
|
||
|
||
After filtering out empty or artifact rows, remaining content rows may have
|
||
gaps between them where the removed rows used to be. This function mutates
|
||
each row to extend upward/downward to the midpoint of such gaps so that
|
||
OCR crops cover the full available content area.
|
||
|
||
The first row always extends to top_bound; the last row to bottom_bound.
|
||
"""
|
||
if not rows:
|
||
return
|
||
rows.sort(key=lambda r: r.y)
|
||
n = len(rows)
|
||
orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
|
||
|
||
for i, row in enumerate(rows):
|
||
# New top: midpoint between previous row's bottom and this row's top
|
||
if i == 0:
|
||
new_top = top_bound
|
||
else:
|
||
prev_bot = orig[i - 1][1]
|
||
my_top = orig[i][0]
|
||
gap = my_top - prev_bot
|
||
new_top = prev_bot + gap // 2 if gap > 1 else my_top
|
||
|
||
# New bottom: midpoint between this row's bottom and next row's top
|
||
if i == n - 1:
|
||
new_bottom = bottom_bound
|
||
else:
|
||
my_bot = orig[i][1]
|
||
next_top = orig[i + 1][0]
|
||
gap = next_top - my_bot
|
||
new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
|
||
|
||
row.y = new_top
|
||
row.height = max(5, new_bottom - new_top)
|
||
|
||
logger.debug(
|
||
f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
|
||
f"(bounds: top={top_bound}, bottom={bottom_bound})"
|
||
)
|
||
|
||
|
||
def build_cell_grid(
|
||
ocr_img: np.ndarray,
|
||
column_regions: List[PageRegion],
|
||
row_geometries: List[RowGeometry],
|
||
img_w: int,
|
||
img_h: int,
|
||
lang: str = "eng+deu",
|
||
ocr_engine: str = "auto",
|
||
img_bgr: Optional[np.ndarray] = None,
|
||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||
"""Generic Cell-Grid: Columns × Rows → cells with OCR text.
|
||
|
||
This is the layout-agnostic foundation. Every column (except column_ignore)
|
||
is intersected with every content row to produce numbered cells.
|
||
|
||
Args:
|
||
ocr_img: Binarized full-page image (for Tesseract).
|
||
column_regions: Classified columns from Step 3 (PageRegion list).
|
||
row_geometries: Rows from Step 4 (RowGeometry list).
|
||
img_w: Image width in pixels.
|
||
img_h: Image height in pixels.
|
||
lang: Default Tesseract language.
|
||
ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'.
|
||
img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR).
|
||
|
||
Returns:
|
||
(cells, columns_meta) where cells is a list of cell dicts and
|
||
columns_meta describes the columns used.
|
||
"""
|
||
# Resolve engine choice
|
||
use_rapid = False
|
||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||
engine_name = ocr_engine
|
||
elif ocr_engine == "auto":
|
||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
elif ocr_engine == "rapid":
|
||
if not RAPIDOCR_AVAILABLE:
|
||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||
else:
|
||
use_rapid = True
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
else:
|
||
engine_name = "tesseract"
|
||
|
||
logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
|
||
|
||
# Filter to content rows only (skip header/footer)
|
||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||
if not content_rows:
|
||
logger.warning("build_cell_grid: no content rows found")
|
||
return [], []
|
||
|
||
# Filter phantom rows: rows with no Tesseract words assigned are
|
||
# inter-line whitespace gaps that would produce garbage OCR.
|
||
before = len(content_rows)
|
||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||
skipped = before - len(content_rows)
|
||
if skipped > 0:
|
||
logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
|
||
if not content_rows:
|
||
logger.warning("build_cell_grid: no content rows with words found")
|
||
return [], []
|
||
|
||
# Use columns only — skip ignore, header, footer, page_ref
|
||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||
if not relevant_cols:
|
||
logger.warning("build_cell_grid: no usable columns found")
|
||
return [], []
|
||
|
||
# Filter artifact rows: rows whose detected words are all single characters
|
||
# are caused by scanner shadows or noise, not real text.
|
||
before_art = len(content_rows)
|
||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||
artifact_skipped = before_art - len(content_rows)
|
||
if artifact_skipped > 0:
|
||
logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
|
||
if not content_rows:
|
||
logger.warning("build_cell_grid: no content rows after artifact filtering")
|
||
return [], []
|
||
|
||
# Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
|
||
# to fill the space so OCR crops are not artificially narrow.
|
||
_heal_row_gaps(
|
||
content_rows,
|
||
top_bound=min(c.y for c in relevant_cols),
|
||
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||
)
|
||
|
||
# Sort columns left-to-right
|
||
relevant_cols.sort(key=lambda c: c.x)
|
||
|
||
# Build columns_meta
|
||
columns_meta = [
|
||
{
|
||
'index': col_idx,
|
||
'type': col.type,
|
||
'x': col.x,
|
||
'width': col.width,
|
||
}
|
||
for col_idx, col in enumerate(relevant_cols)
|
||
]
|
||
|
||
# Choose OCR language per column type (Tesseract only)
|
||
lang_map = {
|
||
'column_en': 'eng',
|
||
'column_de': 'deu',
|
||
'column_example': 'eng+deu',
|
||
}
|
||
|
||
cells: List[Dict[str, Any]] = []
|
||
|
||
for row_idx, row in enumerate(content_rows):
|
||
# Pre-assign each word to exactly one column (nearest center)
|
||
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
||
for col_idx, col in enumerate(relevant_cols):
|
||
cell = _ocr_single_cell(
|
||
row_idx, col_idx, row, col,
|
||
ocr_img, img_bgr, img_w, img_h,
|
||
use_rapid, engine_name, lang, lang_map,
|
||
preassigned_words=col_words[col_idx],
|
||
)
|
||
cells.append(cell)
|
||
|
||
# --- BATCH FALLBACK: re-OCR empty cells by column strip ---
|
||
# Collect cells that are still empty but have visible pixels.
|
||
# Instead of calling Tesseract once per cell (expensive), crop an entire
|
||
# column strip and run OCR once, then assign words to cells by Y position.
|
||
empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices]
|
||
for ci, cell in enumerate(cells):
|
||
if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
|
||
bpx = cell['bbox_px']
|
||
x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
|
||
if w > 0 and h > 0 and ocr_img is not None:
|
||
crop = ocr_img[y:y + h, x:x + w]
|
||
if crop.size > 0:
|
||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||
if dark_ratio > 0.005:
|
||
empty_by_col.setdefault(cell['col_index'], []).append(ci)
|
||
|
||
for col_idx, cell_indices in empty_by_col.items():
|
||
if len(cell_indices) < 3:
|
||
continue # Not worth batching for < 3 cells
|
||
|
||
# Find the column strip bounding box (union of all empty cell bboxes)
|
||
min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
|
||
max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
|
||
col_x = cells[cell_indices[0]]['bbox_px']['x']
|
||
col_w = cells[cell_indices[0]]['bbox_px']['w']
|
||
|
||
strip_region = PageRegion(
|
||
type=relevant_cols[col_idx].type,
|
||
x=col_x, y=min_y,
|
||
width=col_w, height=max_y_h - min_y,
|
||
)
|
||
strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
|
||
|
||
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
|
||
strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
|
||
elif engine_name == "lighton" and img_bgr is not None:
|
||
strip_words = ocr_region_lighton(img_bgr, strip_region)
|
||
elif use_rapid and img_bgr is not None:
|
||
strip_words = ocr_region_rapid(img_bgr, strip_region)
|
||
else:
|
||
strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
|
||
|
||
if not strip_words:
|
||
continue
|
||
|
||
strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
|
||
if not strip_words:
|
||
continue
|
||
|
||
# Assign words to cells by Y overlap
|
||
for ci in cell_indices:
|
||
cell_y = cells[ci]['bbox_px']['y']
|
||
cell_h = cells[ci]['bbox_px']['h']
|
||
cell_mid_y = cell_y + cell_h / 2
|
||
|
||
matched_words = [
|
||
w for w in strip_words
|
||
if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
|
||
]
|
||
if matched_words:
|
||
matched_words.sort(key=lambda w: w['left'])
|
||
batch_text = ' '.join(w['text'] for w in matched_words)
|
||
batch_text = _clean_cell_text(batch_text)
|
||
if batch_text.strip():
|
||
cells[ci]['text'] = batch_text
|
||
cells[ci]['confidence'] = round(
|
||
sum(w['conf'] for w in matched_words) / len(matched_words), 1
|
||
)
|
||
cells[ci]['ocr_engine'] = 'batch_column_ocr'
|
||
|
||
batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
|
||
if batch_filled > 0:
|
||
logger.info(
|
||
f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
|
||
f"empty cells in column {col_idx}"
|
||
)
|
||
|
||
# Post-OCR: remove rows where ALL cells are empty (inter-row gaps
|
||
# that had stray Tesseract artifacts giving word_count > 0).
|
||
rows_with_text: set = set()
|
||
for cell in cells:
|
||
if cell['text'].strip():
|
||
rows_with_text.add(cell['row_index'])
|
||
before_filter = len(cells)
|
||
cells = [c for c in cells if c['row_index'] in rows_with_text]
|
||
empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
|
||
if empty_rows_removed > 0:
|
||
logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
|
||
|
||
logger.info(f"build_cell_grid: {len(cells)} cells from "
|
||
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
||
f"engine={engine_name}")
|
||
|
||
return cells, columns_meta
|
||
|
||
|
||
def build_cell_grid_streaming(
|
||
ocr_img: np.ndarray,
|
||
column_regions: List[PageRegion],
|
||
row_geometries: List[RowGeometry],
|
||
img_w: int,
|
||
img_h: int,
|
||
lang: str = "eng+deu",
|
||
ocr_engine: str = "auto",
|
||
img_bgr: Optional[np.ndarray] = None,
|
||
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
|
||
"""Like build_cell_grid(), but yields each cell as it is OCR'd.
|
||
|
||
Yields:
|
||
(cell_dict, columns_meta, total_cells) for each cell.
|
||
"""
|
||
# Resolve engine choice (same as build_cell_grid)
|
||
use_rapid = False
|
||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||
engine_name = ocr_engine
|
||
elif ocr_engine == "auto":
|
||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
elif ocr_engine == "rapid":
|
||
if not RAPIDOCR_AVAILABLE:
|
||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||
else:
|
||
use_rapid = True
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
else:
|
||
engine_name = "tesseract"
|
||
|
||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||
if not content_rows:
|
||
return
|
||
|
||
# Filter phantom rows: rows with no Tesseract words assigned are
|
||
# inter-line whitespace gaps that would produce garbage OCR.
|
||
before = len(content_rows)
|
||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||
skipped = before - len(content_rows)
|
||
if skipped > 0:
|
||
logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
|
||
if not content_rows:
|
||
return
|
||
|
||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||
if not relevant_cols:
|
||
return
|
||
|
||
# Filter artifact rows + heal gaps (same logic as build_cell_grid)
|
||
before_art = len(content_rows)
|
||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||
artifact_skipped = before_art - len(content_rows)
|
||
if artifact_skipped > 0:
|
||
logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
|
||
if not content_rows:
|
||
return
|
||
_heal_row_gaps(
|
||
content_rows,
|
||
top_bound=min(c.y for c in relevant_cols),
|
||
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||
)
|
||
|
||
relevant_cols.sort(key=lambda c: c.x)
|
||
|
||
columns_meta = [
|
||
{
|
||
'index': col_idx,
|
||
'type': col.type,
|
||
'x': col.x,
|
||
'width': col.width,
|
||
}
|
||
for col_idx, col in enumerate(relevant_cols)
|
||
]
|
||
|
||
lang_map = {
|
||
'column_en': 'eng',
|
||
'column_de': 'deu',
|
||
'column_example': 'eng+deu',
|
||
}
|
||
|
||
total_cells = len(content_rows) * len(relevant_cols)
|
||
|
||
for row_idx, row in enumerate(content_rows):
|
||
# Pre-assign each word to exactly one column (nearest center)
|
||
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
||
for col_idx, col in enumerate(relevant_cols):
|
||
cell = _ocr_single_cell(
|
||
row_idx, col_idx, row, col,
|
||
ocr_img, img_bgr, img_w, img_h,
|
||
use_rapid, engine_name, lang, lang_map,
|
||
preassigned_words=col_words[col_idx],
|
||
)
|
||
yield cell, columns_meta, total_cells
|
||
|
||
|
||
def _cells_to_vocab_entries(
|
||
cells: List[Dict[str, Any]],
|
||
columns_meta: List[Dict[str, Any]],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Map generic cells to vocab entries with english/german/example fields.
|
||
|
||
Groups cells by row_index, maps col_type → field name, and produces
|
||
one entry per row (only rows with at least one non-empty field).
|
||
"""
|
||
# Determine image dimensions from first cell (for row-level bbox)
|
||
col_type_to_field = {
|
||
'column_en': 'english',
|
||
'column_de': 'german',
|
||
'column_example': 'example',
|
||
'page_ref': 'source_page',
|
||
'column_marker': 'marker',
|
||
}
|
||
bbox_key_map = {
|
||
'column_en': 'bbox_en',
|
||
'column_de': 'bbox_de',
|
||
'column_example': 'bbox_ex',
|
||
'page_ref': 'bbox_ref',
|
||
'column_marker': 'bbox_marker',
|
||
}
|
||
|
||
# Group cells by row_index
|
||
rows: Dict[int, List[Dict]] = {}
|
||
for cell in cells:
|
||
ri = cell['row_index']
|
||
rows.setdefault(ri, []).append(cell)
|
||
|
||
entries: List[Dict[str, Any]] = []
|
||
for row_idx in sorted(rows.keys()):
|
||
row_cells = rows[row_idx]
|
||
entry: Dict[str, Any] = {
|
||
'row_index': row_idx,
|
||
'english': '',
|
||
'german': '',
|
||
'example': '',
|
||
'source_page': '',
|
||
'marker': '',
|
||
'confidence': 0.0,
|
||
'bbox': None,
|
||
'bbox_en': None,
|
||
'bbox_de': None,
|
||
'bbox_ex': None,
|
||
'bbox_ref': None,
|
||
'bbox_marker': None,
|
||
'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
|
||
}
|
||
|
||
confidences = []
|
||
for cell in row_cells:
|
||
col_type = cell['col_type']
|
||
field = col_type_to_field.get(col_type)
|
||
if field:
|
||
entry[field] = cell['text']
|
||
bbox_field = bbox_key_map.get(col_type)
|
||
if bbox_field:
|
||
entry[bbox_field] = cell['bbox_pct']
|
||
if cell['confidence'] > 0:
|
||
confidences.append(cell['confidence'])
|
||
|
||
# Compute row-level bbox as union of all cell bboxes
|
||
all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
|
||
if all_bboxes:
|
||
min_x = min(b['x'] for b in all_bboxes)
|
||
min_y = min(b['y'] for b in all_bboxes)
|
||
max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
|
||
max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
|
||
entry['bbox'] = {
|
||
'x': round(min_x, 2),
|
||
'y': round(min_y, 2),
|
||
'w': round(max_x2 - min_x, 2),
|
||
'h': round(max_y2 - min_y, 2),
|
||
}
|
||
|
||
entry['confidence'] = round(
|
||
sum(confidences) / len(confidences), 1
|
||
) if confidences else 0.0
|
||
|
||
# Only include if at least one mapped field has text
|
||
has_content = any(
|
||
entry.get(f)
|
||
for f in col_type_to_field.values()
|
||
)
|
||
if has_content:
|
||
entries.append(entry)
|
||
|
||
return entries
|
||
|
||
|
||
# Regex: line starts with phonetic bracket content only (no real word before it)
|
||
_PHONETIC_ONLY_RE = re.compile(
|
||
r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
|
||
)
|
||
|
||
|
||
def _is_phonetic_only_text(text: str) -> bool:
|
||
"""Check if text consists only of phonetic transcription.
|
||
|
||
Phonetic-only patterns:
|
||
['mani serva] → True
|
||
[dɑːns] → True
|
||
["a:mand] → True
|
||
almond ['a:mand] → False (has real word before bracket)
|
||
Mandel → False
|
||
"""
|
||
t = text.strip()
|
||
if not t:
|
||
return False
|
||
# Must contain at least one bracket
|
||
if '[' not in t and ']' not in t:
|
||
return False
|
||
# Remove all bracket content and surrounding punctuation/whitespace
|
||
without_brackets = re.sub(r"\[.*?\]", '', t)
|
||
without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
|
||
# If nothing meaningful remains, it's phonetic-only
|
||
alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
|
||
return len(alpha_remaining) < 2
|
||
|
||
|
||
def _merge_phonetic_continuation_rows(
|
||
entries: List[Dict[str, Any]],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Merge rows that contain only phonetic transcription into previous entry.
|
||
|
||
In dictionary pages, phonetic transcription sometimes wraps to the next
|
||
row. E.g.:
|
||
Row 28: EN="it's a money-saver" DE="es spart Kosten"
|
||
Row 29: EN="['mani serva]" DE=""
|
||
|
||
Row 29 is phonetic-only → merge into row 28's EN field.
|
||
"""
|
||
if len(entries) < 2:
|
||
return entries
|
||
|
||
merged: List[Dict[str, Any]] = []
|
||
for entry in entries:
|
||
en = (entry.get('english') or '').strip()
|
||
de = (entry.get('german') or '').strip()
|
||
ex = (entry.get('example') or '').strip()
|
||
|
||
# Check if this entry is phonetic-only (EN has only phonetics, DE empty)
|
||
if merged and _is_phonetic_only_text(en) and not de:
|
||
prev = merged[-1]
|
||
prev_en = (prev.get('english') or '').strip()
|
||
# Append phonetic to previous entry's EN
|
||
if prev_en:
|
||
prev['english'] = prev_en + ' ' + en
|
||
else:
|
||
prev['english'] = en
|
||
# If there was an example, append to previous too
|
||
if ex:
|
||
prev_ex = (prev.get('example') or '').strip()
|
||
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
||
logger.debug(
|
||
f"Merged phonetic row {entry.get('row_index')} "
|
||
f"into previous entry: {prev['english']!r}"
|
||
)
|
||
continue
|
||
|
||
merged.append(entry)
|
||
|
||
return merged
|
||
|
||
|
||
def _merge_continuation_rows(
|
||
entries: List[Dict[str, Any]],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Merge multi-line vocabulary entries where text wraps to the next row.
|
||
|
||
A row is a continuation of the previous entry when:
|
||
- EN has text, but DE is empty
|
||
- EN starts with a lowercase letter (not a new vocab entry)
|
||
- Previous entry's EN does NOT end with a sentence terminator (.!?)
|
||
- The continuation text has fewer than 4 words (not an example sentence)
|
||
- The row was not already merged as phonetic
|
||
|
||
Example:
|
||
Row 5: EN="to put up" DE="aufstellen"
|
||
Row 6: EN="with sth." DE=""
|
||
→ Merged: EN="to put up with sth." DE="aufstellen"
|
||
"""
|
||
if len(entries) < 2:
|
||
return entries
|
||
|
||
merged: List[Dict[str, Any]] = []
|
||
for entry in entries:
|
||
en = (entry.get('english') or '').strip()
|
||
de = (entry.get('german') or '').strip()
|
||
|
||
if merged and en and not de:
|
||
# Check: not phonetic (already handled)
|
||
if _is_phonetic_only_text(en):
|
||
merged.append(entry)
|
||
continue
|
||
|
||
# Check: starts with lowercase
|
||
first_alpha = next((c for c in en if c.isalpha()), '')
|
||
starts_lower = first_alpha and first_alpha.islower()
|
||
|
||
# Check: fewer than 4 words (not an example sentence)
|
||
word_count = len(en.split())
|
||
is_short = word_count < 4
|
||
|
||
# Check: previous entry doesn't end with sentence terminator
|
||
prev = merged[-1]
|
||
prev_en = (prev.get('english') or '').strip()
|
||
prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
|
||
|
||
if starts_lower and is_short and not prev_ends_sentence:
|
||
# Merge into previous entry
|
||
prev['english'] = (prev_en + ' ' + en).strip()
|
||
# Merge example if present
|
||
ex = (entry.get('example') or '').strip()
|
||
if ex:
|
||
prev_ex = (prev.get('example') or '').strip()
|
||
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
||
logger.debug(
|
||
f"Merged continuation row {entry.get('row_index')} "
|
||
f"into previous entry: {prev['english']!r}"
|
||
)
|
||
continue
|
||
|
||
merged.append(entry)
|
||
|
||
return merged
|
||
|
||
|
||
def build_word_grid(
|
||
ocr_img: np.ndarray,
|
||
column_regions: List[PageRegion],
|
||
row_geometries: List[RowGeometry],
|
||
img_w: int,
|
||
img_h: int,
|
||
lang: str = "eng+deu",
|
||
ocr_engine: str = "auto",
|
||
img_bgr: Optional[np.ndarray] = None,
|
||
pronunciation: str = "british",
|
||
) -> List[Dict[str, Any]]:
|
||
"""Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
|
||
|
||
Wrapper around build_cell_grid() that adds vocabulary-specific logic:
|
||
- Maps cells to english/german/example entries
|
||
- Applies character confusion fixes, IPA lookup, comma splitting, etc.
|
||
- Falls back to returning raw cells if no vocab columns detected.
|
||
|
||
Args:
|
||
ocr_img: Binarized full-page image (for Tesseract).
|
||
column_regions: Classified columns from Step 3.
|
||
row_geometries: Rows from Step 4.
|
||
img_w, img_h: Image dimensions.
|
||
lang: Default Tesseract language.
|
||
ocr_engine: 'tesseract', 'rapid', or 'auto'.
|
||
img_bgr: BGR color image (required for RapidOCR).
|
||
pronunciation: 'british' or 'american' for IPA lookup.
|
||
|
||
Returns:
|
||
List of entry dicts with english/german/example text and bbox info (percent).
|
||
"""
|
||
cells, columns_meta = build_cell_grid(
|
||
ocr_img, column_regions, row_geometries, img_w, img_h,
|
||
lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
|
||
)
|
||
|
||
if not cells:
|
||
return []
|
||
|
||
# Check if vocab layout is present
|
||
col_types = {c['type'] for c in columns_meta}
|
||
if not (col_types & {'column_en', 'column_de'}):
|
||
logger.info("build_word_grid: no vocab columns — returning raw cells")
|
||
return cells
|
||
|
||
# Vocab mapping: cells → entries
|
||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||
|
||
# --- Post-processing pipeline (deterministic, no LLM) ---
|
||
n_raw = len(entries)
|
||
|
||
# 0a. Merge phonetic-only continuation rows into previous entry
|
||
entries = _merge_phonetic_continuation_rows(entries)
|
||
|
||
# 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
|
||
entries = _merge_continuation_rows(entries)
|
||
|
||
# 1. Character confusion (| → I, 1 → I, 8 → B) is now run in
|
||
# llm_review_entries_streaming so changes are visible to the user in Step 6.
|
||
|
||
# 2. Replace OCR'd phonetics with dictionary IPA
|
||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||
|
||
# 3. Split comma-separated word forms (break, broke, broken → 3 entries)
|
||
entries = _split_comma_entries(entries)
|
||
|
||
# 4. Attach example sentences (rows without DE → examples for preceding entry)
|
||
entries = _attach_example_sentences(entries)
|
||
|
||
engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
|
||
logger.info(f"build_word_grid: {len(entries)} entries from "
|
||
f"{n_raw} raw → {len(entries)} after post-processing "
|
||
f"(engine={engine_name})")
|
||
|
||
return entries
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 6: Multi-Pass OCR
|
||
# =============================================================================
|
||
|
||
def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
|
||
psm: int, fallback_psm: Optional[int] = None,
|
||
min_confidence: float = 40.0) -> List[Dict[str, Any]]:
|
||
"""Run Tesseract OCR on a specific region with given PSM.
|
||
|
||
Args:
|
||
ocr_img: Binarized full-page image.
|
||
region: Region to crop and OCR.
|
||
lang: Tesseract language string.
|
||
psm: Page Segmentation Mode.
|
||
fallback_psm: If confidence too low, retry with this PSM per line.
|
||
min_confidence: Minimum average confidence before fallback.
|
||
|
||
Returns:
|
||
List of word dicts with text, position, confidence.
|
||
"""
|
||
# Crop region
|
||
crop = ocr_img[region.y:region.y + region.height,
|
||
region.x:region.x + region.width]
|
||
|
||
if crop.size == 0:
|
||
return []
|
||
|
||
# Convert to PIL for pytesseract
|
||
pil_img = Image.fromarray(crop)
|
||
|
||
# Run Tesseract with specified PSM
|
||
config = f'--psm {psm} --oem 3'
|
||
try:
|
||
data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
|
||
output_type=pytesseract.Output.DICT)
|
||
except Exception as e:
|
||
logger.warning(f"Tesseract failed for region {region.type}: {e}")
|
||
return []
|
||
|
||
words = []
|
||
for i in range(len(data['text'])):
|
||
text = data['text'][i].strip()
|
||
conf = int(data['conf'][i])
|
||
if not text or conf < 10:
|
||
continue
|
||
words.append({
|
||
'text': text,
|
||
'left': data['left'][i] + region.x, # Absolute coords
|
||
'top': data['top'][i] + region.y,
|
||
'width': data['width'][i],
|
||
'height': data['height'][i],
|
||
'conf': conf,
|
||
'region_type': region.type,
|
||
})
|
||
|
||
# Check average confidence
|
||
if words and fallback_psm is not None:
|
||
avg_conf = sum(w['conf'] for w in words) / len(words)
|
||
if avg_conf < min_confidence:
|
||
logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
|
||
f"trying fallback PSM {fallback_psm}")
|
||
words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
|
||
|
||
return words
|
||
|
||
|
||
def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
|
||
lang: str, psm: int) -> List[Dict[str, Any]]:
|
||
"""OCR a region line by line (fallback for low-confidence regions).
|
||
|
||
Splits the region into horizontal strips based on text density,
|
||
then OCRs each strip individually with the given PSM.
|
||
"""
|
||
crop = ocr_img[region.y:region.y + region.height,
|
||
region.x:region.x + region.width]
|
||
|
||
if crop.size == 0:
|
||
return []
|
||
|
||
# Find text lines via horizontal projection
|
||
inv = cv2.bitwise_not(crop)
|
||
h_proj = np.sum(inv, axis=1)
|
||
threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
|
||
|
||
# Find line boundaries
|
||
lines = []
|
||
in_text = False
|
||
line_start = 0
|
||
for y in range(len(h_proj)):
|
||
if h_proj[y] > threshold and not in_text:
|
||
line_start = y
|
||
in_text = True
|
||
elif h_proj[y] <= threshold and in_text:
|
||
if y - line_start > 5: # Minimum line height
|
||
lines.append((line_start, y))
|
||
in_text = False
|
||
if in_text and len(h_proj) - line_start > 5:
|
||
lines.append((line_start, len(h_proj)))
|
||
|
||
all_words = []
|
||
config = f'--psm {psm} --oem 3'
|
||
|
||
for line_y_start, line_y_end in lines:
|
||
# Add small padding
|
||
pad = 3
|
||
y1 = max(0, line_y_start - pad)
|
||
y2 = min(crop.shape[0], line_y_end + pad)
|
||
line_crop = crop[y1:y2, :]
|
||
|
||
if line_crop.size == 0:
|
||
continue
|
||
|
||
pil_img = Image.fromarray(line_crop)
|
||
try:
|
||
data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
|
||
output_type=pytesseract.Output.DICT)
|
||
except Exception:
|
||
continue
|
||
|
||
for i in range(len(data['text'])):
|
||
text = data['text'][i].strip()
|
||
conf = int(data['conf'][i])
|
||
if not text or conf < 10:
|
||
continue
|
||
all_words.append({
|
||
'text': text,
|
||
'left': data['left'][i] + region.x,
|
||
'top': data['top'][i] + region.y + y1,
|
||
'width': data['width'][i],
|
||
'height': data['height'][i],
|
||
'conf': conf,
|
||
'region_type': region.type,
|
||
})
|
||
|
||
return all_words
|
||
|
||
|
||
def run_multi_pass_ocr(ocr_img: np.ndarray,
|
||
regions: List[PageRegion],
|
||
lang: str = "eng+deu") -> Dict[str, List[Dict]]:
|
||
"""Run OCR on each detected region with optimized settings.
|
||
|
||
Args:
|
||
ocr_img: Binarized full-page image.
|
||
regions: Detected page regions.
|
||
lang: Default language.
|
||
|
||
Returns:
|
||
Dict mapping region type to list of word dicts.
|
||
"""
|
||
results: Dict[str, List[Dict]] = {}
|
||
|
||
_ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||
for region in regions:
|
||
if region.type in _ocr_skip:
|
||
continue # Skip non-content regions
|
||
|
||
if region.type == 'column_en':
|
||
words = ocr_region(ocr_img, region, lang='eng', psm=4)
|
||
elif region.type == 'column_de':
|
||
words = ocr_region(ocr_img, region, lang='deu', psm=4)
|
||
elif region.type == 'column_example':
|
||
words = ocr_region(ocr_img, region, lang=lang, psm=6,
|
||
fallback_psm=7, min_confidence=40.0)
|
||
else:
|
||
words = ocr_region(ocr_img, region, lang=lang, psm=6)
|
||
|
||
results[region.type] = words
|
||
logger.info(f"OCR {region.type}: {len(words)} words")
|
||
|
||
return results
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 7: Line Alignment → Vocabulary Entries
|
||
# =============================================================================
|
||
|
||
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
|
||
"""Group words by Y position into lines, sorted by X within each line."""
|
||
if not words:
|
||
return []
|
||
|
||
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
|
||
lines: List[List[Dict]] = []
|
||
current_line: List[Dict] = [sorted_words[0]]
|
||
current_y = sorted_words[0]['top']
|
||
|
||
for word in sorted_words[1:]:
|
||
if abs(word['top'] - current_y) <= y_tolerance_px:
|
||
current_line.append(word)
|
||
else:
|
||
current_line.sort(key=lambda w: w['left'])
|
||
lines.append(current_line)
|
||
current_line = [word]
|
||
current_y = word['top']
|
||
|
||
if current_line:
|
||
current_line.sort(key=lambda w: w['left'])
|
||
lines.append(current_line)
|
||
|
||
return lines
|
||
|
||
|
||
def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
|
||
regions: List[PageRegion],
|
||
y_tolerance_px: int = 25) -> List[VocabRow]:
|
||
"""Align OCR results from different columns into vocabulary rows.
|
||
|
||
Uses Y-coordinate matching to pair English words, German translations,
|
||
and example sentences that appear on the same line.
|
||
|
||
Args:
|
||
ocr_results: Dict mapping region type to word lists.
|
||
regions: Detected regions (for reference).
|
||
y_tolerance_px: Max Y-distance to consider words on the same row.
|
||
|
||
Returns:
|
||
List of VocabRow objects.
|
||
"""
|
||
# If no vocabulary columns detected (e.g. plain text page), return empty
|
||
if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
|
||
logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
|
||
return []
|
||
|
||
# Group words into lines per column
|
||
en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
|
||
de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
|
||
ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
|
||
|
||
def line_y_center(line: List[Dict]) -> float:
|
||
return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
|
||
|
||
def line_text(line: List[Dict]) -> str:
|
||
return ' '.join(w['text'] for w in line)
|
||
|
||
def line_confidence(line: List[Dict]) -> float:
|
||
return sum(w['conf'] for w in line) / len(line) if line else 0
|
||
|
||
# Build EN entries as the primary reference
|
||
vocab_rows: List[VocabRow] = []
|
||
|
||
for en_line in en_lines:
|
||
en_y = line_y_center(en_line)
|
||
en_text = line_text(en_line)
|
||
en_conf = line_confidence(en_line)
|
||
|
||
# Skip very short or likely header content
|
||
if len(en_text.strip()) < 2:
|
||
continue
|
||
|
||
# Find matching DE line
|
||
de_text = ""
|
||
de_conf = 0.0
|
||
best_de_dist = float('inf')
|
||
best_de_idx = -1
|
||
for idx, de_line in enumerate(de_lines):
|
||
dist = abs(line_y_center(de_line) - en_y)
|
||
if dist < y_tolerance_px and dist < best_de_dist:
|
||
best_de_dist = dist
|
||
best_de_idx = idx
|
||
|
||
if best_de_idx >= 0:
|
||
de_text = line_text(de_lines[best_de_idx])
|
||
de_conf = line_confidence(de_lines[best_de_idx])
|
||
|
||
# Find matching example line
|
||
ex_text = ""
|
||
ex_conf = 0.0
|
||
best_ex_dist = float('inf')
|
||
best_ex_idx = -1
|
||
for idx, ex_line in enumerate(ex_lines):
|
||
dist = abs(line_y_center(ex_line) - en_y)
|
||
if dist < y_tolerance_px and dist < best_ex_dist:
|
||
best_ex_dist = dist
|
||
best_ex_idx = idx
|
||
|
||
if best_ex_idx >= 0:
|
||
ex_text = line_text(ex_lines[best_ex_idx])
|
||
ex_conf = line_confidence(ex_lines[best_ex_idx])
|
||
|
||
avg_conf = en_conf
|
||
conf_count = 1
|
||
if de_conf > 0:
|
||
avg_conf += de_conf
|
||
conf_count += 1
|
||
if ex_conf > 0:
|
||
avg_conf += ex_conf
|
||
conf_count += 1
|
||
|
||
vocab_rows.append(VocabRow(
|
||
english=en_text.strip(),
|
||
german=de_text.strip(),
|
||
example=ex_text.strip(),
|
||
confidence=avg_conf / conf_count,
|
||
y_position=int(en_y),
|
||
))
|
||
|
||
# Handle multi-line wrapping in example column:
|
||
# If an example line has no matching EN/DE, append to previous entry
|
||
matched_ex_ys = set()
|
||
for row in vocab_rows:
|
||
if row.example:
|
||
matched_ex_ys.add(row.y_position)
|
||
|
||
for ex_line in ex_lines:
|
||
ex_y = line_y_center(ex_line)
|
||
# Check if already matched
|
||
already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
|
||
if already_matched:
|
||
continue
|
||
|
||
# Find nearest previous vocab row
|
||
best_row = None
|
||
best_dist = float('inf')
|
||
for row in vocab_rows:
|
||
dist = ex_y - row.y_position
|
||
if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
|
||
best_dist = dist
|
||
best_row = row
|
||
|
||
if best_row:
|
||
continuation = line_text(ex_line).strip()
|
||
if continuation:
|
||
best_row.example = (best_row.example + " " + continuation).strip()
|
||
|
||
# Sort by Y position
|
||
vocab_rows.sort(key=lambda r: r.y_position)
|
||
|
||
return vocab_rows
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 8: Optional LLM Post-Correction
|
||
# =============================================================================
|
||
|
||
async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
|
||
confidence_threshold: float = 50.0,
|
||
enabled: bool = False) -> List[VocabRow]:
|
||
"""Optionally send low-confidence regions to Qwen-VL for correction.
|
||
|
||
Default: disabled. Enable per parameter.
|
||
|
||
Args:
|
||
img: Original BGR image.
|
||
vocab_rows: Current vocabulary rows.
|
||
confidence_threshold: Rows below this get LLM correction.
|
||
enabled: Whether to actually run LLM correction.
|
||
|
||
Returns:
|
||
Corrected vocabulary rows.
|
||
"""
|
||
if not enabled:
|
||
return vocab_rows
|
||
|
||
# TODO: Implement Qwen-VL correction for low-confidence entries
|
||
# For each row with confidence < threshold:
|
||
# 1. Crop the relevant region from img
|
||
# 2. Send crop + OCR text to Qwen-VL
|
||
# 3. Replace text if LLM provides a confident correction
|
||
logger.info(f"LLM post-correction skipped (not yet implemented)")
|
||
return vocab_rows
|
||
|
||
|
||
# =============================================================================
|
||
# Orchestrator
|
||
# =============================================================================
|
||
|
||
async def run_cv_pipeline(
|
||
pdf_data: Optional[bytes] = None,
|
||
image_data: Optional[bytes] = None,
|
||
page_number: int = 0,
|
||
zoom: float = 3.0,
|
||
enable_dewarp: bool = True,
|
||
enable_llm_correction: bool = False,
|
||
lang: str = "eng+deu",
|
||
) -> PipelineResult:
|
||
"""Run the complete CV document reconstruction pipeline.
|
||
|
||
Args:
|
||
pdf_data: Raw PDF bytes (mutually exclusive with image_data).
|
||
image_data: Raw image bytes (mutually exclusive with pdf_data).
|
||
page_number: 0-indexed page number (for PDF).
|
||
zoom: PDF rendering zoom factor.
|
||
enable_dewarp: Whether to run dewarp stage.
|
||
enable_llm_correction: Whether to run LLM post-correction.
|
||
lang: Tesseract language string.
|
||
|
||
Returns:
|
||
PipelineResult with vocabulary and timing info.
|
||
"""
|
||
if not CV_PIPELINE_AVAILABLE:
|
||
return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
|
||
|
||
result = PipelineResult()
|
||
total_start = time.time()
|
||
|
||
try:
|
||
# Stage 1: Render
|
||
t = time.time()
|
||
if pdf_data:
|
||
img = render_pdf_high_res(pdf_data, page_number, zoom)
|
||
elif image_data:
|
||
img = render_image_high_res(image_data)
|
||
else:
|
||
return PipelineResult(error="No input data (pdf_data or image_data required)")
|
||
result.stages['render'] = round(time.time() - t, 2)
|
||
result.image_width = img.shape[1]
|
||
result.image_height = img.shape[0]
|
||
logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
|
||
|
||
# Stage 2: Deskew
|
||
t = time.time()
|
||
img, angle = deskew_image(img)
|
||
result.stages['deskew'] = round(time.time() - t, 2)
|
||
logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s")
|
||
|
||
# Stage 3: Dewarp
|
||
if enable_dewarp:
|
||
t = time.time()
|
||
img, _dewarp_info = dewarp_image(img)
|
||
result.stages['dewarp'] = round(time.time() - t, 2)
|
||
|
||
# Stage 4: Dual image preparation
|
||
t = time.time()
|
||
ocr_img = create_ocr_image(img)
|
||
layout_img = create_layout_image(img)
|
||
result.stages['image_prep'] = round(time.time() - t, 2)
|
||
|
||
# Stage 5: Layout analysis
|
||
t = time.time()
|
||
regions = analyze_layout(layout_img, ocr_img)
|
||
result.stages['layout'] = round(time.time() - t, 2)
|
||
result.columns_detected = len([r for r in regions if r.type.startswith('column')])
|
||
logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
|
||
|
||
# Stage 6: Multi-pass OCR
|
||
t = time.time()
|
||
ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
|
||
result.stages['ocr'] = round(time.time() - t, 2)
|
||
total_words = sum(len(w) for w in ocr_results.values())
|
||
result.word_count = total_words
|
||
logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
|
||
|
||
# Stage 7: Line alignment
|
||
t = time.time()
|
||
vocab_rows = match_lines_to_vocab(ocr_results, regions)
|
||
result.stages['alignment'] = round(time.time() - t, 2)
|
||
|
||
# Stage 8: Optional LLM correction
|
||
if enable_llm_correction:
|
||
t = time.time()
|
||
vocab_rows = await llm_post_correct(img, vocab_rows)
|
||
result.stages['llm_correction'] = round(time.time() - t, 2)
|
||
|
||
# Convert to output format
|
||
result.vocabulary = [
|
||
{
|
||
"english": row.english,
|
||
"german": row.german,
|
||
"example": row.example,
|
||
"confidence": round(row.confidence, 1),
|
||
}
|
||
for row in vocab_rows
|
||
if row.english or row.german # Skip empty rows
|
||
]
|
||
|
||
result.duration_seconds = round(time.time() - total_start, 2)
|
||
logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
|
||
|
||
except Exception as e:
|
||
logger.error(f"CV Pipeline error: {e}")
|
||
import traceback
|
||
logger.debug(traceback.format_exc())
|
||
result.error = str(e)
|
||
result.duration_seconds = round(time.time() - total_start, 2)
|
||
|
||
return result
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# LLM-based OCR Correction (Step 6)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
import httpx
|
||
import os
|
||
import json as _json
|
||
import re as _re
|
||
|
||
_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
|
||
_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
|
||
logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
|
||
|
||
# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]"
|
||
_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]')
|
||
|
||
# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion.
|
||
# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B)
|
||
# when they appear inside or next to a word character.
|
||
_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])')
|
||
|
||
|
||
def _entry_needs_review(entry: Dict) -> bool:
|
||
"""Check if an entry should be sent to the LLM for review.
|
||
|
||
Sends all non-empty entries that don't have IPA phonetic transcriptions.
|
||
The LLM prompt and _is_spurious_change() guard against unwanted changes.
|
||
"""
|
||
en = entry.get("english", "") or ""
|
||
de = entry.get("german", "") or ""
|
||
|
||
# Skip completely empty entries
|
||
if not en.strip() and not de.strip():
|
||
return False
|
||
# Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them
|
||
if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
|
||
return False
|
||
return True
|
||
|
||
|
||
def _build_llm_prompt(table_lines: List[Dict]) -> str:
|
||
"""Build the LLM correction prompt for a batch of entries."""
|
||
return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
|
||
|
||
DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
|
||
|
||
NUR diese Korrekturen sind erlaubt:
|
||
- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball"
|
||
- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old"
|
||
- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
|
||
- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
|
||
- Ziffer 6 statt G oder g: "6eld" → "Geld"
|
||
- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"
|
||
|
||
ABSOLUT VERBOTEN — aendere NIEMALS:
|
||
- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
|
||
- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN
|
||
- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst
|
||
- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest
|
||
- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
|
||
- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
|
||
- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren
|
||
- Beispielsaetze in der ex-Spalte — NIEMALS aendern
|
||
|
||
Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
|
||
|
||
Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
|
||
Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
|
||
|
||
/no_think
|
||
|
||
Eingabe:
|
||
{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
|
||
|
||
|
||
def _is_spurious_change(old_val: str, new_val: str) -> bool:
|
||
"""Detect LLM changes that are likely wrong and should be discarded.
|
||
|
||
Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are
|
||
legitimate OCR corrections. Everything else is rejected.
|
||
|
||
Filters out:
|
||
- Case-only changes
|
||
- Changes that don't contain any digit→letter fix
|
||
- Completely different words (LLM translating or hallucinating)
|
||
- Additions or removals of whole words (count changed)
|
||
"""
|
||
if not old_val or not new_val:
|
||
return False
|
||
|
||
# Case-only change — never a real OCR error
|
||
if old_val.lower() == new_val.lower():
|
||
return True
|
||
|
||
# If the word count changed significantly, the LLM rewrote rather than fixed
|
||
old_words = old_val.split()
|
||
new_words = new_val.split()
|
||
if abs(len(old_words) - len(new_words)) > 1:
|
||
return True
|
||
|
||
# Core rule: a legitimate correction replaces a digit with the corresponding
|
||
# letter. If the change doesn't include such a substitution, reject it.
|
||
# Build a set of (old_char, new_char) pairs that differ between old and new.
|
||
# Use character-level diff heuristic: if lengths are close, zip and compare.
|
||
# Map of characters that OCR commonly misreads → set of correct replacements
|
||
_OCR_CHAR_MAP = {
|
||
# Digits mistaken for letters
|
||
'0': set('oOgG'),
|
||
'1': set('lLiI'),
|
||
'5': set('sS'),
|
||
'6': set('gG'),
|
||
'8': set('bB'),
|
||
# Non-letter symbols mistaken for letters
|
||
'|': set('lLiI1'), # pipe → lowercase l, capital I, or digit 1
|
||
'l': set('iI|1'), # lowercase l → capital I (and reverse)
|
||
}
|
||
has_valid_fix = False
|
||
if len(old_val) == len(new_val):
|
||
for oc, nc in zip(old_val, new_val):
|
||
if oc != nc:
|
||
if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
|
||
has_valid_fix = True
|
||
elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
|
||
# Reverse check (e.g. l→I where new is the "correct" char)
|
||
has_valid_fix = True
|
||
else:
|
||
# Length changed by 1: accept if old had a suspicious char sequence
|
||
_OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
|
||
if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
|
||
has_valid_fix = True
|
||
|
||
if not has_valid_fix:
|
||
return True # Reject — looks like translation or hallucination
|
||
|
||
return False
|
||
|
||
|
||
def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
|
||
"""Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
|
||
changes = []
|
||
entries_out = []
|
||
for i, orig in enumerate(originals):
|
||
if i < len(corrected):
|
||
c = corrected[i]
|
||
entry = dict(orig)
|
||
for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
|
||
new_val = c.get(key, "").strip()
|
||
old_val = (orig.get(field_name, "") or "").strip()
|
||
if new_val and new_val != old_val:
|
||
# Filter spurious LLM changes
|
||
if _is_spurious_change(old_val, new_val):
|
||
continue
|
||
changes.append({
|
||
"row_index": orig.get("row_index", i),
|
||
"field": field_name,
|
||
"old": old_val,
|
||
"new": new_val,
|
||
})
|
||
entry[field_name] = new_val
|
||
entry["llm_corrected"] = True
|
||
entries_out.append(entry)
|
||
else:
|
||
entries_out.append(dict(orig))
|
||
return changes, entries_out
|
||
|
||
|
||
# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
|
||
|
||
REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
|
||
|
||
try:
|
||
from spellchecker import SpellChecker as _SpellChecker
|
||
_en_spell = _SpellChecker(language='en', distance=1)
|
||
_de_spell = _SpellChecker(language='de', distance=1)
|
||
_SPELL_AVAILABLE = True
|
||
logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
|
||
except ImportError:
|
||
_SPELL_AVAILABLE = False
|
||
logger.warning("pyspellchecker not installed — falling back to LLM review")
|
||
|
||
# Suspicious OCR chars → ordered list of most-likely correct replacements
|
||
_SPELL_SUBS: Dict[str, List[str]] = {
|
||
'0': ['O', 'o'],
|
||
'1': ['l', 'I'],
|
||
'5': ['S', 's'],
|
||
'6': ['G', 'g'],
|
||
'8': ['B', 'b'],
|
||
'|': ['I', 'l', '1'],
|
||
}
|
||
_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
|
||
|
||
# Tokenizer: word tokens (letters + pipe) alternating with separators
|
||
_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
|
||
|
||
|
||
def _spell_dict_knows(word: str) -> bool:
|
||
"""True if word is known in EN or DE dictionary."""
|
||
if not _SPELL_AVAILABLE:
|
||
return False
|
||
w = word.lower()
|
||
return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
|
||
|
||
|
||
def _spell_fix_token(token: str) -> Optional[str]:
|
||
"""Return corrected form of token, or None if no fix needed/possible."""
|
||
if not any(ch in _SPELL_SUSPICIOUS for ch in token):
|
||
return None
|
||
# Standalone pipe → capital I
|
||
if token == '|':
|
||
return 'I'
|
||
# Original is already a valid word → leave it
|
||
if _spell_dict_knows(token):
|
||
return None
|
||
# Dictionary-backed single-char substitution
|
||
for i, ch in enumerate(token):
|
||
if ch not in _SPELL_SUBS:
|
||
continue
|
||
for replacement in _SPELL_SUBS[ch]:
|
||
candidate = token[:i] + replacement + token[i + 1:]
|
||
if _spell_dict_knows(candidate):
|
||
return candidate
|
||
# Structural rule: suspicious char at position 0 + rest is all lowercase letters
|
||
# e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld"
|
||
first = token[0]
|
||
if first in _SPELL_SUBS and len(token) >= 2:
|
||
rest = token[1:]
|
||
if rest.isalpha() and rest.islower():
|
||
candidate = _SPELL_SUBS[first][0] + rest
|
||
if not candidate[0].isdigit():
|
||
return candidate
|
||
return None
|
||
|
||
|
||
def _spell_fix_field(text: str) -> Tuple[str, bool]:
|
||
"""Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
|
||
if not text or not any(ch in text for ch in _SPELL_SUSPICIOUS):
|
||
return text, False
|
||
# Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
|
||
fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text)
|
||
changed = fixed != text
|
||
# Tokenize and fix word by word
|
||
parts: List[str] = []
|
||
pos = 0
|
||
for m in _SPELL_TOKEN_RE.finditer(fixed):
|
||
token, sep = m.group(1), m.group(2)
|
||
correction = _spell_fix_token(token)
|
||
if correction:
|
||
parts.append(correction)
|
||
changed = True
|
||
else:
|
||
parts.append(token)
|
||
parts.append(sep)
|
||
pos = m.end()
|
||
if pos < len(fixed):
|
||
parts.append(fixed[pos:])
|
||
return ''.join(parts), changed
|
||
|
||
|
||
def spell_review_entries_sync(entries: List[Dict]) -> Dict:
|
||
"""Rule-based OCR correction: spell-checker + structural heuristics.
|
||
|
||
Deterministic — never translates, never touches IPA, never hallucinates.
|
||
"""
|
||
t0 = time.time()
|
||
changes: List[Dict] = []
|
||
all_corrected: List[Dict] = []
|
||
for i, entry in enumerate(entries):
|
||
e = dict(entry)
|
||
if not _entry_needs_review(e):
|
||
all_corrected.append(e)
|
||
continue
|
||
for field_name in ("english", "german"):
|
||
old_val = (e.get(field_name) or "").strip()
|
||
if not old_val:
|
||
continue
|
||
new_val, was_changed = _spell_fix_field(old_val)
|
||
if was_changed and new_val != old_val:
|
||
changes.append({
|
||
"row_index": e.get("row_index", i),
|
||
"field": field_name,
|
||
"old": old_val,
|
||
"new": new_val,
|
||
})
|
||
e[field_name] = new_val
|
||
e["llm_corrected"] = True
|
||
all_corrected.append(e)
|
||
duration_ms = int((time.time() - t0) * 1000)
|
||
return {
|
||
"entries_original": entries,
|
||
"entries_corrected": all_corrected,
|
||
"changes": changes,
|
||
"skipped_count": 0,
|
||
"model_used": "spell-checker",
|
||
"duration_ms": duration_ms,
|
||
}
|
||
|
||
|
||
async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
|
||
"""Async generator yielding SSE-compatible events for spell-checker review."""
|
||
total = len(entries)
|
||
yield {
|
||
"type": "meta",
|
||
"total_entries": total,
|
||
"to_review": total,
|
||
"skipped": 0,
|
||
"model": "spell-checker",
|
||
"batch_size": batch_size,
|
||
}
|
||
result = spell_review_entries_sync(entries)
|
||
changes = result["changes"]
|
||
yield {
|
||
"type": "batch",
|
||
"batch_index": 0,
|
||
"entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
|
||
"changes": changes,
|
||
"duration_ms": result["duration_ms"],
|
||
"progress": {"current": total, "total": total},
|
||
}
|
||
yield {
|
||
"type": "complete",
|
||
"changes": changes,
|
||
"model_used": "spell-checker",
|
||
"duration_ms": result["duration_ms"],
|
||
"total_entries": total,
|
||
"reviewed": total,
|
||
"skipped": 0,
|
||
"corrections_found": len(changes),
|
||
"entries_corrected": result["entries_corrected"],
|
||
}
|
||
|
||
# ─── End Spell-Checker ────────────────────────────────────────────────────────
|
||
|
||
|
||
async def llm_review_entries(
|
||
entries: List[Dict],
|
||
model: str = None,
|
||
) -> Dict:
|
||
"""OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
|
||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||
return spell_review_entries_sync(entries)
|
||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||
|
||
model = model or OLLAMA_REVIEW_MODEL
|
||
|
||
# Filter: only entries that need review
|
||
reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
|
||
|
||
if not reviewable:
|
||
return {
|
||
"entries_original": entries,
|
||
"entries_corrected": [dict(e) for e in entries],
|
||
"changes": [],
|
||
"skipped_count": len(entries),
|
||
"model_used": model,
|
||
"duration_ms": 0,
|
||
}
|
||
|
||
review_entries = [e for _, e in reviewable]
|
||
table_lines = [
|
||
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
|
||
for e in review_entries
|
||
]
|
||
|
||
logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
|
||
len(review_entries), len(entries), model, len(entries) - len(reviewable))
|
||
logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False))
|
||
|
||
prompt = _build_llm_prompt(table_lines)
|
||
|
||
t0 = time.time()
|
||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||
resp = await client.post(
|
||
f"{_OLLAMA_URL}/api/chat",
|
||
json={
|
||
"model": model,
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"stream": False,
|
||
"think": False, # qwen3: disable chain-of-thought (Ollama >=0.6)
|
||
"options": {"temperature": 0.1, "num_predict": 8192},
|
||
},
|
||
)
|
||
resp.raise_for_status()
|
||
content = resp.json().get("message", {}).get("content", "")
|
||
duration_ms = int((time.time() - t0) * 1000)
|
||
|
||
logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
|
||
logger.debug("LLM review raw response (first 500): %.500s", content)
|
||
|
||
corrected = _parse_llm_json_array(content)
|
||
logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected))
|
||
changes, corrected_entries = _diff_batch(review_entries, corrected)
|
||
|
||
# Merge corrected entries back into the full list
|
||
all_corrected = [dict(e) for e in entries]
|
||
for batch_idx, (orig_idx, _) in enumerate(reviewable):
|
||
if batch_idx < len(corrected_entries):
|
||
all_corrected[orig_idx] = corrected_entries[batch_idx]
|
||
|
||
return {
|
||
"entries_original": entries,
|
||
"entries_corrected": all_corrected,
|
||
"changes": changes,
|
||
"skipped_count": len(entries) - len(reviewable),
|
||
"model_used": model,
|
||
"duration_ms": duration_ms,
|
||
}
|
||
|
||
|
||
async def llm_review_entries_streaming(
|
||
entries: List[Dict],
|
||
model: str = None,
|
||
batch_size: int = _REVIEW_BATCH_SIZE,
|
||
):
|
||
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
|
||
|
||
Phase 0 (always): Run _fix_character_confusion and emit any changes so they are
|
||
visible in the UI — this is the only place the fix now runs (removed from Step 1
|
||
of build_vocab_pipeline_streaming).
|
||
"""
|
||
# --- Phase 0: Character confusion fix (| → I, 1 → I, 8 → B, etc.) ---
|
||
_CONF_FIELDS = ('english', 'german', 'example')
|
||
originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
|
||
_fix_character_confusion(entries) # modifies in-place, returns same list
|
||
char_changes = [
|
||
{'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
|
||
for i in range(len(entries))
|
||
for f in _CONF_FIELDS
|
||
if originals[i][f] != entries[i].get(f, '')
|
||
]
|
||
|
||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||
# Inject char_changes as a batch right after the meta event from the spell checker
|
||
_meta_sent = False
|
||
async for event in spell_review_entries_streaming(entries, batch_size):
|
||
yield event
|
||
if not _meta_sent and event.get('type') == 'meta' and char_changes:
|
||
_meta_sent = True
|
||
yield {
|
||
'type': 'batch',
|
||
'changes': char_changes,
|
||
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
|
||
'progress': {'current': 0, 'total': len(entries)},
|
||
}
|
||
return
|
||
|
||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||
|
||
# LLM path: emit char_changes first (before meta) so they appear in the UI
|
||
if char_changes:
|
||
yield {
|
||
'type': 'batch',
|
||
'changes': char_changes,
|
||
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
|
||
'progress': {'current': 0, 'total': len(entries)},
|
||
}
|
||
|
||
model = model or OLLAMA_REVIEW_MODEL
|
||
|
||
# Separate reviewable from skipped entries
|
||
reviewable = []
|
||
skipped_indices = []
|
||
for i, e in enumerate(entries):
|
||
if _entry_needs_review(e):
|
||
reviewable.append((i, e))
|
||
else:
|
||
skipped_indices.append(i)
|
||
|
||
total_to_review = len(reviewable)
|
||
|
||
# meta event
|
||
yield {
|
||
"type": "meta",
|
||
"total_entries": len(entries),
|
||
"to_review": total_to_review,
|
||
"skipped": len(skipped_indices),
|
||
"model": model,
|
||
"batch_size": batch_size,
|
||
}
|
||
|
||
all_changes = []
|
||
all_corrected = [dict(e) for e in entries]
|
||
total_duration_ms = 0
|
||
reviewed_count = 0
|
||
|
||
# Process in batches
|
||
for batch_start in range(0, total_to_review, batch_size):
|
||
batch_items = reviewable[batch_start:batch_start + batch_size]
|
||
batch_entries = [e for _, e in batch_items]
|
||
|
||
table_lines = [
|
||
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
|
||
for e in batch_entries
|
||
]
|
||
|
||
prompt = _build_llm_prompt(table_lines)
|
||
|
||
logger.info("LLM review streaming: batch %d — sending %d entries to %s",
|
||
batch_start // batch_size, len(batch_entries), model)
|
||
|
||
t0 = time.time()
|
||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||
resp = await client.post(
|
||
f"{_OLLAMA_URL}/api/chat",
|
||
json={
|
||
"model": model,
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"stream": False,
|
||
"think": False, # qwen3: disable chain-of-thought
|
||
"options": {"temperature": 0.1, "num_predict": 8192},
|
||
},
|
||
)
|
||
resp.raise_for_status()
|
||
content = resp.json().get("message", {}).get("content", "")
|
||
batch_ms = int((time.time() - t0) * 1000)
|
||
total_duration_ms += batch_ms
|
||
|
||
logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content))
|
||
logger.debug("LLM review streaming raw (first 500): %.500s", content)
|
||
|
||
corrected = _parse_llm_json_array(content)
|
||
logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected))
|
||
batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
|
||
|
||
# Merge back
|
||
for batch_idx, (orig_idx, _) in enumerate(batch_items):
|
||
if batch_idx < len(batch_corrected):
|
||
all_corrected[orig_idx] = batch_corrected[batch_idx]
|
||
|
||
all_changes.extend(batch_changes)
|
||
reviewed_count += len(batch_items)
|
||
|
||
# Yield batch result
|
||
yield {
|
||
"type": "batch",
|
||
"batch_index": batch_start // batch_size,
|
||
"entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
|
||
"changes": batch_changes,
|
||
"duration_ms": batch_ms,
|
||
"progress": {"current": reviewed_count, "total": total_to_review},
|
||
}
|
||
|
||
# Complete event
|
||
yield {
|
||
"type": "complete",
|
||
"changes": all_changes,
|
||
"model_used": model,
|
||
"duration_ms": total_duration_ms,
|
||
"total_entries": len(entries),
|
||
"reviewed": total_to_review,
|
||
"skipped": len(skipped_indices),
|
||
"corrections_found": len(all_changes),
|
||
"entries_corrected": all_corrected,
|
||
}
|
||
|
||
|
||
def _sanitize_for_json(text: str) -> str:
|
||
"""Remove or escape control characters that break JSON parsing.
|
||
|
||
Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
|
||
JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
|
||
that are only valid inside JSON strings when properly escaped.
|
||
"""
|
||
# Replace literal control chars (except \\t \\n \\r) with a space
|
||
return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
|
||
|
||
|
||
def _parse_llm_json_array(text: str) -> List[Dict]:
|
||
"""Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
|
||
# Strip qwen3 <think>...</think> blocks (present even with think=False on some builds)
|
||
text = _re.sub(r'<think>.*?</think>', '', text, flags=_re.DOTALL)
|
||
# Strip markdown code fences
|
||
text = _re.sub(r'```json\s*', '', text)
|
||
text = _re.sub(r'```\s*', '', text)
|
||
# Sanitize control characters before JSON parsing
|
||
text = _sanitize_for_json(text)
|
||
# Find first [ ... last ]
|
||
match = _re.search(r'\[.*\]', text, _re.DOTALL)
|
||
if match:
|
||
try:
|
||
return _json.loads(match.group())
|
||
except (ValueError, _json.JSONDecodeError) as e:
|
||
logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
|
||
else:
|
||
logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
|
||
return []
|