_CHAR_CONFUSION_RULES: standalone "1" → "I" now skips "1." and "1," Cross-language fallback rule: same lookahead (?![\d.,]) added Fixes: "cross = 1. Kreuz" being converted to "cross = I. Kreuz" in Step 1 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
6050 lines
225 KiB
Python
6050 lines
225 KiB
Python
"""
|
||
CV-based Document Reconstruction Pipeline for Vocabulary Extraction.
|
||
|
||
Uses classical Computer Vision techniques for high-quality OCR:
|
||
- High-resolution PDF rendering (432 DPI)
|
||
- Deskew (rotation correction via Hough Lines)
|
||
- Dewarp (book curvature correction) — pass-through initially
|
||
- Dual image preparation (binarized for OCR, CLAHE for layout)
|
||
- Projection-profile layout analysis (column/row detection)
|
||
- Multi-pass Tesseract OCR with region-specific PSM settings
|
||
- Y-coordinate line alignment for vocabulary matching
|
||
- Optional LLM post-correction for low-confidence regions
|
||
|
||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import io
|
||
import logging
|
||
import time
|
||
from dataclasses import dataclass, field
|
||
from typing import Any, Dict, Generator, List, Optional, Tuple
|
||
|
||
import numpy as np
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# --- Availability Guards ---
|
||
|
||
try:
|
||
import cv2
|
||
CV2_AVAILABLE = True
|
||
except ImportError:
|
||
cv2 = None
|
||
CV2_AVAILABLE = False
|
||
logger.warning("OpenCV not available — CV pipeline disabled")
|
||
|
||
try:
|
||
import pytesseract
|
||
from PIL import Image
|
||
TESSERACT_AVAILABLE = True
|
||
except ImportError:
|
||
pytesseract = None
|
||
Image = None
|
||
TESSERACT_AVAILABLE = False
|
||
logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
|
||
|
||
CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
|
||
|
||
# --- IPA Dictionary ---
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
|
||
IPA_AVAILABLE = False
|
||
_ipa_convert_american = None
|
||
_britfone_dict: Dict[str, str] = {}
|
||
|
||
try:
|
||
import eng_to_ipa as _eng_to_ipa
|
||
_ipa_convert_american = _eng_to_ipa.convert
|
||
IPA_AVAILABLE = True
|
||
logger.info("eng_to_ipa available — American IPA lookup enabled")
|
||
except ImportError:
|
||
logger.info("eng_to_ipa not installed — American IPA disabled")
|
||
|
||
# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
|
||
_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
|
||
if os.path.exists(_britfone_path):
|
||
try:
|
||
with open(_britfone_path, 'r', encoding='utf-8') as f:
|
||
_britfone_dict = json.load(f)
|
||
IPA_AVAILABLE = True
|
||
logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
|
||
except Exception as e:
|
||
logger.warning(f"Failed to load Britfone: {e}")
|
||
else:
|
||
logger.info("Britfone not found — British IPA disabled")
|
||
|
||
# --- Language Detection Constants ---
|
||
|
||
GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
|
||
'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
|
||
'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
|
||
'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
|
||
'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
|
||
|
||
ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
|
||
'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
|
||
'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
||
'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
|
||
'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
|
||
|
||
|
||
# --- Data Classes ---
|
||
|
||
@dataclass
|
||
class PageRegion:
|
||
"""A detected region on the page."""
|
||
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
|
||
x: int
|
||
y: int
|
||
width: int
|
||
height: int
|
||
classification_confidence: float = 1.0 # 0.0-1.0
|
||
classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback'
|
||
|
||
|
||
@dataclass
|
||
class ColumnGeometry:
|
||
"""Geometrisch erkannte Spalte vor Typ-Klassifikation."""
|
||
index: int # 0-basiert, links->rechts
|
||
x: int
|
||
y: int
|
||
width: int
|
||
height: int
|
||
word_count: int
|
||
words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
|
||
width_ratio: float # width / content_width (0.0-1.0)
|
||
is_sub_column: bool = False # True if created by _detect_sub_columns() split
|
||
|
||
|
||
@dataclass
|
||
class RowGeometry:
|
||
"""Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
|
||
index: int # 0-basiert, oben→unten
|
||
x: int # absolute left (= content left_x)
|
||
y: int # absolute y start
|
||
width: int # content width
|
||
height: int # Zeilenhoehe in px
|
||
word_count: int
|
||
words: List[Dict]
|
||
row_type: str = 'content' # 'content' | 'header' | 'footer'
|
||
gap_before: int = 0 # Gap in px ueber dieser Zeile
|
||
|
||
|
||
@dataclass
|
||
class VocabRow:
|
||
"""A single vocabulary entry assembled from multi-column OCR."""
|
||
english: str = ""
|
||
german: str = ""
|
||
example: str = ""
|
||
source_page: str = ""
|
||
confidence: float = 0.0
|
||
y_position: int = 0
|
||
|
||
|
||
@dataclass
|
||
class PipelineResult:
|
||
"""Complete result of the CV pipeline."""
|
||
vocabulary: List[Dict[str, Any]] = field(default_factory=list)
|
||
word_count: int = 0
|
||
columns_detected: int = 0
|
||
duration_seconds: float = 0.0
|
||
stages: Dict[str, float] = field(default_factory=dict)
|
||
error: Optional[str] = None
|
||
image_width: int = 0
|
||
image_height: int = 0
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 1: High-Resolution PDF Rendering
|
||
# =============================================================================
|
||
|
||
def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
|
||
"""Render a PDF page to a high-resolution numpy array (BGR).
|
||
|
||
Args:
|
||
pdf_data: Raw PDF bytes.
|
||
page_number: 0-indexed page number.
|
||
zoom: Zoom factor (3.0 = 432 DPI).
|
||
|
||
Returns:
|
||
numpy array in BGR format.
|
||
"""
|
||
import fitz # PyMuPDF
|
||
|
||
pdf_doc = fitz.open(stream=pdf_data, filetype="pdf")
|
||
if page_number >= pdf_doc.page_count:
|
||
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)")
|
||
|
||
page = pdf_doc[page_number]
|
||
mat = fitz.Matrix(zoom, zoom)
|
||
pix = page.get_pixmap(matrix=mat)
|
||
|
||
# Convert to numpy BGR
|
||
img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
|
||
if pix.n == 4: # RGBA
|
||
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
|
||
elif pix.n == 3: # RGB
|
||
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
|
||
else: # Grayscale
|
||
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
|
||
|
||
pdf_doc.close()
|
||
return img_bgr
|
||
|
||
|
||
def render_image_high_res(image_data: bytes) -> np.ndarray:
|
||
"""Load an image (PNG/JPEG) into a numpy array (BGR).
|
||
|
||
Args:
|
||
image_data: Raw image bytes.
|
||
|
||
Returns:
|
||
numpy array in BGR format.
|
||
"""
|
||
img_array = np.frombuffer(image_data, dtype=np.uint8)
|
||
img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||
if img_bgr is None:
|
||
raise ValueError("Could not decode image data")
|
||
return img_bgr
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 2: Deskew (Rotation Correction)
|
||
# =============================================================================
|
||
|
||
def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
|
||
"""Correct rotation using Hough Line detection.
|
||
|
||
Args:
|
||
img: BGR image.
|
||
|
||
Returns:
|
||
Tuple of (corrected image, detected angle in degrees).
|
||
"""
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
# Binarize for line detection
|
||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
|
||
# Detect lines
|
||
lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
|
||
minLineLength=img.shape[1] // 4, maxLineGap=20)
|
||
|
||
if lines is None or len(lines) < 3:
|
||
return img, 0.0
|
||
|
||
# Compute angles of near-horizontal lines
|
||
angles = []
|
||
for line in lines:
|
||
x1, y1, x2, y2 = line[0]
|
||
angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
|
||
if abs(angle) < 15: # Only near-horizontal
|
||
angles.append(angle)
|
||
|
||
if not angles:
|
||
return img, 0.0
|
||
|
||
median_angle = float(np.median(angles))
|
||
|
||
# Limit correction to ±5°
|
||
if abs(median_angle) > 5.0:
|
||
median_angle = 5.0 * np.sign(median_angle)
|
||
|
||
if abs(median_angle) < 0.1:
|
||
return img, 0.0
|
||
|
||
# Rotate
|
||
h, w = img.shape[:2]
|
||
center = (w // 2, h // 2)
|
||
M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
|
||
corrected = cv2.warpAffine(img, M, (w, h),
|
||
flags=cv2.INTER_LINEAR,
|
||
borderMode=cv2.BORDER_REPLICATE)
|
||
|
||
logger.info(f"Deskew: corrected {median_angle:.2f}° rotation")
|
||
return corrected, median_angle
|
||
|
||
|
||
def deskew_image_by_word_alignment(
|
||
image_data: bytes,
|
||
lang: str = "eng+deu",
|
||
downscale_factor: float = 0.5,
|
||
) -> Tuple[bytes, float]:
|
||
"""Correct rotation by fitting a line through left-most word starts per text line.
|
||
|
||
More robust than Hough-based deskew for vocabulary worksheets where text lines
|
||
have consistent left-alignment. Runs a quick Tesseract pass on a downscaled
|
||
copy to find word positions, computes the dominant left-edge column, fits a
|
||
line through those points and rotates the full-resolution image.
|
||
|
||
Args:
|
||
image_data: Raw image bytes (PNG/JPEG).
|
||
lang: Tesseract language string for the quick pass.
|
||
downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
|
||
|
||
Returns:
|
||
Tuple of (rotated image as PNG bytes, detected angle in degrees).
|
||
"""
|
||
if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
|
||
return image_data, 0.0
|
||
|
||
# 1. Decode image
|
||
img_array = np.frombuffer(image_data, dtype=np.uint8)
|
||
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||
if img is None:
|
||
logger.warning("deskew_by_word_alignment: could not decode image")
|
||
return image_data, 0.0
|
||
|
||
orig_h, orig_w = img.shape[:2]
|
||
|
||
# 2. Downscale for fast Tesseract pass
|
||
small_w = int(orig_w * downscale_factor)
|
||
small_h = int(orig_h * downscale_factor)
|
||
small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
|
||
|
||
# 3. Quick Tesseract — word-level positions
|
||
pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
|
||
try:
|
||
data = pytesseract.image_to_data(
|
||
pil_small, lang=lang, config="--psm 6 --oem 3",
|
||
output_type=pytesseract.Output.DICT,
|
||
)
|
||
except Exception as e:
|
||
logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
|
||
return image_data, 0.0
|
||
|
||
# 4. Per text-line, find the left-most word start
|
||
# Group by (block_num, par_num, line_num)
|
||
from collections import defaultdict
|
||
line_groups: Dict[tuple, list] = defaultdict(list)
|
||
for i in range(len(data["text"])):
|
||
text = (data["text"][i] or "").strip()
|
||
conf = int(data["conf"][i])
|
||
if not text or conf < 20:
|
||
continue
|
||
key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
|
||
line_groups[key].append(i)
|
||
|
||
if len(line_groups) < 5:
|
||
logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
|
||
return image_data, 0.0
|
||
|
||
# For each line, pick the word with smallest 'left' → compute (left_x, center_y)
|
||
# Scale back to original resolution
|
||
scale = 1.0 / downscale_factor
|
||
points = [] # list of (x, y) in original-image coords
|
||
for key, indices in line_groups.items():
|
||
best_idx = min(indices, key=lambda i: data["left"][i])
|
||
lx = data["left"][best_idx] * scale
|
||
top = data["top"][best_idx] * scale
|
||
h = data["height"][best_idx] * scale
|
||
cy = top + h / 2.0
|
||
points.append((lx, cy))
|
||
|
||
# 5. Find dominant left-edge column + compute angle
|
||
xs = np.array([p[0] for p in points])
|
||
ys = np.array([p[1] for p in points])
|
||
median_x = float(np.median(xs))
|
||
tolerance = orig_w * 0.03 # 3% of image width
|
||
|
||
mask = np.abs(xs - median_x) <= tolerance
|
||
filtered_xs = xs[mask]
|
||
filtered_ys = ys[mask]
|
||
|
||
if len(filtered_xs) < 5:
|
||
logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
|
||
return image_data, 0.0
|
||
|
||
# polyfit: x = a*y + b → a = dx/dy → angle = arctan(a)
|
||
coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
|
||
slope = coeffs[0] # dx/dy
|
||
angle_rad = np.arctan(slope)
|
||
angle_deg = float(np.degrees(angle_rad))
|
||
|
||
# Clamp to ±5°
|
||
angle_deg = max(-5.0, min(5.0, angle_deg))
|
||
|
||
logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points "
|
||
f"(total lines: {len(line_groups)})")
|
||
|
||
if abs(angle_deg) < 0.05:
|
||
return image_data, 0.0
|
||
|
||
# 6. Rotate full-res image
|
||
center = (orig_w // 2, orig_h // 2)
|
||
M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
|
||
rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
|
||
flags=cv2.INTER_LINEAR,
|
||
borderMode=cv2.BORDER_REPLICATE)
|
||
|
||
# Encode back to PNG
|
||
success, png_buf = cv2.imencode(".png", rotated)
|
||
if not success:
|
||
logger.warning("deskew_by_word_alignment: PNG encoding failed")
|
||
return image_data, 0.0
|
||
|
||
return png_buf.tobytes(), angle_deg
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 3: Dewarp (Book Curvature Correction)
|
||
# =============================================================================
|
||
|
||
def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
|
||
"""Detect the vertical shear angle of the page.
|
||
|
||
After deskew (horizontal lines aligned), vertical features like column
|
||
edges may still be tilted. This measures that tilt by tracking the
|
||
strongest vertical edge across horizontal strips.
|
||
|
||
The result is a shear angle in degrees: the angular difference between
|
||
true vertical and the detected column edge.
|
||
|
||
Returns:
|
||
Dict with keys: method, shear_degrees, confidence.
|
||
"""
|
||
h, w = img.shape[:2]
|
||
result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
|
||
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
|
||
# Vertical Sobel to find vertical edges
|
||
sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
|
||
abs_sobel = np.abs(sobel_x).astype(np.uint8)
|
||
|
||
# Binarize with Otsu
|
||
_, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||
|
||
num_strips = 20
|
||
strip_h = h // num_strips
|
||
edge_positions = [] # (y_center, x_position)
|
||
|
||
for i in range(num_strips):
|
||
y_start = i * strip_h
|
||
y_end = min((i + 1) * strip_h, h)
|
||
strip = binary[y_start:y_end, :]
|
||
|
||
# Project vertically (sum along y-axis)
|
||
projection = np.sum(strip, axis=0).astype(np.float64)
|
||
if projection.max() == 0:
|
||
continue
|
||
|
||
# Find the strongest vertical edge in left 40% of image
|
||
search_w = int(w * 0.4)
|
||
left_proj = projection[:search_w]
|
||
if left_proj.max() == 0:
|
||
continue
|
||
|
||
# Smooth and find peak
|
||
kernel_size = max(3, w // 100)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
|
||
x_pos = float(np.argmax(smoothed))
|
||
y_center = (y_start + y_end) / 2.0
|
||
edge_positions.append((y_center, x_pos))
|
||
|
||
if len(edge_positions) < 8:
|
||
return result
|
||
|
||
ys = np.array([p[0] for p in edge_positions])
|
||
xs = np.array([p[1] for p in edge_positions])
|
||
|
||
# Remove outliers (> 2 std from median)
|
||
median_x = np.median(xs)
|
||
std_x = max(np.std(xs), 1.0)
|
||
mask = np.abs(xs - median_x) < 2 * std_x
|
||
ys = ys[mask]
|
||
xs = xs[mask]
|
||
|
||
if len(ys) < 6:
|
||
return result
|
||
|
||
# Fit straight line: x = slope * y + intercept
|
||
# The slope tells us the tilt of the vertical edge
|
||
straight_coeffs = np.polyfit(ys, xs, 1)
|
||
slope = straight_coeffs[0] # dx/dy in pixels
|
||
fitted = np.polyval(straight_coeffs, ys)
|
||
residuals = xs - fitted
|
||
rmse = float(np.sqrt(np.mean(residuals ** 2)))
|
||
|
||
# Convert slope to angle: arctan(dx/dy) in degrees
|
||
import math
|
||
shear_degrees = math.degrees(math.atan(slope))
|
||
|
||
confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
|
||
|
||
result["shear_degrees"] = round(shear_degrees, 3)
|
||
result["confidence"] = round(float(confidence), 2)
|
||
|
||
return result
|
||
|
||
|
||
def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
|
||
"""Detect shear angle by maximising variance of horizontal text-line projections.
|
||
|
||
Principle: horizontal text lines produce a row-projection profile with sharp
|
||
peaks (high variance) when the image is correctly aligned. Any residual shear
|
||
smears the peaks and reduces variance. We sweep ±3° and pick the angle whose
|
||
corrected projection has the highest variance.
|
||
|
||
Works best on pages with clear horizontal banding (vocabulary tables, prose).
|
||
Complements _detect_shear_angle() which needs strong vertical edges.
|
||
|
||
Returns:
|
||
Dict with keys: method, shear_degrees, confidence.
|
||
"""
|
||
import math
|
||
result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
|
||
|
||
h, w = img.shape[:2]
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
|
||
# Otsu binarisation
|
||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||
|
||
# Work at half resolution for speed
|
||
small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
|
||
sh, sw = small.shape
|
||
|
||
# Angle sweep: ±3° in 0.25° steps
|
||
angles = [a * 0.25 for a in range(-12, 13)] # 25 values
|
||
best_angle = 0.0
|
||
best_variance = -1.0
|
||
variances: List[Tuple[float, float]] = []
|
||
|
||
for angle_deg in angles:
|
||
if abs(angle_deg) < 0.01:
|
||
rotated = small
|
||
else:
|
||
shear_tan = math.tan(math.radians(angle_deg))
|
||
M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
|
||
rotated = cv2.warpAffine(small, M, (sw, sh),
|
||
flags=cv2.INTER_NEAREST,
|
||
borderMode=cv2.BORDER_CONSTANT)
|
||
profile = np.sum(rotated, axis=1).astype(float)
|
||
var = float(np.var(profile))
|
||
variances.append((angle_deg, var))
|
||
if var > best_variance:
|
||
best_variance = var
|
||
best_angle = angle_deg
|
||
|
||
# Confidence: how much sharper is the best angle vs. the mean?
|
||
all_mean = sum(v for _, v in variances) / len(variances)
|
||
if all_mean > 0 and best_variance > all_mean:
|
||
confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
|
||
else:
|
||
confidence = 0.0
|
||
|
||
result["shear_degrees"] = round(best_angle, 3)
|
||
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
|
||
return result
|
||
|
||
|
||
def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
|
||
"""Detect shear using Hough transform on printed table / ruled lines.
|
||
|
||
Vocabulary worksheets have near-horizontal printed table borders. After
|
||
deskew these should be exactly horizontal; any residual tilt equals the
|
||
vertical shear angle (with inverted sign).
|
||
|
||
The sign convention: a horizontal line tilting +α degrees (left end lower)
|
||
means the page has vertical shear of -α degrees (left column edge drifts
|
||
to the left going downward).
|
||
|
||
Returns:
|
||
Dict with keys: method, shear_degrees, confidence.
|
||
"""
|
||
result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
|
||
|
||
h, w = img.shape[:2]
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
|
||
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
||
|
||
min_len = int(w * 0.15)
|
||
lines = cv2.HoughLinesP(
|
||
edges, rho=1, theta=np.pi / 360,
|
||
threshold=int(w * 0.08),
|
||
minLineLength=min_len,
|
||
maxLineGap=20,
|
||
)
|
||
|
||
if lines is None or len(lines) < 3:
|
||
return result
|
||
|
||
horizontal_angles: List[Tuple[float, float]] = []
|
||
for line in lines:
|
||
x1, y1, x2, y2 = line[0]
|
||
if x1 == x2:
|
||
continue
|
||
angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
|
||
if abs(angle) <= 5.0:
|
||
length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
|
||
horizontal_angles.append((angle, length))
|
||
|
||
if len(horizontal_angles) < 3:
|
||
return result
|
||
|
||
# Weighted median
|
||
angles_arr = np.array([a for a, _ in horizontal_angles])
|
||
weights_arr = np.array([l for _, l in horizontal_angles])
|
||
sorted_idx = np.argsort(angles_arr)
|
||
s_angles = angles_arr[sorted_idx]
|
||
s_weights = weights_arr[sorted_idx]
|
||
cum = np.cumsum(s_weights)
|
||
mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
|
||
median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
|
||
|
||
agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
|
||
confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
|
||
|
||
# Sign inversion: horizontal line tilt is complementary to vertical shear
|
||
shear_degrees = -median_angle
|
||
|
||
result["shear_degrees"] = round(shear_degrees, 3)
|
||
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
|
||
return result
|
||
|
||
|
||
def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
|
||
"""Apply a vertical shear correction to an image.
|
||
|
||
Shifts each row horizontally proportional to its distance from the
|
||
vertical center. This corrects the tilt of vertical features (columns)
|
||
without affecting horizontal alignment (text lines).
|
||
|
||
Args:
|
||
img: BGR image.
|
||
shear_degrees: Shear angle in degrees. Positive = shift top-right/bottom-left.
|
||
|
||
Returns:
|
||
Corrected image.
|
||
"""
|
||
import math
|
||
h, w = img.shape[:2]
|
||
shear_tan = math.tan(math.radians(shear_degrees))
|
||
|
||
# Affine matrix: shift x by shear_tan * (y - h/2)
|
||
# [1 shear_tan -h/2*shear_tan]
|
||
# [0 1 0 ]
|
||
M = np.float32([
|
||
[1, shear_tan, -h / 2.0 * shear_tan],
|
||
[0, 1, 0],
|
||
])
|
||
|
||
corrected = cv2.warpAffine(img, M, (w, h),
|
||
flags=cv2.INTER_LINEAR,
|
||
borderMode=cv2.BORDER_REPLICATE)
|
||
return corrected
|
||
|
||
|
||
def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
|
||
"""Combine multiple shear detections into a single weighted estimate.
|
||
|
||
Only methods with confidence >= 0.3 are considered.
|
||
Results are outlier-filtered: if any accepted result differs by more than
|
||
1° from the weighted mean, it is discarded.
|
||
|
||
Returns:
|
||
(shear_degrees, ensemble_confidence, methods_used_str)
|
||
"""
|
||
accepted = [(d["shear_degrees"], d["confidence"], d["method"])
|
||
for d in detections if d["confidence"] >= 0.3]
|
||
|
||
if not accepted:
|
||
return 0.0, 0.0, "none"
|
||
|
||
if len(accepted) == 1:
|
||
deg, conf, method = accepted[0]
|
||
return deg, conf, method
|
||
|
||
# First pass: weighted mean
|
||
total_w = sum(c for _, c, _ in accepted)
|
||
w_mean = sum(d * c for d, c, _ in accepted) / total_w
|
||
|
||
# Outlier filter: keep results within 1° of weighted mean
|
||
filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
|
||
if not filtered:
|
||
filtered = accepted # fallback: keep all
|
||
|
||
# Second pass: weighted mean on filtered results
|
||
total_w2 = sum(c for _, c, _ in filtered)
|
||
final_deg = sum(d * c for d, c, _ in filtered) / total_w2
|
||
|
||
# Ensemble confidence: average of individual confidences, boosted when
|
||
# methods agree (all within 0.5° of each other)
|
||
avg_conf = total_w2 / len(filtered)
|
||
spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
|
||
agreement_bonus = 0.15 if spread < 0.5 else 0.0
|
||
ensemble_conf = min(1.0, avg_conf + agreement_bonus)
|
||
|
||
methods_str = "+".join(m for _, _, m in filtered)
|
||
return round(final_deg, 3), round(ensemble_conf, 2), methods_str
|
||
|
||
|
||
def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
|
||
"""Correct vertical shear after deskew.
|
||
|
||
After deskew aligns horizontal text lines, vertical features (column
|
||
edges) may still be tilted. This detects the tilt angle using an ensemble
|
||
of three complementary methods and applies an affine shear correction.
|
||
|
||
Methods (all run in ~100ms total):
|
||
A. _detect_shear_angle() — vertical edge profile (~50ms)
|
||
B. _detect_shear_by_projection() — horizontal text-line variance (~30ms)
|
||
C. _detect_shear_by_hough() — Hough lines on table borders (~20ms)
|
||
|
||
Only methods with confidence >= 0.3 contribute to the ensemble.
|
||
Outlier filtering discards results deviating > 1° from the weighted mean.
|
||
|
||
Args:
|
||
img: BGR image (already deskewed).
|
||
use_ensemble: If False, fall back to single-method behaviour (method A only).
|
||
|
||
Returns:
|
||
Tuple of (corrected_image, dewarp_info).
|
||
dewarp_info keys: method, shear_degrees, confidence, detections.
|
||
"""
|
||
no_correction = {
|
||
"method": "none",
|
||
"shear_degrees": 0.0,
|
||
"confidence": 0.0,
|
||
"detections": [],
|
||
}
|
||
|
||
if not CV2_AVAILABLE:
|
||
return img, no_correction
|
||
|
||
t0 = time.time()
|
||
|
||
if use_ensemble:
|
||
det_a = _detect_shear_angle(img)
|
||
det_b = _detect_shear_by_projection(img)
|
||
det_c = _detect_shear_by_hough(img)
|
||
detections = [det_a, det_b, det_c]
|
||
shear_deg, confidence, method = _ensemble_shear(detections)
|
||
else:
|
||
det_a = _detect_shear_angle(img)
|
||
detections = [det_a]
|
||
shear_deg = det_a["shear_degrees"]
|
||
confidence = det_a["confidence"]
|
||
method = det_a["method"]
|
||
|
||
duration = time.time() - t0
|
||
|
||
logger.info(
|
||
"dewarp: ensemble shear=%.3f° conf=%.2f method=%s (%.2fs) | "
|
||
"A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f",
|
||
shear_deg, confidence, method, duration,
|
||
detections[0]["shear_degrees"], detections[0]["confidence"],
|
||
detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
|
||
detections[1]["confidence"] if len(detections) > 1 else 0.0,
|
||
detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
|
||
detections[2]["confidence"] if len(detections) > 2 else 0.0,
|
||
)
|
||
|
||
# Only correct if shear is significant (> 0.05°)
|
||
if abs(shear_deg) < 0.05 or confidence < 0.3:
|
||
return img, no_correction
|
||
|
||
# Apply correction (negate the detected shear to straighten)
|
||
corrected = _apply_shear(img, -shear_deg)
|
||
|
||
info = {
|
||
"method": method,
|
||
"shear_degrees": shear_deg,
|
||
"confidence": confidence,
|
||
"detections": [
|
||
{"method": d["method"], "shear_degrees": d["shear_degrees"],
|
||
"confidence": d["confidence"]}
|
||
for d in detections
|
||
],
|
||
}
|
||
|
||
return corrected, info
|
||
|
||
|
||
def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
|
||
"""Apply shear correction with a manual angle.
|
||
|
||
Args:
|
||
img: BGR image (deskewed, before dewarp).
|
||
shear_degrees: Shear angle in degrees to correct.
|
||
|
||
Returns:
|
||
Corrected image.
|
||
"""
|
||
if abs(shear_degrees) < 0.001:
|
||
return img
|
||
return _apply_shear(img, -shear_degrees)
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 4: Dual Image Preparation
|
||
# =============================================================================
|
||
|
||
def create_ocr_image(img: np.ndarray) -> np.ndarray:
|
||
"""Create a binarized image optimized for Tesseract OCR.
|
||
|
||
Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
|
||
|
||
Args:
|
||
img: BGR image.
|
||
|
||
Returns:
|
||
Binary image (white text on black background inverted to black on white).
|
||
"""
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
|
||
# Background normalization: divide by blurred version
|
||
bg = cv2.GaussianBlur(gray, (51, 51), 0)
|
||
normalized = cv2.divide(gray, bg, scale=255)
|
||
|
||
# Adaptive binarization
|
||
binary = cv2.adaptiveThreshold(
|
||
normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||
cv2.THRESH_BINARY, 31, 10
|
||
)
|
||
|
||
# Light denoise
|
||
denoised = cv2.medianBlur(binary, 3)
|
||
|
||
return denoised
|
||
|
||
|
||
def create_layout_image(img: np.ndarray) -> np.ndarray:
|
||
"""Create a CLAHE-enhanced grayscale image for layout analysis.
|
||
|
||
Args:
|
||
img: BGR image.
|
||
|
||
Returns:
|
||
Enhanced grayscale image.
|
||
"""
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||
enhanced = clahe.apply(gray)
|
||
return enhanced
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 5: Layout Analysis (Projection Profiles)
|
||
# =============================================================================
|
||
|
||
def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
|
||
"""Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
|
||
out = mask.copy()
|
||
n = len(out)
|
||
i = 0
|
||
while i < n:
|
||
if out[i]:
|
||
start = i
|
||
while i < n and out[i]:
|
||
i += 1
|
||
if (i - start) < min_width:
|
||
out[start:i] = False
|
||
else:
|
||
i += 1
|
||
return out
|
||
|
||
|
||
def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
|
||
"""Find the bounding box of actual text content (excluding page margins).
|
||
|
||
Scan artefacts (thin black lines at page edges) are filtered out by
|
||
discarding contiguous projection runs narrower than 1 % of the image
|
||
dimension (min 5 px).
|
||
|
||
Returns:
|
||
Tuple of (left_x, right_x, top_y, bottom_y).
|
||
"""
|
||
h, w = inv.shape[:2]
|
||
threshold = 0.005
|
||
|
||
# --- Horizontal projection for top/bottom ---
|
||
h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
|
||
h_mask = h_proj > threshold
|
||
min_h_run = max(5, h // 100)
|
||
h_mask = _filter_narrow_runs(h_mask, min_h_run)
|
||
|
||
top_y = 0
|
||
for y in range(h):
|
||
if h_mask[y]:
|
||
top_y = max(0, y - 5)
|
||
break
|
||
|
||
bottom_y = h
|
||
for y in range(h - 1, 0, -1):
|
||
if h_mask[y]:
|
||
bottom_y = min(h, y + 5)
|
||
break
|
||
|
||
# --- Vertical projection for left/right margins ---
|
||
v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
|
||
v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
|
||
v_mask = v_proj_norm > threshold
|
||
min_v_run = max(5, w // 100)
|
||
v_mask = _filter_narrow_runs(v_mask, min_v_run)
|
||
|
||
left_x = 0
|
||
for x in range(w):
|
||
if v_mask[x]:
|
||
left_x = max(0, x - 2)
|
||
break
|
||
|
||
right_x = w
|
||
for x in range(w - 1, 0, -1):
|
||
if v_mask[x]:
|
||
right_x = min(w, x + 2)
|
||
break
|
||
|
||
return left_x, right_x, top_y, bottom_y
|
||
|
||
|
||
def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
|
||
"""Detect columns, header, and footer using projection profiles.
|
||
|
||
Uses content-bounds detection to exclude page margins before searching
|
||
for column separators within the actual text area.
|
||
|
||
Args:
|
||
layout_img: CLAHE-enhanced grayscale image.
|
||
ocr_img: Binarized image for text density analysis.
|
||
|
||
Returns:
|
||
List of PageRegion objects describing detected regions.
|
||
"""
|
||
h, w = ocr_img.shape[:2]
|
||
|
||
# Invert: black text on white → white text on black for projection
|
||
inv = cv2.bitwise_not(ocr_img)
|
||
|
||
# --- Find actual content bounds (exclude page margins) ---
|
||
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||
content_w = right_x - left_x
|
||
content_h = bottom_y - top_y
|
||
|
||
logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||
f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
|
||
|
||
if content_w < w * 0.3 or content_h < h * 0.3:
|
||
# Fallback if detection seems wrong
|
||
left_x, right_x = 0, w
|
||
top_y, bottom_y = 0, h
|
||
content_w, content_h = w, h
|
||
|
||
# --- Vertical projection within content area to find column separators ---
|
||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||
v_proj = np.sum(content_strip, axis=0).astype(float)
|
||
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
|
||
|
||
# Smooth the projection profile
|
||
kernel_size = max(5, content_w // 50)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
# Debug: log projection profile statistics
|
||
p_mean = float(np.mean(v_proj_smooth))
|
||
p_median = float(np.median(v_proj_smooth))
|
||
p_min = float(np.min(v_proj_smooth))
|
||
p_max = float(np.max(v_proj_smooth))
|
||
logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
|
||
f"mean={p_mean:.4f}, median={p_median:.4f}")
|
||
|
||
# Find valleys using multiple threshold strategies
|
||
# Strategy 1: relative to median (catches clear separators)
|
||
# Strategy 2: local minima approach (catches subtle gaps)
|
||
threshold = max(p_median * 0.3, p_mean * 0.2)
|
||
logger.info(f"Layout: valley threshold={threshold:.4f}")
|
||
|
||
in_valley = v_proj_smooth < threshold
|
||
|
||
# Find contiguous valley regions
|
||
all_valleys = []
|
||
start = None
|
||
for x in range(len(v_proj_smooth)):
|
||
if in_valley[x] and start is None:
|
||
start = x
|
||
elif not in_valley[x] and start is not None:
|
||
valley_width = x - start
|
||
valley_depth = float(np.min(v_proj_smooth[start:x]))
|
||
# Valley must be at least 3px wide
|
||
if valley_width >= 3:
|
||
all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
|
||
start = None
|
||
|
||
logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
|
||
f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
|
||
|
||
# Filter: valleys must be inside the content area (not at edges)
|
||
inner_margin = int(content_w * 0.08)
|
||
valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
|
||
|
||
# If no valleys found with strict threshold, try local minima approach
|
||
if len(valleys) < 2:
|
||
logger.info("Layout: trying local minima approach for column detection")
|
||
# Divide content into 20 segments, find the 2 lowest
|
||
seg_count = 20
|
||
seg_width = content_w // seg_count
|
||
seg_scores = []
|
||
for i in range(seg_count):
|
||
sx = i * seg_width
|
||
ex = min((i + 1) * seg_width, content_w)
|
||
seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
|
||
seg_scores.append((i, sx, ex, seg_mean))
|
||
|
||
seg_scores.sort(key=lambda s: s[3])
|
||
logger.info(f"Layout: segment scores (lowest 5): "
|
||
f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
|
||
|
||
# Find two lowest non-adjacent segments that create reasonable columns
|
||
candidate_valleys = []
|
||
for seg_idx, sx, ex, seg_mean in seg_scores:
|
||
# Must not be at the edges
|
||
if seg_idx <= 1 or seg_idx >= seg_count - 2:
|
||
continue
|
||
# Must be significantly lower than overall mean
|
||
if seg_mean < p_mean * 0.6:
|
||
center = (sx + ex) // 2
|
||
candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
|
||
|
||
if len(candidate_valleys) >= 2:
|
||
# Pick the best pair: non-adjacent, creating reasonable column widths
|
||
candidate_valleys.sort(key=lambda v: v[2])
|
||
best_pair = None
|
||
best_score = float('inf')
|
||
for i in range(len(candidate_valleys)):
|
||
for j in range(i + 1, len(candidate_valleys)):
|
||
c1 = candidate_valleys[i][2]
|
||
c2 = candidate_valleys[j][2]
|
||
# Must be at least 20% apart
|
||
if (c2 - c1) < content_w * 0.2:
|
||
continue
|
||
col1 = c1
|
||
col2 = c2 - c1
|
||
col3 = content_w - c2
|
||
# Each column at least 15%
|
||
if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
|
||
continue
|
||
parts = sorted([col1, col2, col3])
|
||
score = parts[2] - parts[0]
|
||
if score < best_score:
|
||
best_score = score
|
||
best_pair = (candidate_valleys[i], candidate_valleys[j])
|
||
|
||
if best_pair:
|
||
valleys = list(best_pair)
|
||
logger.info(f"Layout: local minima found 2 valleys: "
|
||
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
|
||
|
||
logger.info(f"Layout: final {len(valleys)} valleys: "
|
||
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
|
||
|
||
regions = []
|
||
|
||
if len(valleys) >= 2:
|
||
# 3-column layout detected
|
||
valleys.sort(key=lambda v: v[2])
|
||
|
||
if len(valleys) == 2:
|
||
sep1_center = valleys[0][2]
|
||
sep2_center = valleys[1][2]
|
||
else:
|
||
# Pick the two valleys that best divide into 3 parts
|
||
# Prefer wider valleys (more likely true separators)
|
||
best_pair = None
|
||
best_score = float('inf')
|
||
for i in range(len(valleys)):
|
||
for j in range(i + 1, len(valleys)):
|
||
c1, c2 = valleys[i][2], valleys[j][2]
|
||
# Each column should be at least 15% of content width
|
||
col1 = c1
|
||
col2 = c2 - c1
|
||
col3 = content_w - c2
|
||
if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
|
||
continue
|
||
# Score: lower is better (more even distribution)
|
||
parts = sorted([col1, col2, col3])
|
||
score = parts[2] - parts[0]
|
||
# Bonus for wider valleys (subtract valley width)
|
||
score -= (valleys[i][3] + valleys[j][3]) * 0.5
|
||
if score < best_score:
|
||
best_score = score
|
||
best_pair = (c1, c2)
|
||
if best_pair:
|
||
sep1_center, sep2_center = best_pair
|
||
else:
|
||
sep1_center = valleys[0][2]
|
||
sep2_center = valleys[1][2]
|
||
|
||
# Convert from content-relative to absolute coordinates
|
||
abs_sep1 = sep1_center + left_x
|
||
abs_sep2 = sep2_center + left_x
|
||
|
||
logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
|
||
f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=0, y=top_y,
|
||
width=abs_sep1, height=content_h
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_de', x=abs_sep1, y=top_y,
|
||
width=abs_sep2 - abs_sep1, height=content_h
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_example', x=abs_sep2, y=top_y,
|
||
width=w - abs_sep2, height=content_h
|
||
))
|
||
|
||
elif len(valleys) == 1:
|
||
# 2-column layout
|
||
abs_sep = valleys[0][2] + left_x
|
||
|
||
logger.info(f"Layout: 2 columns at separator x={abs_sep}")
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=0, y=top_y,
|
||
width=abs_sep, height=content_h
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_de', x=abs_sep, y=top_y,
|
||
width=w - abs_sep, height=content_h
|
||
))
|
||
|
||
else:
|
||
# No columns detected — run full-page OCR as single column
|
||
logger.warning("Layout: no column separators found, using full page")
|
||
regions.append(PageRegion(
|
||
type='column_en', x=0, y=top_y,
|
||
width=w, height=content_h
|
||
))
|
||
|
||
# Add header/footer info (gap-based detection with fallback)
|
||
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
|
||
|
||
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
|
||
bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
|
||
col_count = len([r for r in regions if r.type.startswith('column')])
|
||
logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
|
||
|
||
return regions
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
|
||
# =============================================================================
|
||
|
||
# --- Phase A: Geometry Detection ---
|
||
|
||
def _detect_columns_by_clustering(
|
||
word_dicts: List[Dict],
|
||
left_edges: List[int],
|
||
edge_word_indices: List[int],
|
||
content_w: int,
|
||
content_h: int,
|
||
left_x: int,
|
||
right_x: int,
|
||
top_y: int,
|
||
bottom_y: int,
|
||
inv: Optional[np.ndarray] = None,
|
||
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
|
||
"""Fallback: detect columns by clustering left-aligned word positions.
|
||
|
||
Used when the primary gap-based algorithm finds fewer than 2 gaps.
|
||
"""
|
||
tolerance = max(10, int(content_w * 0.01))
|
||
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
|
||
|
||
clusters = []
|
||
cluster_widxs = []
|
||
cur_edges = [sorted_pairs[0][0]]
|
||
cur_widxs = [sorted_pairs[0][1]]
|
||
for edge, widx in sorted_pairs[1:]:
|
||
if edge - cur_edges[-1] <= tolerance:
|
||
cur_edges.append(edge)
|
||
cur_widxs.append(widx)
|
||
else:
|
||
clusters.append(cur_edges)
|
||
cluster_widxs.append(cur_widxs)
|
||
cur_edges = [edge]
|
||
cur_widxs = [widx]
|
||
clusters.append(cur_edges)
|
||
cluster_widxs.append(cur_widxs)
|
||
|
||
MIN_Y_COVERAGE_PRIMARY = 0.30
|
||
MIN_Y_COVERAGE_SECONDARY = 0.15
|
||
MIN_WORDS_SECONDARY = 5
|
||
|
||
cluster_infos = []
|
||
for c_edges, c_widxs in zip(clusters, cluster_widxs):
|
||
if len(c_edges) < 2:
|
||
continue
|
||
y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
|
||
y_span = max(y_positions) - min(y_positions)
|
||
y_coverage = y_span / content_h if content_h > 0 else 0.0
|
||
cluster_infos.append({
|
||
'mean_x': int(np.mean(c_edges)),
|
||
'count': len(c_edges),
|
||
'min_edge': min(c_edges),
|
||
'max_edge': max(c_edges),
|
||
'y_min': min(y_positions),
|
||
'y_max': max(y_positions),
|
||
'y_coverage': y_coverage,
|
||
})
|
||
|
||
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
|
||
primary_set = set(id(c) for c in primary)
|
||
secondary = [c for c in cluster_infos
|
||
if id(c) not in primary_set
|
||
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
|
||
and c['count'] >= MIN_WORDS_SECONDARY]
|
||
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
|
||
|
||
if len(significant) < 3:
|
||
logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
|
||
return None
|
||
|
||
merge_distance = max(30, int(content_w * 0.06))
|
||
merged = [significant[0].copy()]
|
||
for s in significant[1:]:
|
||
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
|
||
prev = merged[-1]
|
||
total = prev['count'] + s['count']
|
||
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
|
||
prev['mean_x'] = avg_x
|
||
prev['count'] = total
|
||
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
|
||
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
|
||
else:
|
||
merged.append(s.copy())
|
||
|
||
if len(merged) < 3:
|
||
logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
|
||
return None
|
||
|
||
logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
|
||
|
||
margin_px = max(6, int(content_w * 0.003))
|
||
return _build_geometries_from_starts(
|
||
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
|
||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
|
||
)
|
||
|
||
|
||
def _detect_sub_columns(
|
||
geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
left_x: int = 0,
|
||
top_y: int = 0,
|
||
header_y: Optional[int] = None,
|
||
footer_y: Optional[int] = None,
|
||
_edge_tolerance: int = 8,
|
||
_min_col_start_ratio: float = 0.10,
|
||
) -> List[ColumnGeometry]:
|
||
"""Split columns that contain internal sub-columns based on left-edge alignment.
|
||
|
||
For each column, clusters word left-edges into alignment bins (within
|
||
``_edge_tolerance`` px). The leftmost bin whose word count reaches
|
||
``_min_col_start_ratio`` of the column total is treated as the true column
|
||
start. Any words to the left of that bin form a sub-column, provided they
|
||
number >= 2 and < 35 % of total.
|
||
|
||
Word ``left`` values are relative to the content ROI (offset by *left_x*),
|
||
while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
|
||
bridges the two coordinate systems.
|
||
|
||
If *header_y* / *footer_y* are provided (absolute y-coordinates), words
|
||
in header/footer regions are excluded from alignment clustering to avoid
|
||
polluting the bins with page numbers or chapter titles. Word ``top``
|
||
values are relative to *top_y*.
|
||
|
||
Returns a new list of ColumnGeometry — potentially longer than the input.
|
||
"""
|
||
if content_w <= 0:
|
||
return geometries
|
||
|
||
result: List[ColumnGeometry] = []
|
||
for geo in geometries:
|
||
# Only consider wide-enough columns with enough words
|
||
if geo.width_ratio < 0.15 or geo.word_count < 5:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# Collect left-edges of confident words, excluding header/footer
|
||
# Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
|
||
min_top_rel = (header_y - top_y) if header_y is not None else None
|
||
max_top_rel = (footer_y - top_y) if footer_y is not None else None
|
||
|
||
confident = [w for w in geo.words
|
||
if w.get('conf', 0) >= 30
|
||
and (min_top_rel is None or w['top'] >= min_top_rel)
|
||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||
if len(confident) < 3:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# --- Cluster left-edges into alignment bins ---
|
||
sorted_edges = sorted(w['left'] for w in confident)
|
||
bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge)
|
||
cur = [sorted_edges[0]]
|
||
for i in range(1, len(sorted_edges)):
|
||
if sorted_edges[i] - cur[-1] <= _edge_tolerance:
|
||
cur.append(sorted_edges[i])
|
||
else:
|
||
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
|
||
cur = [sorted_edges[i]]
|
||
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
|
||
|
||
# --- Find the leftmost bin qualifying as a real column start ---
|
||
total = len(confident)
|
||
min_count = max(3, int(total * _min_col_start_ratio))
|
||
col_start_bin = None
|
||
for b in bins:
|
||
if b[1] >= min_count:
|
||
col_start_bin = b
|
||
break
|
||
|
||
if col_start_bin is None:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# Words to the left of the column-start bin are sub-column candidates
|
||
split_threshold = col_start_bin[2] - _edge_tolerance
|
||
sub_words = [w for w in geo.words if w['left'] < split_threshold]
|
||
main_words = [w for w in geo.words if w['left'] >= split_threshold]
|
||
|
||
# Count only body words (excluding header/footer) for the threshold check
|
||
# so that header/footer words don't artificially trigger a split.
|
||
sub_body = [w for w in sub_words
|
||
if (min_top_rel is None or w['top'] >= min_top_rel)
|
||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||
if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# --- Build two sub-column geometries ---
|
||
# Word 'left' values are relative to left_x; geo.x is absolute.
|
||
# Convert the split position from relative to absolute coordinates.
|
||
max_sub_left = max(w['left'] for w in sub_words)
|
||
split_rel = (max_sub_left + col_start_bin[2]) // 2
|
||
split_abs = split_rel + left_x
|
||
|
||
sub_x = geo.x
|
||
sub_width = split_abs - geo.x
|
||
main_x = split_abs
|
||
main_width = (geo.x + geo.width) - split_abs
|
||
|
||
if sub_width <= 0 or main_width <= 0:
|
||
result.append(geo)
|
||
continue
|
||
|
||
sub_geo = ColumnGeometry(
|
||
index=0,
|
||
x=sub_x,
|
||
y=geo.y,
|
||
width=sub_width,
|
||
height=geo.height,
|
||
word_count=len(sub_words),
|
||
words=sub_words,
|
||
width_ratio=sub_width / content_w if content_w > 0 else 0.0,
|
||
is_sub_column=True,
|
||
)
|
||
main_geo = ColumnGeometry(
|
||
index=0,
|
||
x=main_x,
|
||
y=geo.y,
|
||
width=main_width,
|
||
height=geo.height,
|
||
word_count=len(main_words),
|
||
words=main_words,
|
||
width_ratio=main_width / content_w if content_w > 0 else 0.0,
|
||
is_sub_column=True,
|
||
)
|
||
|
||
result.append(sub_geo)
|
||
result.append(main_geo)
|
||
|
||
logger.info(
|
||
f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
|
||
f"(rel={split_rel}), sub={len(sub_words)} words, "
|
||
f"main={len(main_words)} words, "
|
||
f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
|
||
)
|
||
|
||
# Re-index by left-to-right order
|
||
result.sort(key=lambda g: g.x)
|
||
for i, g in enumerate(result):
|
||
g.index = i
|
||
|
||
return result
|
||
|
||
|
||
def _build_geometries_from_starts(
|
||
col_starts: List[Tuple[int, int]],
|
||
word_dicts: List[Dict],
|
||
left_x: int,
|
||
right_x: int,
|
||
top_y: int,
|
||
bottom_y: int,
|
||
content_w: int,
|
||
content_h: int,
|
||
inv: Optional[np.ndarray] = None,
|
||
) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
|
||
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
|
||
geometries = []
|
||
for i, (start_x, count) in enumerate(col_starts):
|
||
if i + 1 < len(col_starts):
|
||
col_width = col_starts[i + 1][0] - start_x
|
||
else:
|
||
col_width = right_x - start_x
|
||
|
||
col_left_rel = start_x - left_x
|
||
col_right_rel = col_left_rel + col_width
|
||
col_words = [w for w in word_dicts
|
||
if col_left_rel <= w['left'] < col_right_rel]
|
||
|
||
geometries.append(ColumnGeometry(
|
||
index=i,
|
||
x=start_x,
|
||
y=top_y,
|
||
width=col_width,
|
||
height=content_h,
|
||
word_count=len(col_words),
|
||
words=col_words,
|
||
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
||
))
|
||
|
||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||
|
||
|
||
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
|
||
"""Detect column geometry using whitespace-gap analysis with word validation.
|
||
|
||
Phase A of the two-phase column detection. Uses vertical projection
|
||
profiles to find whitespace gaps between columns, then validates that
|
||
no gap cuts through a word bounding box.
|
||
|
||
Falls back to clustering-based detection if fewer than 2 gaps are found.
|
||
|
||
Args:
|
||
ocr_img: Binarized grayscale image for layout analysis.
|
||
dewarped_bgr: Original BGR image (for Tesseract word detection).
|
||
|
||
Returns:
|
||
Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||
or None if detection fails entirely.
|
||
"""
|
||
h, w = ocr_img.shape[:2]
|
||
|
||
# --- Step 1: Find content bounds ---
|
||
inv = cv2.bitwise_not(ocr_img)
|
||
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||
content_w = right_x - left_x
|
||
content_h = bottom_y - top_y
|
||
|
||
if content_w < w * 0.3 or content_h < h * 0.3:
|
||
left_x, right_x = 0, w
|
||
top_y, bottom_y = 0, h
|
||
content_w, content_h = w, h
|
||
|
||
logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||
f"y=[{top_y}..{bottom_y}] ({content_h}px)")
|
||
|
||
# --- Step 2: Get word bounding boxes from Tesseract ---
|
||
content_roi = dewarped_bgr[top_y:bottom_y, left_x:right_x]
|
||
pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
|
||
|
||
try:
|
||
data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
|
||
except Exception as e:
|
||
logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
|
||
return None
|
||
|
||
word_dicts = []
|
||
left_edges = []
|
||
edge_word_indices = []
|
||
n_words = len(data['text'])
|
||
for i in range(n_words):
|
||
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
|
||
text = str(data['text'][i]).strip()
|
||
if conf < 30 or not text:
|
||
continue
|
||
lx = int(data['left'][i])
|
||
ty = int(data['top'][i])
|
||
bw = int(data['width'][i])
|
||
bh = int(data['height'][i])
|
||
left_edges.append(lx)
|
||
edge_word_indices.append(len(word_dicts))
|
||
word_dicts.append({
|
||
'text': text, 'conf': conf,
|
||
'left': lx, 'top': ty, 'width': bw, 'height': bh,
|
||
})
|
||
|
||
if len(left_edges) < 5:
|
||
logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
|
||
return None
|
||
|
||
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
|
||
|
||
# --- Step 3: Vertical projection profile ---
|
||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||
v_proj = np.sum(content_strip, axis=0).astype(float)
|
||
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
|
||
|
||
# Smooth the projection to avoid noise-induced micro-gaps
|
||
kernel_size = max(5, content_w // 80)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1 # keep odd for symmetry
|
||
v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
# --- Step 4: Find whitespace gaps ---
|
||
# Threshold: areas with very little ink density are gaps
|
||
median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
|
||
gap_threshold = max(median_density * 0.15, 0.005)
|
||
|
||
in_gap = v_smooth < gap_threshold
|
||
MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
|
||
|
||
# Collect contiguous gap regions
|
||
raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
|
||
gap_start = None
|
||
for x in range(len(in_gap)):
|
||
if in_gap[x]:
|
||
if gap_start is None:
|
||
gap_start = x
|
||
else:
|
||
if gap_start is not None:
|
||
gap_width = x - gap_start
|
||
if gap_width >= MIN_GAP_WIDTH:
|
||
raw_gaps.append((gap_start, x))
|
||
gap_start = None
|
||
# Handle gap at the right edge
|
||
if gap_start is not None:
|
||
gap_width = len(in_gap) - gap_start
|
||
if gap_width >= MIN_GAP_WIDTH:
|
||
raw_gaps.append((gap_start, len(in_gap)))
|
||
|
||
logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||
f"min_width={MIN_GAP_WIDTH}px): "
|
||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
|
||
|
||
# --- Step 5: Validate gaps against word bounding boxes ---
|
||
validated_gaps = []
|
||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||
# Check if any word overlaps with this gap region
|
||
overlapping = False
|
||
for wd in word_dicts:
|
||
word_left = wd['left']
|
||
word_right = wd['left'] + wd['width']
|
||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||
overlapping = True
|
||
break
|
||
|
||
if not overlapping:
|
||
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||
else:
|
||
# Try to shift the gap to avoid the overlapping word(s)
|
||
# Find the tightest word boundaries within the gap region
|
||
min_word_left = content_w
|
||
max_word_right = 0
|
||
for wd in word_dicts:
|
||
word_left = wd['left']
|
||
word_right = wd['left'] + wd['width']
|
||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||
min_word_left = min(min_word_left, word_left)
|
||
max_word_right = max(max_word_right, word_right)
|
||
|
||
# Try gap before the overlapping words
|
||
if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
|
||
validated_gaps.append((gap_start_rel, min_word_left))
|
||
logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
|
||
# Try gap after the overlapping words
|
||
elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
|
||
validated_gaps.append((max_word_right, gap_end_rel))
|
||
logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
|
||
else:
|
||
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||
f"discarded (word overlap, no room to shift)")
|
||
|
||
logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
|
||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
|
||
|
||
# --- Step 6: Fallback to clustering if too few gaps ---
|
||
if len(validated_gaps) < 2:
|
||
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
|
||
return _detect_columns_by_clustering(
|
||
word_dicts, left_edges, edge_word_indices,
|
||
content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
|
||
)
|
||
|
||
# --- Step 7: Derive column boundaries from gaps ---
|
||
# Sort gaps by position
|
||
validated_gaps.sort(key=lambda g: g[0])
|
||
|
||
# Identify margin gaps (first and last) vs interior gaps
|
||
# A margin gap touches the edge of the content area (within 2% tolerance)
|
||
edge_tolerance = max(10, int(content_w * 0.02))
|
||
|
||
is_left_margin = validated_gaps[0][0] <= edge_tolerance
|
||
is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
|
||
|
||
# Interior gaps define column boundaries
|
||
# Column starts at the end of a gap, ends at the start of the next gap
|
||
col_starts = []
|
||
|
||
if is_left_margin:
|
||
# First column starts after the left margin gap
|
||
first_gap_end = validated_gaps[0][1]
|
||
interior_gaps = validated_gaps[1:]
|
||
else:
|
||
# No left margin gap — first column starts at content left edge
|
||
first_gap_end = 0
|
||
interior_gaps = validated_gaps[:]
|
||
|
||
if is_right_margin:
|
||
# Last gap is right margin — don't use it as column start
|
||
interior_gaps_for_boundaries = interior_gaps[:-1]
|
||
right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
|
||
else:
|
||
interior_gaps_for_boundaries = interior_gaps
|
||
right_boundary = content_w
|
||
|
||
# First column
|
||
col_starts.append(left_x + first_gap_end)
|
||
|
||
# Columns between interior gaps
|
||
for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
|
||
col_starts.append(left_x + gap_end_rel)
|
||
|
||
# Count words per column region (for logging)
|
||
col_start_counts = []
|
||
for i, start_x in enumerate(col_starts):
|
||
if i + 1 < len(col_starts):
|
||
next_start = col_starts[i + 1]
|
||
else:
|
||
# Rightmost column always extends to full image width (w).
|
||
# The page margin contains only white space — extending the OCR
|
||
# crop to the image edge is safe and prevents text near the right
|
||
# border from being cut off.
|
||
next_start = w
|
||
|
||
col_left_rel = start_x - left_x
|
||
col_right_rel = next_start - left_x
|
||
n_words_in_col = sum(1 for w in word_dicts
|
||
if col_left_rel <= w['left'] < col_right_rel)
|
||
col_start_counts.append((start_x, n_words_in_col))
|
||
|
||
logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
|
||
f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
|
||
f"{col_start_counts}")
|
||
|
||
# --- Step 8: Build ColumnGeometry objects ---
|
||
# Determine right edge for each column
|
||
all_boundaries = []
|
||
for i, start_x in enumerate(col_starts):
|
||
if i + 1 < len(col_starts):
|
||
end_x = col_starts[i + 1]
|
||
else:
|
||
# Rightmost column always extends to full image width (w).
|
||
end_x = w
|
||
all_boundaries.append((start_x, end_x))
|
||
|
||
geometries = []
|
||
for i, (start_x, end_x) in enumerate(all_boundaries):
|
||
col_width = end_x - start_x
|
||
col_left_rel = start_x - left_x
|
||
col_right_rel = col_left_rel + col_width
|
||
col_words = [w for w in word_dicts
|
||
if col_left_rel <= w['left'] < col_right_rel]
|
||
|
||
geometries.append(ColumnGeometry(
|
||
index=i,
|
||
x=start_x,
|
||
y=top_y,
|
||
width=col_width,
|
||
height=content_h,
|
||
word_count=len(col_words),
|
||
words=col_words,
|
||
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
||
))
|
||
|
||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||
|
||
# --- Step 9: Filter phantom narrow columns ---
|
||
# Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
|
||
# columns (< 3% of content width) with zero or no words. These are not
|
||
# real columns — remove them and close the gap between neighbors.
|
||
min_real_col_w = max(20, int(content_w * 0.03))
|
||
filtered_geoms = [g for g in geometries
|
||
if not (g.word_count < 3 and g.width < min_real_col_w)]
|
||
if len(filtered_geoms) < len(geometries):
|
||
n_removed = len(geometries) - len(filtered_geoms)
|
||
logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
|
||
f"(width < {min_real_col_w}px and words < 3)")
|
||
# Extend each remaining column to close gaps with its right neighbor
|
||
for i, g in enumerate(filtered_geoms):
|
||
if i + 1 < len(filtered_geoms):
|
||
g.width = filtered_geoms[i + 1].x - g.x
|
||
else:
|
||
g.width = w - g.x
|
||
g.index = i
|
||
col_left_rel = g.x - left_x
|
||
col_right_rel = col_left_rel + g.width
|
||
g.words = [w for w in word_dicts
|
||
if col_left_rel <= w['left'] < col_right_rel]
|
||
g.word_count = len(g.words)
|
||
geometries = filtered_geoms
|
||
logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
|
||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||
|
||
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||
|
||
|
||
# =============================================================================
|
||
# Row Geometry Detection (horizontal whitespace-gap analysis)
|
||
# =============================================================================
|
||
|
||
def detect_row_geometry(
|
||
inv: np.ndarray,
|
||
word_dicts: List[Dict],
|
||
left_x: int, right_x: int,
|
||
top_y: int, bottom_y: int,
|
||
) -> List['RowGeometry']:
|
||
"""Detect row geometry using horizontal whitespace-gap analysis.
|
||
|
||
Mirrors the vertical gap approach used for columns, but operates on
|
||
horizontal projection profiles to find gaps between text lines.
|
||
Also classifies header/footer rows based on gap size.
|
||
|
||
Args:
|
||
inv: Inverted binarized image (white text on black bg, full page).
|
||
word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
|
||
left_x, right_x: Absolute X bounds of the content area.
|
||
top_y, bottom_y: Absolute Y bounds of the content area.
|
||
|
||
Returns:
|
||
List of RowGeometry objects sorted top to bottom.
|
||
"""
|
||
content_w = right_x - left_x
|
||
content_h = bottom_y - top_y
|
||
|
||
if content_h < 10 or content_w < 10:
|
||
logger.warning("detect_row_geometry: content area too small")
|
||
return []
|
||
|
||
# --- Step 1: Horizontal projection profile (text-only, images masked out) ---
|
||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||
|
||
# Build a word-coverage mask so that image regions (high ink density but no
|
||
# Tesseract words) are ignored. Only pixels within/near word bounding boxes
|
||
# contribute to the projection. This prevents large illustrations from
|
||
# merging multiple vocabulary rows into one.
|
||
WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
|
||
word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
|
||
for wd in word_dicts:
|
||
y1 = max(0, wd['top'] - WORD_PAD_Y)
|
||
y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
|
||
x1 = max(0, wd['left'])
|
||
x2 = min(content_w, wd['left'] + wd['width'])
|
||
word_mask[y1:y2, x1:x2] = 255
|
||
|
||
masked_strip = cv2.bitwise_and(content_strip, word_mask)
|
||
h_proj = np.sum(masked_strip, axis=1).astype(float)
|
||
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
|
||
|
||
# --- Step 2: Smoothing + threshold ---
|
||
kernel_size = max(3, content_h // 200)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
|
||
gap_threshold = max(median_density * 0.15, 0.003)
|
||
|
||
in_gap = h_smooth < gap_threshold
|
||
MIN_GAP_HEIGHT = max(3, content_h // 500)
|
||
|
||
# --- Step 3: Collect contiguous gap regions ---
|
||
raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI
|
||
gap_start = None
|
||
for y in range(len(in_gap)):
|
||
if in_gap[y]:
|
||
if gap_start is None:
|
||
gap_start = y
|
||
else:
|
||
if gap_start is not None:
|
||
gap_height = y - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, y))
|
||
gap_start = None
|
||
if gap_start is not None:
|
||
gap_height = len(in_gap) - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, len(in_gap)))
|
||
|
||
logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||
f"min_height={MIN_GAP_HEIGHT}px)")
|
||
|
||
# --- Step 4: Validate gaps against word bounding boxes ---
|
||
validated_gaps = []
|
||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||
overlapping = False
|
||
for wd in word_dicts:
|
||
word_top = wd['top']
|
||
word_bottom = wd['top'] + wd['height']
|
||
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||
overlapping = True
|
||
break
|
||
|
||
if not overlapping:
|
||
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||
else:
|
||
# Try to shift the gap to avoid overlapping words
|
||
min_word_top = content_h
|
||
max_word_bottom = 0
|
||
for wd in word_dicts:
|
||
word_top = wd['top']
|
||
word_bottom = wd['top'] + wd['height']
|
||
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||
min_word_top = min(min_word_top, word_top)
|
||
max_word_bottom = max(max_word_bottom, word_bottom)
|
||
|
||
if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
|
||
validated_gaps.append((gap_start_rel, min_word_top))
|
||
elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
|
||
validated_gaps.append((max_word_bottom, gap_end_rel))
|
||
else:
|
||
logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||
f"discarded (word overlap, no room to shift)")
|
||
|
||
logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
|
||
|
||
# --- Fallback if too few gaps ---
|
||
if len(validated_gaps) < 2:
|
||
logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
|
||
return _build_rows_from_word_grouping(
|
||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
|
||
)
|
||
|
||
validated_gaps.sort(key=lambda g: g[0])
|
||
|
||
# --- Step 5: Header/footer detection via gap size ---
|
||
HEADER_FOOTER_ZONE = 0.15
|
||
GAP_MULTIPLIER = 2.0
|
||
|
||
gap_sizes = [g[1] - g[0] for g in validated_gaps]
|
||
median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
|
||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||
|
||
header_boundary_rel = None # y below which is header
|
||
footer_boundary_rel = None # y above which is footer
|
||
|
||
header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
|
||
footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
|
||
|
||
# Find largest gap in header zone
|
||
best_header_gap = None
|
||
for gs, ge in validated_gaps:
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||
if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
|
||
best_header_gap = (gs, ge)
|
||
|
||
if best_header_gap is not None:
|
||
header_boundary_rel = best_header_gap[1]
|
||
logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
|
||
f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
|
||
f"median_gap={median_gap:.0f}px)")
|
||
|
||
# Find largest gap in footer zone
|
||
best_footer_gap = None
|
||
for gs, ge in validated_gaps:
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||
if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
|
||
best_footer_gap = (gs, ge)
|
||
|
||
if best_footer_gap is not None:
|
||
footer_boundary_rel = best_footer_gap[0]
|
||
logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
|
||
f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
|
||
|
||
# --- Step 6: Build RowGeometry objects from gaps ---
|
||
# Rows are the spans between gaps
|
||
row_boundaries = [] # (start_y_rel, end_y_rel)
|
||
|
||
# Top of content to first gap
|
||
if validated_gaps[0][0] > MIN_GAP_HEIGHT:
|
||
row_boundaries.append((0, validated_gaps[0][0]))
|
||
|
||
# Between gaps
|
||
for i in range(len(validated_gaps) - 1):
|
||
row_start = validated_gaps[i][1]
|
||
row_end = validated_gaps[i + 1][0]
|
||
if row_end - row_start > 0:
|
||
row_boundaries.append((row_start, row_end))
|
||
|
||
# Last gap to bottom of content
|
||
if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
|
||
row_boundaries.append((validated_gaps[-1][1], content_h))
|
||
|
||
rows = []
|
||
for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
|
||
# Determine row type
|
||
row_mid = (row_start_rel + row_end_rel) / 2
|
||
if header_boundary_rel is not None and row_mid < header_boundary_rel:
|
||
row_type = 'header'
|
||
elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
|
||
row_type = 'footer'
|
||
else:
|
||
row_type = 'content'
|
||
|
||
# Collect words in this row
|
||
row_words = [w for w in word_dicts
|
||
if w['top'] + w['height'] / 2 >= row_start_rel
|
||
and w['top'] + w['height'] / 2 < row_end_rel]
|
||
|
||
# Gap before this row
|
||
gap_before = 0
|
||
if idx == 0 and validated_gaps[0][0] > 0:
|
||
gap_before = validated_gaps[0][0]
|
||
elif idx > 0:
|
||
# Find the gap just before this row boundary
|
||
for gs, ge in validated_gaps:
|
||
if ge == row_start_rel:
|
||
gap_before = ge - gs
|
||
break
|
||
|
||
rows.append(RowGeometry(
|
||
index=idx,
|
||
x=left_x,
|
||
y=top_y + row_start_rel,
|
||
width=content_w,
|
||
height=row_end_rel - row_start_rel,
|
||
word_count=len(row_words),
|
||
words=row_words,
|
||
row_type=row_type,
|
||
gap_before=gap_before,
|
||
))
|
||
|
||
# --- Step 7: Word-center grid regularization ---
|
||
# Derive precise row boundaries from word vertical centers. Detects
|
||
# section breaks (headings, paragraphs) and builds per-section grids.
|
||
rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
|
||
content_w, content_h, inv)
|
||
|
||
type_counts = {}
|
||
for r in rows:
|
||
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
|
||
logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
|
||
|
||
return rows
|
||
|
||
|
||
def _regularize_row_grid(
|
||
rows: List['RowGeometry'],
|
||
word_dicts: List[Dict],
|
||
left_x: int, right_x: int,
|
||
top_y: int,
|
||
content_w: int, content_h: int,
|
||
inv: np.ndarray,
|
||
) -> List['RowGeometry']:
|
||
"""Rebuild row boundaries from word center-lines with section-break awareness.
|
||
|
||
Instead of overlaying a rigid grid, this derives row positions bottom-up
|
||
from the words themselves:
|
||
|
||
1. Group words into line clusters (by Y proximity).
|
||
2. For each cluster compute center_y (median of word vertical centers)
|
||
and letter_height (median of word heights).
|
||
3. Compute the pitch (distance between consecutive centers).
|
||
4. Detect section breaks where the gap is >1.8× the median pitch
|
||
(headings, sub-headings, paragraph breaks).
|
||
5. Within each section, use the local pitch to place row boundaries
|
||
at the midpoints between consecutive centers.
|
||
6. Validate that ≥85% of words land in a grid row; otherwise fall back.
|
||
|
||
Header/footer rows from the gap-based detection are preserved.
|
||
"""
|
||
content_rows = [r for r in rows if r.row_type == 'content']
|
||
non_content = [r for r in rows if r.row_type != 'content']
|
||
|
||
if len(content_rows) < 5:
|
||
return rows
|
||
|
||
# --- Step A: Group ALL words into line clusters ---
|
||
# Collect words that belong to content rows (deduplicated)
|
||
content_words: List[Dict] = []
|
||
seen_keys: set = set()
|
||
for r in content_rows:
|
||
for w in r.words:
|
||
key = (w['left'], w['top'], w['width'], w['height'])
|
||
if key not in seen_keys:
|
||
seen_keys.add(key)
|
||
content_words.append(w)
|
||
|
||
if len(content_words) < 5:
|
||
return rows
|
||
|
||
# Compute median word height (excluding outliers like tall brackets/IPA)
|
||
word_heights = sorted(w['height'] for w in content_words)
|
||
median_wh = word_heights[len(word_heights) // 2]
|
||
|
||
# Compute median gap-based row height — this is the actual line height
|
||
# as detected by the horizontal projection. We use 40% of this as
|
||
# grouping tolerance. This is much more reliable than using word height
|
||
# alone, because words on the same line can have very different heights
|
||
# (e.g. lowercase vs uppercase, brackets, phonetic symbols).
|
||
gap_row_heights = sorted(r.height for r in content_rows)
|
||
median_row_h = gap_row_heights[len(gap_row_heights) // 2]
|
||
|
||
# Tolerance: 40% of row height. Words on the same line should have
|
||
# centers within this range. Even if a word's bbox is taller/shorter,
|
||
# its center should stay within half a row height of the line center.
|
||
y_tol = max(10, int(median_row_h * 0.4))
|
||
|
||
# Sort by center_y, then group by proximity
|
||
words_by_center = sorted(content_words,
|
||
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
|
||
line_clusters: List[List[Dict]] = []
|
||
current_line: List[Dict] = [words_by_center[0]]
|
||
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
|
||
|
||
for w in words_by_center[1:]:
|
||
w_center = w['top'] + w['height'] / 2
|
||
if abs(w_center - current_center) <= y_tol:
|
||
current_line.append(w)
|
||
else:
|
||
current_line.sort(key=lambda w: w['left'])
|
||
line_clusters.append(current_line)
|
||
current_line = [w]
|
||
current_center = w_center
|
||
|
||
if current_line:
|
||
current_line.sort(key=lambda w: w['left'])
|
||
line_clusters.append(current_line)
|
||
|
||
if len(line_clusters) < 3:
|
||
return rows
|
||
|
||
# --- Step B: Compute center_y per cluster ---
|
||
# center_y = median of (word_top + word_height/2) across all words in cluster
|
||
# letter_h = median of word heights, but excluding outlier-height words
|
||
# (>2× median) so that tall brackets/IPA don't skew the height
|
||
cluster_info: List[Dict] = []
|
||
for cl_words in line_clusters:
|
||
centers = [w['top'] + w['height'] / 2 for w in cl_words]
|
||
# Filter outlier heights for letter_h computation
|
||
normal_heights = [w['height'] for w in cl_words
|
||
if w['height'] <= median_wh * 2.0]
|
||
if not normal_heights:
|
||
normal_heights = [w['height'] for w in cl_words]
|
||
center_y = float(np.median(centers))
|
||
letter_h = float(np.median(normal_heights))
|
||
cluster_info.append({
|
||
'center_y_rel': center_y, # relative to content ROI
|
||
'center_y_abs': center_y + top_y, # absolute
|
||
'letter_h': letter_h,
|
||
'words': cl_words,
|
||
})
|
||
|
||
cluster_info.sort(key=lambda c: c['center_y_rel'])
|
||
|
||
# --- Step B2: Merge clusters that are too close together ---
|
||
# Even with center-based grouping, some edge cases can produce
|
||
# spurious clusters. Merge any pair whose centers are closer
|
||
# than 30% of the row height (they're definitely the same text line).
|
||
merge_threshold = max(8, median_row_h * 0.3)
|
||
merged: List[Dict] = [cluster_info[0]]
|
||
for cl in cluster_info[1:]:
|
||
prev = merged[-1]
|
||
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
|
||
# Merge: combine words, recompute center
|
||
combined_words = prev['words'] + cl['words']
|
||
centers = [w['top'] + w['height'] / 2 for w in combined_words]
|
||
normal_heights = [w['height'] for w in combined_words
|
||
if w['height'] <= median_wh * 2.0]
|
||
if not normal_heights:
|
||
normal_heights = [w['height'] for w in combined_words]
|
||
prev['center_y_rel'] = float(np.median(centers))
|
||
prev['center_y_abs'] = prev['center_y_rel'] + top_y
|
||
prev['letter_h'] = float(np.median(normal_heights))
|
||
prev['words'] = combined_words
|
||
else:
|
||
merged.append(cl)
|
||
|
||
cluster_info = merged
|
||
|
||
if len(cluster_info) < 3:
|
||
return rows
|
||
|
||
# --- Step C: Compute pitches and detect section breaks ---
|
||
pitches: List[float] = []
|
||
for i in range(1, len(cluster_info)):
|
||
pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||
pitches.append(pitch)
|
||
|
||
if not pitches:
|
||
return rows
|
||
|
||
median_pitch = float(np.median(pitches))
|
||
if median_pitch <= 5:
|
||
return rows
|
||
|
||
# A section break is where the gap between line centers is much larger
|
||
# than the normal pitch (sub-headings, section titles, etc.)
|
||
BREAK_FACTOR = 1.8
|
||
|
||
# --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
|
||
sections: List[List[Dict]] = []
|
||
current_section: List[Dict] = [cluster_info[0]]
|
||
|
||
for i in range(1, len(cluster_info)):
|
||
gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||
if gap > median_pitch * BREAK_FACTOR:
|
||
sections.append(current_section)
|
||
current_section = [cluster_info[i]]
|
||
else:
|
||
current_section.append(cluster_info[i])
|
||
|
||
if current_section:
|
||
sections.append(current_section)
|
||
|
||
# --- Step E: Build row boundaries per section ---
|
||
grid_rows: List[RowGeometry] = []
|
||
|
||
for section in sections:
|
||
if not section:
|
||
continue
|
||
|
||
if len(section) == 1:
|
||
# Single-line section (likely a heading)
|
||
cl = section[0]
|
||
half_h = max(cl['letter_h'], median_pitch * 0.4)
|
||
row_top = cl['center_y_abs'] - half_h
|
||
row_bot = cl['center_y_abs'] + half_h
|
||
grid_rows.append(RowGeometry(
|
||
index=0,
|
||
x=left_x,
|
||
y=round(row_top),
|
||
width=content_w,
|
||
height=round(row_bot - row_top),
|
||
word_count=len(cl['words']),
|
||
words=cl['words'],
|
||
row_type='content',
|
||
gap_before=0,
|
||
))
|
||
continue
|
||
|
||
# Compute local pitch for this section
|
||
local_pitches = []
|
||
for i in range(1, len(section)):
|
||
local_pitches.append(
|
||
section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
|
||
)
|
||
local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
|
||
|
||
# Row boundaries are placed at midpoints between consecutive centers.
|
||
# First row: top = center - local_pitch/2
|
||
# Last row: bottom = center + local_pitch/2
|
||
for i, cl in enumerate(section):
|
||
if i == 0:
|
||
row_top = cl['center_y_abs'] - local_pitch / 2
|
||
else:
|
||
# Midpoint between this center and previous center
|
||
prev_center = section[i - 1]['center_y_abs']
|
||
row_top = (prev_center + cl['center_y_abs']) / 2
|
||
|
||
if i == len(section) - 1:
|
||
row_bot = cl['center_y_abs'] + local_pitch / 2
|
||
else:
|
||
next_center = section[i + 1]['center_y_abs']
|
||
row_bot = (cl['center_y_abs'] + next_center) / 2
|
||
|
||
# Clamp to reasonable bounds
|
||
row_top = max(top_y, row_top)
|
||
row_bot = min(top_y + content_h, row_bot)
|
||
|
||
if row_bot - row_top < 5:
|
||
continue
|
||
|
||
grid_rows.append(RowGeometry(
|
||
index=0,
|
||
x=left_x,
|
||
y=round(row_top),
|
||
width=content_w,
|
||
height=round(row_bot - row_top),
|
||
word_count=len(cl['words']),
|
||
words=cl['words'],
|
||
row_type='content',
|
||
gap_before=0,
|
||
))
|
||
|
||
if not grid_rows:
|
||
return rows
|
||
|
||
# --- Step F: Re-assign words to grid rows ---
|
||
# Words may have shifted slightly; assign each word to the row whose
|
||
# center is closest to the word's vertical center.
|
||
for gr in grid_rows:
|
||
gr.words = []
|
||
|
||
for w in content_words:
|
||
w_center = w['top'] + top_y + w['height'] / 2
|
||
best_row = None
|
||
best_dist = float('inf')
|
||
for gr in grid_rows:
|
||
row_center = gr.y + gr.height / 2
|
||
dist = abs(w_center - row_center)
|
||
if dist < best_dist:
|
||
best_dist = dist
|
||
best_row = gr
|
||
if best_row is not None and best_dist < median_pitch:
|
||
best_row.words.append(w)
|
||
|
||
for gr in grid_rows:
|
||
gr.word_count = len(gr.words)
|
||
|
||
# --- Step G: Validate ---
|
||
words_placed = sum(gr.word_count for gr in grid_rows)
|
||
if len(content_words) > 0:
|
||
match_ratio = words_placed / len(content_words)
|
||
if match_ratio < 0.85:
|
||
logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
|
||
f"of words, keeping gap-based rows")
|
||
return rows
|
||
|
||
# Remove empty grid rows (no words assigned)
|
||
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
|
||
|
||
# --- Step H: Merge header/footer + re-index ---
|
||
result = list(non_content) + grid_rows
|
||
result.sort(key=lambda r: r.y)
|
||
for i, r in enumerate(result):
|
||
r.index = i
|
||
|
||
row_heights = [gr.height for gr in grid_rows]
|
||
min_h = min(row_heights) if row_heights else 0
|
||
max_h = max(row_heights) if row_heights else 0
|
||
logger.info(f"RowGrid: word-center grid applied "
|
||
f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
|
||
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
|
||
f"{len(sections)} sections, "
|
||
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
|
||
f"was {len(content_rows)} gap-based rows)")
|
||
|
||
return result
|
||
|
||
|
||
def _build_rows_from_word_grouping(
|
||
word_dicts: List[Dict],
|
||
left_x: int, right_x: int,
|
||
top_y: int, bottom_y: int,
|
||
content_w: int, content_h: int,
|
||
) -> List['RowGeometry']:
|
||
"""Fallback: build rows by grouping words by Y position.
|
||
|
||
Uses _group_words_into_lines() with a generous tolerance.
|
||
No header/footer detection in fallback mode.
|
||
"""
|
||
if not word_dicts:
|
||
return []
|
||
|
||
y_tolerance = max(20, content_h // 100)
|
||
lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
|
||
|
||
rows = []
|
||
for idx, line_words in enumerate(lines):
|
||
if not line_words:
|
||
continue
|
||
min_top = min(w['top'] for w in line_words)
|
||
max_bottom = max(w['top'] + w['height'] for w in line_words)
|
||
row_height = max_bottom - min_top
|
||
|
||
rows.append(RowGeometry(
|
||
index=idx,
|
||
x=left_x,
|
||
y=top_y + min_top,
|
||
width=content_w,
|
||
height=row_height,
|
||
word_count=len(line_words),
|
||
words=line_words,
|
||
row_type='content',
|
||
gap_before=0,
|
||
))
|
||
|
||
logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
|
||
return rows
|
||
|
||
|
||
# --- Phase B: Content-Based Classification ---
|
||
|
||
def _score_language(words: List[Dict]) -> Dict[str, float]:
|
||
"""Score the language of a column's words.
|
||
|
||
Analyzes function words, umlauts, and capitalization patterns
|
||
to determine whether text is English or German.
|
||
|
||
Args:
|
||
words: List of word dicts with 'text' and 'conf' keys.
|
||
|
||
Returns:
|
||
Dict with 'eng' and 'deu' scores (0.0-1.0).
|
||
"""
|
||
if not words:
|
||
return {'eng': 0.0, 'deu': 0.0}
|
||
|
||
# Only consider words with decent confidence
|
||
good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
|
||
if not good_words:
|
||
return {'eng': 0.0, 'deu': 0.0}
|
||
|
||
total = len(good_words)
|
||
en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
|
||
de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
|
||
|
||
# Check for umlauts (strong German signal)
|
||
raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
|
||
umlaut_count = sum(1 for t in raw_texts
|
||
for c in t if c in 'äöüÄÖÜß')
|
||
|
||
# German capitalization: nouns are capitalized mid-sentence
|
||
# Count words that start with uppercase but aren't at position 0
|
||
cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
|
||
|
||
en_score = en_hits / total if total > 0 else 0.0
|
||
de_score = de_hits / total if total > 0 else 0.0
|
||
|
||
# Boost German score for umlauts
|
||
if umlaut_count > 0:
|
||
de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
|
||
|
||
# Boost German score for high capitalization ratio (typical for German nouns)
|
||
if total > 5:
|
||
cap_ratio = cap_words / total
|
||
if cap_ratio > 0.3:
|
||
de_score = min(1.0, de_score + 0.1)
|
||
|
||
return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
|
||
|
||
|
||
def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
|
||
"""Score the role of a column based on its geometry and content patterns.
|
||
|
||
Args:
|
||
geom: ColumnGeometry with words and dimensions.
|
||
|
||
Returns:
|
||
Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
|
||
"""
|
||
scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
|
||
|
||
if not geom.words:
|
||
return scores
|
||
|
||
texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
|
||
if not texts:
|
||
return scores
|
||
|
||
avg_word_len = sum(len(t) for t in texts) / len(texts)
|
||
has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
|
||
digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
|
||
digit_ratio = digit_words / len(texts) if texts else 0.0
|
||
|
||
# Reference: narrow + mostly numbers/page references
|
||
if geom.width_ratio < 0.12:
|
||
scores['reference'] = 0.5
|
||
if digit_ratio > 0.4:
|
||
scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
|
||
|
||
# Marker: narrow + few short entries
|
||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||
scores['marker'] = 0.7
|
||
if avg_word_len < 4:
|
||
scores['marker'] = 0.9
|
||
# Very narrow non-edge column → strong marker regardless of word count
|
||
if geom.width_ratio < 0.04 and geom.index > 0:
|
||
scores['marker'] = max(scores['marker'], 0.9)
|
||
|
||
# Sentence: longer words + punctuation present
|
||
if geom.width_ratio > 0.15 and has_punctuation > 2:
|
||
scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
|
||
if avg_word_len > 4:
|
||
scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
|
||
|
||
# Vocabulary: medium width + medium word length
|
||
if 0.10 < geom.width_ratio < 0.45:
|
||
scores['vocabulary'] = 0.4
|
||
if 3 < avg_word_len < 8:
|
||
scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
|
||
|
||
return {k: round(v, 3) for k, v in scores.items()}
|
||
|
||
|
||
def _build_margin_regions(
|
||
all_regions: List[PageRegion],
|
||
left_x: int,
|
||
right_x: int,
|
||
img_w: int,
|
||
top_y: int,
|
||
content_h: int,
|
||
) -> List[PageRegion]:
|
||
"""Create margin_left / margin_right PageRegions from content bounds.
|
||
|
||
Margins represent the space between the image edge and the first/last
|
||
content column. They are used downstream for faithful page
|
||
reconstruction but are skipped during OCR.
|
||
"""
|
||
margins: List[PageRegion] = []
|
||
# Minimum gap (px) to create a margin region
|
||
_min_gap = 5
|
||
|
||
if left_x > _min_gap:
|
||
margins.append(PageRegion(
|
||
type='margin_left', x=0, y=top_y,
|
||
width=left_x, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='content_bounds',
|
||
))
|
||
|
||
# Right margin: from end of last content column to image edge
|
||
non_margin = [r for r in all_regions
|
||
if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
|
||
'margin_top', 'margin_bottom')]
|
||
if non_margin:
|
||
last_col_end = max(r.x + r.width for r in non_margin)
|
||
else:
|
||
last_col_end = right_x
|
||
if img_w - last_col_end > _min_gap:
|
||
margins.append(PageRegion(
|
||
type='margin_right', x=last_col_end, y=top_y,
|
||
width=img_w - last_col_end, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='content_bounds',
|
||
))
|
||
|
||
if margins:
|
||
logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
|
||
f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
|
||
|
||
return margins
|
||
|
||
|
||
def classify_column_types(geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
top_y: int,
|
||
img_w: int,
|
||
img_h: int,
|
||
bottom_y: int,
|
||
left_x: int = 0,
|
||
right_x: int = 0,
|
||
inv: Optional[np.ndarray] = None) -> List[PageRegion]:
|
||
"""Classify column types using a 3-level fallback chain.
|
||
|
||
Level 1: Content-based (language + role scoring)
|
||
Level 2: Position + language (old rules enhanced with language detection)
|
||
Level 3: Pure position (exact old code, no regression)
|
||
|
||
Args:
|
||
geometries: List of ColumnGeometry from Phase A.
|
||
content_w: Total content width.
|
||
top_y: Top Y of content area.
|
||
img_w: Full image width.
|
||
img_h: Full image height.
|
||
bottom_y: Bottom Y of content area.
|
||
left_x: Left content bound (from _find_content_bounds).
|
||
right_x: Right content bound (from _find_content_bounds).
|
||
|
||
Returns:
|
||
List of PageRegion with types, confidence, and method.
|
||
"""
|
||
content_h = bottom_y - top_y
|
||
|
||
def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
|
||
"""Append margin_left / margin_right regions to *result*."""
|
||
margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
|
||
return result + margins
|
||
|
||
# Special case: single column → plain text page
|
||
if len(geometries) == 1:
|
||
geom = geometries[0]
|
||
return _with_margins([PageRegion(
|
||
type='column_text', x=geom.x, y=geom.y,
|
||
width=geom.width, height=geom.height,
|
||
classification_confidence=0.9,
|
||
classification_method='content',
|
||
)])
|
||
|
||
# --- Pre-filter: first/last columns with very few words → column_ignore ---
|
||
# Sub-columns from _detect_sub_columns() are exempt: they intentionally
|
||
# have few words (page refs, markers) and should not be discarded.
|
||
ignore_regions = []
|
||
active_geometries = []
|
||
for idx, g in enumerate(geometries):
|
||
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
|
||
ignore_regions.append(PageRegion(
|
||
type='column_ignore', x=g.x, y=g.y,
|
||
width=g.width, height=content_h,
|
||
classification_confidence=0.95,
|
||
classification_method='content',
|
||
))
|
||
logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) → column_ignore (edge, few words)")
|
||
else:
|
||
active_geometries.append(g)
|
||
|
||
# Re-index active geometries for classification
|
||
for new_idx, g in enumerate(active_geometries):
|
||
g.index = new_idx
|
||
geometries = active_geometries
|
||
|
||
# Handle edge case: all columns ignored or only 1 left
|
||
if len(geometries) == 0:
|
||
return _with_margins(ignore_regions)
|
||
if len(geometries) == 1:
|
||
geom = geometries[0]
|
||
ignore_regions.append(PageRegion(
|
||
type='column_text', x=geom.x, y=geom.y,
|
||
width=geom.width, height=geom.height,
|
||
classification_confidence=0.9,
|
||
classification_method='content',
|
||
))
|
||
return _with_margins(ignore_regions)
|
||
|
||
# --- Score all columns ---
|
||
lang_scores = [_score_language(g.words) for g in geometries]
|
||
role_scores = [_score_role(g) for g in geometries]
|
||
|
||
logger.info(f"ClassifyColumns: language scores: "
|
||
f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
|
||
logger.info(f"ClassifyColumns: role scores: "
|
||
f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
|
||
|
||
# --- Level 1: Content-based classification ---
|
||
regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
|
||
if regions is not None:
|
||
logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
|
||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||
return _with_margins(ignore_regions + regions)
|
||
|
||
# --- Level 2: Position + language enhanced ---
|
||
regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
|
||
if regions is not None:
|
||
logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
|
||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||
return _with_margins(ignore_regions + regions)
|
||
|
||
# --- Level 3: Pure position fallback (old code, no regression) ---
|
||
logger.info("ClassifyColumns: Level 3 (position fallback)")
|
||
regions = _classify_by_position_fallback(geometries, content_w, content_h)
|
||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||
return _with_margins(ignore_regions + regions)
|
||
|
||
|
||
def _classify_by_content(geometries: List[ColumnGeometry],
|
||
lang_scores: List[Dict[str, float]],
|
||
role_scores: List[Dict[str, float]],
|
||
content_w: int,
|
||
content_h: int) -> Optional[List[PageRegion]]:
|
||
"""Level 1: Classify columns purely by content analysis.
|
||
|
||
Requires clear language signals to distinguish EN/DE columns.
|
||
Returns None if language signals are too weak.
|
||
"""
|
||
regions = []
|
||
assigned = set()
|
||
|
||
# Step 1: Assign structural roles first (reference, marker)
|
||
# left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
|
||
left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
|
||
|
||
for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
|
||
is_left_side = geom.x < left_20_threshold
|
||
has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
|
||
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
|
||
regions.append(PageRegion(
|
||
type='page_ref', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=rs['reference'],
|
||
classification_method='content',
|
||
))
|
||
assigned.add(i)
|
||
elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
|
||
regions.append(PageRegion(
|
||
type='column_marker', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=rs['marker'],
|
||
classification_method='content',
|
||
))
|
||
assigned.add(i)
|
||
elif geom.width_ratio < 0.05 and not is_left_side:
|
||
# Narrow column on the right side → marker, not page_ref
|
||
regions.append(PageRegion(
|
||
type='column_marker', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.8,
|
||
classification_method='content',
|
||
))
|
||
assigned.add(i)
|
||
|
||
# Step 2: Among remaining columns, find EN and DE by language scores
|
||
remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
|
||
for i in range(len(geometries)) if i not in assigned]
|
||
|
||
if len(remaining) < 2:
|
||
# Not enough columns for EN/DE pair
|
||
if len(remaining) == 1:
|
||
i, geom, ls, rs = remaining[0]
|
||
regions.append(PageRegion(
|
||
type='column_text', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.6,
|
||
classification_method='content',
|
||
))
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
# Check if we have enough language signal
|
||
en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
|
||
de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
|
||
|
||
# Position tiebreaker: when language signals are weak, use left=EN, right=DE
|
||
if (not en_candidates or not de_candidates) and len(remaining) >= 2:
|
||
max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
|
||
max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
|
||
if max_eng < 0.15 and max_deu < 0.15:
|
||
# Both signals weak — fall back to positional: left=EN, right=DE
|
||
sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
|
||
best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
|
||
best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
|
||
logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
|
||
en_conf = 0.4
|
||
de_conf = 0.4
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=best_en[1].x, y=best_en[1].y,
|
||
width=best_en[1].width, height=content_h,
|
||
classification_confidence=en_conf,
|
||
classification_method='content',
|
||
))
|
||
assigned.add(best_en[0])
|
||
|
||
regions.append(PageRegion(
|
||
type='column_de', x=best_de[1].x, y=best_de[1].y,
|
||
width=best_de[1].width, height=content_h,
|
||
classification_confidence=de_conf,
|
||
classification_method='content',
|
||
))
|
||
assigned.add(best_de[0])
|
||
|
||
# Assign remaining as example
|
||
for i, geom, ls, rs in remaining:
|
||
if i not in assigned:
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.4,
|
||
classification_method='content',
|
||
))
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
if not en_candidates or not de_candidates:
|
||
# Language signals too weak for content-based classification
|
||
logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
|
||
return None
|
||
|
||
# Pick the best EN and DE candidates
|
||
best_en = max(en_candidates, key=lambda x: x[2]['eng'])
|
||
best_de = max(de_candidates, key=lambda x: x[2]['deu'])
|
||
|
||
if best_en[0] == best_de[0]:
|
||
# Same column scored highest for both — ambiguous
|
||
logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
|
||
return None
|
||
|
||
en_conf = best_en[2]['eng']
|
||
de_conf = best_de[2]['deu']
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=best_en[1].x, y=best_en[1].y,
|
||
width=best_en[1].width, height=content_h,
|
||
classification_confidence=round(en_conf, 2),
|
||
classification_method='content',
|
||
))
|
||
assigned.add(best_en[0])
|
||
|
||
regions.append(PageRegion(
|
||
type='column_de', x=best_de[1].x, y=best_de[1].y,
|
||
width=best_de[1].width, height=content_h,
|
||
classification_confidence=round(de_conf, 2),
|
||
classification_method='content',
|
||
))
|
||
assigned.add(best_de[0])
|
||
|
||
# Step 3: Remaining columns → example or text based on role scores
|
||
for i, geom, ls, rs in remaining:
|
||
if i in assigned:
|
||
continue
|
||
if rs['sentence'] > 0.4:
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=round(rs['sentence'], 2),
|
||
classification_method='content',
|
||
))
|
||
else:
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.5,
|
||
classification_method='content',
|
||
))
|
||
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
|
||
def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
|
||
lang_scores: List[Dict[str, float]],
|
||
content_w: int,
|
||
content_h: int) -> Optional[List[PageRegion]]:
|
||
"""Level 2: Position-based rules enhanced with language confirmation.
|
||
|
||
Uses the old positional heuristics but confirms EN/DE assignment
|
||
with language scores (swapping if needed).
|
||
"""
|
||
regions = []
|
||
untyped = list(range(len(geometries)))
|
||
first_x = geometries[0].x if geometries else 0
|
||
left_20_threshold = first_x + content_w * 0.20
|
||
|
||
# Rule 1: Leftmost narrow column → page_ref (only if in left 20%, no strong language)
|
||
g0 = geometries[0]
|
||
ls0 = lang_scores[0]
|
||
has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
|
||
if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
|
||
regions.append(PageRegion(
|
||
type='page_ref', x=g0.x, y=g0.y,
|
||
width=g0.width, height=content_h,
|
||
classification_confidence=0.8,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped.remove(0)
|
||
|
||
# Rule 2: Narrow columns with few words → marker
|
||
for i in list(untyped):
|
||
geom = geometries[i]
|
||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||
regions.append(PageRegion(
|
||
type='column_marker', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.7,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped.remove(i)
|
||
|
||
# Rule 3: Rightmost remaining → column_example (if 3+ remaining)
|
||
if len(untyped) >= 3:
|
||
last_idx = untyped[-1]
|
||
geom = geometries[last_idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.7,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped.remove(last_idx)
|
||
|
||
# Rule 4: First two remaining → EN/DE, but check language to possibly swap
|
||
if len(untyped) >= 2:
|
||
idx_a = untyped[0]
|
||
idx_b = untyped[1]
|
||
ls_a = lang_scores[idx_a]
|
||
ls_b = lang_scores[idx_b]
|
||
|
||
# Default: first=EN, second=DE (old behavior)
|
||
en_idx, de_idx = idx_a, idx_b
|
||
conf = 0.7
|
||
|
||
# Swap if language signals clearly indicate the opposite
|
||
if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
|
||
en_idx, de_idx = idx_b, idx_a
|
||
conf = 0.85
|
||
logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
|
||
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
|
||
width=geometries[en_idx].width, height=content_h,
|
||
classification_confidence=conf,
|
||
classification_method='position_enhanced',
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
|
||
width=geometries[de_idx].width, height=content_h,
|
||
classification_confidence=conf,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped = untyped[2:]
|
||
elif len(untyped) == 1:
|
||
idx = untyped[0]
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.5,
|
||
classification_method='position_enhanced',
|
||
))
|
||
untyped = []
|
||
|
||
# Remaining → example
|
||
for idx in untyped:
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=0.5,
|
||
classification_method='position_enhanced',
|
||
))
|
||
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
|
||
def _classify_by_position_fallback(geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
content_h: int) -> List[PageRegion]:
|
||
"""Level 3: Pure position-based fallback (identical to old code).
|
||
|
||
Guarantees no regression from the previous behavior.
|
||
"""
|
||
regions = []
|
||
untyped = list(range(len(geometries)))
|
||
first_x = geometries[0].x if geometries else 0
|
||
left_20_threshold = first_x + content_w * 0.20
|
||
|
||
# Rule 1: Leftmost narrow column → page_ref (only if in left 20%)
|
||
g0 = geometries[0]
|
||
if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
|
||
regions.append(PageRegion(
|
||
type='page_ref', x=g0.x, y=g0.y,
|
||
width=g0.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped.remove(0)
|
||
|
||
# Rule 2: Narrow + few words → marker
|
||
for i in list(untyped):
|
||
geom = geometries[i]
|
||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||
regions.append(PageRegion(
|
||
type='column_marker', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped.remove(i)
|
||
|
||
# Rule 3: Rightmost remaining → example (if 3+)
|
||
if len(untyped) >= 3:
|
||
last_idx = untyped[-1]
|
||
geom = geometries[last_idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped.remove(last_idx)
|
||
|
||
# Rule 4: First remaining → EN, second → DE
|
||
if len(untyped) >= 2:
|
||
en_idx = untyped[0]
|
||
de_idx = untyped[1]
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
|
||
width=geometries[en_idx].width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
regions.append(PageRegion(
|
||
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
|
||
width=geometries[de_idx].width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped = untyped[2:]
|
||
elif len(untyped) == 1:
|
||
idx = untyped[0]
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_en', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
untyped = []
|
||
|
||
for idx in untyped:
|
||
geom = geometries[idx]
|
||
regions.append(PageRegion(
|
||
type='column_example', x=geom.x, y=geom.y,
|
||
width=geom.width, height=content_h,
|
||
classification_confidence=1.0,
|
||
classification_method='position_fallback',
|
||
))
|
||
|
||
regions.sort(key=lambda r: r.x)
|
||
return regions
|
||
|
||
|
||
def _detect_header_footer_gaps(
|
||
inv: np.ndarray,
|
||
img_w: int,
|
||
img_h: int,
|
||
) -> Tuple[Optional[int], Optional[int]]:
|
||
"""Detect header/footer boundaries via horizontal projection gap analysis.
|
||
|
||
Scans the full-page inverted image for large horizontal gaps in the top/bottom
|
||
20% that separate header/footer content from the main body.
|
||
|
||
Returns:
|
||
(header_y, footer_y) — absolute y-coordinates.
|
||
header_y = bottom edge of header region (None if no header detected).
|
||
footer_y = top edge of footer region (None if no footer detected).
|
||
"""
|
||
HEADER_FOOTER_ZONE = 0.20
|
||
GAP_MULTIPLIER = 2.0
|
||
|
||
# Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
|
||
actual_h = min(inv.shape[0], img_h)
|
||
roi = inv[:actual_h, :]
|
||
h_proj = np.sum(roi, axis=1).astype(float)
|
||
proj_w = roi.shape[1]
|
||
h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
|
||
|
||
# Step 2: Smoothing
|
||
kernel_size = max(3, actual_h // 200)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
# Step 3: Gap threshold
|
||
positive = h_smooth[h_smooth > 0]
|
||
median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
|
||
gap_threshold = max(median_density * 0.15, 0.003)
|
||
|
||
in_gap = h_smooth < gap_threshold
|
||
MIN_GAP_HEIGHT = max(3, actual_h // 500)
|
||
|
||
# Step 4: Collect contiguous gaps
|
||
raw_gaps: List[Tuple[int, int]] = []
|
||
gap_start: Optional[int] = None
|
||
for y in range(len(in_gap)):
|
||
if in_gap[y]:
|
||
if gap_start is None:
|
||
gap_start = y
|
||
else:
|
||
if gap_start is not None:
|
||
gap_height = y - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, y))
|
||
gap_start = None
|
||
if gap_start is not None:
|
||
gap_height = len(in_gap) - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, len(in_gap)))
|
||
|
||
if not raw_gaps:
|
||
return None, None
|
||
|
||
# Step 5: Compute median gap size and large-gap threshold
|
||
gap_sizes = [g[1] - g[0] for g in raw_gaps]
|
||
median_gap = float(np.median(gap_sizes))
|
||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||
|
||
# Step 6: Find largest qualifying gap in header / footer zones
|
||
# A separator gap must have content on BOTH sides — edge-touching gaps
|
||
# (e.g. dewarp padding at bottom) are not valid separators.
|
||
EDGE_MARGIN = max(5, actual_h // 400)
|
||
header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
|
||
footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
|
||
|
||
header_y: Optional[int] = None
|
||
footer_y: Optional[int] = None
|
||
|
||
best_header_size = 0
|
||
for gs, ge in raw_gaps:
|
||
if gs <= EDGE_MARGIN:
|
||
continue # skip gaps touching the top edge
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||
if gap_size > best_header_size:
|
||
best_header_size = gap_size
|
||
header_y = ge # bottom edge of gap
|
||
|
||
best_footer_size = 0
|
||
for gs, ge in raw_gaps:
|
||
if ge >= actual_h - EDGE_MARGIN:
|
||
continue # skip gaps touching the bottom edge
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||
if gap_size > best_footer_size:
|
||
best_footer_size = gap_size
|
||
footer_y = gs # top edge of gap
|
||
|
||
if header_y is not None:
|
||
logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
|
||
f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
|
||
if footer_y is not None:
|
||
logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
|
||
f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
|
||
|
||
return header_y, footer_y
|
||
|
||
|
||
def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
|
||
min_density: float = 0.005) -> bool:
|
||
"""Check whether a horizontal strip contains meaningful ink.
|
||
|
||
Args:
|
||
inv: Inverted binarized image (white-on-black).
|
||
y_start: Top of the region (inclusive).
|
||
y_end: Bottom of the region (exclusive).
|
||
min_density: Fraction of white pixels required to count as content.
|
||
|
||
Returns:
|
||
True if the region contains text/graphics, False if empty margin.
|
||
"""
|
||
if y_start >= y_end:
|
||
return False
|
||
strip = inv[y_start:y_end, :]
|
||
density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
|
||
return density > min_density
|
||
|
||
|
||
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
|
||
img_w: int, img_h: int,
|
||
inv: Optional[np.ndarray] = None) -> None:
|
||
"""Add header/footer/margin regions in-place.
|
||
|
||
Uses gap-based detection when *inv* is provided, otherwise falls back
|
||
to simple top_y/bottom_y bounds.
|
||
|
||
Region types depend on whether there is actual content (text/graphics):
|
||
- 'header' / 'footer' — region contains text (e.g. title, page number)
|
||
- 'margin_top' / 'margin_bottom' — region is empty page margin
|
||
"""
|
||
header_y: Optional[int] = None
|
||
footer_y: Optional[int] = None
|
||
|
||
if inv is not None:
|
||
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
|
||
|
||
# --- Top region ---
|
||
top_boundary = header_y if header_y is not None and header_y > 10 else (
|
||
top_y if top_y > 10 else None
|
||
)
|
||
if top_boundary is not None:
|
||
has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
|
||
rtype = 'header' if has_content else 'margin_top'
|
||
regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
|
||
logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
|
||
f"(has_content={has_content})")
|
||
|
||
# --- Bottom region ---
|
||
bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
|
||
bottom_y if bottom_y < img_h - 10 else None
|
||
)
|
||
if bottom_boundary is not None:
|
||
has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
|
||
rtype = 'footer' if has_content else 'margin_bottom'
|
||
regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
|
||
height=img_h - bottom_boundary))
|
||
logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
|
||
f"height={img_h - bottom_boundary}px (has_content={has_content})")
|
||
|
||
|
||
# --- Main Entry Point ---
|
||
|
||
def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
|
||
"""Detect columns using two-phase approach: geometry then content classification.
|
||
|
||
Phase A: detect_column_geometry() — clustering word positions into columns.
|
||
Phase B: classify_column_types() — content-based type assignment with fallback.
|
||
|
||
Falls back to projection-based analyze_layout() if geometry detection fails.
|
||
|
||
Args:
|
||
ocr_img: Binarized grayscale image for layout analysis.
|
||
dewarped_bgr: Original BGR image (for Tesseract word detection).
|
||
|
||
Returns:
|
||
List of PageRegion objects with types, confidence, and method.
|
||
"""
|
||
h, w = ocr_img.shape[:2]
|
||
|
||
# Phase A: Geometry detection
|
||
result = detect_column_geometry(ocr_img, dewarped_bgr)
|
||
|
||
if result is None:
|
||
# Fallback to projection-based layout
|
||
logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
|
||
layout_img = create_layout_image(dewarped_bgr)
|
||
return analyze_layout(layout_img, ocr_img)
|
||
|
||
geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
|
||
content_w = right_x - left_x
|
||
|
||
# Detect header/footer early so sub-column clustering ignores them
|
||
header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
|
||
|
||
# Split sub-columns (e.g. page references) before classification
|
||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||
|
||
# Phase B: Content-based classification
|
||
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
||
left_x=left_x, right_x=right_x, inv=_inv)
|
||
|
||
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
||
methods = set(r.classification_method for r in regions if r.classification_method)
|
||
logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
|
||
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
|
||
|
||
return regions
|
||
|
||
|
||
# =============================================================================
|
||
# Pipeline Step 5: Word Grid from Columns × Rows
|
||
# =============================================================================
|
||
|
||
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
|
||
"""Group OCR words into visual lines in reading order.
|
||
|
||
Returns a list of line strings (one per visual line in the cell).
|
||
"""
|
||
if not words:
|
||
return []
|
||
|
||
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||
return [' '.join(w['text'] for w in line) for line in lines]
|
||
|
||
|
||
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
|
||
"""Rejoin words split by line-break hyphenation.
|
||
|
||
E.g. ['Fuß-', 'boden'] → ['Fußboden']
|
||
['some text-', 'thing here'] → ['something here']
|
||
"""
|
||
if len(lines) <= 1:
|
||
return lines
|
||
|
||
result = []
|
||
i = 0
|
||
while i < len(lines):
|
||
line = lines[i]
|
||
# If line ends with '-' and there's a next line, rejoin
|
||
if i + 1 < len(lines) and line.rstrip().endswith('-'):
|
||
stripped = line.rstrip()
|
||
# Get the word fragment before hyphen (last word)
|
||
prefix = stripped[:-1] # remove trailing hyphen
|
||
next_line = lines[i + 1]
|
||
# Join: last word of this line + first word of next line
|
||
prefix_words = prefix.rsplit(' ', 1)
|
||
next_words = next_line.split(' ', 1)
|
||
if len(prefix_words) > 1:
|
||
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
|
||
else:
|
||
joined = prefix_words[0] + next_words[0]
|
||
remainder = next_words[1] if len(next_words) > 1 else ''
|
||
if remainder:
|
||
result.append(joined + ' ' + remainder)
|
||
else:
|
||
result.append(joined)
|
||
i += 2
|
||
else:
|
||
result.append(line)
|
||
i += 1
|
||
return result
|
||
|
||
|
||
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||
"""Join OCR words into text in correct reading order, preserving line breaks.
|
||
|
||
Groups words into visual lines by Y-tolerance, sorts each line by X,
|
||
rejoins hyphenated words, then joins lines with newlines.
|
||
"""
|
||
lines = _words_to_reading_order_lines(words, y_tolerance_px)
|
||
lines = _rejoin_hyphenated(lines)
|
||
return '\n'.join(lines)
|
||
|
||
|
||
# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
|
||
|
||
_rapid_engine = None
|
||
RAPIDOCR_AVAILABLE = False
|
||
|
||
try:
|
||
from rapidocr import RapidOCR as _RapidOCRClass
|
||
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
|
||
RAPIDOCR_AVAILABLE = True
|
||
logger.info("RapidOCR available — can be used as alternative to Tesseract")
|
||
except ImportError:
|
||
logger.info("RapidOCR not installed — using Tesseract only")
|
||
|
||
|
||
def _get_rapid_engine():
|
||
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
|
||
global _rapid_engine
|
||
if _rapid_engine is None:
|
||
_rapid_engine = _RapidOCRClass(params={
|
||
# PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
|
||
"Rec.lang_type": _LangRec.LATIN,
|
||
"Rec.model_type": _ModelType.SERVER,
|
||
"Rec.ocr_version": _OCRVersion.PPOCRV5,
|
||
# Tighter detection boxes to reduce word merging
|
||
"Det.unclip_ratio": 1.3,
|
||
"Det.box_thresh": 0.6,
|
||
# Silence verbose logging
|
||
"Global.log_level": "critical",
|
||
})
|
||
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
|
||
return _rapid_engine
|
||
|
||
|
||
def ocr_region_rapid(
|
||
img_bgr: np.ndarray,
|
||
region: PageRegion,
|
||
) -> List[Dict[str, Any]]:
|
||
"""Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format.
|
||
|
||
Args:
|
||
img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray).
|
||
region: Region to crop and OCR.
|
||
|
||
Returns:
|
||
List of word dicts with text, left, top, width, height, conf, region_type.
|
||
"""
|
||
engine = _get_rapid_engine()
|
||
|
||
# Crop region from BGR image
|
||
crop = img_bgr[region.y:region.y + region.height,
|
||
region.x:region.x + region.width]
|
||
|
||
if crop.size == 0:
|
||
return []
|
||
|
||
result = engine(crop)
|
||
|
||
if result is None or result.boxes is None or result.txts is None:
|
||
return []
|
||
|
||
words = []
|
||
boxes = result.boxes # shape (N, 4, 2) — 4 corner points per text line
|
||
txts = result.txts # tuple of strings
|
||
scores = result.scores # tuple of floats
|
||
|
||
for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
|
||
if not txt or not txt.strip():
|
||
continue
|
||
|
||
# box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left)
|
||
xs = [p[0] for p in box]
|
||
ys = [p[1] for p in box]
|
||
left = int(min(xs))
|
||
top = int(min(ys))
|
||
w = int(max(xs) - left)
|
||
h = int(max(ys) - top)
|
||
|
||
words.append({
|
||
'text': txt.strip(),
|
||
'left': left + region.x, # Absolute coords
|
||
'top': top + region.y,
|
||
'width': w,
|
||
'height': h,
|
||
'conf': int(score * 100), # 0-100 like Tesseract
|
||
'region_type': region.type,
|
||
})
|
||
|
||
return words
|
||
|
||
|
||
def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
|
||
"""Run TrOCR on a region. Returns line-level word dicts (same format as ocr_region_rapid).
|
||
|
||
Uses trocr_service.get_trocr_model() + _split_into_lines() for line segmentation.
|
||
Bboxes are approximated from equal line-height distribution within the region.
|
||
Falls back to Tesseract if TrOCR is not available.
|
||
"""
|
||
from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
|
||
|
||
if not _check_trocr_available():
|
||
logger.warning("TrOCR not available, falling back to Tesseract")
|
||
if region.height > 0 and region.width > 0:
|
||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
|
||
if ocr_img_crop is not None:
|
||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||
return []
|
||
|
||
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
|
||
if crop.size == 0:
|
||
return []
|
||
|
||
try:
|
||
import torch
|
||
from PIL import Image as _PILImage
|
||
|
||
processor, model = get_trocr_model(handwritten=handwritten)
|
||
if processor is None or model is None:
|
||
logger.warning("TrOCR model not loaded, falling back to Tesseract")
|
||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||
|
||
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
|
||
lines = _split_into_lines(pil_crop)
|
||
if not lines:
|
||
lines = [pil_crop]
|
||
|
||
device = next(model.parameters()).device
|
||
all_text = []
|
||
confidences = []
|
||
for line_img in lines:
|
||
pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
|
||
with torch.no_grad():
|
||
generated_ids = model.generate(pixel_values, max_length=128)
|
||
text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
||
if text_line:
|
||
all_text.append(text_line)
|
||
confidences.append(0.85 if len(text_line) > 3 else 0.5)
|
||
|
||
if not all_text:
|
||
return []
|
||
|
||
avg_conf = int(sum(confidences) / len(confidences) * 100)
|
||
line_h = region.height // max(len(all_text), 1)
|
||
words = []
|
||
for i, line in enumerate(all_text):
|
||
words.append({
|
||
"text": line,
|
||
"left": region.x,
|
||
"top": region.y + i * line_h,
|
||
"width": region.width,
|
||
"height": line_h,
|
||
"conf": avg_conf,
|
||
"region_type": region.type,
|
||
})
|
||
return words
|
||
|
||
except Exception as e:
|
||
logger.error(f"ocr_region_trocr failed: {e}")
|
||
return []
|
||
|
||
|
||
def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
|
||
"""Run LightOnOCR-2-1B on a region. Returns line-level word dicts (same format as ocr_region_rapid).
|
||
|
||
Falls back to RapidOCR or Tesseract if LightOnOCR is not available.
|
||
"""
|
||
from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
|
||
|
||
if not _check_lighton_available():
|
||
logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
|
||
if RAPIDOCR_AVAILABLE and img_bgr is not None:
|
||
return ocr_region_rapid(img_bgr, region)
|
||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
|
||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
|
||
|
||
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
|
||
if crop.size == 0:
|
||
return []
|
||
|
||
try:
|
||
import io
|
||
import torch
|
||
from PIL import Image as _PILImage
|
||
|
||
processor, model = get_lighton_model()
|
||
if processor is None or model is None:
|
||
logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
|
||
if RAPIDOCR_AVAILABLE and img_bgr is not None:
|
||
return ocr_region_rapid(img_bgr, region)
|
||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||
|
||
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
|
||
conversation = [{"role": "user", "content": [{"type": "image"}]}]
|
||
inputs = processor.apply_chat_template(
|
||
conversation, images=[pil_crop],
|
||
add_generation_prompt=True, return_tensors="pt"
|
||
).to(model.device)
|
||
|
||
with torch.no_grad():
|
||
output_ids = model.generate(**inputs, max_new_tokens=1024)
|
||
|
||
text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
|
||
if not text:
|
||
return []
|
||
|
||
lines = [l.strip() for l in text.split("\n") if l.strip()]
|
||
line_h = region.height // max(len(lines), 1)
|
||
words = []
|
||
for i, line in enumerate(lines):
|
||
words.append({
|
||
"text": line,
|
||
"left": region.x,
|
||
"top": region.y + i * line_h,
|
||
"width": region.width,
|
||
"height": line_h,
|
||
"conf": 85,
|
||
"region_type": region.type,
|
||
})
|
||
return words
|
||
|
||
except Exception as e:
|
||
logger.error(f"ocr_region_lighton failed: {e}")
|
||
return []
|
||
|
||
|
||
# =============================================================================
|
||
# Post-Processing: Deterministic Quality Fixes
|
||
# =============================================================================
|
||
|
||
# --- A. Character Confusion Fix (I/1/l) ---
|
||
|
||
# Common OCR confusion pairs in vocabulary context
|
||
_CHAR_CONFUSION_RULES = [
|
||
# "1" at word start followed by lowercase → likely "I" or "l"
|
||
# Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
|
||
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
|
||
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
|
||
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
|
||
# "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
|
||
(re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
|
||
]
|
||
|
||
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
|
||
_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
|
||
|
||
|
||
def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
"""Fix common OCR character confusions using context.
|
||
|
||
Deterministic rules:
|
||
- "1" at word start → "I" or "l" based on context
|
||
- Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
|
||
- "y " artifact at word boundaries → remove (e.g. "y you" → "you")
|
||
"""
|
||
for entry in entries:
|
||
en = entry.get('english', '') or ''
|
||
de = entry.get('german', '') or ''
|
||
ex = entry.get('example', '') or ''
|
||
|
||
# Apply general rules to all fields
|
||
for pattern, replacement in _CHAR_CONFUSION_RULES:
|
||
en = pattern.sub(replacement, en)
|
||
de = pattern.sub(replacement, de)
|
||
ex = pattern.sub(replacement, ex)
|
||
|
||
# Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
|
||
de_lower_words = set(de.lower().replace(',', ' ').split())
|
||
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
|
||
# Any remaining "1" in EN that looks like "I"
|
||
en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
|
||
|
||
# Fix "y " artifact before repeated word: "y you" → "you"
|
||
en = re.sub(r'\by\s+([a-z])', r'\1', en)
|
||
ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
|
||
|
||
entry['english'] = en.strip()
|
||
entry['german'] = de.strip()
|
||
entry['example'] = ex.strip()
|
||
|
||
return entries
|
||
|
||
|
||
# --- B. Comma-Separated Word Form Splitting ---
|
||
|
||
def _is_singular_plural_pair(parts: List[str]) -> bool:
|
||
"""Detect if comma-separated parts are singular/plural forms of the same word.
|
||
|
||
E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
|
||
"break, broke, broken" → False (different verb forms, OK to split).
|
||
|
||
Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
|
||
OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
|
||
"""
|
||
if len(parts) != 2:
|
||
return False
|
||
|
||
a, b = parts[0].lower().strip(), parts[1].lower().strip()
|
||
if not a or not b:
|
||
return False
|
||
|
||
# Common prefix heuristic: if words share >= 50% of the shorter word,
|
||
# they are likely forms of the same word (Maus/Mäuse, child/children).
|
||
min_len = min(len(a), len(b))
|
||
common = 0
|
||
for ca, cb in zip(a, b):
|
||
if ca == cb:
|
||
common += 1
|
||
else:
|
||
break
|
||
if common >= max(2, min_len * 0.5):
|
||
return True
|
||
|
||
# Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
|
||
umlaut_map = str.maketrans('aou', 'äöü')
|
||
if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
"""Split entries with comma-separated word forms into individual entries.
|
||
|
||
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
|
||
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
|
||
|
||
Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
|
||
because those are forms of the same vocabulary entry.
|
||
|
||
Only splits when both EN and DE have the same number of comma-parts,
|
||
parts are short (word forms, not sentences), and at least 3 parts
|
||
(to avoid splitting pairs that likely belong together).
|
||
"""
|
||
result: List[Dict[str, Any]] = []
|
||
|
||
for entry in entries:
|
||
en = (entry.get('english', '') or '').strip()
|
||
de = (entry.get('german', '') or '').strip()
|
||
|
||
# Split by comma (but not inside brackets or parentheses)
|
||
en_parts = _split_by_comma(en)
|
||
de_parts = _split_by_comma(de)
|
||
|
||
# Only split if we have multiple parts and counts match
|
||
should_split = False
|
||
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
|
||
# All parts must be short (word forms, not sentences)
|
||
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
|
||
# Do NOT split singular/plural pairs (2 parts that are
|
||
# forms of the same word)
|
||
if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
|
||
should_split = False
|
||
else:
|
||
should_split = True
|
||
|
||
if not should_split:
|
||
result.append(entry)
|
||
continue
|
||
|
||
# Split into individual entries
|
||
for k in range(len(en_parts)):
|
||
sub = dict(entry) # shallow copy
|
||
sub['english'] = en_parts[k].strip()
|
||
sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
|
||
sub['example'] = '' # examples get attached later
|
||
sub['split_from_comma'] = True
|
||
result.append(sub)
|
||
|
||
# Re-number
|
||
for i, e in enumerate(result):
|
||
e['row_index'] = i
|
||
|
||
return result
|
||
|
||
|
||
def _split_by_comma(text: str) -> List[str]:
|
||
"""Split text by commas, but not inside brackets [...] or parens (...)."""
|
||
if ',' not in text:
|
||
return [text]
|
||
|
||
parts = []
|
||
depth_bracket = 0
|
||
depth_paren = 0
|
||
current = []
|
||
|
||
for ch in text:
|
||
if ch == '[':
|
||
depth_bracket += 1
|
||
elif ch == ']':
|
||
depth_bracket = max(0, depth_bracket - 1)
|
||
elif ch == '(':
|
||
depth_paren += 1
|
||
elif ch == ')':
|
||
depth_paren = max(0, depth_paren - 1)
|
||
elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
|
||
parts.append(''.join(current).strip())
|
||
current = []
|
||
continue
|
||
current.append(ch)
|
||
|
||
if current:
|
||
parts.append(''.join(current).strip())
|
||
|
||
# Filter empty parts
|
||
return [p for p in parts if p]
|
||
|
||
|
||
# --- C. Example Sentence Attachment ---
|
||
|
||
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
|
||
"""Find the vocab entry whose English word(s) best match the example sentence.
|
||
|
||
Returns index into vocab_entries, or -1 if no match found.
|
||
Uses word stem overlap: "a broken arm" matches "broken" or "break".
|
||
"""
|
||
if not vocab_entries or not example_text:
|
||
return -1
|
||
|
||
example_lower = example_text.lower()
|
||
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
|
||
|
||
best_idx = -1
|
||
best_score = 0
|
||
|
||
for i, entry in enumerate(vocab_entries):
|
||
en = (entry.get('english', '') or '').lower()
|
||
if not en:
|
||
continue
|
||
|
||
# Extract vocab words (split on space, comma, newline)
|
||
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
|
||
|
||
# Score: how many vocab words appear in the example?
|
||
# Also check if example words share a common stem (first 4 chars)
|
||
direct_matches = vocab_words & example_words
|
||
score = len(direct_matches) * 10
|
||
|
||
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
|
||
if score == 0:
|
||
for vw in vocab_words:
|
||
if len(vw) < 3:
|
||
continue
|
||
stem = vw[:4] if len(vw) >= 4 else vw[:3]
|
||
for ew in example_words:
|
||
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
|
||
score += 5
|
||
break
|
||
|
||
if score > best_score:
|
||
best_score = score
|
||
best_idx = i
|
||
|
||
return best_idx if best_score > 0 else -1
|
||
|
||
|
||
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||
"""Attach rows with EN text but no DE translation as examples to matching vocab entries.
|
||
|
||
Vocabulary worksheets often have:
|
||
Row 1: break, broke, broken / brechen, brach, gebrochen
|
||
Row 2: a broken arm (no DE → example for "broken")
|
||
Row 3: a broken plate (no DE → example for "broken")
|
||
Row 4: egg / Ei (has DE → new vocab entry)
|
||
|
||
Rules (deterministic, generic):
|
||
- A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
|
||
- Find the best matching vocab entry by checking which entry's English words
|
||
appear in the example sentence (semantic matching via word overlap)
|
||
- Fall back to the nearest preceding entry if no word match found
|
||
- Multiple examples get joined with " | "
|
||
"""
|
||
if not entries:
|
||
return entries
|
||
|
||
# Separate into vocab entries (have DE) and example candidates (no DE)
|
||
vocab_entries: List[Dict[str, Any]] = []
|
||
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
|
||
|
||
for entry in entries:
|
||
en = (entry.get('english', '') or '').strip()
|
||
de = (entry.get('german', '') or '').strip()
|
||
ex = (entry.get('example', '') or '').strip()
|
||
|
||
# Treat single-char DE as OCR noise, not real translation.
|
||
# "Ei" (2 chars) is a valid German word, so threshold is 1.
|
||
has_de = len(de) > 1
|
||
has_en = bool(en)
|
||
|
||
# Heuristic: a row without DE is an "example sentence" only if
|
||
# the EN text looks like a sentence (>= 4 words, or contains
|
||
# typical sentence punctuation). Short EN text (1-3 words) is
|
||
# more likely a vocab entry whose DE was missed by OCR.
|
||
_looks_like_sentence = (
|
||
len(en.split()) >= 4
|
||
or en.rstrip().endswith(('.', '!', '?'))
|
||
)
|
||
is_example_candidate = (
|
||
has_en and not has_de and _looks_like_sentence and vocab_entries
|
||
)
|
||
|
||
if is_example_candidate:
|
||
# This is an example sentence — find best matching vocab entry
|
||
example_text = en
|
||
|
||
match_idx = _find_best_vocab_match(en, vocab_entries)
|
||
if match_idx < 0:
|
||
# No word match → fall back to last entry
|
||
match_idx = len(vocab_entries) - 1
|
||
|
||
if match_idx not in examples_for:
|
||
examples_for[match_idx] = []
|
||
examples_for[match_idx].append(example_text)
|
||
else:
|
||
vocab_entries.append(entry)
|
||
|
||
# Attach examples to their matched vocab entries
|
||
for idx, example_list in examples_for.items():
|
||
if 0 <= idx < len(vocab_entries):
|
||
entry = vocab_entries[idx]
|
||
existing_ex = (entry.get('example', '') or '').strip()
|
||
new_examples = ' | '.join(example_list)
|
||
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
|
||
|
||
# Re-number
|
||
for i, e in enumerate(vocab_entries):
|
||
e['row_index'] = i
|
||
|
||
return vocab_entries
|
||
|
||
|
||
# --- D. Phonetic Bracket IPA Replacement ---
|
||
|
||
# Pattern: word [phonetic] or word (phonetic) — capture the word before brackets
|
||
_PHONETIC_BRACKET_RE = re.compile(
|
||
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*\[([^\]]*)\]'
|
||
)
|
||
|
||
|
||
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||
"""Look up IPA for a word using the selected pronunciation dictionary.
|
||
|
||
Args:
|
||
word: English word to look up.
|
||
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
|
||
|
||
Returns:
|
||
IPA string or None if not found.
|
||
"""
|
||
word_lower = word.lower().strip()
|
||
if not word_lower:
|
||
return None
|
||
|
||
if pronunciation == 'british' and _britfone_dict:
|
||
ipa = _britfone_dict.get(word_lower)
|
||
if ipa:
|
||
return ipa
|
||
# Fallback to American if not in Britfone
|
||
if _ipa_convert_american:
|
||
result = _ipa_convert_american(word_lower)
|
||
if result and '*' not in result:
|
||
return result
|
||
return None
|
||
|
||
if pronunciation == 'american' and _ipa_convert_american:
|
||
result = _ipa_convert_american(word_lower)
|
||
if result and '*' not in result:
|
||
return result
|
||
# Fallback to Britfone if not in CMU
|
||
if _britfone_dict:
|
||
ipa = _britfone_dict.get(word_lower)
|
||
if ipa:
|
||
return ipa
|
||
return None
|
||
|
||
# Try any available source
|
||
if _britfone_dict:
|
||
ipa = _britfone_dict.get(word_lower)
|
||
if ipa:
|
||
return ipa
|
||
if _ipa_convert_american:
|
||
result = _ipa_convert_american(word_lower)
|
||
if result and '*' not in result:
|
||
return result
|
||
|
||
return None
|
||
|
||
|
||
def _fix_phonetic_brackets(
|
||
entries: List[Dict[str, Any]],
|
||
pronunciation: str = 'british',
|
||
) -> List[Dict[str, Any]]:
|
||
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
|
||
|
||
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
|
||
- British: "dance [dˈɑːns]" (Britfone, MIT)
|
||
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
|
||
|
||
Only replaces if the word before brackets is found in the dictionary.
|
||
"""
|
||
if not IPA_AVAILABLE:
|
||
return entries
|
||
|
||
for entry in entries:
|
||
for field in ('english', 'german', 'example'):
|
||
text = entry.get(field, '') or ''
|
||
if '[' not in text:
|
||
continue
|
||
entry[field] = _replace_phonetics_in_text(text, pronunciation)
|
||
|
||
return entries
|
||
|
||
|
||
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
|
||
"""Replace [phonetic] after words with dictionary IPA."""
|
||
if not IPA_AVAILABLE:
|
||
return text
|
||
|
||
def replacer(match):
|
||
word = match.group(1)
|
||
ocr_phonetic = match.group(2)
|
||
|
||
# Skip if bracket content looks like regular text (has spaces + capitals)
|
||
if len(ocr_phonetic.split()) > 3:
|
||
return match.group(0) # Keep original
|
||
|
||
# Look up in IPA dictionary
|
||
ipa = _lookup_ipa(word, pronunciation)
|
||
if not ipa:
|
||
return match.group(0) # Keep original
|
||
|
||
return f"{word} [{ipa}]"
|
||
|
||
return _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||
|
||
|
||
def _assign_row_words_to_columns(
|
||
row: RowGeometry,
|
||
columns: List[PageRegion],
|
||
) -> Dict[int, List[Dict]]:
|
||
"""Assign each word in a row to exactly one column.
|
||
|
||
Uses a two-pass strategy:
|
||
1. Containment: if a word's center falls within a column's horizontal
|
||
bounds (with padding), assign it to that column.
|
||
2. Nearest center: for words not contained by any column, fall back to
|
||
nearest column center distance.
|
||
|
||
This prevents long sentences in wide columns (e.g. example) from having
|
||
their rightmost words stolen by an adjacent column.
|
||
|
||
Args:
|
||
row: Row with words (relative coordinates).
|
||
columns: Sorted list of columns (absolute coordinates).
|
||
|
||
Returns:
|
||
Dict mapping col_index → list of words assigned to that column.
|
||
"""
|
||
result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
|
||
|
||
if not row.words or not columns:
|
||
return result
|
||
|
||
left_x = row.x # content ROI left (absolute)
|
||
|
||
# Build non-overlapping column assignment ranges using midpoints.
|
||
# For adjacent columns, the boundary is the midpoint between them.
|
||
# This prevents words near column borders from being assigned to
|
||
# the wrong column (e.g. "We" at the start of an example sentence
|
||
# being stolen by the preceding DE column).
|
||
n = len(columns)
|
||
col_ranges_rel = [] # (assign_left, assign_right) per column
|
||
for ci, col in enumerate(columns):
|
||
col_left_rel = col.x - left_x
|
||
col_right_rel = col_left_rel + col.width
|
||
|
||
# Left boundary: midpoint to previous column, or 0
|
||
if ci == 0:
|
||
assign_left = 0
|
||
else:
|
||
prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
|
||
assign_left = (prev_right + col_left_rel) / 2
|
||
|
||
# Right boundary: midpoint to next column, or infinity (row width)
|
||
if ci == n - 1:
|
||
assign_right = row.width + 100 # generous for last column
|
||
else:
|
||
next_left = columns[ci + 1].x - left_x
|
||
assign_right = (col_right_rel + next_left) / 2
|
||
|
||
col_ranges_rel.append((assign_left, assign_right))
|
||
|
||
for w in row.words:
|
||
w_left = w['left']
|
||
w_right = w_left + w['width']
|
||
w_center_x = w_left + w['width'] / 2
|
||
|
||
# Primary: overlap-based matching — assign to column with most overlap.
|
||
# This is more robust than center-based for narrow columns (page_ref)
|
||
# where the last character's center may fall into the next column.
|
||
best_col = -1
|
||
best_overlap = 0
|
||
for ci, col in enumerate(columns):
|
||
col_left_rel = col.x - left_x
|
||
col_right_rel = col_left_rel + col.width
|
||
overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
|
||
if overlap > best_overlap:
|
||
best_overlap = overlap
|
||
best_col = ci
|
||
|
||
if best_col >= 0 and best_overlap > 0:
|
||
result[best_col].append(w)
|
||
else:
|
||
# Fallback: center-based range matching
|
||
assigned = False
|
||
for ci, (al, ar) in enumerate(col_ranges_rel):
|
||
if al <= w_center_x < ar:
|
||
result[ci].append(w)
|
||
assigned = True
|
||
break
|
||
|
||
if not assigned:
|
||
# Last resort: nearest column center
|
||
best_col = 0
|
||
col_left_0 = columns[0].x - left_x
|
||
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
|
||
for ci in range(1, n):
|
||
col_left = columns[ci].x - left_x
|
||
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
|
||
if dist < best_dist:
|
||
best_dist = dist
|
||
best_col = ci
|
||
result[best_col].append(w)
|
||
|
||
return result
|
||
|
||
|
||
# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
|
||
_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
|
||
_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
|
||
|
||
# Common short EN/DE words (2-3 chars). Tokens at the end of a cell
|
||
# that do NOT appear here are treated as trailing OCR noise.
|
||
_COMMON_SHORT_WORDS: set = {
|
||
# EN 1-2 letter
|
||
'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
|
||
'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
|
||
'or', 'so', 'to', 'up', 'us', 'we',
|
||
# EN 3 letter
|
||
'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
|
||
'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
|
||
'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
|
||
'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
|
||
'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
|
||
'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
|
||
'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
|
||
'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
|
||
'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
|
||
'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
|
||
'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
|
||
'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
|
||
'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
|
||
'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
|
||
'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
|
||
'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
|
||
'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
|
||
'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
|
||
'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
|
||
'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
|
||
'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
|
||
'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
|
||
'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
|
||
'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
|
||
'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
|
||
'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
|
||
'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
|
||
'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
|
||
'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
|
||
'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
|
||
'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
|
||
'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
|
||
'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
|
||
'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
|
||
'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
|
||
'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
|
||
'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
|
||
'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
|
||
'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
|
||
'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
|
||
'zap', 'zip', 'zoo',
|
||
# DE 2-3 letter
|
||
'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
|
||
'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
|
||
'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
|
||
'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
|
||
'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
|
||
'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
|
||
'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
|
||
'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
|
||
'wut', 'zum', 'zur',
|
||
}
|
||
|
||
# Known abbreviations found in EN/DE textbooks and dictionaries.
|
||
# Stored WITHOUT trailing period (the noise filter strips periods).
|
||
# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
|
||
_KNOWN_ABBREVIATIONS: set = {
|
||
# EN dictionary meta-words
|
||
'sth', 'sb', 'smth', 'smb', 'sbd',
|
||
# EN general
|
||
'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
|
||
'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
|
||
# EN references / textbook
|
||
'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
|
||
'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
|
||
'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
|
||
'ans', 'wb', 'tb', 'vocab',
|
||
# EN parts of speech / grammar
|
||
'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
|
||
'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
|
||
'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
|
||
'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
|
||
'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
|
||
'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
|
||
'syn', 'ant', 'opp', 'var', 'orig',
|
||
# EN titles
|
||
'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
|
||
# EN pronunciation
|
||
'br', 'am', 'brit', 'amer',
|
||
# EN units
|
||
'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
|
||
# DE general
|
||
'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
|
||
'bes', 'insb', 'insbes', 'bspw', 'ca',
|
||
'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
|
||
'inkl', 'exkl', 'zzgl', 'abzgl',
|
||
# DE references
|
||
'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
|
||
'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
|
||
's', 'sp', 'zit', 'zs', 'vlg',
|
||
# DE grammar
|
||
'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
|
||
'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
|
||
'trennb', 'untrennb', 'ugs', 'geh', 'pej',
|
||
# DE regional
|
||
'nordd', 'österr', 'schweiz',
|
||
# Linguistic
|
||
'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
|
||
'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
|
||
'count', 'uncount', 'indef', 'def', 'poss', 'demon',
|
||
}
|
||
|
||
|
||
def _is_noise_tail_token(token: str) -> bool:
|
||
"""Check if a token at the END of cell text is trailing OCR noise.
|
||
|
||
Trailing fragments are very common OCR artifacts from image edges,
|
||
borders, and neighbouring cells. This is more aggressive than a
|
||
general word filter: any short token that isn't in the dictionary
|
||
of common EN/DE words is considered noise.
|
||
|
||
Examples of noise: "Es)", "3", "ee", "B"
|
||
Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]"
|
||
"""
|
||
t = token.strip()
|
||
if not t:
|
||
return True
|
||
|
||
# Keep ellipsis
|
||
if t in ('...', '…'):
|
||
return False
|
||
|
||
# Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
|
||
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
|
||
return False
|
||
if t.endswith(']'):
|
||
return False
|
||
|
||
# Pure non-alpha → noise ("3", ")", "|")
|
||
alpha_chars = _RE_ALPHA.findall(t)
|
||
if not alpha_chars:
|
||
return True
|
||
|
||
# Extract only alpha characters for dictionary lookup
|
||
cleaned = ''.join(alpha_chars)
|
||
|
||
# Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
|
||
if cleaned.lower() in _KNOWN_ABBREVIATIONS:
|
||
return False
|
||
|
||
# Strip normal trailing punctuation before checking for internal noise.
|
||
stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes"
|
||
t_check = stripped_punct if stripped_punct else t
|
||
|
||
# Check for legitimate punctuation patterns vs. real noise.
|
||
# Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
|
||
# "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
|
||
# Noise: "3d", "B|", "x7"
|
||
# Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
|
||
# THEN check if residual contains only alpha characters.
|
||
t_inner = t_check
|
||
# Remove all parentheses, hyphens, slashes, and dots — these are normal
|
||
# in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
|
||
# "(zer)brechen", "wir/uns", "e.g."
|
||
t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
|
||
# Now check: does the inner form still have non-alpha noise?
|
||
inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
|
||
has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
|
||
|
||
# Long alpha words (4+ chars) without internal noise are likely real
|
||
if len(cleaned) >= 4 and not has_internal_noise:
|
||
return False
|
||
|
||
# Short words: check dictionary (uses only alpha chars)
|
||
if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
|
||
return False
|
||
|
||
# Default: short or suspicious → noise
|
||
return True
|
||
|
||
|
||
def _is_garbage_text(text: str) -> bool:
|
||
"""Check if entire cell text is OCR garbage from image areas.
|
||
|
||
Garbage text = no recognizable dictionary word. Catches
|
||
"(ci]oeu", "uanoaain." etc.
|
||
"""
|
||
words = _RE_REAL_WORD.findall(text)
|
||
if not words:
|
||
# Check if any token is a known abbreviation (e.g. "e.g.")
|
||
alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
|
||
if alpha_only in _KNOWN_ABBREVIATIONS:
|
||
return False
|
||
return True
|
||
|
||
for w in words:
|
||
wl = w.lower()
|
||
# Known short word or abbreviation → not garbage
|
||
if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
|
||
return False
|
||
# Long word (>= 4 chars): check vowel/consonant ratio.
|
||
# Real EN/DE words have 20-60% vowels. Garbage like "uanoaain"
|
||
# or "cioeu" has unusual ratios (too many or too few vowels).
|
||
if len(wl) >= 4:
|
||
vowels = sum(1 for c in wl if c in 'aeiouäöü')
|
||
ratio = vowels / len(wl)
|
||
if 0.15 <= ratio <= 0.65:
|
||
return False # plausible vowel ratio → real word
|
||
|
||
return True
|
||
|
||
|
||
def _clean_cell_text(text: str) -> str:
|
||
"""Remove OCR noise from cell text. Generic filters:
|
||
|
||
1. If the entire text has no real alphabetic word (>= 2 letters), clear.
|
||
2. If the entire text is garbage (no dictionary word), clear.
|
||
3. Strip trailing noise tokens from the end of the text.
|
||
"""
|
||
stripped = text.strip()
|
||
if not stripped:
|
||
return ''
|
||
|
||
# --- Filter 1: No real word at all ---
|
||
if not _RE_REAL_WORD.search(stripped):
|
||
# Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
|
||
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
|
||
if alpha_only not in _KNOWN_ABBREVIATIONS:
|
||
return ''
|
||
|
||
# --- Filter 2: Entire text is garbage ---
|
||
if _is_garbage_text(stripped):
|
||
return ''
|
||
|
||
# --- Filter 3: Strip trailing noise tokens ---
|
||
tokens = stripped.split()
|
||
while tokens and _is_noise_tail_token(tokens[-1]):
|
||
tokens.pop()
|
||
if not tokens:
|
||
return ''
|
||
|
||
return ' '.join(tokens)
|
||
|
||
|
||
def _ocr_single_cell(
|
||
row_idx: int,
|
||
col_idx: int,
|
||
row: RowGeometry,
|
||
col: PageRegion,
|
||
ocr_img: np.ndarray,
|
||
img_bgr: Optional[np.ndarray],
|
||
img_w: int,
|
||
img_h: int,
|
||
use_rapid: bool,
|
||
engine_name: str,
|
||
lang: str,
|
||
lang_map: Dict[str, str],
|
||
preassigned_words: Optional[List[Dict]] = None,
|
||
) -> Dict[str, Any]:
|
||
"""Populate a single cell (column x row intersection) via word lookup."""
|
||
# Display bbox: exact column × row intersection (no padding)
|
||
disp_x = col.x
|
||
disp_y = row.y
|
||
disp_w = col.width
|
||
disp_h = row.height
|
||
|
||
# OCR crop: slightly wider to catch edge characters (internal only)
|
||
pad = 4
|
||
cell_x = max(0, col.x - pad)
|
||
cell_y = max(0, row.y - pad)
|
||
cell_w = min(col.width + 2 * pad, img_w - cell_x)
|
||
cell_h = min(row.height + 2 * pad, img_h - cell_y)
|
||
|
||
if disp_w <= 0 or disp_h <= 0:
|
||
return {
|
||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||
'row_index': row_idx,
|
||
'col_index': col_idx,
|
||
'col_type': col.type,
|
||
'text': '',
|
||
'confidence': 0.0,
|
||
'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
|
||
'bbox_pct': {
|
||
'x': round(col.x / img_w * 100, 2),
|
||
'y': round(row.y / img_h * 100, 2),
|
||
'w': round(col.width / img_w * 100, 2),
|
||
'h': round(row.height / img_h * 100, 2),
|
||
},
|
||
'ocr_engine': 'word_lookup',
|
||
}
|
||
|
||
# --- PRIMARY: Word-lookup from full-page Tesseract ---
|
||
words = preassigned_words if preassigned_words is not None else []
|
||
used_engine = 'word_lookup'
|
||
|
||
# Filter low-confidence words (OCR noise from images/artifacts).
|
||
# Tesseract gives low confidence to misread image edges, borders,
|
||
# and other non-text elements.
|
||
_MIN_WORD_CONF = 30
|
||
if words:
|
||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||
|
||
if words:
|
||
# Use row height as Y-tolerance so all words within a single row
|
||
# are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
|
||
# across two lines due to slight vertical offset).
|
||
y_tol = max(15, row.height)
|
||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||
else:
|
||
text = ''
|
||
avg_conf = 0.0
|
||
|
||
# --- FALLBACK: Cell-OCR for empty cells ---
|
||
# Full-page Tesseract can miss small or isolated words (e.g. "Ei").
|
||
# Re-run OCR on the cell crop to catch what word-lookup missed.
|
||
# To avoid wasting time on truly empty cells, check pixel density first:
|
||
# only run Tesseract if the cell crop contains enough dark pixels to
|
||
# plausibly contain text.
|
||
_run_fallback = False
|
||
if not text.strip() and cell_w > 0 and cell_h > 0:
|
||
if ocr_img is not None:
|
||
crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
||
if crop.size > 0:
|
||
# Threshold: pixels darker than 180 (on 0-255 grayscale).
|
||
# Use 0.5% to catch even small text like "Ei" (2 chars)
|
||
# in an otherwise empty cell.
|
||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||
_run_fallback = dark_ratio > 0.005
|
||
if _run_fallback:
|
||
cell_region = PageRegion(
|
||
type=col.type,
|
||
x=cell_x, y=cell_y,
|
||
width=cell_w, height=cell_h,
|
||
)
|
||
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
|
||
fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
|
||
elif engine_name == "lighton" and img_bgr is not None:
|
||
fallback_words = ocr_region_lighton(img_bgr, cell_region)
|
||
elif use_rapid and img_bgr is not None:
|
||
fallback_words = ocr_region_rapid(img_bgr, cell_region)
|
||
else:
|
||
cell_lang = lang_map.get(col.type, lang)
|
||
fallback_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||
|
||
if fallback_words:
|
||
# Apply same confidence filter to fallback words
|
||
fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||
if fallback_words:
|
||
fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
|
||
fb_y_tol = max(10, int(fb_avg_h * 0.5))
|
||
fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
|
||
if fb_text.strip():
|
||
text = fb_text
|
||
avg_conf = round(
|
||
sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
|
||
)
|
||
used_engine = 'cell_ocr_fallback'
|
||
|
||
# --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
|
||
if not text.strip() and _run_fallback and not use_rapid:
|
||
cell_lang = lang_map.get(col.type, lang)
|
||
psm7_words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7)
|
||
if psm7_words:
|
||
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||
if psm7_words:
|
||
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
|
||
if p7_text.strip():
|
||
text = p7_text
|
||
avg_conf = round(
|
||
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
|
||
)
|
||
used_engine = 'cell_ocr_psm7'
|
||
|
||
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
|
||
if text.strip():
|
||
text = _clean_cell_text(text)
|
||
if not text:
|
||
avg_conf = 0.0
|
||
|
||
return {
|
||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||
'row_index': row_idx,
|
||
'col_index': col_idx,
|
||
'col_type': col.type,
|
||
'text': text,
|
||
'confidence': avg_conf,
|
||
'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
|
||
'bbox_pct': {
|
||
'x': round(disp_x / img_w * 100, 2),
|
||
'y': round(disp_y / img_h * 100, 2),
|
||
'w': round(disp_w / img_w * 100, 2),
|
||
'h': round(disp_h / img_h * 100, 2),
|
||
},
|
||
'ocr_engine': used_engine,
|
||
}
|
||
|
||
|
||
def _is_artifact_row(row: RowGeometry) -> bool:
|
||
"""Return True if this row contains only scan artifacts, not real text.
|
||
|
||
Artifact rows (scanner shadows, noise) typically produce only single-character
|
||
detections. A real content row always has at least one token with 2+ characters.
|
||
"""
|
||
if row.word_count == 0:
|
||
return True
|
||
texts = [w.get('text', '').strip() for w in row.words]
|
||
return all(len(t) <= 1 for t in texts)
|
||
|
||
|
||
def _heal_row_gaps(
|
||
rows: List[RowGeometry],
|
||
top_bound: int,
|
||
bottom_bound: int,
|
||
) -> None:
|
||
"""Expand row y/height to fill vertical gaps caused by removed adjacent rows.
|
||
|
||
After filtering out empty or artifact rows, remaining content rows may have
|
||
gaps between them where the removed rows used to be. This function mutates
|
||
each row to extend upward/downward to the midpoint of such gaps so that
|
||
OCR crops cover the full available content area.
|
||
|
||
The first row always extends to top_bound; the last row to bottom_bound.
|
||
"""
|
||
if not rows:
|
||
return
|
||
rows.sort(key=lambda r: r.y)
|
||
n = len(rows)
|
||
orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
|
||
|
||
for i, row in enumerate(rows):
|
||
# New top: midpoint between previous row's bottom and this row's top
|
||
if i == 0:
|
||
new_top = top_bound
|
||
else:
|
||
prev_bot = orig[i - 1][1]
|
||
my_top = orig[i][0]
|
||
gap = my_top - prev_bot
|
||
new_top = prev_bot + gap // 2 if gap > 1 else my_top
|
||
|
||
# New bottom: midpoint between this row's bottom and next row's top
|
||
if i == n - 1:
|
||
new_bottom = bottom_bound
|
||
else:
|
||
my_bot = orig[i][1]
|
||
next_top = orig[i + 1][0]
|
||
gap = next_top - my_bot
|
||
new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
|
||
|
||
row.y = new_top
|
||
row.height = max(5, new_bottom - new_top)
|
||
|
||
logger.debug(
|
||
f"_heal_row_gaps: {n} rows → y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
|
||
f"(bounds: top={top_bound}, bottom={bottom_bound})"
|
||
)
|
||
|
||
|
||
def build_cell_grid(
|
||
ocr_img: np.ndarray,
|
||
column_regions: List[PageRegion],
|
||
row_geometries: List[RowGeometry],
|
||
img_w: int,
|
||
img_h: int,
|
||
lang: str = "eng+deu",
|
||
ocr_engine: str = "auto",
|
||
img_bgr: Optional[np.ndarray] = None,
|
||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||
"""Generic Cell-Grid: Columns × Rows → cells with OCR text.
|
||
|
||
This is the layout-agnostic foundation. Every column (except column_ignore)
|
||
is intersected with every content row to produce numbered cells.
|
||
|
||
Args:
|
||
ocr_img: Binarized full-page image (for Tesseract).
|
||
column_regions: Classified columns from Step 3 (PageRegion list).
|
||
row_geometries: Rows from Step 4 (RowGeometry list).
|
||
img_w: Image width in pixels.
|
||
img_h: Image height in pixels.
|
||
lang: Default Tesseract language.
|
||
ocr_engine: 'tesseract', 'rapid', 'auto', 'trocr-printed', 'trocr-handwritten', or 'lighton'.
|
||
img_bgr: BGR color image (required for RapidOCR / TrOCR / LightOnOCR).
|
||
|
||
Returns:
|
||
(cells, columns_meta) where cells is a list of cell dicts and
|
||
columns_meta describes the columns used.
|
||
"""
|
||
# Resolve engine choice
|
||
use_rapid = False
|
||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||
engine_name = ocr_engine
|
||
elif ocr_engine == "auto":
|
||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
elif ocr_engine == "rapid":
|
||
if not RAPIDOCR_AVAILABLE:
|
||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||
else:
|
||
use_rapid = True
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
else:
|
||
engine_name = "tesseract"
|
||
|
||
logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
|
||
|
||
# Filter to content rows only (skip header/footer)
|
||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||
if not content_rows:
|
||
logger.warning("build_cell_grid: no content rows found")
|
||
return [], []
|
||
|
||
# Filter phantom rows: rows with no Tesseract words assigned are
|
||
# inter-line whitespace gaps that would produce garbage OCR.
|
||
before = len(content_rows)
|
||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||
skipped = before - len(content_rows)
|
||
if skipped > 0:
|
||
logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
|
||
if not content_rows:
|
||
logger.warning("build_cell_grid: no content rows with words found")
|
||
return [], []
|
||
|
||
# Use columns only — skip ignore, header, footer, page_ref
|
||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||
if not relevant_cols:
|
||
logger.warning("build_cell_grid: no usable columns found")
|
||
return [], []
|
||
|
||
# Filter artifact rows: rows whose detected words are all single characters
|
||
# are caused by scanner shadows or noise, not real text.
|
||
before_art = len(content_rows)
|
||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||
artifact_skipped = before_art - len(content_rows)
|
||
if artifact_skipped > 0:
|
||
logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
|
||
if not content_rows:
|
||
logger.warning("build_cell_grid: no content rows after artifact filtering")
|
||
return [], []
|
||
|
||
# Heal row gaps: rows removed above leave vertical gaps; expand adjacent rows
|
||
# to fill the space so OCR crops are not artificially narrow.
|
||
_heal_row_gaps(
|
||
content_rows,
|
||
top_bound=min(c.y for c in relevant_cols),
|
||
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||
)
|
||
|
||
# Sort columns left-to-right
|
||
relevant_cols.sort(key=lambda c: c.x)
|
||
|
||
# Build columns_meta
|
||
columns_meta = [
|
||
{
|
||
'index': col_idx,
|
||
'type': col.type,
|
||
'x': col.x,
|
||
'width': col.width,
|
||
}
|
||
for col_idx, col in enumerate(relevant_cols)
|
||
]
|
||
|
||
# Choose OCR language per column type (Tesseract only)
|
||
lang_map = {
|
||
'column_en': 'eng',
|
||
'column_de': 'deu',
|
||
'column_example': 'eng+deu',
|
||
}
|
||
|
||
cells: List[Dict[str, Any]] = []
|
||
|
||
for row_idx, row in enumerate(content_rows):
|
||
# Pre-assign each word to exactly one column (nearest center)
|
||
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
||
for col_idx, col in enumerate(relevant_cols):
|
||
cell = _ocr_single_cell(
|
||
row_idx, col_idx, row, col,
|
||
ocr_img, img_bgr, img_w, img_h,
|
||
use_rapid, engine_name, lang, lang_map,
|
||
preassigned_words=col_words[col_idx],
|
||
)
|
||
cells.append(cell)
|
||
|
||
# --- BATCH FALLBACK: re-OCR empty cells by column strip ---
|
||
# Collect cells that are still empty but have visible pixels.
|
||
# Instead of calling Tesseract once per cell (expensive), crop an entire
|
||
# column strip and run OCR once, then assign words to cells by Y position.
|
||
empty_by_col: Dict[int, List[int]] = {} # col_idx → [cell list indices]
|
||
for ci, cell in enumerate(cells):
|
||
if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
|
||
bpx = cell['bbox_px']
|
||
x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
|
||
if w > 0 and h > 0 and ocr_img is not None:
|
||
crop = ocr_img[y:y + h, x:x + w]
|
||
if crop.size > 0:
|
||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||
if dark_ratio > 0.005:
|
||
empty_by_col.setdefault(cell['col_index'], []).append(ci)
|
||
|
||
for col_idx, cell_indices in empty_by_col.items():
|
||
if len(cell_indices) < 3:
|
||
continue # Not worth batching for < 3 cells
|
||
|
||
# Find the column strip bounding box (union of all empty cell bboxes)
|
||
min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
|
||
max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
|
||
col_x = cells[cell_indices[0]]['bbox_px']['x']
|
||
col_w = cells[cell_indices[0]]['bbox_px']['w']
|
||
|
||
strip_region = PageRegion(
|
||
type=relevant_cols[col_idx].type,
|
||
x=col_x, y=min_y,
|
||
width=col_w, height=max_y_h - min_y,
|
||
)
|
||
strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
|
||
|
||
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
|
||
strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
|
||
elif engine_name == "lighton" and img_bgr is not None:
|
||
strip_words = ocr_region_lighton(img_bgr, strip_region)
|
||
elif use_rapid and img_bgr is not None:
|
||
strip_words = ocr_region_rapid(img_bgr, strip_region)
|
||
else:
|
||
strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
|
||
|
||
if not strip_words:
|
||
continue
|
||
|
||
strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
|
||
if not strip_words:
|
||
continue
|
||
|
||
# Assign words to cells by Y overlap
|
||
for ci in cell_indices:
|
||
cell_y = cells[ci]['bbox_px']['y']
|
||
cell_h = cells[ci]['bbox_px']['h']
|
||
cell_mid_y = cell_y + cell_h / 2
|
||
|
||
matched_words = [
|
||
w for w in strip_words
|
||
if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
|
||
]
|
||
if matched_words:
|
||
matched_words.sort(key=lambda w: w['left'])
|
||
batch_text = ' '.join(w['text'] for w in matched_words)
|
||
batch_text = _clean_cell_text(batch_text)
|
||
if batch_text.strip():
|
||
cells[ci]['text'] = batch_text
|
||
cells[ci]['confidence'] = round(
|
||
sum(w['conf'] for w in matched_words) / len(matched_words), 1
|
||
)
|
||
cells[ci]['ocr_engine'] = 'batch_column_ocr'
|
||
|
||
batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
|
||
if batch_filled > 0:
|
||
logger.info(
|
||
f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
|
||
f"empty cells in column {col_idx}"
|
||
)
|
||
|
||
# Post-OCR: remove rows where ALL cells are empty (inter-row gaps
|
||
# that had stray Tesseract artifacts giving word_count > 0).
|
||
rows_with_text: set = set()
|
||
for cell in cells:
|
||
if cell['text'].strip():
|
||
rows_with_text.add(cell['row_index'])
|
||
before_filter = len(cells)
|
||
cells = [c for c in cells if c['row_index'] in rows_with_text]
|
||
empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
|
||
if empty_rows_removed > 0:
|
||
logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
|
||
|
||
logger.info(f"build_cell_grid: {len(cells)} cells from "
|
||
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
|
||
f"engine={engine_name}")
|
||
|
||
return cells, columns_meta
|
||
|
||
|
||
def build_cell_grid_streaming(
|
||
ocr_img: np.ndarray,
|
||
column_regions: List[PageRegion],
|
||
row_geometries: List[RowGeometry],
|
||
img_w: int,
|
||
img_h: int,
|
||
lang: str = "eng+deu",
|
||
ocr_engine: str = "auto",
|
||
img_bgr: Optional[np.ndarray] = None,
|
||
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
|
||
"""Like build_cell_grid(), but yields each cell as it is OCR'd.
|
||
|
||
Yields:
|
||
(cell_dict, columns_meta, total_cells) for each cell.
|
||
"""
|
||
# Resolve engine choice (same as build_cell_grid)
|
||
use_rapid = False
|
||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||
engine_name = ocr_engine
|
||
elif ocr_engine == "auto":
|
||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
elif ocr_engine == "rapid":
|
||
if not RAPIDOCR_AVAILABLE:
|
||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||
else:
|
||
use_rapid = True
|
||
engine_name = "rapid" if use_rapid else "tesseract"
|
||
else:
|
||
engine_name = "tesseract"
|
||
|
||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||
if not content_rows:
|
||
return
|
||
|
||
# Filter phantom rows: rows with no Tesseract words assigned are
|
||
# inter-line whitespace gaps that would produce garbage OCR.
|
||
before = len(content_rows)
|
||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||
skipped = before - len(content_rows)
|
||
if skipped > 0:
|
||
logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
|
||
if not content_rows:
|
||
return
|
||
|
||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||
if not relevant_cols:
|
||
return
|
||
|
||
# Filter artifact rows + heal gaps (same logic as build_cell_grid)
|
||
before_art = len(content_rows)
|
||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||
artifact_skipped = before_art - len(content_rows)
|
||
if artifact_skipped > 0:
|
||
logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
|
||
if not content_rows:
|
||
return
|
||
_heal_row_gaps(
|
||
content_rows,
|
||
top_bound=min(c.y for c in relevant_cols),
|
||
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||
)
|
||
|
||
relevant_cols.sort(key=lambda c: c.x)
|
||
|
||
columns_meta = [
|
||
{
|
||
'index': col_idx,
|
||
'type': col.type,
|
||
'x': col.x,
|
||
'width': col.width,
|
||
}
|
||
for col_idx, col in enumerate(relevant_cols)
|
||
]
|
||
|
||
lang_map = {
|
||
'column_en': 'eng',
|
||
'column_de': 'deu',
|
||
'column_example': 'eng+deu',
|
||
}
|
||
|
||
total_cells = len(content_rows) * len(relevant_cols)
|
||
|
||
for row_idx, row in enumerate(content_rows):
|
||
# Pre-assign each word to exactly one column (nearest center)
|
||
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
||
for col_idx, col in enumerate(relevant_cols):
|
||
cell = _ocr_single_cell(
|
||
row_idx, col_idx, row, col,
|
||
ocr_img, img_bgr, img_w, img_h,
|
||
use_rapid, engine_name, lang, lang_map,
|
||
preassigned_words=col_words[col_idx],
|
||
)
|
||
yield cell, columns_meta, total_cells
|
||
|
||
|
||
def _cells_to_vocab_entries(
|
||
cells: List[Dict[str, Any]],
|
||
columns_meta: List[Dict[str, Any]],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Map generic cells to vocab entries with english/german/example fields.
|
||
|
||
Groups cells by row_index, maps col_type → field name, and produces
|
||
one entry per row (only rows with at least one non-empty field).
|
||
"""
|
||
# Determine image dimensions from first cell (for row-level bbox)
|
||
col_type_to_field = {
|
||
'column_en': 'english',
|
||
'column_de': 'german',
|
||
'column_example': 'example',
|
||
'page_ref': 'source_page',
|
||
'column_marker': 'marker',
|
||
}
|
||
bbox_key_map = {
|
||
'column_en': 'bbox_en',
|
||
'column_de': 'bbox_de',
|
||
'column_example': 'bbox_ex',
|
||
'page_ref': 'bbox_ref',
|
||
'column_marker': 'bbox_marker',
|
||
}
|
||
|
||
# Group cells by row_index
|
||
rows: Dict[int, List[Dict]] = {}
|
||
for cell in cells:
|
||
ri = cell['row_index']
|
||
rows.setdefault(ri, []).append(cell)
|
||
|
||
entries: List[Dict[str, Any]] = []
|
||
for row_idx in sorted(rows.keys()):
|
||
row_cells = rows[row_idx]
|
||
entry: Dict[str, Any] = {
|
||
'row_index': row_idx,
|
||
'english': '',
|
||
'german': '',
|
||
'example': '',
|
||
'source_page': '',
|
||
'marker': '',
|
||
'confidence': 0.0,
|
||
'bbox': None,
|
||
'bbox_en': None,
|
||
'bbox_de': None,
|
||
'bbox_ex': None,
|
||
'bbox_ref': None,
|
||
'bbox_marker': None,
|
||
'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
|
||
}
|
||
|
||
confidences = []
|
||
for cell in row_cells:
|
||
col_type = cell['col_type']
|
||
field = col_type_to_field.get(col_type)
|
||
if field:
|
||
entry[field] = cell['text']
|
||
bbox_field = bbox_key_map.get(col_type)
|
||
if bbox_field:
|
||
entry[bbox_field] = cell['bbox_pct']
|
||
if cell['confidence'] > 0:
|
||
confidences.append(cell['confidence'])
|
||
|
||
# Compute row-level bbox as union of all cell bboxes
|
||
all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
|
||
if all_bboxes:
|
||
min_x = min(b['x'] for b in all_bboxes)
|
||
min_y = min(b['y'] for b in all_bboxes)
|
||
max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
|
||
max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
|
||
entry['bbox'] = {
|
||
'x': round(min_x, 2),
|
||
'y': round(min_y, 2),
|
||
'w': round(max_x2 - min_x, 2),
|
||
'h': round(max_y2 - min_y, 2),
|
||
}
|
||
|
||
entry['confidence'] = round(
|
||
sum(confidences) / len(confidences), 1
|
||
) if confidences else 0.0
|
||
|
||
# Only include if at least one mapped field has text
|
||
has_content = any(
|
||
entry.get(f)
|
||
for f in col_type_to_field.values()
|
||
)
|
||
if has_content:
|
||
entries.append(entry)
|
||
|
||
return entries
|
||
|
||
|
||
# Regex: line starts with phonetic bracket content only (no real word before it)
|
||
_PHONETIC_ONLY_RE = re.compile(
|
||
r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
|
||
)
|
||
|
||
|
||
def _is_phonetic_only_text(text: str) -> bool:
|
||
"""Check if text consists only of phonetic transcription.
|
||
|
||
Phonetic-only patterns:
|
||
['mani serva] → True
|
||
[dɑːns] → True
|
||
["a:mand] → True
|
||
almond ['a:mand] → False (has real word before bracket)
|
||
Mandel → False
|
||
"""
|
||
t = text.strip()
|
||
if not t:
|
||
return False
|
||
# Must contain at least one bracket
|
||
if '[' not in t and ']' not in t:
|
||
return False
|
||
# Remove all bracket content and surrounding punctuation/whitespace
|
||
without_brackets = re.sub(r"\[.*?\]", '', t)
|
||
without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
|
||
# If nothing meaningful remains, it's phonetic-only
|
||
alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
|
||
return len(alpha_remaining) < 2
|
||
|
||
|
||
def _merge_phonetic_continuation_rows(
|
||
entries: List[Dict[str, Any]],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Merge rows that contain only phonetic transcription into previous entry.
|
||
|
||
In dictionary pages, phonetic transcription sometimes wraps to the next
|
||
row. E.g.:
|
||
Row 28: EN="it's a money-saver" DE="es spart Kosten"
|
||
Row 29: EN="['mani serva]" DE=""
|
||
|
||
Row 29 is phonetic-only → merge into row 28's EN field.
|
||
"""
|
||
if len(entries) < 2:
|
||
return entries
|
||
|
||
merged: List[Dict[str, Any]] = []
|
||
for entry in entries:
|
||
en = (entry.get('english') or '').strip()
|
||
de = (entry.get('german') or '').strip()
|
||
ex = (entry.get('example') or '').strip()
|
||
|
||
# Check if this entry is phonetic-only (EN has only phonetics, DE empty)
|
||
if merged and _is_phonetic_only_text(en) and not de:
|
||
prev = merged[-1]
|
||
prev_en = (prev.get('english') or '').strip()
|
||
# Append phonetic to previous entry's EN
|
||
if prev_en:
|
||
prev['english'] = prev_en + ' ' + en
|
||
else:
|
||
prev['english'] = en
|
||
# If there was an example, append to previous too
|
||
if ex:
|
||
prev_ex = (prev.get('example') or '').strip()
|
||
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
||
logger.debug(
|
||
f"Merged phonetic row {entry.get('row_index')} "
|
||
f"into previous entry: {prev['english']!r}"
|
||
)
|
||
continue
|
||
|
||
merged.append(entry)
|
||
|
||
return merged
|
||
|
||
|
||
def _merge_continuation_rows(
|
||
entries: List[Dict[str, Any]],
|
||
) -> List[Dict[str, Any]]:
|
||
"""Merge multi-line vocabulary entries where text wraps to the next row.
|
||
|
||
A row is a continuation of the previous entry when:
|
||
- EN has text, but DE is empty
|
||
- EN starts with a lowercase letter (not a new vocab entry)
|
||
- Previous entry's EN does NOT end with a sentence terminator (.!?)
|
||
- The continuation text has fewer than 4 words (not an example sentence)
|
||
- The row was not already merged as phonetic
|
||
|
||
Example:
|
||
Row 5: EN="to put up" DE="aufstellen"
|
||
Row 6: EN="with sth." DE=""
|
||
→ Merged: EN="to put up with sth." DE="aufstellen"
|
||
"""
|
||
if len(entries) < 2:
|
||
return entries
|
||
|
||
merged: List[Dict[str, Any]] = []
|
||
for entry in entries:
|
||
en = (entry.get('english') or '').strip()
|
||
de = (entry.get('german') or '').strip()
|
||
|
||
if merged and en and not de:
|
||
# Check: not phonetic (already handled)
|
||
if _is_phonetic_only_text(en):
|
||
merged.append(entry)
|
||
continue
|
||
|
||
# Check: starts with lowercase
|
||
first_alpha = next((c for c in en if c.isalpha()), '')
|
||
starts_lower = first_alpha and first_alpha.islower()
|
||
|
||
# Check: fewer than 4 words (not an example sentence)
|
||
word_count = len(en.split())
|
||
is_short = word_count < 4
|
||
|
||
# Check: previous entry doesn't end with sentence terminator
|
||
prev = merged[-1]
|
||
prev_en = (prev.get('english') or '').strip()
|
||
prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
|
||
|
||
if starts_lower and is_short and not prev_ends_sentence:
|
||
# Merge into previous entry
|
||
prev['english'] = (prev_en + ' ' + en).strip()
|
||
# Merge example if present
|
||
ex = (entry.get('example') or '').strip()
|
||
if ex:
|
||
prev_ex = (prev.get('example') or '').strip()
|
||
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
||
logger.debug(
|
||
f"Merged continuation row {entry.get('row_index')} "
|
||
f"into previous entry: {prev['english']!r}"
|
||
)
|
||
continue
|
||
|
||
merged.append(entry)
|
||
|
||
return merged
|
||
|
||
|
||
def build_word_grid(
|
||
ocr_img: np.ndarray,
|
||
column_regions: List[PageRegion],
|
||
row_geometries: List[RowGeometry],
|
||
img_w: int,
|
||
img_h: int,
|
||
lang: str = "eng+deu",
|
||
ocr_engine: str = "auto",
|
||
img_bgr: Optional[np.ndarray] = None,
|
||
pronunciation: str = "british",
|
||
) -> List[Dict[str, Any]]:
|
||
"""Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
|
||
|
||
Wrapper around build_cell_grid() that adds vocabulary-specific logic:
|
||
- Maps cells to english/german/example entries
|
||
- Applies character confusion fixes, IPA lookup, comma splitting, etc.
|
||
- Falls back to returning raw cells if no vocab columns detected.
|
||
|
||
Args:
|
||
ocr_img: Binarized full-page image (for Tesseract).
|
||
column_regions: Classified columns from Step 3.
|
||
row_geometries: Rows from Step 4.
|
||
img_w, img_h: Image dimensions.
|
||
lang: Default Tesseract language.
|
||
ocr_engine: 'tesseract', 'rapid', or 'auto'.
|
||
img_bgr: BGR color image (required for RapidOCR).
|
||
pronunciation: 'british' or 'american' for IPA lookup.
|
||
|
||
Returns:
|
||
List of entry dicts with english/german/example text and bbox info (percent).
|
||
"""
|
||
cells, columns_meta = build_cell_grid(
|
||
ocr_img, column_regions, row_geometries, img_w, img_h,
|
||
lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
|
||
)
|
||
|
||
if not cells:
|
||
return []
|
||
|
||
# Check if vocab layout is present
|
||
col_types = {c['type'] for c in columns_meta}
|
||
if not (col_types & {'column_en', 'column_de'}):
|
||
logger.info("build_word_grid: no vocab columns — returning raw cells")
|
||
return cells
|
||
|
||
# Vocab mapping: cells → entries
|
||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||
|
||
# --- Post-processing pipeline (deterministic, no LLM) ---
|
||
n_raw = len(entries)
|
||
|
||
# 0a. Merge phonetic-only continuation rows into previous entry
|
||
entries = _merge_phonetic_continuation_rows(entries)
|
||
|
||
# 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
|
||
entries = _merge_continuation_rows(entries)
|
||
|
||
# 1. Fix character confusion (I/1/l based on context)
|
||
entries = _fix_character_confusion(entries)
|
||
|
||
# 2. Replace OCR'd phonetics with dictionary IPA
|
||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||
|
||
# 3. Split comma-separated word forms (break, broke, broken → 3 entries)
|
||
entries = _split_comma_entries(entries)
|
||
|
||
# 4. Attach example sentences (rows without DE → examples for preceding entry)
|
||
entries = _attach_example_sentences(entries)
|
||
|
||
engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
|
||
logger.info(f"build_word_grid: {len(entries)} entries from "
|
||
f"{n_raw} raw → {len(entries)} after post-processing "
|
||
f"(engine={engine_name})")
|
||
|
||
return entries
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 6: Multi-Pass OCR
|
||
# =============================================================================
|
||
|
||
def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
|
||
psm: int, fallback_psm: Optional[int] = None,
|
||
min_confidence: float = 40.0) -> List[Dict[str, Any]]:
|
||
"""Run Tesseract OCR on a specific region with given PSM.
|
||
|
||
Args:
|
||
ocr_img: Binarized full-page image.
|
||
region: Region to crop and OCR.
|
||
lang: Tesseract language string.
|
||
psm: Page Segmentation Mode.
|
||
fallback_psm: If confidence too low, retry with this PSM per line.
|
||
min_confidence: Minimum average confidence before fallback.
|
||
|
||
Returns:
|
||
List of word dicts with text, position, confidence.
|
||
"""
|
||
# Crop region
|
||
crop = ocr_img[region.y:region.y + region.height,
|
||
region.x:region.x + region.width]
|
||
|
||
if crop.size == 0:
|
||
return []
|
||
|
||
# Convert to PIL for pytesseract
|
||
pil_img = Image.fromarray(crop)
|
||
|
||
# Run Tesseract with specified PSM
|
||
config = f'--psm {psm} --oem 3'
|
||
try:
|
||
data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
|
||
output_type=pytesseract.Output.DICT)
|
||
except Exception as e:
|
||
logger.warning(f"Tesseract failed for region {region.type}: {e}")
|
||
return []
|
||
|
||
words = []
|
||
for i in range(len(data['text'])):
|
||
text = data['text'][i].strip()
|
||
conf = int(data['conf'][i])
|
||
if not text or conf < 10:
|
||
continue
|
||
words.append({
|
||
'text': text,
|
||
'left': data['left'][i] + region.x, # Absolute coords
|
||
'top': data['top'][i] + region.y,
|
||
'width': data['width'][i],
|
||
'height': data['height'][i],
|
||
'conf': conf,
|
||
'region_type': region.type,
|
||
})
|
||
|
||
# Check average confidence
|
||
if words and fallback_psm is not None:
|
||
avg_conf = sum(w['conf'] for w in words) / len(words)
|
||
if avg_conf < min_confidence:
|
||
logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
|
||
f"trying fallback PSM {fallback_psm}")
|
||
words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
|
||
|
||
return words
|
||
|
||
|
||
def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
|
||
lang: str, psm: int) -> List[Dict[str, Any]]:
|
||
"""OCR a region line by line (fallback for low-confidence regions).
|
||
|
||
Splits the region into horizontal strips based on text density,
|
||
then OCRs each strip individually with the given PSM.
|
||
"""
|
||
crop = ocr_img[region.y:region.y + region.height,
|
||
region.x:region.x + region.width]
|
||
|
||
if crop.size == 0:
|
||
return []
|
||
|
||
# Find text lines via horizontal projection
|
||
inv = cv2.bitwise_not(crop)
|
||
h_proj = np.sum(inv, axis=1)
|
||
threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
|
||
|
||
# Find line boundaries
|
||
lines = []
|
||
in_text = False
|
||
line_start = 0
|
||
for y in range(len(h_proj)):
|
||
if h_proj[y] > threshold and not in_text:
|
||
line_start = y
|
||
in_text = True
|
||
elif h_proj[y] <= threshold and in_text:
|
||
if y - line_start > 5: # Minimum line height
|
||
lines.append((line_start, y))
|
||
in_text = False
|
||
if in_text and len(h_proj) - line_start > 5:
|
||
lines.append((line_start, len(h_proj)))
|
||
|
||
all_words = []
|
||
config = f'--psm {psm} --oem 3'
|
||
|
||
for line_y_start, line_y_end in lines:
|
||
# Add small padding
|
||
pad = 3
|
||
y1 = max(0, line_y_start - pad)
|
||
y2 = min(crop.shape[0], line_y_end + pad)
|
||
line_crop = crop[y1:y2, :]
|
||
|
||
if line_crop.size == 0:
|
||
continue
|
||
|
||
pil_img = Image.fromarray(line_crop)
|
||
try:
|
||
data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
|
||
output_type=pytesseract.Output.DICT)
|
||
except Exception:
|
||
continue
|
||
|
||
for i in range(len(data['text'])):
|
||
text = data['text'][i].strip()
|
||
conf = int(data['conf'][i])
|
||
if not text or conf < 10:
|
||
continue
|
||
all_words.append({
|
||
'text': text,
|
||
'left': data['left'][i] + region.x,
|
||
'top': data['top'][i] + region.y + y1,
|
||
'width': data['width'][i],
|
||
'height': data['height'][i],
|
||
'conf': conf,
|
||
'region_type': region.type,
|
||
})
|
||
|
||
return all_words
|
||
|
||
|
||
def run_multi_pass_ocr(ocr_img: np.ndarray,
|
||
regions: List[PageRegion],
|
||
lang: str = "eng+deu") -> Dict[str, List[Dict]]:
|
||
"""Run OCR on each detected region with optimized settings.
|
||
|
||
Args:
|
||
ocr_img: Binarized full-page image.
|
||
regions: Detected page regions.
|
||
lang: Default language.
|
||
|
||
Returns:
|
||
Dict mapping region type to list of word dicts.
|
||
"""
|
||
results: Dict[str, List[Dict]] = {}
|
||
|
||
_ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||
for region in regions:
|
||
if region.type in _ocr_skip:
|
||
continue # Skip non-content regions
|
||
|
||
if region.type == 'column_en':
|
||
words = ocr_region(ocr_img, region, lang='eng', psm=4)
|
||
elif region.type == 'column_de':
|
||
words = ocr_region(ocr_img, region, lang='deu', psm=4)
|
||
elif region.type == 'column_example':
|
||
words = ocr_region(ocr_img, region, lang=lang, psm=6,
|
||
fallback_psm=7, min_confidence=40.0)
|
||
else:
|
||
words = ocr_region(ocr_img, region, lang=lang, psm=6)
|
||
|
||
results[region.type] = words
|
||
logger.info(f"OCR {region.type}: {len(words)} words")
|
||
|
||
return results
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 7: Line Alignment → Vocabulary Entries
|
||
# =============================================================================
|
||
|
||
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
|
||
"""Group words by Y position into lines, sorted by X within each line."""
|
||
if not words:
|
||
return []
|
||
|
||
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
|
||
lines: List[List[Dict]] = []
|
||
current_line: List[Dict] = [sorted_words[0]]
|
||
current_y = sorted_words[0]['top']
|
||
|
||
for word in sorted_words[1:]:
|
||
if abs(word['top'] - current_y) <= y_tolerance_px:
|
||
current_line.append(word)
|
||
else:
|
||
current_line.sort(key=lambda w: w['left'])
|
||
lines.append(current_line)
|
||
current_line = [word]
|
||
current_y = word['top']
|
||
|
||
if current_line:
|
||
current_line.sort(key=lambda w: w['left'])
|
||
lines.append(current_line)
|
||
|
||
return lines
|
||
|
||
|
||
def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
|
||
regions: List[PageRegion],
|
||
y_tolerance_px: int = 25) -> List[VocabRow]:
|
||
"""Align OCR results from different columns into vocabulary rows.
|
||
|
||
Uses Y-coordinate matching to pair English words, German translations,
|
||
and example sentences that appear on the same line.
|
||
|
||
Args:
|
||
ocr_results: Dict mapping region type to word lists.
|
||
regions: Detected regions (for reference).
|
||
y_tolerance_px: Max Y-distance to consider words on the same row.
|
||
|
||
Returns:
|
||
List of VocabRow objects.
|
||
"""
|
||
# If no vocabulary columns detected (e.g. plain text page), return empty
|
||
if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
|
||
logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
|
||
return []
|
||
|
||
# Group words into lines per column
|
||
en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
|
||
de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
|
||
ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
|
||
|
||
def line_y_center(line: List[Dict]) -> float:
|
||
return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
|
||
|
||
def line_text(line: List[Dict]) -> str:
|
||
return ' '.join(w['text'] for w in line)
|
||
|
||
def line_confidence(line: List[Dict]) -> float:
|
||
return sum(w['conf'] for w in line) / len(line) if line else 0
|
||
|
||
# Build EN entries as the primary reference
|
||
vocab_rows: List[VocabRow] = []
|
||
|
||
for en_line in en_lines:
|
||
en_y = line_y_center(en_line)
|
||
en_text = line_text(en_line)
|
||
en_conf = line_confidence(en_line)
|
||
|
||
# Skip very short or likely header content
|
||
if len(en_text.strip()) < 2:
|
||
continue
|
||
|
||
# Find matching DE line
|
||
de_text = ""
|
||
de_conf = 0.0
|
||
best_de_dist = float('inf')
|
||
best_de_idx = -1
|
||
for idx, de_line in enumerate(de_lines):
|
||
dist = abs(line_y_center(de_line) - en_y)
|
||
if dist < y_tolerance_px and dist < best_de_dist:
|
||
best_de_dist = dist
|
||
best_de_idx = idx
|
||
|
||
if best_de_idx >= 0:
|
||
de_text = line_text(de_lines[best_de_idx])
|
||
de_conf = line_confidence(de_lines[best_de_idx])
|
||
|
||
# Find matching example line
|
||
ex_text = ""
|
||
ex_conf = 0.0
|
||
best_ex_dist = float('inf')
|
||
best_ex_idx = -1
|
||
for idx, ex_line in enumerate(ex_lines):
|
||
dist = abs(line_y_center(ex_line) - en_y)
|
||
if dist < y_tolerance_px and dist < best_ex_dist:
|
||
best_ex_dist = dist
|
||
best_ex_idx = idx
|
||
|
||
if best_ex_idx >= 0:
|
||
ex_text = line_text(ex_lines[best_ex_idx])
|
||
ex_conf = line_confidence(ex_lines[best_ex_idx])
|
||
|
||
avg_conf = en_conf
|
||
conf_count = 1
|
||
if de_conf > 0:
|
||
avg_conf += de_conf
|
||
conf_count += 1
|
||
if ex_conf > 0:
|
||
avg_conf += ex_conf
|
||
conf_count += 1
|
||
|
||
vocab_rows.append(VocabRow(
|
||
english=en_text.strip(),
|
||
german=de_text.strip(),
|
||
example=ex_text.strip(),
|
||
confidence=avg_conf / conf_count,
|
||
y_position=int(en_y),
|
||
))
|
||
|
||
# Handle multi-line wrapping in example column:
|
||
# If an example line has no matching EN/DE, append to previous entry
|
||
matched_ex_ys = set()
|
||
for row in vocab_rows:
|
||
if row.example:
|
||
matched_ex_ys.add(row.y_position)
|
||
|
||
for ex_line in ex_lines:
|
||
ex_y = line_y_center(ex_line)
|
||
# Check if already matched
|
||
already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
|
||
if already_matched:
|
||
continue
|
||
|
||
# Find nearest previous vocab row
|
||
best_row = None
|
||
best_dist = float('inf')
|
||
for row in vocab_rows:
|
||
dist = ex_y - row.y_position
|
||
if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
|
||
best_dist = dist
|
||
best_row = row
|
||
|
||
if best_row:
|
||
continuation = line_text(ex_line).strip()
|
||
if continuation:
|
||
best_row.example = (best_row.example + " " + continuation).strip()
|
||
|
||
# Sort by Y position
|
||
vocab_rows.sort(key=lambda r: r.y_position)
|
||
|
||
return vocab_rows
|
||
|
||
|
||
# =============================================================================
|
||
# Stage 8: Optional LLM Post-Correction
|
||
# =============================================================================
|
||
|
||
async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
|
||
confidence_threshold: float = 50.0,
|
||
enabled: bool = False) -> List[VocabRow]:
|
||
"""Optionally send low-confidence regions to Qwen-VL for correction.
|
||
|
||
Default: disabled. Enable per parameter.
|
||
|
||
Args:
|
||
img: Original BGR image.
|
||
vocab_rows: Current vocabulary rows.
|
||
confidence_threshold: Rows below this get LLM correction.
|
||
enabled: Whether to actually run LLM correction.
|
||
|
||
Returns:
|
||
Corrected vocabulary rows.
|
||
"""
|
||
if not enabled:
|
||
return vocab_rows
|
||
|
||
# TODO: Implement Qwen-VL correction for low-confidence entries
|
||
# For each row with confidence < threshold:
|
||
# 1. Crop the relevant region from img
|
||
# 2. Send crop + OCR text to Qwen-VL
|
||
# 3. Replace text if LLM provides a confident correction
|
||
logger.info(f"LLM post-correction skipped (not yet implemented)")
|
||
return vocab_rows
|
||
|
||
|
||
# =============================================================================
|
||
# Orchestrator
|
||
# =============================================================================
|
||
|
||
async def run_cv_pipeline(
|
||
pdf_data: Optional[bytes] = None,
|
||
image_data: Optional[bytes] = None,
|
||
page_number: int = 0,
|
||
zoom: float = 3.0,
|
||
enable_dewarp: bool = True,
|
||
enable_llm_correction: bool = False,
|
||
lang: str = "eng+deu",
|
||
) -> PipelineResult:
|
||
"""Run the complete CV document reconstruction pipeline.
|
||
|
||
Args:
|
||
pdf_data: Raw PDF bytes (mutually exclusive with image_data).
|
||
image_data: Raw image bytes (mutually exclusive with pdf_data).
|
||
page_number: 0-indexed page number (for PDF).
|
||
zoom: PDF rendering zoom factor.
|
||
enable_dewarp: Whether to run dewarp stage.
|
||
enable_llm_correction: Whether to run LLM post-correction.
|
||
lang: Tesseract language string.
|
||
|
||
Returns:
|
||
PipelineResult with vocabulary and timing info.
|
||
"""
|
||
if not CV_PIPELINE_AVAILABLE:
|
||
return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
|
||
|
||
result = PipelineResult()
|
||
total_start = time.time()
|
||
|
||
try:
|
||
# Stage 1: Render
|
||
t = time.time()
|
||
if pdf_data:
|
||
img = render_pdf_high_res(pdf_data, page_number, zoom)
|
||
elif image_data:
|
||
img = render_image_high_res(image_data)
|
||
else:
|
||
return PipelineResult(error="No input data (pdf_data or image_data required)")
|
||
result.stages['render'] = round(time.time() - t, 2)
|
||
result.image_width = img.shape[1]
|
||
result.image_height = img.shape[0]
|
||
logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
|
||
|
||
# Stage 2: Deskew
|
||
t = time.time()
|
||
img, angle = deskew_image(img)
|
||
result.stages['deskew'] = round(time.time() - t, 2)
|
||
logger.info(f"Stage 2 (deskew): {angle:.2f}° in {result.stages['deskew']}s")
|
||
|
||
# Stage 3: Dewarp
|
||
if enable_dewarp:
|
||
t = time.time()
|
||
img, _dewarp_info = dewarp_image(img)
|
||
result.stages['dewarp'] = round(time.time() - t, 2)
|
||
|
||
# Stage 4: Dual image preparation
|
||
t = time.time()
|
||
ocr_img = create_ocr_image(img)
|
||
layout_img = create_layout_image(img)
|
||
result.stages['image_prep'] = round(time.time() - t, 2)
|
||
|
||
# Stage 5: Layout analysis
|
||
t = time.time()
|
||
regions = analyze_layout(layout_img, ocr_img)
|
||
result.stages['layout'] = round(time.time() - t, 2)
|
||
result.columns_detected = len([r for r in regions if r.type.startswith('column')])
|
||
logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
|
||
|
||
# Stage 6: Multi-pass OCR
|
||
t = time.time()
|
||
ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
|
||
result.stages['ocr'] = round(time.time() - t, 2)
|
||
total_words = sum(len(w) for w in ocr_results.values())
|
||
result.word_count = total_words
|
||
logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
|
||
|
||
# Stage 7: Line alignment
|
||
t = time.time()
|
||
vocab_rows = match_lines_to_vocab(ocr_results, regions)
|
||
result.stages['alignment'] = round(time.time() - t, 2)
|
||
|
||
# Stage 8: Optional LLM correction
|
||
if enable_llm_correction:
|
||
t = time.time()
|
||
vocab_rows = await llm_post_correct(img, vocab_rows)
|
||
result.stages['llm_correction'] = round(time.time() - t, 2)
|
||
|
||
# Convert to output format
|
||
result.vocabulary = [
|
||
{
|
||
"english": row.english,
|
||
"german": row.german,
|
||
"example": row.example,
|
||
"confidence": round(row.confidence, 1),
|
||
}
|
||
for row in vocab_rows
|
||
if row.english or row.german # Skip empty rows
|
||
]
|
||
|
||
result.duration_seconds = round(time.time() - total_start, 2)
|
||
logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
|
||
|
||
except Exception as e:
|
||
logger.error(f"CV Pipeline error: {e}")
|
||
import traceback
|
||
logger.debug(traceback.format_exc())
|
||
result.error = str(e)
|
||
result.duration_seconds = round(time.time() - total_start, 2)
|
||
|
||
return result
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# LLM-based OCR Correction (Step 6)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
import httpx
|
||
import os
|
||
import json as _json
|
||
import re as _re
|
||
|
||
_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
|
||
_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
|
||
logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
|
||
|
||
# Regex: entry contains IPA phonetic brackets like "dance [dɑːns]"
|
||
_HAS_PHONETIC_RE = _re.compile(r'\[.*?[ˈˌːʃʒθðŋɑɒɔəɜɪʊʌæ].*?\]')
|
||
|
||
# Regex: digit adjacent to a letter — the hallmark of OCR digit↔letter confusion.
|
||
# Matches digits 0,1,5,6,8 (common OCR confusions: 0→O, 1→l/I, 5→S, 6→G, 8→B)
|
||
# when they appear inside or next to a word character.
|
||
_OCR_DIGIT_IN_WORD_RE = _re.compile(r'(?<=[A-Za-zÄÖÜäöüß])[01568]|[01568](?=[A-Za-zÄÖÜäöüß])')
|
||
|
||
|
||
def _entry_needs_review(entry: Dict) -> bool:
|
||
"""Check if an entry should be sent to the LLM for review.
|
||
|
||
Sends all non-empty entries that don't have IPA phonetic transcriptions.
|
||
The LLM prompt and _is_spurious_change() guard against unwanted changes.
|
||
"""
|
||
en = entry.get("english", "") or ""
|
||
de = entry.get("german", "") or ""
|
||
|
||
# Skip completely empty entries
|
||
if not en.strip() and not de.strip():
|
||
return False
|
||
# Skip entries with IPA/phonetic brackets — dictionary-corrected, LLM must not touch them
|
||
if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
|
||
return False
|
||
return True
|
||
|
||
|
||
def _build_llm_prompt(table_lines: List[Dict]) -> str:
|
||
"""Build the LLM correction prompt for a batch of entries."""
|
||
return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
|
||
|
||
DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
|
||
|
||
NUR diese Korrekturen sind erlaubt:
|
||
- Ziffer 8 statt B: "8en" → "Ben", "8uch" → "Buch", "8all" → "Ball"
|
||
- Ziffer 0 statt O oder o: "L0ndon" → "London", "0ld" → "Old"
|
||
- Ziffer 1 statt l oder I: "1ong" → "long", "Ber1in" → "Berlin"
|
||
- Ziffer 5 statt S oder s: "5tadt" → "Stadt", "5ee" → "See"
|
||
- Ziffer 6 statt G oder g: "6eld" → "Geld"
|
||
- Senkrechter Strich | statt I oder l: "| want" → "I want", "|ong" → "long", "he| p" → "help"
|
||
|
||
ABSOLUT VERBOTEN — aendere NIEMALS:
|
||
- Woerter die korrekt geschrieben sind — auch wenn du eine andere Schreibweise kennst
|
||
- Uebersetzungen — du uebersetzt NICHTS, weder EN→DE noch DE→EN
|
||
- Korrekte englische Woerter (en-Spalte) — auch wenn du eine Bedeutung kennst
|
||
- Korrekte deutsche Woerter (de-Spalte) — auch wenn du sie anders sagen wuerdest
|
||
- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
|
||
- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
|
||
- Lautschrift in eckigen Klammern [...] — diese NIEMALS beruehren
|
||
- Beispielsaetze in der ex-Spalte — NIEMALS aendern
|
||
|
||
Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
|
||
|
||
Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
|
||
Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
|
||
|
||
/no_think
|
||
|
||
Eingabe:
|
||
{_json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
|
||
|
||
|
||
def _is_spurious_change(old_val: str, new_val: str) -> bool:
|
||
"""Detect LLM changes that are likely wrong and should be discarded.
|
||
|
||
Only digit↔letter substitutions (0→O, 1→l, 5→S, 6→G, 8→B) are
|
||
legitimate OCR corrections. Everything else is rejected.
|
||
|
||
Filters out:
|
||
- Case-only changes
|
||
- Changes that don't contain any digit→letter fix
|
||
- Completely different words (LLM translating or hallucinating)
|
||
- Additions or removals of whole words (count changed)
|
||
"""
|
||
if not old_val or not new_val:
|
||
return False
|
||
|
||
# Case-only change — never a real OCR error
|
||
if old_val.lower() == new_val.lower():
|
||
return True
|
||
|
||
# If the word count changed significantly, the LLM rewrote rather than fixed
|
||
old_words = old_val.split()
|
||
new_words = new_val.split()
|
||
if abs(len(old_words) - len(new_words)) > 1:
|
||
return True
|
||
|
||
# Core rule: a legitimate correction replaces a digit with the corresponding
|
||
# letter. If the change doesn't include such a substitution, reject it.
|
||
# Build a set of (old_char, new_char) pairs that differ between old and new.
|
||
# Use character-level diff heuristic: if lengths are close, zip and compare.
|
||
# Map of characters that OCR commonly misreads → set of correct replacements
|
||
_OCR_CHAR_MAP = {
|
||
# Digits mistaken for letters
|
||
'0': set('oOgG'),
|
||
'1': set('lLiI'),
|
||
'5': set('sS'),
|
||
'6': set('gG'),
|
||
'8': set('bB'),
|
||
# Non-letter symbols mistaken for letters
|
||
'|': set('lLiI1'), # pipe → lowercase l, capital I, or digit 1
|
||
'l': set('iI|1'), # lowercase l → capital I (and reverse)
|
||
}
|
||
has_valid_fix = False
|
||
if len(old_val) == len(new_val):
|
||
for oc, nc in zip(old_val, new_val):
|
||
if oc != nc:
|
||
if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
|
||
has_valid_fix = True
|
||
elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
|
||
# Reverse check (e.g. l→I where new is the "correct" char)
|
||
has_valid_fix = True
|
||
else:
|
||
# Length changed by 1: accept if old had a suspicious char sequence
|
||
_OCR_SUSPICIOUS_RE = _re.compile(r'[|01568]')
|
||
if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
|
||
has_valid_fix = True
|
||
|
||
if not has_valid_fix:
|
||
return True # Reject — looks like translation or hallucination
|
||
|
||
return False
|
||
|
||
|
||
def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
|
||
"""Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
|
||
changes = []
|
||
entries_out = []
|
||
for i, orig in enumerate(originals):
|
||
if i < len(corrected):
|
||
c = corrected[i]
|
||
entry = dict(orig)
|
||
for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
|
||
new_val = c.get(key, "").strip()
|
||
old_val = (orig.get(field_name, "") or "").strip()
|
||
if new_val and new_val != old_val:
|
||
# Filter spurious LLM changes
|
||
if _is_spurious_change(old_val, new_val):
|
||
continue
|
||
changes.append({
|
||
"row_index": orig.get("row_index", i),
|
||
"field": field_name,
|
||
"old": old_val,
|
||
"new": new_val,
|
||
})
|
||
entry[field_name] = new_val
|
||
entry["llm_corrected"] = True
|
||
entries_out.append(entry)
|
||
else:
|
||
entries_out.append(dict(orig))
|
||
return changes, entries_out
|
||
|
||
|
||
# ─── Spell-Checker OCR Review (Rule-Based, no LLM) ────────────────────────────
|
||
|
||
REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
|
||
|
||
try:
|
||
from spellchecker import SpellChecker as _SpellChecker
|
||
_en_spell = _SpellChecker(language='en', distance=1)
|
||
_de_spell = _SpellChecker(language='de', distance=1)
|
||
_SPELL_AVAILABLE = True
|
||
logger.info("pyspellchecker loaded (EN+DE), review engine: %s", REVIEW_ENGINE)
|
||
except ImportError:
|
||
_SPELL_AVAILABLE = False
|
||
logger.warning("pyspellchecker not installed — falling back to LLM review")
|
||
|
||
# Suspicious OCR chars → ordered list of most-likely correct replacements
|
||
_SPELL_SUBS: Dict[str, List[str]] = {
|
||
'0': ['O', 'o'],
|
||
'1': ['l', 'I'],
|
||
'5': ['S', 's'],
|
||
'6': ['G', 'g'],
|
||
'8': ['B', 'b'],
|
||
'|': ['I', 'l', '1'],
|
||
}
|
||
_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
|
||
|
||
# Tokenizer: word tokens (letters + pipe) alternating with separators
|
||
_SPELL_TOKEN_RE = _re.compile(r'([A-Za-zÄÖÜäöüß|]+)([^A-Za-zÄÖÜäöüß|]*)')
|
||
|
||
|
||
def _spell_dict_knows(word: str) -> bool:
|
||
"""True if word is known in EN or DE dictionary."""
|
||
if not _SPELL_AVAILABLE:
|
||
return False
|
||
w = word.lower()
|
||
return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
|
||
|
||
|
||
def _spell_fix_token(token: str) -> Optional[str]:
|
||
"""Return corrected form of token, or None if no fix needed/possible."""
|
||
if not any(ch in _SPELL_SUSPICIOUS for ch in token):
|
||
return None
|
||
# Standalone pipe → capital I
|
||
if token == '|':
|
||
return 'I'
|
||
# Original is already a valid word → leave it
|
||
if _spell_dict_knows(token):
|
||
return None
|
||
# Dictionary-backed single-char substitution
|
||
for i, ch in enumerate(token):
|
||
if ch not in _SPELL_SUBS:
|
||
continue
|
||
for replacement in _SPELL_SUBS[ch]:
|
||
candidate = token[:i] + replacement + token[i + 1:]
|
||
if _spell_dict_knows(candidate):
|
||
return candidate
|
||
# Structural rule: suspicious char at position 0 + rest is all lowercase letters
|
||
# e.g. "8en"→"Ben", "8uch"→"Buch", "5ee"→"See", "6eld"→"Geld"
|
||
first = token[0]
|
||
if first in _SPELL_SUBS and len(token) >= 2:
|
||
rest = token[1:]
|
||
if rest.isalpha() and rest.islower():
|
||
candidate = _SPELL_SUBS[first][0] + rest
|
||
if not candidate[0].isdigit():
|
||
return candidate
|
||
return None
|
||
|
||
|
||
def _spell_fix_field(text: str) -> Tuple[str, bool]:
|
||
"""Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
|
||
if not text or not any(ch in text for ch in _SPELL_SUSPICIOUS):
|
||
return text, False
|
||
# Pattern: | immediately before . or , → numbered list prefix ("|. " → "1. ")
|
||
fixed = _re.sub(r'(?<!\w)\|(?=[.,])', '1', text)
|
||
changed = fixed != text
|
||
# Tokenize and fix word by word
|
||
parts: List[str] = []
|
||
pos = 0
|
||
for m in _SPELL_TOKEN_RE.finditer(fixed):
|
||
token, sep = m.group(1), m.group(2)
|
||
correction = _spell_fix_token(token)
|
||
if correction:
|
||
parts.append(correction)
|
||
changed = True
|
||
else:
|
||
parts.append(token)
|
||
parts.append(sep)
|
||
pos = m.end()
|
||
if pos < len(fixed):
|
||
parts.append(fixed[pos:])
|
||
return ''.join(parts), changed
|
||
|
||
|
||
def spell_review_entries_sync(entries: List[Dict]) -> Dict:
|
||
"""Rule-based OCR correction: spell-checker + structural heuristics.
|
||
|
||
Deterministic — never translates, never touches IPA, never hallucinates.
|
||
"""
|
||
t0 = time.time()
|
||
changes: List[Dict] = []
|
||
all_corrected: List[Dict] = []
|
||
for i, entry in enumerate(entries):
|
||
e = dict(entry)
|
||
if not _entry_needs_review(e):
|
||
all_corrected.append(e)
|
||
continue
|
||
for field_name in ("english", "german"):
|
||
old_val = (e.get(field_name) or "").strip()
|
||
if not old_val:
|
||
continue
|
||
new_val, was_changed = _spell_fix_field(old_val)
|
||
if was_changed and new_val != old_val:
|
||
changes.append({
|
||
"row_index": e.get("row_index", i),
|
||
"field": field_name,
|
||
"old": old_val,
|
||
"new": new_val,
|
||
})
|
||
e[field_name] = new_val
|
||
e["llm_corrected"] = True
|
||
all_corrected.append(e)
|
||
duration_ms = int((time.time() - t0) * 1000)
|
||
return {
|
||
"entries_original": entries,
|
||
"entries_corrected": all_corrected,
|
||
"changes": changes,
|
||
"skipped_count": 0,
|
||
"model_used": "spell-checker",
|
||
"duration_ms": duration_ms,
|
||
}
|
||
|
||
|
||
async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
|
||
"""Async generator yielding SSE-compatible events for spell-checker review."""
|
||
total = len(entries)
|
||
yield {
|
||
"type": "meta",
|
||
"total_entries": total,
|
||
"to_review": total,
|
||
"skipped": 0,
|
||
"model": "spell-checker",
|
||
"batch_size": batch_size,
|
||
}
|
||
result = spell_review_entries_sync(entries)
|
||
changes = result["changes"]
|
||
yield {
|
||
"type": "batch",
|
||
"batch_index": 0,
|
||
"entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
|
||
"changes": changes,
|
||
"duration_ms": result["duration_ms"],
|
||
"progress": {"current": total, "total": total},
|
||
}
|
||
yield {
|
||
"type": "complete",
|
||
"changes": changes,
|
||
"model_used": "spell-checker",
|
||
"duration_ms": result["duration_ms"],
|
||
"total_entries": total,
|
||
"reviewed": total,
|
||
"skipped": 0,
|
||
"corrections_found": len(changes),
|
||
"entries_corrected": result["entries_corrected"],
|
||
}
|
||
|
||
# ─── End Spell-Checker ────────────────────────────────────────────────────────
|
||
|
||
|
||
async def llm_review_entries(
|
||
entries: List[Dict],
|
||
model: str = None,
|
||
) -> Dict:
|
||
"""OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
|
||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||
return spell_review_entries_sync(entries)
|
||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||
|
||
model = model or OLLAMA_REVIEW_MODEL
|
||
|
||
# Filter: only entries that need review
|
||
reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
|
||
|
||
if not reviewable:
|
||
return {
|
||
"entries_original": entries,
|
||
"entries_corrected": [dict(e) for e in entries],
|
||
"changes": [],
|
||
"skipped_count": len(entries),
|
||
"model_used": model,
|
||
"duration_ms": 0,
|
||
}
|
||
|
||
review_entries = [e for _, e in reviewable]
|
||
table_lines = [
|
||
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
|
||
for e in review_entries
|
||
]
|
||
|
||
logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
|
||
len(review_entries), len(entries), model, len(entries) - len(reviewable))
|
||
logger.debug("LLM review input: %s", _json.dumps(table_lines[:3], ensure_ascii=False))
|
||
|
||
prompt = _build_llm_prompt(table_lines)
|
||
|
||
t0 = time.time()
|
||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||
resp = await client.post(
|
||
f"{_OLLAMA_URL}/api/chat",
|
||
json={
|
||
"model": model,
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"stream": False,
|
||
"think": False, # qwen3: disable chain-of-thought (Ollama >=0.6)
|
||
"options": {"temperature": 0.1, "num_predict": 8192},
|
||
},
|
||
)
|
||
resp.raise_for_status()
|
||
content = resp.json().get("message", {}).get("content", "")
|
||
duration_ms = int((time.time() - t0) * 1000)
|
||
|
||
logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
|
||
logger.debug("LLM review raw response (first 500): %.500s", content)
|
||
|
||
corrected = _parse_llm_json_array(content)
|
||
logger.info("LLM review: parsed %d corrected entries, applying diff...", len(corrected))
|
||
changes, corrected_entries = _diff_batch(review_entries, corrected)
|
||
|
||
# Merge corrected entries back into the full list
|
||
all_corrected = [dict(e) for e in entries]
|
||
for batch_idx, (orig_idx, _) in enumerate(reviewable):
|
||
if batch_idx < len(corrected_entries):
|
||
all_corrected[orig_idx] = corrected_entries[batch_idx]
|
||
|
||
return {
|
||
"entries_original": entries,
|
||
"entries_corrected": all_corrected,
|
||
"changes": changes,
|
||
"skipped_count": len(entries) - len(reviewable),
|
||
"model_used": model,
|
||
"duration_ms": duration_ms,
|
||
}
|
||
|
||
|
||
async def llm_review_entries_streaming(
|
||
entries: List[Dict],
|
||
model: str = None,
|
||
batch_size: int = _REVIEW_BATCH_SIZE,
|
||
):
|
||
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE."""
|
||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||
async for event in spell_review_entries_streaming(entries, batch_size):
|
||
yield event
|
||
return
|
||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||
|
||
model = model or OLLAMA_REVIEW_MODEL
|
||
|
||
# Separate reviewable from skipped entries
|
||
reviewable = []
|
||
skipped_indices = []
|
||
for i, e in enumerate(entries):
|
||
if _entry_needs_review(e):
|
||
reviewable.append((i, e))
|
||
else:
|
||
skipped_indices.append(i)
|
||
|
||
total_to_review = len(reviewable)
|
||
|
||
# meta event
|
||
yield {
|
||
"type": "meta",
|
||
"total_entries": len(entries),
|
||
"to_review": total_to_review,
|
||
"skipped": len(skipped_indices),
|
||
"model": model,
|
||
"batch_size": batch_size,
|
||
}
|
||
|
||
all_changes = []
|
||
all_corrected = [dict(e) for e in entries]
|
||
total_duration_ms = 0
|
||
reviewed_count = 0
|
||
|
||
# Process in batches
|
||
for batch_start in range(0, total_to_review, batch_size):
|
||
batch_items = reviewable[batch_start:batch_start + batch_size]
|
||
batch_entries = [e for _, e in batch_items]
|
||
|
||
table_lines = [
|
||
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
|
||
for e in batch_entries
|
||
]
|
||
|
||
prompt = _build_llm_prompt(table_lines)
|
||
|
||
logger.info("LLM review streaming: batch %d — sending %d entries to %s",
|
||
batch_start // batch_size, len(batch_entries), model)
|
||
|
||
t0 = time.time()
|
||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||
resp = await client.post(
|
||
f"{_OLLAMA_URL}/api/chat",
|
||
json={
|
||
"model": model,
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"stream": False,
|
||
"think": False, # qwen3: disable chain-of-thought
|
||
"options": {"temperature": 0.1, "num_predict": 8192},
|
||
},
|
||
)
|
||
resp.raise_for_status()
|
||
content = resp.json().get("message", {}).get("content", "")
|
||
batch_ms = int((time.time() - t0) * 1000)
|
||
total_duration_ms += batch_ms
|
||
|
||
logger.info("LLM review streaming: response %dms, length=%d chars", batch_ms, len(content))
|
||
logger.debug("LLM review streaming raw (first 500): %.500s", content)
|
||
|
||
corrected = _parse_llm_json_array(content)
|
||
logger.info("LLM review streaming: parsed %d entries, applying diff...", len(corrected))
|
||
batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
|
||
|
||
# Merge back
|
||
for batch_idx, (orig_idx, _) in enumerate(batch_items):
|
||
if batch_idx < len(batch_corrected):
|
||
all_corrected[orig_idx] = batch_corrected[batch_idx]
|
||
|
||
all_changes.extend(batch_changes)
|
||
reviewed_count += len(batch_items)
|
||
|
||
# Yield batch result
|
||
yield {
|
||
"type": "batch",
|
||
"batch_index": batch_start // batch_size,
|
||
"entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
|
||
"changes": batch_changes,
|
||
"duration_ms": batch_ms,
|
||
"progress": {"current": reviewed_count, "total": total_to_review},
|
||
}
|
||
|
||
# Complete event
|
||
yield {
|
||
"type": "complete",
|
||
"changes": all_changes,
|
||
"model_used": model,
|
||
"duration_ms": total_duration_ms,
|
||
"total_entries": len(entries),
|
||
"reviewed": total_to_review,
|
||
"skipped": len(skipped_indices),
|
||
"corrections_found": len(all_changes),
|
||
"entries_corrected": all_corrected,
|
||
}
|
||
|
||
|
||
def _sanitize_for_json(text: str) -> str:
|
||
"""Remove or escape control characters that break JSON parsing.
|
||
|
||
Keeps tab (\\t), newline (\\n), carriage return (\\r) which are valid
|
||
JSON whitespace. Removes all other ASCII control characters (0x00-0x1f)
|
||
that are only valid inside JSON strings when properly escaped.
|
||
"""
|
||
# Replace literal control chars (except \\t \\n \\r) with a space
|
||
return _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
|
||
|
||
|
||
def _parse_llm_json_array(text: str) -> List[Dict]:
|
||
"""Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
|
||
# Strip qwen3 <think>...</think> blocks (present even with think=False on some builds)
|
||
text = _re.sub(r'<think>.*?</think>', '', text, flags=_re.DOTALL)
|
||
# Strip markdown code fences
|
||
text = _re.sub(r'```json\s*', '', text)
|
||
text = _re.sub(r'```\s*', '', text)
|
||
# Sanitize control characters before JSON parsing
|
||
text = _sanitize_for_json(text)
|
||
# Find first [ ... last ]
|
||
match = _re.search(r'\[.*\]', text, _re.DOTALL)
|
||
if match:
|
||
try:
|
||
return _json.loads(match.group())
|
||
except (ValueError, _json.JSONDecodeError) as e:
|
||
logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
|
||
else:
|
||
logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
|
||
return []
|