""" CV Preprocessing Deskew — Rotation correction via Hough lines, word alignment, and iterative projection. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from collections import defaultdict from typing import Any, Dict, Tuple import numpy as np from cv_vocab_types import ( CV2_AVAILABLE, TESSERACT_AVAILABLE, ) logger = logging.getLogger(__name__) try: import cv2 except ImportError: cv2 = None # type: ignore[assignment] try: import pytesseract from PIL import Image except ImportError: pytesseract = None # type: ignore[assignment] Image = None # type: ignore[assignment,misc] # ============================================================================= # Deskew via Hough Lines # ============================================================================= def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]: """Correct rotation using Hough Line detection. Args: img: BGR image. Returns: Tuple of (corrected image, detected angle in degrees). """ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100, minLineLength=img.shape[1] // 4, maxLineGap=20) if lines is None or len(lines) < 3: return img, 0.0 angles = [] for line in lines: x1, y1, x2, y2 = line[0] angle = np.degrees(np.arctan2(y2 - y1, x2 - x1)) if abs(angle) < 15: angles.append(angle) if not angles: return img, 0.0 median_angle = float(np.median(angles)) if abs(median_angle) > 5.0: median_angle = 5.0 * np.sign(median_angle) if abs(median_angle) < 0.1: return img, 0.0 h, w = img.shape[:2] center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, median_angle, 1.0) corrected = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE) logger.info(f"Deskew: corrected {median_angle:.2f}\u00b0 rotation") return corrected, median_angle # ============================================================================= # Deskew via Word Alignment # ============================================================================= def deskew_image_by_word_alignment( image_data: bytes, lang: str = "eng+deu", downscale_factor: float = 0.5, ) -> Tuple[bytes, float]: """Correct rotation by fitting a line through left-most word starts per text line. More robust than Hough-based deskew for vocabulary worksheets where text lines have consistent left-alignment. Args: image_data: Raw image bytes (PNG/JPEG). lang: Tesseract language string for the quick pass. downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%). Returns: Tuple of (rotated image as PNG bytes, detected angle in degrees). """ if not CV2_AVAILABLE or not TESSERACT_AVAILABLE: return image_data, 0.0 img_array = np.frombuffer(image_data, dtype=np.uint8) img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) if img is None: logger.warning("deskew_by_word_alignment: could not decode image") return image_data, 0.0 orig_h, orig_w = img.shape[:2] small_w = int(orig_w * downscale_factor) small_h = int(orig_h * downscale_factor) small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA) pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB)) try: data = pytesseract.image_to_data( pil_small, lang=lang, config="--psm 6 --oem 3", output_type=pytesseract.Output.DICT, ) except Exception as e: logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}") return image_data, 0.0 line_groups: Dict[tuple, list] = defaultdict(list) for i in range(len(data["text"])): text = (data["text"][i] or "").strip() conf = int(data["conf"][i]) if not text or conf < 20: continue key = (data["block_num"][i], data["par_num"][i], data["line_num"][i]) line_groups[key].append(i) if len(line_groups) < 5: logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping") return image_data, 0.0 scale = 1.0 / downscale_factor points = [] for key, indices in line_groups.items(): best_idx = min(indices, key=lambda i: data["left"][i]) lx = data["left"][best_idx] * scale top = data["top"][best_idx] * scale h = data["height"][best_idx] * scale cy = top + h / 2.0 points.append((lx, cy)) xs = np.array([p[0] for p in points]) ys = np.array([p[1] for p in points]) median_x = float(np.median(xs)) tolerance = orig_w * 0.03 mask = np.abs(xs - median_x) <= tolerance filtered_xs = xs[mask] filtered_ys = ys[mask] if len(filtered_xs) < 5: logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping") return image_data, 0.0 coeffs = np.polyfit(filtered_ys, filtered_xs, 1) slope = coeffs[0] angle_rad = np.arctan(slope) angle_deg = float(np.degrees(angle_rad)) angle_deg = max(-5.0, min(5.0, angle_deg)) logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}\u00b0 from {len(filtered_xs)} points " f"(total lines: {len(line_groups)})") if abs(angle_deg) < 0.05: return image_data, 0.0 center = (orig_w // 2, orig_h // 2) M = cv2.getRotationMatrix2D(center, angle_deg, 1.0) rotated = cv2.warpAffine(img, M, (orig_w, orig_h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE) success, png_buf = cv2.imencode(".png", rotated) if not success: logger.warning("deskew_by_word_alignment: PNG encoding failed") return image_data, 0.0 return png_buf.tobytes(), angle_deg # ============================================================================= # Projection Gradient Scoring # ============================================================================= def _projection_gradient_score(profile: np.ndarray) -> float: """Score a projection profile by the L2-norm of its first derivative.""" diff = np.diff(profile) return float(np.sum(diff * diff)) # ============================================================================= # Iterative Deskew (Vertical-Edge Projection) # ============================================================================= def deskew_image_iterative( img: np.ndarray, coarse_range: float = 5.0, coarse_step: float = 0.1, fine_range: float = 0.15, fine_step: float = 0.02, ) -> Tuple[np.ndarray, float, Dict[str, Any]]: """Iterative deskew using vertical-edge projection optimisation. Args: img: BGR image (full resolution). coarse_range: half-range in degrees for the coarse sweep. coarse_step: step size in degrees for the coarse sweep. fine_range: half-range around the coarse winner for the fine sweep. fine_step: step size in degrees for the fine sweep. Returns: (rotated_bgr, angle_degrees, debug_dict) """ h, w = img.shape[:2] debug: Dict[str, Any] = {} gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) y_lo, y_hi = int(h * 0.15), int(h * 0.85) x_lo, x_hi = int(w * 0.10), int(w * 0.90) gray_crop = gray[y_lo:y_hi, x_lo:x_hi] sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3) edges = np.abs(sobel_x) edge_max = edges.max() if edge_max > 0: edges = (edges / edge_max * 255).astype(np.uint8) else: return img, 0.0, {"error": "no edges detected"} crop_h, crop_w = edges.shape[:2] crop_center = (crop_w // 2, crop_h // 2) trim_y = max(4, int(crop_h * 0.03)) trim_x = max(4, int(crop_w * 0.03)) def _sweep_edges(angles: np.ndarray) -> list: results = [] for angle in angles: if abs(angle) < 1e-6: rotated = edges else: M = cv2.getRotationMatrix2D(crop_center, angle, 1.0) rotated = cv2.warpAffine(edges, M, (crop_w, crop_h), flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_REPLICATE) trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x] v_profile = np.sum(trimmed, axis=0, dtype=np.float64) score = _projection_gradient_score(v_profile) results.append((float(angle), score)) return results coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step) coarse_results = _sweep_edges(coarse_angles) best_coarse = max(coarse_results, key=lambda x: x[1]) best_coarse_angle, best_coarse_score = best_coarse debug["coarse_best_angle"] = round(best_coarse_angle, 2) debug["coarse_best_score"] = round(best_coarse_score, 1) debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results] fine_lo = best_coarse_angle - fine_range fine_hi = best_coarse_angle + fine_range fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step) fine_results = _sweep_edges(fine_angles) best_fine = max(fine_results, key=lambda x: x[1]) best_fine_angle, best_fine_score = best_fine debug["fine_best_angle"] = round(best_fine_angle, 2) debug["fine_best_score"] = round(best_fine_score, 1) debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results] final_angle = best_fine_angle final_angle = max(-5.0, min(5.0, final_angle)) logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}\u00b0 fine={best_fine_angle:.2f}\u00b0 -> {final_angle:.2f}\u00b0") if abs(final_angle) < 0.05: return img, 0.0, debug center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, final_angle, 1.0) rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE) return rotated, final_angle, debug # ============================================================================= # Text-Line Slope Measurement # ============================================================================= def _measure_textline_slope(img: np.ndarray) -> float: """Measure residual text-line slope via Tesseract word-position regression.""" import math as _math if not TESSERACT_AVAILABLE or not CV2_AVAILABLE: return 0.0 h, w = img.shape[:2] gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) data = pytesseract.image_to_data( Image.fromarray(gray), output_type=pytesseract.Output.DICT, config="--psm 6", ) lines: Dict[tuple, list] = {} for i in range(len(data["text"])): txt = (data["text"][i] or "").strip() if len(txt) < 2 or int(data["conf"][i]) < 30: continue key = (data["block_num"][i], data["par_num"][i], data["line_num"][i]) cx = data["left"][i] + data["width"][i] / 2.0 cy = data["top"][i] + data["height"][i] / 2.0 lines.setdefault(key, []).append((cx, cy)) slopes: list = [] for pts in lines.values(): if len(pts) < 3: continue pts.sort(key=lambda p: p[0]) xs = np.array([p[0] for p in pts], dtype=np.float64) ys = np.array([p[1] for p in pts], dtype=np.float64) if xs[-1] - xs[0] < w * 0.15: continue A = np.vstack([xs, np.ones_like(xs)]).T result = np.linalg.lstsq(A, ys, rcond=None) slope = result[0][0] slopes.append(_math.degrees(_math.atan(slope))) if len(slopes) < 3: return 0.0 slopes.sort() trim = max(1, len(slopes) // 10) trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes if not trimmed: return 0.0 return sum(trimmed) / len(trimmed) # ============================================================================= # Two-Pass Deskew # ============================================================================= def deskew_two_pass( img: np.ndarray, coarse_range: float = 5.0, ) -> Tuple[np.ndarray, float, Dict[str, Any]]: """Two-pass deskew: iterative projection + word-alignment residual check. Returns: (corrected_bgr, total_angle_degrees, debug_dict) """ debug: Dict[str, Any] = {} # --- Pass 1: iterative projection --- corrected, angle1, dbg1 = deskew_image_iterative( img.copy(), coarse_range=coarse_range, ) debug["pass1_angle"] = round(angle1, 3) debug["pass1_method"] = "iterative" debug["pass1_debug"] = dbg1 # --- Pass 2: word-alignment residual check --- angle2 = 0.0 try: ok, buf = cv2.imencode(".png", corrected) if ok: corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes()) if abs(angle2) >= 0.3: arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8) corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR) if corrected2 is not None: corrected = corrected2 logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 applied " f"(total={angle1 + angle2:.2f}\u00b0)") else: angle2 = 0.0 else: logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 < 0.3\u00b0 -- skipped") angle2 = 0.0 except Exception as e: logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}") angle2 = 0.0 # --- Pass 3: Tesseract text-line regression residual check --- angle3 = 0.0 try: residual = _measure_textline_slope(corrected) debug["pass3_raw"] = round(residual, 3) if abs(residual) >= 0.3: h3, w3 = corrected.shape[:2] center3 = (w3 // 2, h3 // 2) M3 = cv2.getRotationMatrix2D(center3, residual, 1.0) corrected = cv2.warpAffine( corrected, M3, (w3, h3), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE, ) angle3 = residual logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 applied", residual) else: logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 < 0.3\u00b0 -- skipped", residual) except Exception as e: logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e) total_angle = angle1 + angle2 + angle3 debug["pass2_angle"] = round(angle2, 3) debug["pass2_method"] = "word_alignment" debug["pass3_angle"] = round(angle3, 3) debug["pass3_method"] = "textline_regression" debug["total_angle"] = round(total_angle, 3) logger.info( "deskew_two_pass: pass1=%.2f\u00b0 + pass2=%.2f\u00b0 + pass3=%.2f\u00b0 = %.2f\u00b0", angle1, angle2, angle3, total_angle, ) return corrected, total_angle, debug