Fix: Sidebar scrollable + add Eltern-Portal nav link

overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 20:49:44 +02:00
parent d87645ffce
commit 45287b3541
48 changed files with 6 additions and 1 deletions
--- a/klausur-service/backend/ocr/preprocessing/deskew.py
+++ b/klausur-service/backend/ocr/preprocessing/deskew.py
@@ -0,0 +1,437 @@
+"""
+CV Preprocessing Deskew — Rotation correction via Hough lines, word alignment, and iterative projection.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from collections import defaultdict
+from typing import Any, Dict, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV2_AVAILABLE,
+    TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Deskew via Hough Lines
+# =============================================================================
+
+def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
+    """Correct rotation using Hough Line detection.
+
+    Args:
+        img: BGR image.
+
+    Returns:
+        Tuple of (corrected image, detected angle in degrees).
+    """
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+    lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
+                            minLineLength=img.shape[1] // 4, maxLineGap=20)
+
+    if lines is None or len(lines) < 3:
+        return img, 0.0
+
+    angles = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
+        if abs(angle) < 15:
+            angles.append(angle)
+
+    if not angles:
+        return img, 0.0
+
+    median_angle = float(np.median(angles))
+
+    if abs(median_angle) > 5.0:
+        median_angle = 5.0 * np.sign(median_angle)
+
+    if abs(median_angle) < 0.1:
+        return img, 0.0
+
+    h, w = img.shape[:2]
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+    corrected = cv2.warpAffine(img, M, (w, h),
+                               flags=cv2.INTER_LINEAR,
+                               borderMode=cv2.BORDER_REPLICATE)
+
+    logger.info(f"Deskew: corrected {median_angle:.2f}\u00b0 rotation")
+    return corrected, median_angle
+
+
+# =============================================================================
+# Deskew via Word Alignment
+# =============================================================================
+
+def deskew_image_by_word_alignment(
+    image_data: bytes,
+    lang: str = "eng+deu",
+    downscale_factor: float = 0.5,
+) -> Tuple[bytes, float]:
+    """Correct rotation by fitting a line through left-most word starts per text line.
+
+    More robust than Hough-based deskew for vocabulary worksheets where text lines
+    have consistent left-alignment.
+
+    Args:
+        image_data: Raw image bytes (PNG/JPEG).
+        lang: Tesseract language string for the quick pass.
+        downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
+
+    Returns:
+        Tuple of (rotated image as PNG bytes, detected angle in degrees).
+    """
+    if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
+        return image_data, 0.0
+
+    img_array = np.frombuffer(image_data, dtype=np.uint8)
+    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+    if img is None:
+        logger.warning("deskew_by_word_alignment: could not decode image")
+        return image_data, 0.0
+
+    orig_h, orig_w = img.shape[:2]
+
+    small_w = int(orig_w * downscale_factor)
+    small_h = int(orig_h * downscale_factor)
+    small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
+
+    pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
+    try:
+        data = pytesseract.image_to_data(
+            pil_small, lang=lang, config="--psm 6 --oem 3",
+            output_type=pytesseract.Output.DICT,
+        )
+    except Exception as e:
+        logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
+        return image_data, 0.0
+
+    line_groups: Dict[tuple, list] = defaultdict(list)
+    for i in range(len(data["text"])):
+        text = (data["text"][i] or "").strip()
+        conf = int(data["conf"][i])
+        if not text or conf < 20:
+            continue
+        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+        line_groups[key].append(i)
+
+    if len(line_groups) < 5:
+        logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
+        return image_data, 0.0
+
+    scale = 1.0 / downscale_factor
+    points = []
+    for key, indices in line_groups.items():
+        best_idx = min(indices, key=lambda i: data["left"][i])
+        lx = data["left"][best_idx] * scale
+        top = data["top"][best_idx] * scale
+        h = data["height"][best_idx] * scale
+        cy = top + h / 2.0
+        points.append((lx, cy))
+
+    xs = np.array([p[0] for p in points])
+    ys = np.array([p[1] for p in points])
+    median_x = float(np.median(xs))
+    tolerance = orig_w * 0.03
+
+    mask = np.abs(xs - median_x) <= tolerance
+    filtered_xs = xs[mask]
+    filtered_ys = ys[mask]
+
+    if len(filtered_xs) < 5:
+        logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
+        return image_data, 0.0
+
+    coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
+    slope = coeffs[0]
+    angle_rad = np.arctan(slope)
+    angle_deg = float(np.degrees(angle_rad))
+
+    angle_deg = max(-5.0, min(5.0, angle_deg))
+
+    logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}\u00b0 from {len(filtered_xs)} points "
+                f"(total lines: {len(line_groups)})")
+
+    if abs(angle_deg) < 0.05:
+        return image_data, 0.0
+
+    center = (orig_w // 2, orig_h // 2)
+    M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
+    rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
+                              flags=cv2.INTER_LINEAR,
+                              borderMode=cv2.BORDER_REPLICATE)
+
+    success, png_buf = cv2.imencode(".png", rotated)
+    if not success:
+        logger.warning("deskew_by_word_alignment: PNG encoding failed")
+        return image_data, 0.0
+
+    return png_buf.tobytes(), angle_deg
+
+
+# =============================================================================
+# Projection Gradient Scoring
+# =============================================================================
+
+def _projection_gradient_score(profile: np.ndarray) -> float:
+    """Score a projection profile by the L2-norm of its first derivative."""
+    diff = np.diff(profile)
+    return float(np.sum(diff * diff))
+
+
+# =============================================================================
+# Iterative Deskew (Vertical-Edge Projection)
+# =============================================================================
+
+def deskew_image_iterative(
+    img: np.ndarray,
+    coarse_range: float = 5.0,
+    coarse_step: float = 0.1,
+    fine_range: float = 0.15,
+    fine_step: float = 0.02,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+    """Iterative deskew using vertical-edge projection optimisation.
+
+    Args:
+        img: BGR image (full resolution).
+        coarse_range: half-range in degrees for the coarse sweep.
+        coarse_step: step size in degrees for the coarse sweep.
+        fine_range: half-range around the coarse winner for the fine sweep.
+        fine_step: step size in degrees for the fine sweep.
+
+    Returns:
+        (rotated_bgr, angle_degrees, debug_dict)
+    """
+    h, w = img.shape[:2]
+    debug: Dict[str, Any] = {}
+
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    y_lo, y_hi = int(h * 0.15), int(h * 0.85)
+    x_lo, x_hi = int(w * 0.10), int(w * 0.90)
+    gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
+
+    sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
+    edges = np.abs(sobel_x)
+    edge_max = edges.max()
+    if edge_max > 0:
+        edges = (edges / edge_max * 255).astype(np.uint8)
+    else:
+        return img, 0.0, {"error": "no edges detected"}
+
+    crop_h, crop_w = edges.shape[:2]
+    crop_center = (crop_w // 2, crop_h // 2)
+
+    trim_y = max(4, int(crop_h * 0.03))
+    trim_x = max(4, int(crop_w * 0.03))
+
+    def _sweep_edges(angles: np.ndarray) -> list:
+        results = []
+        for angle in angles:
+            if abs(angle) < 1e-6:
+                rotated = edges
+            else:
+                M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
+                rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
+                                         flags=cv2.INTER_NEAREST,
+                                         borderMode=cv2.BORDER_REPLICATE)
+            trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
+            v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
+            score = _projection_gradient_score(v_profile)
+            results.append((float(angle), score))
+        return results
+
+    coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
+    coarse_results = _sweep_edges(coarse_angles)
+    best_coarse = max(coarse_results, key=lambda x: x[1])
+    best_coarse_angle, best_coarse_score = best_coarse
+
+    debug["coarse_best_angle"] = round(best_coarse_angle, 2)
+    debug["coarse_best_score"] = round(best_coarse_score, 1)
+    debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
+
+    fine_lo = best_coarse_angle - fine_range
+    fine_hi = best_coarse_angle + fine_range
+    fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
+    fine_results = _sweep_edges(fine_angles)
+    best_fine = max(fine_results, key=lambda x: x[1])
+    best_fine_angle, best_fine_score = best_fine
+
+    debug["fine_best_angle"] = round(best_fine_angle, 2)
+    debug["fine_best_score"] = round(best_fine_score, 1)
+    debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
+
+    final_angle = best_fine_angle
+    final_angle = max(-5.0, min(5.0, final_angle))
+
+    logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}\u00b0 fine={best_fine_angle:.2f}\u00b0 -> {final_angle:.2f}\u00b0")
+
+    if abs(final_angle) < 0.05:
+        return img, 0.0, debug
+
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
+    rotated = cv2.warpAffine(img, M, (w, h),
+                              flags=cv2.INTER_LINEAR,
+                              borderMode=cv2.BORDER_REPLICATE)
+
+    return rotated, final_angle, debug
+
+
+# =============================================================================
+# Text-Line Slope Measurement
+# =============================================================================
+
+def _measure_textline_slope(img: np.ndarray) -> float:
+    """Measure residual text-line slope via Tesseract word-position regression."""
+    import math as _math
+
+    if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
+        return 0.0
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    data = pytesseract.image_to_data(
+        Image.fromarray(gray),
+        output_type=pytesseract.Output.DICT,
+        config="--psm 6",
+    )
+
+    lines: Dict[tuple, list] = {}
+    for i in range(len(data["text"])):
+        txt = (data["text"][i] or "").strip()
+        if len(txt) < 2 or int(data["conf"][i]) < 30:
+            continue
+        key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
+        cx = data["left"][i] + data["width"][i] / 2.0
+        cy = data["top"][i] + data["height"][i] / 2.0
+        lines.setdefault(key, []).append((cx, cy))
+
+    slopes: list = []
+    for pts in lines.values():
+        if len(pts) < 3:
+            continue
+        pts.sort(key=lambda p: p[0])
+        xs = np.array([p[0] for p in pts], dtype=np.float64)
+        ys = np.array([p[1] for p in pts], dtype=np.float64)
+        if xs[-1] - xs[0] < w * 0.15:
+            continue
+        A = np.vstack([xs, np.ones_like(xs)]).T
+        result = np.linalg.lstsq(A, ys, rcond=None)
+        slope = result[0][0]
+        slopes.append(_math.degrees(_math.atan(slope)))
+
+    if len(slopes) < 3:
+        return 0.0
+
+    slopes.sort()
+    trim = max(1, len(slopes) // 10)
+    trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
+    if not trimmed:
+        return 0.0
+
+    return sum(trimmed) / len(trimmed)
+
+
+# =============================================================================
+# Two-Pass Deskew
+# =============================================================================
+
+def deskew_two_pass(
+    img: np.ndarray,
+    coarse_range: float = 5.0,
+) -> Tuple[np.ndarray, float, Dict[str, Any]]:
+    """Two-pass deskew: iterative projection + word-alignment residual check.
+
+    Returns:
+        (corrected_bgr, total_angle_degrees, debug_dict)
+    """
+    debug: Dict[str, Any] = {}
+
+    # --- Pass 1: iterative projection ---
+    corrected, angle1, dbg1 = deskew_image_iterative(
+        img.copy(), coarse_range=coarse_range,
+    )
+    debug["pass1_angle"] = round(angle1, 3)
+    debug["pass1_method"] = "iterative"
+    debug["pass1_debug"] = dbg1
+
+    # --- Pass 2: word-alignment residual check ---
+    angle2 = 0.0
+    try:
+        ok, buf = cv2.imencode(".png", corrected)
+        if ok:
+            corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
+            if abs(angle2) >= 0.3:
+                arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
+                corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
+                if corrected2 is not None:
+                    corrected = corrected2
+                    logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 applied "
+                                f"(total={angle1 + angle2:.2f}\u00b0)")
+                else:
+                    angle2 = 0.0
+            else:
+                logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 < 0.3\u00b0 -- skipped")
+                angle2 = 0.0
+    except Exception as e:
+        logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
+        angle2 = 0.0
+
+    # --- Pass 3: Tesseract text-line regression residual check ---
+    angle3 = 0.0
+    try:
+        residual = _measure_textline_slope(corrected)
+        debug["pass3_raw"] = round(residual, 3)
+        if abs(residual) >= 0.3:
+            h3, w3 = corrected.shape[:2]
+            center3 = (w3 // 2, h3 // 2)
+            M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
+            corrected = cv2.warpAffine(
+                corrected, M3, (w3, h3),
+                flags=cv2.INTER_LINEAR,
+                borderMode=cv2.BORDER_REPLICATE,
+            )
+            angle3 = residual
+            logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 applied", residual)
+        else:
+            logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 < 0.3\u00b0 -- skipped", residual)
+    except Exception as e:
+        logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
+
+    total_angle = angle1 + angle2 + angle3
+    debug["pass2_angle"] = round(angle2, 3)
+    debug["pass2_method"] = "word_alignment"
+    debug["pass3_angle"] = round(angle3, 3)
+    debug["pass3_method"] = "textline_regression"
+    debug["total_angle"] = round(total_angle, 3)
+
+    logger.info(
+        "deskew_two_pass: pass1=%.2f\u00b0 + pass2=%.2f\u00b0 + pass3=%.2f\u00b0 = %.2f\u00b0",
+        angle1, angle2, angle3, total_angle,
+    )
+
+    return corrected, total_angle, debug
--- a/klausur-service/backend/ocr/preprocessing/dewarp.py
+++ b/klausur-service/backend/ocr/preprocessing/dewarp.py
@@ -0,0 +1,474 @@
+"""
+CV Preprocessing Dewarp — Vertical shear detection and correction.
+
+Provides four shear detection methods (vertical edge, projection variance,
+Hough lines, text-line drift), ensemble combination, quality gating,
+and the main dewarp_image() function.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import math
+import time
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV2_AVAILABLE,
+    TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+
+# =============================================================================
+# Shear Detection Methods
+# =============================================================================
+
+def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
+    """Detect vertical shear angle via strongest vertical edge tracking (Method A)."""
+    h, w = img.shape[:2]
+    result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
+
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
+    abs_sobel = np.abs(sobel_x).astype(np.uint8)
+
+    _, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+
+    num_strips = 20
+    strip_h = h // num_strips
+    edge_positions = []
+
+    for i in range(num_strips):
+        y_start = i * strip_h
+        y_end = min((i + 1) * strip_h, h)
+        strip = binary[y_start:y_end, :]
+
+        projection = np.sum(strip, axis=0).astype(np.float64)
+        if projection.max() == 0:
+            continue
+
+        search_w = int(w * 0.4)
+        left_proj = projection[:search_w]
+        if left_proj.max() == 0:
+            continue
+
+        kernel_size = max(3, w // 100)
+        if kernel_size % 2 == 0:
+            kernel_size += 1
+        smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
+        x_pos = float(np.argmax(smoothed))
+        y_center = (y_start + y_end) / 2.0
+        edge_positions.append((y_center, x_pos))
+
+    if len(edge_positions) < 8:
+        return result
+
+    ys = np.array([p[0] for p in edge_positions])
+    xs = np.array([p[1] for p in edge_positions])
+
+    median_x = np.median(xs)
+    std_x = max(np.std(xs), 1.0)
+    mask = np.abs(xs - median_x) < 2 * std_x
+    ys = ys[mask]
+    xs = xs[mask]
+
+    if len(ys) < 6:
+        return result
+
+    straight_coeffs = np.polyfit(ys, xs, 1)
+    slope = straight_coeffs[0]
+    fitted = np.polyval(straight_coeffs, ys)
+    residuals = xs - fitted
+    rmse = float(np.sqrt(np.mean(residuals ** 2)))
+
+    shear_degrees = math.degrees(math.atan(slope))
+
+    confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(float(confidence), 2)
+
+    return result
+
+
+def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear angle by maximising variance of horizontal text-line projections (Method B)."""
+    result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+
+    small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
+    sh, sw = small.shape
+
+    def _sweep_variance(angles_list):
+        results = []
+        for angle_deg in angles_list:
+            if abs(angle_deg) < 0.001:
+                rotated = small
+            else:
+                shear_tan = math.tan(math.radians(angle_deg))
+                M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
+                rotated = cv2.warpAffine(small, M, (sw, sh),
+                                         flags=cv2.INTER_NEAREST,
+                                         borderMode=cv2.BORDER_CONSTANT)
+            profile = np.sum(rotated, axis=1).astype(float)
+            results.append((angle_deg, float(np.var(profile))))
+        return results
+
+    coarse_angles = [a * 0.5 for a in range(-6, 7)]
+    coarse_results = _sweep_variance(coarse_angles)
+    coarse_best = max(coarse_results, key=lambda x: x[1])
+
+    fine_center = coarse_best[0]
+    fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)]
+    fine_results = _sweep_variance(fine_angles)
+    fine_best = max(fine_results, key=lambda x: x[1])
+
+    best_angle = fine_best[0]
+    best_variance = fine_best[1]
+    variances = coarse_results + fine_results
+
+    all_mean = sum(v for _, v in variances) / len(variances)
+    if all_mean > 0 and best_variance > all_mean:
+        confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
+    else:
+        confidence = 0.0
+
+    result["shear_degrees"] = round(best_angle, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    return result
+
+
+def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear using Hough transform on printed table / ruled lines (Method C)."""
+    result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+
+    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+
+    min_len = int(w * 0.15)
+    lines = cv2.HoughLinesP(
+        edges, rho=1, theta=np.pi / 360,
+        threshold=int(w * 0.08),
+        minLineLength=min_len,
+        maxLineGap=20,
+    )
+
+    if lines is None or len(lines) < 3:
+        return result
+
+    horizontal_angles: List[Tuple[float, float]] = []
+    for line in lines:
+        x1, y1, x2, y2 = line[0]
+        if x1 == x2:
+            continue
+        angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
+        if abs(angle) <= 5.0:
+            length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
+            horizontal_angles.append((angle, length))
+
+    if len(horizontal_angles) < 3:
+        return result
+
+    angles_arr = np.array([a for a, _ in horizontal_angles])
+    weights_arr = np.array([l for _, l in horizontal_angles])
+    sorted_idx = np.argsort(angles_arr)
+    s_angles = angles_arr[sorted_idx]
+    s_weights = weights_arr[sorted_idx]
+    cum = np.cumsum(s_weights)
+    mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
+    median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
+
+    agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
+    confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
+
+    shear_degrees = -median_angle
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    return result
+
+
+def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
+    """Detect shear by measuring text-line straightness (Method D)."""
+    result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
+
+    h, w = img.shape[:2]
+    scale = 0.5
+    small = cv2.resize(img, (int(w * scale), int(h * scale)),
+                       interpolation=cv2.INTER_AREA)
+    gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
+    pil_img = Image.fromarray(gray)
+
+    try:
+        data = pytesseract.image_to_data(
+            pil_img, lang='eng+deu', config='--psm 11 --oem 3',
+            output_type=pytesseract.Output.DICT,
+        )
+    except Exception:
+        return result
+
+    words = []
+    for i in range(len(data['text'])):
+        text = data['text'][i].strip()
+        conf = int(data['conf'][i])
+        if not text or conf < 20 or len(text) < 2:
+            continue
+        left_x = float(data['left'][i])
+        cy = data['top'][i] + data['height'][i] / 2.0
+        word_w = float(data['width'][i])
+        words.append((left_x, cy, word_w))
+
+    if len(words) < 15:
+        return result
+
+    avg_w = sum(ww for _, _, ww in words) / len(words)
+    x_tol = max(avg_w * 0.4, 8)
+
+    words_by_x = sorted(words, key=lambda w: w[0])
+    columns: List[List[Tuple[float, float]]] = []
+    cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
+    cur_x = words_by_x[0][0]
+
+    for lx, cy, _ in words_by_x[1:]:
+        if abs(lx - cur_x) <= x_tol:
+            cur_col.append((lx, cy))
+            cur_x = cur_x * 0.8 + lx * 0.2
+        else:
+            if len(cur_col) >= 5:
+                columns.append(cur_col)
+            cur_col = [(lx, cy)]
+            cur_x = lx
+    if len(cur_col) >= 5:
+        columns.append(cur_col)
+
+    if len(columns) < 2:
+        return result
+
+    drifts = []
+    for col in columns:
+        ys = np.array([p[1] for p in col])
+        xs = np.array([p[0] for p in col])
+        y_range = ys.max() - ys.min()
+        if y_range < h * scale * 0.3:
+            continue
+        coeffs = np.polyfit(ys, xs, 1)
+        drifts.append(coeffs[0])
+
+    if len(drifts) < 2:
+        return result
+
+    median_drift = float(np.median(drifts))
+    shear_degrees = math.degrees(math.atan(median_drift))
+
+    drift_std = float(np.std(drifts))
+    consistency = max(0.0, 1.0 - drift_std * 50)
+    count_factor = min(1.0, len(drifts) / 4.0)
+    confidence = count_factor * 0.5 + consistency * 0.5
+
+    result["shear_degrees"] = round(shear_degrees, 3)
+    result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
+    logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
+                "shear=%.3f\u00b0, conf=%.2f",
+                len(columns), len(drifts), median_drift,
+                shear_degrees, confidence)
+    return result
+
+
+# =============================================================================
+# Quality Check and Shear Application
+# =============================================================================
+
+def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
+    """Check whether the dewarp correction actually improved alignment."""
+    def _h_proj_variance(img: np.ndarray) -> float:
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        _, binary = cv2.threshold(gray, 0, 255,
+                                  cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
+                           interpolation=cv2.INTER_AREA)
+        profile = np.sum(small, axis=1).astype(float)
+        return float(np.var(profile))
+
+    var_before = _h_proj_variance(original)
+    var_after = _h_proj_variance(corrected)
+
+    return var_after > var_before
+
+
+def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+    """Apply a vertical shear correction to an image."""
+    h, w = img.shape[:2]
+    shear_tan = math.tan(math.radians(shear_degrees))
+
+    M = np.float32([
+        [1, shear_tan, -h / 2.0 * shear_tan],
+        [0, 1, 0],
+    ])
+
+    corrected = cv2.warpAffine(img, M, (w, h),
+                                flags=cv2.INTER_LINEAR,
+                                borderMode=cv2.BORDER_REPLICATE)
+    return corrected
+
+
+# =============================================================================
+# Ensemble Shear Combination
+# =============================================================================
+
+def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
+    """Combine multiple shear detections into a single weighted estimate (v2)."""
+    _MIN_CONF = 0.35
+    _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
+
+    accepted = []
+    for d in detections:
+        if d["confidence"] < _MIN_CONF:
+            continue
+        boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
+        effective_conf = d["confidence"] * boost
+        accepted.append((d["shear_degrees"], effective_conf, d["method"]))
+
+    if not accepted:
+        return 0.0, 0.0, "none"
+
+    if len(accepted) == 1:
+        deg, conf, method = accepted[0]
+        return deg, min(conf, 1.0), method
+
+    total_w = sum(c for _, c, _ in accepted)
+    w_mean = sum(d * c for d, c, _ in accepted) / total_w
+
+    filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
+    if not filtered:
+        filtered = accepted
+
+    total_w2 = sum(c for _, c, _ in filtered)
+    final_deg = sum(d * c for d, c, _ in filtered) / total_w2
+
+    avg_conf = total_w2 / len(filtered)
+    spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
+    agreement_bonus = 0.15 if spread < 0.5 else 0.0
+    ensemble_conf = min(1.0, avg_conf + agreement_bonus)
+
+    methods_str = "+".join(m for _, _, m in filtered)
+    return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
+
+
+# =============================================================================
+# Main Dewarp Function
+# =============================================================================
+
+def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
+    """Correct vertical shear after deskew (v2 with quality gate).
+
+    Methods (all run in ~150ms total):
+        A. _detect_shear_angle()           -- vertical edge profile (~50ms)
+        B. _detect_shear_by_projection()   -- horizontal text-line variance (~30ms)
+        C. _detect_shear_by_hough()        -- Hough lines on table borders (~20ms)
+        D. _detect_shear_by_text_lines()   -- text-line straightness (~50ms)
+
+    Args:
+        img: BGR image (already deskewed).
+        use_ensemble: If False, fall back to single-method behaviour (method A only).
+
+    Returns:
+        Tuple of (corrected_image, dewarp_info).
+    """
+    no_correction = {
+        "method": "none",
+        "shear_degrees": 0.0,
+        "confidence": 0.0,
+        "detections": [],
+    }
+
+    if not CV2_AVAILABLE:
+        return img, no_correction
+
+    t0 = time.time()
+
+    if use_ensemble:
+        det_a = _detect_shear_angle(img)
+        det_b = _detect_shear_by_projection(img)
+        det_c = _detect_shear_by_hough(img)
+        det_d = _detect_shear_by_text_lines(img)
+        detections = [det_a, det_b, det_c, det_d]
+        shear_deg, confidence, method = _ensemble_shear(detections)
+    else:
+        det_a = _detect_shear_angle(img)
+        detections = [det_a]
+        shear_deg = det_a["shear_degrees"]
+        confidence = det_a["confidence"]
+        method = det_a["method"]
+
+    duration = time.time() - t0
+
+    logger.info(
+        "dewarp: ensemble shear=%.3f\u00b0 conf=%.2f method=%s (%.2fs) | "
+        "A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
+        shear_deg, confidence, method, duration,
+        detections[0]["shear_degrees"], detections[0]["confidence"],
+        detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
+        detections[1]["confidence"] if len(detections) > 1 else 0.0,
+        detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
+        detections[2]["confidence"] if len(detections) > 2 else 0.0,
+        detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
+        detections[3]["confidence"] if len(detections) > 3 else 0.0,
+    )
+
+    _all_detections = [
+        {"method": d["method"], "shear_degrees": d["shear_degrees"],
+         "confidence": d["confidence"]}
+        for d in detections
+    ]
+
+    if abs(shear_deg) < 0.08 or confidence < 0.4:
+        no_correction["detections"] = _all_detections
+        return img, no_correction
+
+    corrected = _apply_shear(img, -shear_deg)
+
+    if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
+        logger.info("dewarp: quality gate REJECTED correction (%.3f\u00b0) -- "
+                     "projection variance did not improve", shear_deg)
+        no_correction["detections"] = _all_detections
+        return img, no_correction
+
+    info = {
+        "method": method,
+        "shear_degrees": shear_deg,
+        "confidence": confidence,
+        "detections": _all_detections,
+    }
+
+    return corrected, info
+
+
+def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
+    """Apply shear correction with a manual angle."""
+    if abs(shear_degrees) < 0.001:
+        return img
+    return _apply_shear(img, -shear_degrees)
--- a/klausur-service/backend/ocr/preprocessing/preprocessing.py
+++ b/klausur-service/backend/ocr/preprocessing/preprocessing.py
@@ -0,0 +1,157 @@
+"""
+Image I/O, orientation detection, deskew, and dewarp for the CV vocabulary pipeline.
+
+Re-export facade -- all logic lives in the sub-modules:
+
+  cv_preprocessing_deskew   Rotation correction (Hough, word-alignment, iterative, two-pass)
+  cv_preprocessing_dewarp   Vertical shear detection and correction (4 methods + ensemble)
+
+This file contains the image I/O and orientation detection functions.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    CV2_AVAILABLE,
+    TESSERACT_AVAILABLE,
+)
+
+logger = logging.getLogger(__name__)
+
+# Guarded imports
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    import pytesseract
+    from PIL import Image
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+
+# Re-export all deskew functions
+from cv_preprocessing_deskew import (  # noqa: F401
+    deskew_image,
+    deskew_image_by_word_alignment,
+    deskew_image_iterative,
+    deskew_two_pass,
+    _projection_gradient_score,
+    _measure_textline_slope,
+)
+
+# Re-export all dewarp functions
+from cv_preprocessing_dewarp import (  # noqa: F401
+    _apply_shear,
+    _detect_shear_angle,
+    _detect_shear_by_hough,
+    _detect_shear_by_projection,
+    _detect_shear_by_text_lines,
+    _dewarp_quality_check,
+    _ensemble_shear,
+    dewarp_image,
+    dewarp_image_manual,
+)
+
+
+# =============================================================================
+# Image I/O
+# =============================================================================
+
+def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
+    """Render a PDF page to a high-resolution numpy array (BGR).
+
+    Args:
+        pdf_data: Raw PDF bytes.
+        page_number: 0-indexed page number.
+        zoom: Zoom factor (3.0 = 432 DPI).
+
+    Returns:
+        numpy array in BGR format.
+    """
+    import fitz  # PyMuPDF
+
+    pdf_doc = fitz.open(stream=pdf_data, filetype="pdf")
+    if page_number >= pdf_doc.page_count:
+        raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)")
+
+    page = pdf_doc[page_number]
+    mat = fitz.Matrix(zoom, zoom)
+    pix = page.get_pixmap(matrix=mat)
+
+    img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
+    if pix.n == 4:  # RGBA
+        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
+    elif pix.n == 3:  # RGB
+        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
+    else:  # Grayscale
+        img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
+
+    pdf_doc.close()
+    return img_bgr
+
+
+def render_image_high_res(image_data: bytes) -> np.ndarray:
+    """Load an image (PNG/JPEG) into a numpy array (BGR).
+
+    Args:
+        image_data: Raw image bytes.
+
+    Returns:
+        numpy array in BGR format.
+    """
+    img_array = np.frombuffer(image_data, dtype=np.uint8)
+    img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+    if img_bgr is None:
+        raise ValueError("Could not decode image data")
+    return img_bgr
+
+
+# =============================================================================
+# Orientation Detection (0/90/180/270)
+# =============================================================================
+
+def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
+    """Detect page orientation via Tesseract OSD and rotate if needed.
+
+    Returns:
+        (corrected_image, rotation_degrees) -- rotation is 0, 90, 180, or 270.
+    """
+    if pytesseract is None:
+        return img_bgr, 0
+
+    try:
+        gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+        pil_img = Image.fromarray(gray)
+
+        osd = pytesseract.image_to_osd(pil_img, output_type=pytesseract.Output.DICT)
+        rotate = osd.get("rotate", 0)
+        confidence = osd.get("orientation_conf", 0.0)
+
+        logger.info(f"OSD: orientation={rotate}\u00b0 confidence={confidence:.1f}")
+
+        if rotate == 0 or confidence < 1.0:
+            return img_bgr, 0
+
+        if rotate == 180:
+            corrected = cv2.rotate(img_bgr, cv2.ROTATE_180)
+        elif rotate == 90:
+            corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_CLOCKWISE)
+        elif rotate == 270:
+            corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_COUNTERCLOCKWISE)
+        else:
+            return img_bgr, 0
+
+        logger.info(f"OSD: rotated {rotate}\u00b0 to fix orientation")
+        return corrected, rotate
+
+    except Exception as e:
+        logger.warning(f"OSD orientation detection failed: {e}")
+        return img_bgr, 0