fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/klausur-service/backend/services/handwriting_detection.py
+++ b/klausur-service/backend/services/handwriting_detection.py
@@ -0,0 +1,359 @@
+"""
+Handwriting Detection Service for Worksheet Cleanup
+
+Detects handwritten content in scanned worksheets and returns binary masks.
+Uses multiple detection methods:
+1. Color-based detection (blue/red ink)
+2. Stroke analysis (thin irregular strokes)
+3. Edge density variance
+
+DATENSCHUTZ: All processing happens locally on Mac Mini.
+"""
+
+import numpy as np
+from PIL import Image
+import io
+import logging
+from typing import Tuple, Optional
+from dataclasses import dataclass
+
+# OpenCV is optional - only required for actual handwriting detection
+try:
+    import cv2
+    CV2_AVAILABLE = True
+except ImportError:
+    cv2 = None
+    CV2_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DetectionResult:
+    """Result of handwriting detection."""
+    mask: np.ndarray  # Binary mask (255 = handwriting, 0 = background/printed)
+    confidence: float  # Overall confidence score
+    handwriting_ratio: float  # Ratio of handwriting pixels to total
+    detection_method: str  # Which method was primarily used
+
+
+def detect_handwriting(image_bytes: bytes) -> DetectionResult:
+    """
+    Detect handwriting in an image.
+
+    Args:
+        image_bytes: Image as bytes (PNG, JPG, etc.)
+
+    Returns:
+        DetectionResult with binary mask where handwriting is white (255)
+
+    Raises:
+        ImportError: If OpenCV is not available
+    """
+    if not CV2_AVAILABLE:
+        raise ImportError(
+            "OpenCV (cv2) is required for handwriting detection. "
+            "Install with: pip install opencv-python-headless"
+        )
+
+    # Load image
+    img = Image.open(io.BytesIO(image_bytes))
+    img_array = np.array(img)
+
+    # Convert to BGR if needed (OpenCV format)
+    if len(img_array.shape) == 2:
+        # Grayscale to BGR
+        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR)
+    elif img_array.shape[2] == 4:
+        # RGBA to BGR
+        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
+    elif img_array.shape[2] == 3:
+        # RGB to BGR
+        img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
+    else:
+        img_bgr = img_array
+
+    # Run multiple detection methods
+    color_mask, color_confidence = _detect_by_color(img_bgr)
+    stroke_mask, stroke_confidence = _detect_by_stroke_analysis(img_bgr)
+    variance_mask, variance_confidence = _detect_by_variance(img_bgr)
+
+    # Combine masks using weighted average
+    weights = [color_confidence, stroke_confidence, variance_confidence]
+    total_weight = sum(weights)
+
+    if total_weight > 0:
+        # Weighted combination
+        combined_mask = (
+            color_mask.astype(np.float32) * color_confidence +
+            stroke_mask.astype(np.float32) * stroke_confidence +
+            variance_mask.astype(np.float32) * variance_confidence
+        ) / total_weight
+
+        # Threshold to binary
+        combined_mask = (combined_mask > 127).astype(np.uint8) * 255
+    else:
+        combined_mask = np.zeros(img_bgr.shape[:2], dtype=np.uint8)
+
+    # Post-processing: Remove small noise
+    combined_mask = _clean_mask(combined_mask)
+
+    # Calculate metrics
+    total_pixels = combined_mask.size
+    handwriting_pixels = np.sum(combined_mask > 0)
+    handwriting_ratio = handwriting_pixels / total_pixels if total_pixels > 0 else 0
+
+    # Determine primary method
+    primary_method = "combined"
+    max_conf = max(color_confidence, stroke_confidence, variance_confidence)
+    if max_conf == color_confidence:
+        primary_method = "color"
+    elif max_conf == stroke_confidence:
+        primary_method = "stroke"
+    else:
+        primary_method = "variance"
+
+    overall_confidence = total_weight / 3.0  # Average confidence
+
+    logger.info(f"Handwriting detection: {handwriting_ratio:.2%} handwriting, "
+                f"confidence={overall_confidence:.2f}, method={primary_method}")
+
+    return DetectionResult(
+        mask=combined_mask,
+        confidence=overall_confidence,
+        handwriting_ratio=handwriting_ratio,
+        detection_method=primary_method
+    )
+
+
+def _detect_by_color(img_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
+    """
+    Detect handwriting by ink color (blue, red, black pen).
+
+    Blue and red ink are common for corrections and handwriting.
+    Black pen has different characteristics than printed black.
+    """
+    # Convert to HSV for color detection
+    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
+
+    # Blue ink detection (Hue: 100-130, Saturation: 50-255, Value: 30-200)
+    blue_lower = np.array([100, 50, 30])
+    blue_upper = np.array([130, 255, 200])
+    blue_mask = cv2.inRange(hsv, blue_lower, blue_upper)
+
+    # Red ink detection (Hue: 0-10 and 170-180)
+    red_lower1 = np.array([0, 50, 50])
+    red_upper1 = np.array([10, 255, 255])
+    red_mask1 = cv2.inRange(hsv, red_lower1, red_upper1)
+
+    red_lower2 = np.array([170, 50, 50])
+    red_upper2 = np.array([180, 255, 255])
+    red_mask2 = cv2.inRange(hsv, red_lower2, red_upper2)
+    red_mask = cv2.bitwise_or(red_mask1, red_mask2)
+
+    # Green ink (less common but sometimes used)
+    green_lower = np.array([35, 50, 50])
+    green_upper = np.array([85, 255, 200])
+    green_mask = cv2.inRange(hsv, green_lower, green_upper)
+
+    # Combine colored ink masks
+    color_mask = cv2.bitwise_or(blue_mask, red_mask)
+    color_mask = cv2.bitwise_or(color_mask, green_mask)
+
+    # Dilate to connect nearby regions
+    kernel = np.ones((3, 3), np.uint8)
+    color_mask = cv2.dilate(color_mask, kernel, iterations=1)
+
+    # Calculate confidence based on detected pixels
+    total_pixels = color_mask.size
+    colored_pixels = np.sum(color_mask > 0)
+    ratio = colored_pixels / total_pixels if total_pixels > 0 else 0
+
+    # High confidence if we found significant colored ink (1-20% of image)
+    if 0.005 < ratio < 0.3:
+        confidence = 0.9
+    elif ratio > 0:
+        confidence = 0.5
+    else:
+        confidence = 0.1
+
+    return color_mask, confidence
+
+
+def _detect_by_stroke_analysis(img_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
+    """
+    Detect handwriting by analyzing stroke characteristics.
+
+    Handwriting typically has:
+    - Thinner, more variable stroke widths
+    - More curved lines
+    - Connected components
+    """
+    # Convert to grayscale
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+    # Adaptive thresholding to extract text
+    binary = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY_INV, 11, 2
+    )
+
+    # Find edges (handwriting has more irregular edges)
+    edges = cv2.Canny(gray, 50, 150)
+
+    # Morphological gradient for stroke detection
+    kernel = np.ones((2, 2), np.uint8)
+    gradient = cv2.morphologyEx(binary, cv2.MORPH_GRADIENT, kernel)
+
+    # Skeleton to analyze stroke width
+    # Thin strokes (handwriting) will have more skeleton pixels relative to mass
+    skeleton = _skeletonize(binary)
+
+    # Detect thin strokes by comparing skeleton to original
+    # Dilate skeleton and XOR with original to find thick regions (printed)
+    dilated_skeleton = cv2.dilate(skeleton, np.ones((5, 5), np.uint8), iterations=1)
+    thick_regions = cv2.bitwise_and(binary, cv2.bitwise_not(dilated_skeleton))
+    thin_regions = cv2.bitwise_and(binary, dilated_skeleton)
+
+    # Handwriting tends to be in thin regions with irregular edges
+    handwriting_mask = thin_regions
+
+    # Calculate confidence
+    total_ink = np.sum(binary > 0)
+    thin_ink = np.sum(thin_regions > 0)
+
+    if total_ink > 0:
+        thin_ratio = thin_ink / total_ink
+        confidence = min(thin_ratio * 1.5, 0.8)  # Cap at 0.8
+    else:
+        confidence = 0.1
+
+    return handwriting_mask, confidence
+
+
+def _detect_by_variance(img_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
+    """
+    Detect handwriting by local variance analysis.
+
+    Handwriting has higher local variance in stroke direction and width
+    compared to uniform printed text.
+    """
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+    # Calculate local variance using a sliding window
+    kernel_size = 15
+    mean = cv2.blur(gray.astype(np.float32), (kernel_size, kernel_size))
+    sqr_mean = cv2.blur((gray.astype(np.float32))**2, (kernel_size, kernel_size))
+    variance = sqr_mean - mean**2
+
+    # Normalize variance
+    variance = cv2.normalize(variance, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
+
+    # High variance regions might be handwriting
+    # But also edges of printed text, so we need to filter
+
+    # Get text regions first
+    binary = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY_INV, 11, 2
+    )
+
+    # High variance within text regions
+    high_variance_mask = cv2.threshold(variance, 100, 255, cv2.THRESH_BINARY)[1]
+    handwriting_mask = cv2.bitwise_and(high_variance_mask, binary)
+
+    # Calculate confidence based on variance distribution
+    text_pixels = np.sum(binary > 0)
+    high_var_pixels = np.sum(handwriting_mask > 0)
+
+    if text_pixels > 0:
+        var_ratio = high_var_pixels / text_pixels
+        # If 5-40% of text has high variance, likely handwriting present
+        if 0.05 < var_ratio < 0.5:
+            confidence = 0.7
+        else:
+            confidence = 0.3
+    else:
+        confidence = 0.1
+
+    return handwriting_mask, confidence
+
+
+def _skeletonize(binary: np.ndarray) -> np.ndarray:
+    """
+    Morphological skeletonization.
+    """
+    skeleton = np.zeros(binary.shape, np.uint8)
+    element = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
+
+    img = binary.copy()
+    while True:
+        eroded = cv2.erode(img, element)
+        temp = cv2.dilate(eroded, element)
+        temp = cv2.subtract(img, temp)
+        skeleton = cv2.bitwise_or(skeleton, temp)
+        img = eroded.copy()
+
+        if cv2.countNonZero(img) == 0:
+            break
+
+    return skeleton
+
+
+def _clean_mask(mask: np.ndarray, min_area: int = 50) -> np.ndarray:
+    """
+    Clean up the mask by removing small noise regions.
+    """
+    # Find connected components
+    num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
+        mask, connectivity=8
+    )
+
+    # Create clean mask keeping only components above minimum area
+    clean = np.zeros_like(mask)
+    for i in range(1, num_labels):  # Skip background (label 0)
+        area = stats[i, cv2.CC_STAT_AREA]
+        if area >= min_area:
+            clean[labels == i] = 255
+
+    return clean
+
+
+def mask_to_png(mask: np.ndarray) -> bytes:
+    """
+    Convert a mask to PNG bytes.
+    """
+    img = Image.fromarray(mask)
+    buffer = io.BytesIO()
+    img.save(buffer, format='PNG')
+    return buffer.getvalue()
+
+
+def detect_handwriting_regions(
+    image_bytes: bytes,
+    min_confidence: float = 0.3
+) -> dict:
+    """
+    High-level function that returns structured detection results.
+
+    Args:
+        image_bytes: Input image
+        min_confidence: Minimum confidence to report detection
+
+    Returns:
+        Dictionary with detection results
+    """
+    result = detect_handwriting(image_bytes)
+
+    has_handwriting = (
+        result.confidence >= min_confidence and
+        result.handwriting_ratio > 0.005  # At least 0.5% handwriting
+    )
+
+    return {
+        "has_handwriting": has_handwriting,
+        "confidence": result.confidence,
+        "handwriting_ratio": result.handwriting_ratio,
+        "detection_method": result.detection_method,
+        "mask_shape": result.mask.shape,
+    }