breakpilot-pwa/backend/image_cleaner.py

"""
Image Cleaning Module - Stage 2 of Worksheet Cleaning System

Removes handwriting and markings from worksheet scans while preserving
printed text and diagrams using computer vision techniques.
"""

import cv2
import numpy as np
from pathlib import Path
from typing import Dict, List, Optional
import logging

logger = logging.getLogger(__name__)


class WorksheetCleaner:
    """
    Removes handwriting from worksheet scans while preserving printed content.

    Multi-strategy approach:
    1. Color-based filtering (blue ink detection)
    2. AI-guided region masking (using bounding boxes from analysis)
    3. Stroke thickness analysis (thin handwriting vs thick print)
    4. Diagram preservation (copy from original)
    """

    def __init__(self, debug_mode: bool = False):
        """
        Initialize the worksheet cleaner.

        Args:
            debug_mode: If True, saves intermediate images for debugging
        """
        self.debug_mode = debug_mode

        # Tunable parameters (optimiert für bessere Handschrift-Entfernung)
        self.blue_hue_range = (90, 130)  # HSV hue range for blue ink
        self.inpaint_radius = 10  # Erhöht von 3 auf 10 für besseres Inpainting
        self.min_stroke_thickness = 2
        self.handwriting_area_threshold = 50
        self.sharpen_amount = 1.5
        self.mask_dilation_kernel_size = 5  # Vergrößert Masken um Handschrift vollständig zu erfassen

    def clean_worksheet(
        self,
        image_path: Path,
        analysis_data: Dict,
        output_path: Path
    ) -> Path:
        """
        Main cleaning pipeline.

        Args:
            image_path: Path to input worksheet scan
            analysis_data: JSON from Stage 1 analysis (with layout/handwriting_regions)
            output_path: Where to save cleaned image

        Returns:
            Path to cleaned image

        Raises:
            ValueError: If image cannot be loaded
            RuntimeError: If cleaning fails
        """
        logger.info(f"Starting cleaning for {image_path.name}")

        # Load image
        img = cv2.imread(str(image_path))
        if img is None:
            raise ValueError(f"Cannot load image: {image_path}")

        original = img.copy()
        cleaned = img.copy()

        try:
            # Strategy 1: Color-based filtering
            if self._has_blue_ink_annotations(analysis_data):
                logger.info("Applying blue ink removal")
                cleaned = self._remove_blue_ink(cleaned)

            # Strategy 2: AI-guided region masking
            hw_regions = analysis_data.get('handwriting_regions', [])
            if hw_regions:
                logger.info(f"Masking {len(hw_regions)} handwriting regions")
                cleaned = self._mask_handwriting_regions(cleaned, hw_regions)

            # Strategy 3: Stroke thickness analysis
            logger.info("Removing thin strokes")
            cleaned = self._remove_thin_strokes(cleaned, img)

            # Post-processing: enhance printed text
            logger.info("Enhancing printed text")
            cleaned = self._enhance_printed_text(cleaned)

            # Preserve diagrams
            diagram_elements = analysis_data.get('layout', {}).get('diagram_elements', [])
            if diagram_elements:
                logger.info(f"Preserving {len(diagram_elements)} diagram elements")
                cleaned = self._preserve_diagrams(cleaned, original, diagram_elements)

            # Save result
            cv2.imwrite(str(output_path), cleaned)
            logger.info(f"Cleaned image saved to {output_path.name}")

            return output_path

        except Exception as e:
            logger.error(f"Cleaning failed for {image_path.name}: {e}")
            raise RuntimeError(f"Cleaning failed: {e}") from e

    def _has_blue_ink_annotations(self, analysis_data: Dict) -> bool:
        """Check if analysis detected blue ink handwriting"""
        hw_regions = analysis_data.get('handwriting_regions', [])
        return any(r.get('color_hint') == 'blue' for r in hw_regions)

    def _remove_blue_ink(self, img: np.ndarray) -> np.ndarray:
        """
        Remove blue pen marks (common for student answers).

        Strategy: Blue ink has high Blue channel, lower Red/Green.
        Convert to HSV, isolate blue hue range, create mask, inpaint.

        Args:
            img: Input image (BGR)

        Returns:
            Image with blue ink removed
        """
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

        # Blue hue range: 90-130 in OpenCV HSV (Hue is 0-180)
        lower_blue = np.array([self.blue_hue_range[0], 50, 50])
        upper_blue = np.array([self.blue_hue_range[1], 255, 255])

        # Create mask for blue pixels
        blue_mask = cv2.inRange(hsv, lower_blue, upper_blue)

        # Morphological operations to clean up mask
        kernel = np.ones((3, 3), np.uint8)
        blue_mask = cv2.morphologyEx(blue_mask, cv2.MORPH_CLOSE, kernel)
        blue_mask = cv2.morphologyEx(blue_mask, cv2.MORPH_OPEN, kernel)

        # Inpaint blue regions with surrounding colors
        cleaned = cv2.inpaint(img, blue_mask, inpaintRadius=self.inpaint_radius,
                             flags=cv2.INPAINT_TELEA)

        return cleaned

    def _mask_handwriting_regions(
        self,
        img: np.ndarray,
        hw_regions: List[Dict]
    ) -> np.ndarray:
        """
        Mask out handwriting regions identified by AI.

        Uses bounding boxes from analysis_data to selectively clean areas.
        OPTIMIZED: Verwendet starkes Inpainting mit vergrößerten Masken.

        Args:
            img: Input image
            hw_regions: List of handwriting region dicts with bounding_box

        Returns:
            Image with handwriting regions cleaned
        """
        cleaned = img.copy()
        img_h, img_w = img.shape[:2]

        # Erstelle globale Maske für alle Handschrift-Regionen
        mask = np.zeros((img_h, img_w), dtype=np.uint8)

        for region in hw_regions:
            if region.get('type') in ['student_answer', 'correction', 'note', 'drawing']:
                bbox = region.get('bounding_box', {})
                x = bbox.get('x', 0)
                y = bbox.get('y', 0)
                w = bbox.get('width', 0)
                h = bbox.get('height', 0)

                # Validate and clip bounding box
                if w > 0 and h > 0 and x >= 0 and y >= 0:
                    x = max(0, min(x, img_w - 1))
                    y = max(0, min(y, img_h - 1))
                    w = min(w, img_w - x)
                    h = min(h, img_h - y)

                    if w > 0 and h > 0:
                        # Vergrößere Bounding Box um 10 Pixel in jede Richtung
                        padding = 10
                        x_pad = max(0, x - padding)
                        y_pad = max(0, y - padding)
                        w_pad = min(img_w - x_pad, w + 2 * padding)
                        h_pad = min(img_h - y_pad, h + 2 * padding)

                        # Zeichne gefülltes Rechteck in Maske
                        cv2.rectangle(mask, (x_pad, y_pad), (x_pad + w_pad, y_pad + h_pad), 255, -1)

        # Vergrößere Maske mit Morphological Dilation
        if self.mask_dilation_kernel_size > 0:
            kernel = np.ones((self.mask_dilation_kernel_size, self.mask_dilation_kernel_size), np.uint8)
            mask = cv2.dilate(mask, kernel, iterations=2)

        # Inpaint mit größerem Radius
        if np.any(mask > 0):
            cleaned = cv2.inpaint(cleaned, mask, inpaintRadius=self.inpaint_radius,
                                 flags=cv2.INPAINT_TELEA)
            logger.info(f"Inpainted {np.sum(mask > 0)} pixels with radius {self.inpaint_radius}")

        return cleaned

    def _clean_text_region(self, region: np.ndarray) -> np.ndarray:
        """
        Clean a specific text region, removing handwriting but keeping print.

        Heuristic: Printed text is usually darker and more uniform thickness.
        Handwriting varies in pressure, lighter or inconsistent.

        Args:
            region: Image region to clean

        Returns:
            Cleaned region
        """
        gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY)

        # Adaptive threshold to separate foreground
        binary = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY_INV, 11, 2
        )

        # Find connected components
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
            binary, connectivity=8
        )

        # Create mask for components to remove (small/thin = handwriting)
        mask = np.zeros(binary.shape, dtype=np.uint8)
        for i in range(1, num_labels):  # Skip background (0)
            area = stats[i, cv2.CC_STAT_AREA]
            width = stats[i, cv2.CC_STAT_WIDTH]
            height = stats[i, cv2.CC_STAT_HEIGHT]

            # Heuristic: handwriting is often thinner, smaller area
            # This needs tuning based on real samples
            if area < self.handwriting_area_threshold or max(width, height) < 10:
                mask[labels == i] = 255

        # Inpaint masked regions
        cleaned = cv2.inpaint(region, mask, inpaintRadius=self.inpaint_radius,
                             flags=cv2.INPAINT_TELEA)

        return cleaned

    def _remove_thin_strokes(self, cleaned: np.ndarray, original: np.ndarray) -> np.ndarray:
        """
        Remove thin pen strokes (handwriting) while keeping thicker printed text.

        Uses morphological operations to identify stroke thickness.

        Args:
            cleaned: Current cleaned image
            original: Original image

        Returns:
            Image with thin strokes removed
        """
        gray_clean = cv2.cvtColor(cleaned, cv2.COLOR_BGR2GRAY)

        # Detect edges
        edges_clean = cv2.Canny(gray_clean, 50, 150)

        # Morphological closing to connect nearby edges
        kernel = np.ones((2, 2), np.uint8)
        thick_strokes = cv2.morphologyEx(edges_clean, cv2.MORPH_CLOSE, kernel, iterations=2)

        # Thin strokes detection (erosion reveals thin lines)
        thin_strokes = cv2.erode(edges_clean, kernel, iterations=1)
        thin_only = cv2.subtract(thin_strokes, thick_strokes)

        # Inpaint thin strokes
        result = cv2.inpaint(cleaned, thin_only, inpaintRadius=2, flags=cv2.INPAINT_TELEA)

        return result

    def _enhance_printed_text(self, img: np.ndarray) -> np.ndarray:
        """
        Sharpen and enhance printed text after cleaning.

        Uses unsharp masking technique.

        Args:
            img: Input image

        Returns:
            Enhanced image
        """
        # Unsharp masking
        gaussian = cv2.GaussianBlur(img, (0, 0), 2.0)
        sharpened = cv2.addWeighted(img, self.sharpen_amount, gaussian, -0.5, 0)

        return sharpened

    def _preserve_diagrams(
        self,
        cleaned: np.ndarray,
        original: np.ndarray,
        diagram_elements: List[Dict]
    ) -> np.ndarray:
        """
        Preserve diagram/illustration areas by copying from original.

        Args:
            cleaned: Current cleaned image
            original: Original scan
            diagram_elements: List of diagram bounding boxes from AI analysis

        Returns:
            Image with diagrams preserved
        """
        result = cleaned.copy()

        for diagram in diagram_elements:
            if diagram.get('preserve', True):
                bbox = diagram.get('bounding_box', {})
                x = bbox.get('x', 0)
                y = bbox.get('y', 0)
                w = bbox.get('width', 0)
                h = bbox.get('height', 0)

                # Validate and clip bounding box
                if w > 0 and h > 0 and x >= 0 and y >= 0:
                    img_h, img_w = original.shape[:2]
                    x = min(x, img_w - 1)
                    y = min(y, img_h - 1)
                    w = min(w, img_w - x)
                    h = min(h, img_h - y)

                    if w > 0 and h > 0:
                        # Copy diagram region from original
                        result[y:y+h, x:x+w] = original[y:y+h, x:x+w]

        return result