This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/image_cleaner.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

346 lines
12 KiB
Python

"""
Image Cleaning Module - Stage 2 of Worksheet Cleaning System
Removes handwriting and markings from worksheet scans while preserving
printed text and diagrams using computer vision techniques.
"""
import cv2
import numpy as np
from pathlib import Path
from typing import Dict, List, Optional
import logging
logger = logging.getLogger(__name__)
class WorksheetCleaner:
"""
Removes handwriting from worksheet scans while preserving printed content.
Multi-strategy approach:
1. Color-based filtering (blue ink detection)
2. AI-guided region masking (using bounding boxes from analysis)
3. Stroke thickness analysis (thin handwriting vs thick print)
4. Diagram preservation (copy from original)
"""
def __init__(self, debug_mode: bool = False):
"""
Initialize the worksheet cleaner.
Args:
debug_mode: If True, saves intermediate images for debugging
"""
self.debug_mode = debug_mode
# Tunable parameters (optimiert für bessere Handschrift-Entfernung)
self.blue_hue_range = (90, 130) # HSV hue range for blue ink
self.inpaint_radius = 10 # Erhöht von 3 auf 10 für besseres Inpainting
self.min_stroke_thickness = 2
self.handwriting_area_threshold = 50
self.sharpen_amount = 1.5
self.mask_dilation_kernel_size = 5 # Vergrößert Masken um Handschrift vollständig zu erfassen
def clean_worksheet(
self,
image_path: Path,
analysis_data: Dict,
output_path: Path
) -> Path:
"""
Main cleaning pipeline.
Args:
image_path: Path to input worksheet scan
analysis_data: JSON from Stage 1 analysis (with layout/handwriting_regions)
output_path: Where to save cleaned image
Returns:
Path to cleaned image
Raises:
ValueError: If image cannot be loaded
RuntimeError: If cleaning fails
"""
logger.info(f"Starting cleaning for {image_path.name}")
# Load image
img = cv2.imread(str(image_path))
if img is None:
raise ValueError(f"Cannot load image: {image_path}")
original = img.copy()
cleaned = img.copy()
try:
# Strategy 1: Color-based filtering
if self._has_blue_ink_annotations(analysis_data):
logger.info("Applying blue ink removal")
cleaned = self._remove_blue_ink(cleaned)
# Strategy 2: AI-guided region masking
hw_regions = analysis_data.get('handwriting_regions', [])
if hw_regions:
logger.info(f"Masking {len(hw_regions)} handwriting regions")
cleaned = self._mask_handwriting_regions(cleaned, hw_regions)
# Strategy 3: Stroke thickness analysis
logger.info("Removing thin strokes")
cleaned = self._remove_thin_strokes(cleaned, img)
# Post-processing: enhance printed text
logger.info("Enhancing printed text")
cleaned = self._enhance_printed_text(cleaned)
# Preserve diagrams
diagram_elements = analysis_data.get('layout', {}).get('diagram_elements', [])
if diagram_elements:
logger.info(f"Preserving {len(diagram_elements)} diagram elements")
cleaned = self._preserve_diagrams(cleaned, original, diagram_elements)
# Save result
cv2.imwrite(str(output_path), cleaned)
logger.info(f"Cleaned image saved to {output_path.name}")
return output_path
except Exception as e:
logger.error(f"Cleaning failed for {image_path.name}: {e}")
raise RuntimeError(f"Cleaning failed: {e}") from e
def _has_blue_ink_annotations(self, analysis_data: Dict) -> bool:
"""Check if analysis detected blue ink handwriting"""
hw_regions = analysis_data.get('handwriting_regions', [])
return any(r.get('color_hint') == 'blue' for r in hw_regions)
def _remove_blue_ink(self, img: np.ndarray) -> np.ndarray:
"""
Remove blue pen marks (common for student answers).
Strategy: Blue ink has high Blue channel, lower Red/Green.
Convert to HSV, isolate blue hue range, create mask, inpaint.
Args:
img: Input image (BGR)
Returns:
Image with blue ink removed
"""
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
# Blue hue range: 90-130 in OpenCV HSV (Hue is 0-180)
lower_blue = np.array([self.blue_hue_range[0], 50, 50])
upper_blue = np.array([self.blue_hue_range[1], 255, 255])
# Create mask for blue pixels
blue_mask = cv2.inRange(hsv, lower_blue, upper_blue)
# Morphological operations to clean up mask
kernel = np.ones((3, 3), np.uint8)
blue_mask = cv2.morphologyEx(blue_mask, cv2.MORPH_CLOSE, kernel)
blue_mask = cv2.morphologyEx(blue_mask, cv2.MORPH_OPEN, kernel)
# Inpaint blue regions with surrounding colors
cleaned = cv2.inpaint(img, blue_mask, inpaintRadius=self.inpaint_radius,
flags=cv2.INPAINT_TELEA)
return cleaned
def _mask_handwriting_regions(
self,
img: np.ndarray,
hw_regions: List[Dict]
) -> np.ndarray:
"""
Mask out handwriting regions identified by AI.
Uses bounding boxes from analysis_data to selectively clean areas.
OPTIMIZED: Verwendet starkes Inpainting mit vergrößerten Masken.
Args:
img: Input image
hw_regions: List of handwriting region dicts with bounding_box
Returns:
Image with handwriting regions cleaned
"""
cleaned = img.copy()
img_h, img_w = img.shape[:2]
# Erstelle globale Maske für alle Handschrift-Regionen
mask = np.zeros((img_h, img_w), dtype=np.uint8)
for region in hw_regions:
if region.get('type') in ['student_answer', 'correction', 'note', 'drawing']:
bbox = region.get('bounding_box', {})
x = bbox.get('x', 0)
y = bbox.get('y', 0)
w = bbox.get('width', 0)
h = bbox.get('height', 0)
# Validate and clip bounding box
if w > 0 and h > 0 and x >= 0 and y >= 0:
x = max(0, min(x, img_w - 1))
y = max(0, min(y, img_h - 1))
w = min(w, img_w - x)
h = min(h, img_h - y)
if w > 0 and h > 0:
# Vergrößere Bounding Box um 10 Pixel in jede Richtung
padding = 10
x_pad = max(0, x - padding)
y_pad = max(0, y - padding)
w_pad = min(img_w - x_pad, w + 2 * padding)
h_pad = min(img_h - y_pad, h + 2 * padding)
# Zeichne gefülltes Rechteck in Maske
cv2.rectangle(mask, (x_pad, y_pad), (x_pad + w_pad, y_pad + h_pad), 255, -1)
# Vergrößere Maske mit Morphological Dilation
if self.mask_dilation_kernel_size > 0:
kernel = np.ones((self.mask_dilation_kernel_size, self.mask_dilation_kernel_size), np.uint8)
mask = cv2.dilate(mask, kernel, iterations=2)
# Inpaint mit größerem Radius
if np.any(mask > 0):
cleaned = cv2.inpaint(cleaned, mask, inpaintRadius=self.inpaint_radius,
flags=cv2.INPAINT_TELEA)
logger.info(f"Inpainted {np.sum(mask > 0)} pixels with radius {self.inpaint_radius}")
return cleaned
def _clean_text_region(self, region: np.ndarray) -> np.ndarray:
"""
Clean a specific text region, removing handwriting but keeping print.
Heuristic: Printed text is usually darker and more uniform thickness.
Handwriting varies in pressure, lighter or inconsistent.
Args:
region: Image region to clean
Returns:
Cleaned region
"""
gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY)
# Adaptive threshold to separate foreground
binary = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2
)
# Find connected components
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
binary, connectivity=8
)
# Create mask for components to remove (small/thin = handwriting)
mask = np.zeros(binary.shape, dtype=np.uint8)
for i in range(1, num_labels): # Skip background (0)
area = stats[i, cv2.CC_STAT_AREA]
width = stats[i, cv2.CC_STAT_WIDTH]
height = stats[i, cv2.CC_STAT_HEIGHT]
# Heuristic: handwriting is often thinner, smaller area
# This needs tuning based on real samples
if area < self.handwriting_area_threshold or max(width, height) < 10:
mask[labels == i] = 255
# Inpaint masked regions
cleaned = cv2.inpaint(region, mask, inpaintRadius=self.inpaint_radius,
flags=cv2.INPAINT_TELEA)
return cleaned
def _remove_thin_strokes(self, cleaned: np.ndarray, original: np.ndarray) -> np.ndarray:
"""
Remove thin pen strokes (handwriting) while keeping thicker printed text.
Uses morphological operations to identify stroke thickness.
Args:
cleaned: Current cleaned image
original: Original image
Returns:
Image with thin strokes removed
"""
gray_clean = cv2.cvtColor(cleaned, cv2.COLOR_BGR2GRAY)
# Detect edges
edges_clean = cv2.Canny(gray_clean, 50, 150)
# Morphological closing to connect nearby edges
kernel = np.ones((2, 2), np.uint8)
thick_strokes = cv2.morphologyEx(edges_clean, cv2.MORPH_CLOSE, kernel, iterations=2)
# Thin strokes detection (erosion reveals thin lines)
thin_strokes = cv2.erode(edges_clean, kernel, iterations=1)
thin_only = cv2.subtract(thin_strokes, thick_strokes)
# Inpaint thin strokes
result = cv2.inpaint(cleaned, thin_only, inpaintRadius=2, flags=cv2.INPAINT_TELEA)
return result
def _enhance_printed_text(self, img: np.ndarray) -> np.ndarray:
"""
Sharpen and enhance printed text after cleaning.
Uses unsharp masking technique.
Args:
img: Input image
Returns:
Enhanced image
"""
# Unsharp masking
gaussian = cv2.GaussianBlur(img, (0, 0), 2.0)
sharpened = cv2.addWeighted(img, self.sharpen_amount, gaussian, -0.5, 0)
return sharpened
def _preserve_diagrams(
self,
cleaned: np.ndarray,
original: np.ndarray,
diagram_elements: List[Dict]
) -> np.ndarray:
"""
Preserve diagram/illustration areas by copying from original.
Args:
cleaned: Current cleaned image
original: Original scan
diagram_elements: List of diagram bounding boxes from AI analysis
Returns:
Image with diagrams preserved
"""
result = cleaned.copy()
for diagram in diagram_elements:
if diagram.get('preserve', True):
bbox = diagram.get('bounding_box', {})
x = bbox.get('x', 0)
y = bbox.get('y', 0)
w = bbox.get('width', 0)
h = bbox.get('height', 0)
# Validate and clip bounding box
if w > 0 and h > 0 and x >= 0 and y >= 0:
img_h, img_w = original.shape[:2]
x = min(x, img_w - 1)
y = min(y, img_h - 1)
w = min(w, img_w - x)
h = min(h, img_h - y)
if w > 0 and h > 0:
# Copy diagram region from original
result[y:y+h, x:x+w] = original[y:y+h, x:x+w]
return result