""" Pseudonymization Service for Klausurkorrektur. Implements privacy-by-design principles: - QR code generation with random doc_tokens - Header redaction to remove personal data before OCR - No student identity data leaves the teacher's device DSGVO Art. 4 Nr. 5 Compliance: The doc_token is a 128-bit random UUID that cannot be used to identify a student without the encrypted identity map. """ import uuid import io import logging from typing import List, Tuple, Optional from dataclasses import dataclass from PIL import Image, ImageDraw, ImageFont logger = logging.getLogger(__name__) # Optional imports (graceful fallback if not installed) try: import qrcode HAS_QRCODE = True except ImportError: HAS_QRCODE = False logger.warning("qrcode not installed - QR generation disabled") try: import cv2 import numpy as np HAS_CV2 = True except ImportError: HAS_CV2 = False logger.warning("opencv-python not installed - image processing disabled") try: from pyzbar.pyzbar import decode as pyzbar_decode HAS_PYZBAR = True except ImportError: HAS_PYZBAR = False logger.warning("pyzbar not installed - QR reading disabled") @dataclass class RedactionResult: """Result of header redaction.""" redacted_image: bytes original_height: int redacted_height: int redaction_applied: bool @dataclass class QRDetectionResult: """Result of QR code detection.""" doc_token: Optional[str] confidence: float bbox: Optional[Tuple[int, int, int, int]] # x, y, width, height class PseudonymizationService: """ Service for document pseudonymization. PRIVACY GUARANTEES: 1. doc_tokens are cryptographically random (UUID4) 2. No deterministic relationship between token and student 3. Header redaction removes visible personal data 4. Identity mapping is encrypted client-side """ # Default header height to redact (in pixels, assuming 300 DPI scan) DEFAULT_HEADER_HEIGHT = 300 # ~1 inch / 2.5cm @staticmethod def generate_doc_token() -> str: """ Generate a cryptographically random document token. Uses UUID4 which provides 122 bits of randomness. This ensures no correlation between tokens is possible. """ return str(uuid.uuid4()) @staticmethod def generate_batch_tokens(count: int) -> List[str]: """Generate multiple unique doc_tokens.""" return [PseudonymizationService.generate_doc_token() for _ in range(count)] def generate_qr_code( self, doc_token: str, size: int = 200, border: int = 2 ) -> bytes: """ Generate a QR code image for a doc_token. Args: doc_token: The pseudonymization token size: Size of the QR code in pixels border: Border size in QR modules Returns: PNG image as bytes """ if not HAS_QRCODE: raise RuntimeError("qrcode library not installed") qr = qrcode.QRCode( version=1, error_correction=qrcode.constants.ERROR_CORRECT_M, box_size=10, border=border, ) qr.add_data(doc_token) qr.make(fit=True) img = qr.make_image(fill_color="black", back_color="white") img = img.resize((size, size), Image.Resampling.LANCZOS) buffer = io.BytesIO() img.save(buffer, format="PNG") return buffer.getvalue() def generate_qr_sheet( self, doc_tokens: List[str], page_size: Tuple[int, int] = (2480, 3508), # A4 at 300 DPI qr_size: int = 200, margin: int = 100, labels: Optional[List[str]] = None ) -> bytes: """ Generate a printable sheet of QR codes. Args: doc_tokens: List of tokens to generate QR codes for page_size: Page dimensions (width, height) in pixels qr_size: Size of each QR code margin: Page margin labels: Optional labels (e.g., "Nr. 1", "Nr. 2") - NO student names! Returns: PNG image of the full sheet """ if not HAS_QRCODE: raise RuntimeError("qrcode library not installed") width, height = page_size img = Image.new('RGB', (width, height), 'white') draw = ImageDraw.Draw(img) # Calculate grid usable_width = width - 2 * margin usable_height = height - 2 * margin cell_width = qr_size + 50 cell_height = qr_size + 80 # Extra space for label cols = usable_width // cell_width rows = usable_height // cell_height # Try to load a font (fallback to default) try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16) except (IOError, OSError): font = ImageFont.load_default() # Generate QR codes for i, token in enumerate(doc_tokens): if i >= cols * rows: logger.warning(f"Sheet full, skipping {len(doc_tokens) - i} tokens") break row = i // cols col = i % cols x = margin + col * cell_width y = margin + row * cell_height # Generate QR code qr_bytes = self.generate_qr_code(token, qr_size) qr_img = Image.open(io.BytesIO(qr_bytes)) img.paste(qr_img, (x, y)) # Add label (number only, NO names) label = labels[i] if labels and i < len(labels) else f"Nr. {i + 1}" draw.text((x, y + qr_size + 5), label, fill="black", font=font) # Add truncated token for verification token_short = token[:8] + "..." draw.text((x, y + qr_size + 25), token_short, fill="gray", font=font) buffer = io.BytesIO() img.save(buffer, format="PNG") return buffer.getvalue() def detect_qr_code(self, image_bytes: bytes) -> QRDetectionResult: """ Detect and decode QR code from an image. Args: image_bytes: Image data (PNG, JPEG, etc.) Returns: QRDetectionResult with doc_token if found """ if not HAS_PYZBAR: return QRDetectionResult( doc_token=None, confidence=0.0, bbox=None ) try: img = Image.open(io.BytesIO(image_bytes)) # Decode QR codes decoded = pyzbar_decode(img) for obj in decoded: if obj.type == 'QRCODE': token = obj.data.decode('utf-8') # Validate it looks like a UUID try: uuid.UUID(token) rect = obj.rect return QRDetectionResult( doc_token=token, confidence=1.0, bbox=(rect.left, rect.top, rect.width, rect.height) ) except ValueError: continue return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None) except Exception as e: logger.error(f"QR detection failed: {e}") return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None) def redact_header( self, image_bytes: bytes, header_height: Optional[int] = None, fill_color: Tuple[int, int, int] = (255, 255, 255) ) -> RedactionResult: """ Redact the header area of a scanned exam page. This removes the area where student name/class/date typically appears. The redaction is permanent - no original data is preserved. Args: image_bytes: Original scanned image header_height: Height in pixels to redact (None = auto-detect) fill_color: RGB color to fill redacted area (default: white) Returns: RedactionResult with redacted image """ try: img = Image.open(io.BytesIO(image_bytes)) width, height = img.size # Determine header height redact_height = header_height or self.DEFAULT_HEADER_HEIGHT # Create a copy and redact header redacted = img.copy() draw = ImageDraw.Draw(redacted) draw.rectangle([(0, 0), (width, redact_height)], fill=fill_color) # Save result buffer = io.BytesIO() redacted.save(buffer, format="PNG") return RedactionResult( redacted_image=buffer.getvalue(), original_height=height, redacted_height=redact_height, redaction_applied=True ) except Exception as e: logger.error(f"Header redaction failed: {e}") return RedactionResult( redacted_image=image_bytes, original_height=0, redacted_height=0, redaction_applied=False ) def smart_redact_header( self, image_bytes: bytes, preserve_qr: bool = True ) -> RedactionResult: """ Smart header redaction that detects text regions. Uses OCR confidence to identify and redact only the header area containing personal data. Args: image_bytes: Original scanned image preserve_qr: If True, don't redact QR code areas Returns: RedactionResult with intelligently redacted image """ if not HAS_CV2: # Fallback to simple redaction return self.redact_header(image_bytes) try: # Convert to OpenCV format nparr = np.frombuffer(image_bytes, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) height, width = img.shape[:2] # Detect QR code position if present qr_result = self.detect_qr_code(image_bytes) # Calculate redaction area (top portion of page) # Typically header is in top 10-15% of page header_height = int(height * 0.12) # If QR code is in header area, adjust redaction if preserve_qr and qr_result.bbox: qr_x, qr_y, qr_w, qr_h = qr_result.bbox if qr_y < header_height: # QR is in header - redact around it # Create mask mask = np.ones((header_height, width), dtype=np.uint8) * 255 # Leave QR area unredacted mask[max(0, qr_y):min(header_height, qr_y + qr_h), max(0, qr_x):min(width, qr_x + qr_w)] = 0 # Apply white fill where mask is 255 img[:header_height][mask == 255] = [255, 255, 255] else: # QR not in header - simple redaction img[:header_height] = [255, 255, 255] else: # Simple header redaction img[:header_height] = [255, 255, 255] # Encode result _, buffer = cv2.imencode('.png', img) return RedactionResult( redacted_image=buffer.tobytes(), original_height=height, redacted_height=header_height, redaction_applied=True ) except Exception as e: logger.error(f"Smart redaction failed: {e}") return self.redact_header(image_bytes) # Singleton instance _pseudonymizer: Optional[PseudonymizationService] = None def get_pseudonymizer() -> PseudonymizationService: """Get or create the pseudonymization service singleton.""" global _pseudonymizer if _pseudonymizer is None: _pseudonymizer = PseudonymizationService() return _pseudonymizer