breakpilot-pwa/backend/klausur/services/pseudonymizer.py

"""
Pseudonymization Service for Klausurkorrektur.

Implements privacy-by-design principles:
- QR code generation with random doc_tokens
- Header redaction to remove personal data before OCR
- No student identity data leaves the teacher's device

DSGVO Art. 4 Nr. 5 Compliance:
The doc_token is a 128-bit random UUID that cannot be used to
identify a student without the encrypted identity map.
"""
import uuid
import io
import logging
from typing import List, Tuple, Optional
from dataclasses import dataclass
from PIL import Image, ImageDraw, ImageFont

logger = logging.getLogger(__name__)

# Optional imports (graceful fallback if not installed)
try:
    import qrcode
    HAS_QRCODE = True
except ImportError:
    HAS_QRCODE = False
    logger.warning("qrcode not installed - QR generation disabled")

try:
    import cv2
    import numpy as np
    HAS_CV2 = True
except ImportError:
    HAS_CV2 = False
    logger.warning("opencv-python not installed - image processing disabled")

try:
    from pyzbar.pyzbar import decode as pyzbar_decode
    HAS_PYZBAR = True
except ImportError:
    HAS_PYZBAR = False
    logger.warning("pyzbar not installed - QR reading disabled")


@dataclass
class RedactionResult:
    """Result of header redaction."""
    redacted_image: bytes
    original_height: int
    redacted_height: int
    redaction_applied: bool


@dataclass
class QRDetectionResult:
    """Result of QR code detection."""
    doc_token: Optional[str]
    confidence: float
    bbox: Optional[Tuple[int, int, int, int]]  # x, y, width, height


class PseudonymizationService:
    """
    Service for document pseudonymization.

    PRIVACY GUARANTEES:
    1. doc_tokens are cryptographically random (UUID4)
    2. No deterministic relationship between token and student
    3. Header redaction removes visible personal data
    4. Identity mapping is encrypted client-side
    """

    # Default header height to redact (in pixels, assuming 300 DPI scan)
    DEFAULT_HEADER_HEIGHT = 300  # ~1 inch / 2.5cm

    @staticmethod
    def generate_doc_token() -> str:
        """
        Generate a cryptographically random document token.

        Uses UUID4 which provides 122 bits of randomness.
        This ensures no correlation between tokens is possible.
        """
        return str(uuid.uuid4())

    @staticmethod
    def generate_batch_tokens(count: int) -> List[str]:
        """Generate multiple unique doc_tokens."""
        return [PseudonymizationService.generate_doc_token() for _ in range(count)]

    def generate_qr_code(
        self,
        doc_token: str,
        size: int = 200,
        border: int = 2
    ) -> bytes:
        """
        Generate a QR code image for a doc_token.

        Args:
            doc_token: The pseudonymization token
            size: Size of the QR code in pixels
            border: Border size in QR modules

        Returns:
            PNG image as bytes
        """
        if not HAS_QRCODE:
            raise RuntimeError("qrcode library not installed")

        qr = qrcode.QRCode(
            version=1,
            error_correction=qrcode.constants.ERROR_CORRECT_M,
            box_size=10,
            border=border,
        )
        qr.add_data(doc_token)
        qr.make(fit=True)

        img = qr.make_image(fill_color="black", back_color="white")
        img = img.resize((size, size), Image.Resampling.LANCZOS)

        buffer = io.BytesIO()
        img.save(buffer, format="PNG")
        return buffer.getvalue()

    def generate_qr_sheet(
        self,
        doc_tokens: List[str],
        page_size: Tuple[int, int] = (2480, 3508),  # A4 at 300 DPI
        qr_size: int = 200,
        margin: int = 100,
        labels: Optional[List[str]] = None
    ) -> bytes:
        """
        Generate a printable sheet of QR codes.

        Args:
            doc_tokens: List of tokens to generate QR codes for
            page_size: Page dimensions (width, height) in pixels
            qr_size: Size of each QR code
            margin: Page margin
            labels: Optional labels (e.g., "Nr. 1", "Nr. 2") - NO student names!

        Returns:
            PNG image of the full sheet
        """
        if not HAS_QRCODE:
            raise RuntimeError("qrcode library not installed")

        width, height = page_size
        img = Image.new('RGB', (width, height), 'white')
        draw = ImageDraw.Draw(img)

        # Calculate grid
        usable_width = width - 2 * margin
        usable_height = height - 2 * margin
        cell_width = qr_size + 50
        cell_height = qr_size + 80  # Extra space for label

        cols = usable_width // cell_width
        rows = usable_height // cell_height

        # Try to load a font (fallback to default)
        try:
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
        except (IOError, OSError):
            font = ImageFont.load_default()

        # Generate QR codes
        for i, token in enumerate(doc_tokens):
            if i >= cols * rows:
                logger.warning(f"Sheet full, skipping {len(doc_tokens) - i} tokens")
                break

            row = i // cols
            col = i % cols

            x = margin + col * cell_width
            y = margin + row * cell_height

            # Generate QR code
            qr_bytes = self.generate_qr_code(token, qr_size)
            qr_img = Image.open(io.BytesIO(qr_bytes))
            img.paste(qr_img, (x, y))

            # Add label (number only, NO names)
            label = labels[i] if labels and i < len(labels) else f"Nr. {i + 1}"
            draw.text((x, y + qr_size + 5), label, fill="black", font=font)

            # Add truncated token for verification
            token_short = token[:8] + "..."
            draw.text((x, y + qr_size + 25), token_short, fill="gray", font=font)

        buffer = io.BytesIO()
        img.save(buffer, format="PNG")
        return buffer.getvalue()

    def detect_qr_code(self, image_bytes: bytes) -> QRDetectionResult:
        """
        Detect and decode QR code from an image.

        Args:
            image_bytes: Image data (PNG, JPEG, etc.)

        Returns:
            QRDetectionResult with doc_token if found
        """
        if not HAS_PYZBAR:
            return QRDetectionResult(
                doc_token=None,
                confidence=0.0,
                bbox=None
            )

        try:
            img = Image.open(io.BytesIO(image_bytes))

            # Decode QR codes
            decoded = pyzbar_decode(img)

            for obj in decoded:
                if obj.type == 'QRCODE':
                    token = obj.data.decode('utf-8')
                    # Validate it looks like a UUID
                    try:
                        uuid.UUID(token)
                        rect = obj.rect
                        return QRDetectionResult(
                            doc_token=token,
                            confidence=1.0,
                            bbox=(rect.left, rect.top, rect.width, rect.height)
                        )
                    except ValueError:
                        continue

            return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)

        except Exception as e:
            logger.error(f"QR detection failed: {e}")
            return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)

    def redact_header(
        self,
        image_bytes: bytes,
        header_height: Optional[int] = None,
        fill_color: Tuple[int, int, int] = (255, 255, 255)
    ) -> RedactionResult:
        """
        Redact the header area of a scanned exam page.

        This removes the area where student name/class/date typically appears.
        The redaction is permanent - no original data is preserved.

        Args:
            image_bytes: Original scanned image
            header_height: Height in pixels to redact (None = auto-detect)
            fill_color: RGB color to fill redacted area (default: white)

        Returns:
            RedactionResult with redacted image
        """
        try:
            img = Image.open(io.BytesIO(image_bytes))
            width, height = img.size

            # Determine header height
            redact_height = header_height or self.DEFAULT_HEADER_HEIGHT

            # Create a copy and redact header
            redacted = img.copy()
            draw = ImageDraw.Draw(redacted)
            draw.rectangle([(0, 0), (width, redact_height)], fill=fill_color)

            # Save result
            buffer = io.BytesIO()
            redacted.save(buffer, format="PNG")

            return RedactionResult(
                redacted_image=buffer.getvalue(),
                original_height=height,
                redacted_height=redact_height,
                redaction_applied=True
            )

        except Exception as e:
            logger.error(f"Header redaction failed: {e}")
            return RedactionResult(
                redacted_image=image_bytes,
                original_height=0,
                redacted_height=0,
                redaction_applied=False
            )

    def smart_redact_header(
        self,
        image_bytes: bytes,
        preserve_qr: bool = True
    ) -> RedactionResult:
        """
        Smart header redaction that detects text regions.

        Uses OCR confidence to identify and redact only the header
        area containing personal data.

        Args:
            image_bytes: Original scanned image
            preserve_qr: If True, don't redact QR code areas

        Returns:
            RedactionResult with intelligently redacted image
        """
        if not HAS_CV2:
            # Fallback to simple redaction
            return self.redact_header(image_bytes)

        try:
            # Convert to OpenCV format
            nparr = np.frombuffer(image_bytes, np.uint8)
            img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
            height, width = img.shape[:2]

            # Detect QR code position if present
            qr_result = self.detect_qr_code(image_bytes)

            # Calculate redaction area (top portion of page)
            # Typically header is in top 10-15% of page
            header_height = int(height * 0.12)

            # If QR code is in header area, adjust redaction
            if preserve_qr and qr_result.bbox:
                qr_x, qr_y, qr_w, qr_h = qr_result.bbox
                if qr_y < header_height:
                    # QR is in header - redact around it
                    # Create mask
                    mask = np.ones((header_height, width), dtype=np.uint8) * 255

                    # Leave QR area unredacted
                    mask[max(0, qr_y):min(header_height, qr_y + qr_h),
                         max(0, qr_x):min(width, qr_x + qr_w)] = 0

                    # Apply white fill where mask is 255
                    img[:header_height][mask == 255] = [255, 255, 255]
                else:
                    # QR not in header - simple redaction
                    img[:header_height] = [255, 255, 255]
            else:
                # Simple header redaction
                img[:header_height] = [255, 255, 255]

            # Encode result
            _, buffer = cv2.imencode('.png', img)

            return RedactionResult(
                redacted_image=buffer.tobytes(),
                original_height=height,
                redacted_height=header_height,
                redaction_applied=True
            )

        except Exception as e:
            logger.error(f"Smart redaction failed: {e}")
            return self.redact_header(image_bytes)


# Singleton instance
_pseudonymizer: Optional[PseudonymizationService] = None


def get_pseudonymizer() -> PseudonymizationService:
    """Get or create the pseudonymization service singleton."""
    global _pseudonymizer
    if _pseudonymizer is None:
        _pseudonymizer = PseudonymizationService()
    return _pseudonymizer