This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/klausur/services/pseudonymizer.py
Benjamin Admin bfdaf63ba9 fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

377 lines
12 KiB
Python

"""
Pseudonymization Service for Klausurkorrektur.
Implements privacy-by-design principles:
- QR code generation with random doc_tokens
- Header redaction to remove personal data before OCR
- No student identity data leaves the teacher's device
DSGVO Art. 4 Nr. 5 Compliance:
The doc_token is a 128-bit random UUID that cannot be used to
identify a student without the encrypted identity map.
"""
import uuid
import io
import logging
from typing import List, Tuple, Optional
from dataclasses import dataclass
from PIL import Image, ImageDraw, ImageFont
logger = logging.getLogger(__name__)
# Optional imports (graceful fallback if not installed)
try:
import qrcode
HAS_QRCODE = True
except ImportError:
HAS_QRCODE = False
logger.warning("qrcode not installed - QR generation disabled")
try:
import cv2
import numpy as np
HAS_CV2 = True
except ImportError:
HAS_CV2 = False
logger.warning("opencv-python not installed - image processing disabled")
try:
from pyzbar.pyzbar import decode as pyzbar_decode
HAS_PYZBAR = True
except ImportError:
HAS_PYZBAR = False
logger.warning("pyzbar not installed - QR reading disabled")
@dataclass
class RedactionResult:
"""Result of header redaction."""
redacted_image: bytes
original_height: int
redacted_height: int
redaction_applied: bool
@dataclass
class QRDetectionResult:
"""Result of QR code detection."""
doc_token: Optional[str]
confidence: float
bbox: Optional[Tuple[int, int, int, int]] # x, y, width, height
class PseudonymizationService:
"""
Service for document pseudonymization.
PRIVACY GUARANTEES:
1. doc_tokens are cryptographically random (UUID4)
2. No deterministic relationship between token and student
3. Header redaction removes visible personal data
4. Identity mapping is encrypted client-side
"""
# Default header height to redact (in pixels, assuming 300 DPI scan)
DEFAULT_HEADER_HEIGHT = 300 # ~1 inch / 2.5cm
@staticmethod
def generate_doc_token() -> str:
"""
Generate a cryptographically random document token.
Uses UUID4 which provides 122 bits of randomness.
This ensures no correlation between tokens is possible.
"""
return str(uuid.uuid4())
@staticmethod
def generate_batch_tokens(count: int) -> List[str]:
"""Generate multiple unique doc_tokens."""
return [PseudonymizationService.generate_doc_token() for _ in range(count)]
def generate_qr_code(
self,
doc_token: str,
size: int = 200,
border: int = 2
) -> bytes:
"""
Generate a QR code image for a doc_token.
Args:
doc_token: The pseudonymization token
size: Size of the QR code in pixels
border: Border size in QR modules
Returns:
PNG image as bytes
"""
if not HAS_QRCODE:
raise RuntimeError("qrcode library not installed")
qr = qrcode.QRCode(
version=1,
error_correction=qrcode.constants.ERROR_CORRECT_M,
box_size=10,
border=border,
)
qr.add_data(doc_token)
qr.make(fit=True)
img = qr.make_image(fill_color="black", back_color="white")
img = img.resize((size, size), Image.Resampling.LANCZOS)
buffer = io.BytesIO()
img.save(buffer, format="PNG")
return buffer.getvalue()
def generate_qr_sheet(
self,
doc_tokens: List[str],
page_size: Tuple[int, int] = (2480, 3508), # A4 at 300 DPI
qr_size: int = 200,
margin: int = 100,
labels: Optional[List[str]] = None
) -> bytes:
"""
Generate a printable sheet of QR codes.
Args:
doc_tokens: List of tokens to generate QR codes for
page_size: Page dimensions (width, height) in pixels
qr_size: Size of each QR code
margin: Page margin
labels: Optional labels (e.g., "Nr. 1", "Nr. 2") - NO student names!
Returns:
PNG image of the full sheet
"""
if not HAS_QRCODE:
raise RuntimeError("qrcode library not installed")
width, height = page_size
img = Image.new('RGB', (width, height), 'white')
draw = ImageDraw.Draw(img)
# Calculate grid
usable_width = width - 2 * margin
usable_height = height - 2 * margin
cell_width = qr_size + 50
cell_height = qr_size + 80 # Extra space for label
cols = usable_width // cell_width
rows = usable_height // cell_height
# Try to load a font (fallback to default)
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
except (IOError, OSError):
font = ImageFont.load_default()
# Generate QR codes
for i, token in enumerate(doc_tokens):
if i >= cols * rows:
logger.warning(f"Sheet full, skipping {len(doc_tokens) - i} tokens")
break
row = i // cols
col = i % cols
x = margin + col * cell_width
y = margin + row * cell_height
# Generate QR code
qr_bytes = self.generate_qr_code(token, qr_size)
qr_img = Image.open(io.BytesIO(qr_bytes))
img.paste(qr_img, (x, y))
# Add label (number only, NO names)
label = labels[i] if labels and i < len(labels) else f"Nr. {i + 1}"
draw.text((x, y + qr_size + 5), label, fill="black", font=font)
# Add truncated token for verification
token_short = token[:8] + "..."
draw.text((x, y + qr_size + 25), token_short, fill="gray", font=font)
buffer = io.BytesIO()
img.save(buffer, format="PNG")
return buffer.getvalue()
def detect_qr_code(self, image_bytes: bytes) -> QRDetectionResult:
"""
Detect and decode QR code from an image.
Args:
image_bytes: Image data (PNG, JPEG, etc.)
Returns:
QRDetectionResult with doc_token if found
"""
if not HAS_PYZBAR:
return QRDetectionResult(
doc_token=None,
confidence=0.0,
bbox=None
)
try:
img = Image.open(io.BytesIO(image_bytes))
# Decode QR codes
decoded = pyzbar_decode(img)
for obj in decoded:
if obj.type == 'QRCODE':
token = obj.data.decode('utf-8')
# Validate it looks like a UUID
try:
uuid.UUID(token)
rect = obj.rect
return QRDetectionResult(
doc_token=token,
confidence=1.0,
bbox=(rect.left, rect.top, rect.width, rect.height)
)
except ValueError:
continue
return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)
except Exception as e:
logger.error(f"QR detection failed: {e}")
return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)
def redact_header(
self,
image_bytes: bytes,
header_height: Optional[int] = None,
fill_color: Tuple[int, int, int] = (255, 255, 255)
) -> RedactionResult:
"""
Redact the header area of a scanned exam page.
This removes the area where student name/class/date typically appears.
The redaction is permanent - no original data is preserved.
Args:
image_bytes: Original scanned image
header_height: Height in pixels to redact (None = auto-detect)
fill_color: RGB color to fill redacted area (default: white)
Returns:
RedactionResult with redacted image
"""
try:
img = Image.open(io.BytesIO(image_bytes))
width, height = img.size
# Determine header height
redact_height = header_height or self.DEFAULT_HEADER_HEIGHT
# Create a copy and redact header
redacted = img.copy()
draw = ImageDraw.Draw(redacted)
draw.rectangle([(0, 0), (width, redact_height)], fill=fill_color)
# Save result
buffer = io.BytesIO()
redacted.save(buffer, format="PNG")
return RedactionResult(
redacted_image=buffer.getvalue(),
original_height=height,
redacted_height=redact_height,
redaction_applied=True
)
except Exception as e:
logger.error(f"Header redaction failed: {e}")
return RedactionResult(
redacted_image=image_bytes,
original_height=0,
redacted_height=0,
redaction_applied=False
)
def smart_redact_header(
self,
image_bytes: bytes,
preserve_qr: bool = True
) -> RedactionResult:
"""
Smart header redaction that detects text regions.
Uses OCR confidence to identify and redact only the header
area containing personal data.
Args:
image_bytes: Original scanned image
preserve_qr: If True, don't redact QR code areas
Returns:
RedactionResult with intelligently redacted image
"""
if not HAS_CV2:
# Fallback to simple redaction
return self.redact_header(image_bytes)
try:
# Convert to OpenCV format
nparr = np.frombuffer(image_bytes, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
height, width = img.shape[:2]
# Detect QR code position if present
qr_result = self.detect_qr_code(image_bytes)
# Calculate redaction area (top portion of page)
# Typically header is in top 10-15% of page
header_height = int(height * 0.12)
# If QR code is in header area, adjust redaction
if preserve_qr and qr_result.bbox:
qr_x, qr_y, qr_w, qr_h = qr_result.bbox
if qr_y < header_height:
# QR is in header - redact around it
# Create mask
mask = np.ones((header_height, width), dtype=np.uint8) * 255
# Leave QR area unredacted
mask[max(0, qr_y):min(header_height, qr_y + qr_h),
max(0, qr_x):min(width, qr_x + qr_w)] = 0
# Apply white fill where mask is 255
img[:header_height][mask == 255] = [255, 255, 255]
else:
# QR not in header - simple redaction
img[:header_height] = [255, 255, 255]
else:
# Simple header redaction
img[:header_height] = [255, 255, 255]
# Encode result
_, buffer = cv2.imencode('.png', img)
return RedactionResult(
redacted_image=buffer.tobytes(),
original_height=height,
redacted_height=header_height,
redaction_applied=True
)
except Exception as e:
logger.error(f"Smart redaction failed: {e}")
return self.redact_header(image_bytes)
# Singleton instance
_pseudonymizer: Optional[PseudonymizationService] = None
def get_pseudonymizer() -> PseudonymizationService:
"""Get or create the pseudonymization service singleton."""
global _pseudonymizer
if _pseudonymizer is None:
_pseudonymizer = PseudonymizationService()
return _pseudonymizer