Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
All services: admin-v2, studio-v2, website, ai-compliance-sdk, consent-service, klausur-service, voice-service, and infrastructure. Large PDFs and compiled binaries excluded via .gitignore.
377 lines
12 KiB
Python
377 lines
12 KiB
Python
"""
|
|
Pseudonymization Service for Klausurkorrektur.
|
|
|
|
Implements privacy-by-design principles:
|
|
- QR code generation with random doc_tokens
|
|
- Header redaction to remove personal data before OCR
|
|
- No student identity data leaves the teacher's device
|
|
|
|
DSGVO Art. 4 Nr. 5 Compliance:
|
|
The doc_token is a 128-bit random UUID that cannot be used to
|
|
identify a student without the encrypted identity map.
|
|
"""
|
|
import uuid
|
|
import io
|
|
import logging
|
|
from typing import List, Tuple, Optional
|
|
from dataclasses import dataclass
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Optional imports (graceful fallback if not installed)
|
|
try:
|
|
import qrcode
|
|
HAS_QRCODE = True
|
|
except ImportError:
|
|
HAS_QRCODE = False
|
|
logger.warning("qrcode not installed - QR generation disabled")
|
|
|
|
try:
|
|
import cv2
|
|
import numpy as np
|
|
HAS_CV2 = True
|
|
except ImportError:
|
|
HAS_CV2 = False
|
|
logger.warning("opencv-python not installed - image processing disabled")
|
|
|
|
try:
|
|
from pyzbar.pyzbar import decode as pyzbar_decode
|
|
HAS_PYZBAR = True
|
|
except ImportError:
|
|
HAS_PYZBAR = False
|
|
logger.warning("pyzbar not installed - QR reading disabled")
|
|
|
|
|
|
@dataclass
|
|
class RedactionResult:
|
|
"""Result of header redaction."""
|
|
redacted_image: bytes
|
|
original_height: int
|
|
redacted_height: int
|
|
redaction_applied: bool
|
|
|
|
|
|
@dataclass
|
|
class QRDetectionResult:
|
|
"""Result of QR code detection."""
|
|
doc_token: Optional[str]
|
|
confidence: float
|
|
bbox: Optional[Tuple[int, int, int, int]] # x, y, width, height
|
|
|
|
|
|
class PseudonymizationService:
|
|
"""
|
|
Service for document pseudonymization.
|
|
|
|
PRIVACY GUARANTEES:
|
|
1. doc_tokens are cryptographically random (UUID4)
|
|
2. No deterministic relationship between token and student
|
|
3. Header redaction removes visible personal data
|
|
4. Identity mapping is encrypted client-side
|
|
"""
|
|
|
|
# Default header height to redact (in pixels, assuming 300 DPI scan)
|
|
DEFAULT_HEADER_HEIGHT = 300 # ~1 inch / 2.5cm
|
|
|
|
@staticmethod
|
|
def generate_doc_token() -> str:
|
|
"""
|
|
Generate a cryptographically random document token.
|
|
|
|
Uses UUID4 which provides 122 bits of randomness.
|
|
This ensures no correlation between tokens is possible.
|
|
"""
|
|
return str(uuid.uuid4())
|
|
|
|
@staticmethod
|
|
def generate_batch_tokens(count: int) -> List[str]:
|
|
"""Generate multiple unique doc_tokens."""
|
|
return [PseudonymizationService.generate_doc_token() for _ in range(count)]
|
|
|
|
def generate_qr_code(
|
|
self,
|
|
doc_token: str,
|
|
size: int = 200,
|
|
border: int = 2
|
|
) -> bytes:
|
|
"""
|
|
Generate a QR code image for a doc_token.
|
|
|
|
Args:
|
|
doc_token: The pseudonymization token
|
|
size: Size of the QR code in pixels
|
|
border: Border size in QR modules
|
|
|
|
Returns:
|
|
PNG image as bytes
|
|
"""
|
|
if not HAS_QRCODE:
|
|
raise RuntimeError("qrcode library not installed")
|
|
|
|
qr = qrcode.QRCode(
|
|
version=1,
|
|
error_correction=qrcode.constants.ERROR_CORRECT_M,
|
|
box_size=10,
|
|
border=border,
|
|
)
|
|
qr.add_data(doc_token)
|
|
qr.make(fit=True)
|
|
|
|
img = qr.make_image(fill_color="black", back_color="white")
|
|
img = img.resize((size, size), Image.Resampling.LANCZOS)
|
|
|
|
buffer = io.BytesIO()
|
|
img.save(buffer, format="PNG")
|
|
return buffer.getvalue()
|
|
|
|
def generate_qr_sheet(
|
|
self,
|
|
doc_tokens: List[str],
|
|
page_size: Tuple[int, int] = (2480, 3508), # A4 at 300 DPI
|
|
qr_size: int = 200,
|
|
margin: int = 100,
|
|
labels: Optional[List[str]] = None
|
|
) -> bytes:
|
|
"""
|
|
Generate a printable sheet of QR codes.
|
|
|
|
Args:
|
|
doc_tokens: List of tokens to generate QR codes for
|
|
page_size: Page dimensions (width, height) in pixels
|
|
qr_size: Size of each QR code
|
|
margin: Page margin
|
|
labels: Optional labels (e.g., "Nr. 1", "Nr. 2") - NO student names!
|
|
|
|
Returns:
|
|
PNG image of the full sheet
|
|
"""
|
|
if not HAS_QRCODE:
|
|
raise RuntimeError("qrcode library not installed")
|
|
|
|
width, height = page_size
|
|
img = Image.new('RGB', (width, height), 'white')
|
|
draw = ImageDraw.Draw(img)
|
|
|
|
# Calculate grid
|
|
usable_width = width - 2 * margin
|
|
usable_height = height - 2 * margin
|
|
cell_width = qr_size + 50
|
|
cell_height = qr_size + 80 # Extra space for label
|
|
|
|
cols = usable_width // cell_width
|
|
rows = usable_height // cell_height
|
|
|
|
# Try to load a font (fallback to default)
|
|
try:
|
|
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 16)
|
|
except (IOError, OSError):
|
|
font = ImageFont.load_default()
|
|
|
|
# Generate QR codes
|
|
for i, token in enumerate(doc_tokens):
|
|
if i >= cols * rows:
|
|
logger.warning(f"Sheet full, skipping {len(doc_tokens) - i} tokens")
|
|
break
|
|
|
|
row = i // cols
|
|
col = i % cols
|
|
|
|
x = margin + col * cell_width
|
|
y = margin + row * cell_height
|
|
|
|
# Generate QR code
|
|
qr_bytes = self.generate_qr_code(token, qr_size)
|
|
qr_img = Image.open(io.BytesIO(qr_bytes))
|
|
img.paste(qr_img, (x, y))
|
|
|
|
# Add label (number only, NO names)
|
|
label = labels[i] if labels and i < len(labels) else f"Nr. {i + 1}"
|
|
draw.text((x, y + qr_size + 5), label, fill="black", font=font)
|
|
|
|
# Add truncated token for verification
|
|
token_short = token[:8] + "..."
|
|
draw.text((x, y + qr_size + 25), token_short, fill="gray", font=font)
|
|
|
|
buffer = io.BytesIO()
|
|
img.save(buffer, format="PNG")
|
|
return buffer.getvalue()
|
|
|
|
def detect_qr_code(self, image_bytes: bytes) -> QRDetectionResult:
|
|
"""
|
|
Detect and decode QR code from an image.
|
|
|
|
Args:
|
|
image_bytes: Image data (PNG, JPEG, etc.)
|
|
|
|
Returns:
|
|
QRDetectionResult with doc_token if found
|
|
"""
|
|
if not HAS_PYZBAR:
|
|
return QRDetectionResult(
|
|
doc_token=None,
|
|
confidence=0.0,
|
|
bbox=None
|
|
)
|
|
|
|
try:
|
|
img = Image.open(io.BytesIO(image_bytes))
|
|
|
|
# Decode QR codes
|
|
decoded = pyzbar_decode(img)
|
|
|
|
for obj in decoded:
|
|
if obj.type == 'QRCODE':
|
|
token = obj.data.decode('utf-8')
|
|
# Validate it looks like a UUID
|
|
try:
|
|
uuid.UUID(token)
|
|
rect = obj.rect
|
|
return QRDetectionResult(
|
|
doc_token=token,
|
|
confidence=1.0,
|
|
bbox=(rect.left, rect.top, rect.width, rect.height)
|
|
)
|
|
except ValueError:
|
|
continue
|
|
|
|
return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)
|
|
|
|
except Exception as e:
|
|
logger.error(f"QR detection failed: {e}")
|
|
return QRDetectionResult(doc_token=None, confidence=0.0, bbox=None)
|
|
|
|
def redact_header(
|
|
self,
|
|
image_bytes: bytes,
|
|
header_height: Optional[int] = None,
|
|
fill_color: Tuple[int, int, int] = (255, 255, 255)
|
|
) -> RedactionResult:
|
|
"""
|
|
Redact the header area of a scanned exam page.
|
|
|
|
This removes the area where student name/class/date typically appears.
|
|
The redaction is permanent - no original data is preserved.
|
|
|
|
Args:
|
|
image_bytes: Original scanned image
|
|
header_height: Height in pixels to redact (None = auto-detect)
|
|
fill_color: RGB color to fill redacted area (default: white)
|
|
|
|
Returns:
|
|
RedactionResult with redacted image
|
|
"""
|
|
try:
|
|
img = Image.open(io.BytesIO(image_bytes))
|
|
width, height = img.size
|
|
|
|
# Determine header height
|
|
redact_height = header_height or self.DEFAULT_HEADER_HEIGHT
|
|
|
|
# Create a copy and redact header
|
|
redacted = img.copy()
|
|
draw = ImageDraw.Draw(redacted)
|
|
draw.rectangle([(0, 0), (width, redact_height)], fill=fill_color)
|
|
|
|
# Save result
|
|
buffer = io.BytesIO()
|
|
redacted.save(buffer, format="PNG")
|
|
|
|
return RedactionResult(
|
|
redacted_image=buffer.getvalue(),
|
|
original_height=height,
|
|
redacted_height=redact_height,
|
|
redaction_applied=True
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Header redaction failed: {e}")
|
|
return RedactionResult(
|
|
redacted_image=image_bytes,
|
|
original_height=0,
|
|
redacted_height=0,
|
|
redaction_applied=False
|
|
)
|
|
|
|
def smart_redact_header(
|
|
self,
|
|
image_bytes: bytes,
|
|
preserve_qr: bool = True
|
|
) -> RedactionResult:
|
|
"""
|
|
Smart header redaction that detects text regions.
|
|
|
|
Uses OCR confidence to identify and redact only the header
|
|
area containing personal data.
|
|
|
|
Args:
|
|
image_bytes: Original scanned image
|
|
preserve_qr: If True, don't redact QR code areas
|
|
|
|
Returns:
|
|
RedactionResult with intelligently redacted image
|
|
"""
|
|
if not HAS_CV2:
|
|
# Fallback to simple redaction
|
|
return self.redact_header(image_bytes)
|
|
|
|
try:
|
|
# Convert to OpenCV format
|
|
nparr = np.frombuffer(image_bytes, np.uint8)
|
|
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
|
height, width = img.shape[:2]
|
|
|
|
# Detect QR code position if present
|
|
qr_result = self.detect_qr_code(image_bytes)
|
|
|
|
# Calculate redaction area (top portion of page)
|
|
# Typically header is in top 10-15% of page
|
|
header_height = int(height * 0.12)
|
|
|
|
# If QR code is in header area, adjust redaction
|
|
if preserve_qr and qr_result.bbox:
|
|
qr_x, qr_y, qr_w, qr_h = qr_result.bbox
|
|
if qr_y < header_height:
|
|
# QR is in header - redact around it
|
|
# Create mask
|
|
mask = np.ones((header_height, width), dtype=np.uint8) * 255
|
|
|
|
# Leave QR area unredacted
|
|
mask[max(0, qr_y):min(header_height, qr_y + qr_h),
|
|
max(0, qr_x):min(width, qr_x + qr_w)] = 0
|
|
|
|
# Apply white fill where mask is 255
|
|
img[:header_height][mask == 255] = [255, 255, 255]
|
|
else:
|
|
# QR not in header - simple redaction
|
|
img[:header_height] = [255, 255, 255]
|
|
else:
|
|
# Simple header redaction
|
|
img[:header_height] = [255, 255, 255]
|
|
|
|
# Encode result
|
|
_, buffer = cv2.imencode('.png', img)
|
|
|
|
return RedactionResult(
|
|
redacted_image=buffer.tobytes(),
|
|
original_height=height,
|
|
redacted_height=header_height,
|
|
redaction_applied=True
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Smart redaction failed: {e}")
|
|
return self.redact_header(image_bytes)
|
|
|
|
|
|
# Singleton instance
|
|
_pseudonymizer: Optional[PseudonymizationService] = None
|
|
|
|
|
|
def get_pseudonymizer() -> PseudonymizationService:
|
|
"""Get or create the pseudonymization service singleton."""
|
|
global _pseudonymizer
|
|
if _pseudonymizer is None:
|
|
_pseudonymizer = PseudonymizationService()
|
|
return _pseudonymizer
|