feat: box-aware column detection — exclude box content from global columns

- Enrich column geometries with original full-page words (box-filtered) so _detect_sub_columns() finds narrow sub-columns across box boundaries - Add inline marker guard: bullet points (1., 2., •) are not split into sub-columns (minimum gap check: 1.2× word height or 20px) - Add box_rects parameter to build_grid_from_words() — words inside boxes are excluded from X-gap column clustering - Pass box rects from zones to words_first grid builder - Add 9 tests for box-aware column detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
feat: add border ghost filter + graphic detection tests + structure overlay
2026-03-16 18:42:46 +01:00 · 2026-03-16 18:28:53 +01:00
11 changed files with 1275 additions and 31 deletions
--- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts
+++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts
@@ -219,7 +219,7 @@ export interface StructureGraphic {
  w: number
  h: number
  area: number
-  shape: string   // arrow, circle, line, exclamation, dot, icon, illustration
+  shape: string   // image, illustration
  color_name: string
  color_hex: string
  confidence: number
@@ -235,6 +235,7 @@ export interface StructureResult {
  color_pixel_counts: Record<string, number>
  has_words: boolean
  word_count: number
  border_ghosts_removed?: number
  duration_seconds: number
 }
--- a/admin-lehrer/components/ocr-pipeline/StepReconstruction.tsx
+++ b/admin-lehrer/components/ocr-pipeline/StepReconstruction.tsx
@@ -2,7 +2,7 @@
 import { useCallback, useEffect, useMemo, useRef, useState } from 'react'
 import dynamic from 'next/dynamic'
-import type { GridResult, GridCell, ColumnResult, RowResult, PageZone, PageRegion, RowItem } from '@/app/(admin)/ai/ocr-pipeline/types'
+import type { GridResult, GridCell, ColumnResult, RowResult, PageZone, PageRegion, RowItem, StructureResult, StructureBox, StructureGraphic } from '@/app/(admin)/ai/ocr-pipeline/types'
 import { usePixelWordPositions } from './usePixelWordPositions'
 const KLAUSUR_API = '/klausur-api'
@@ -60,6 +60,9 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
  const [fontScale, setFontScale] = useState(0.7)
  const [globalBold, setGlobalBold] = useState(false)
  const [imageRotation, setImageRotation] = useState<0 | 180>(0)
  const [structureBoxes, setStructureBoxes] = useState<StructureBox[]>([])
  const [structureGraphics, setStructureGraphics] = useState<StructureGraphic[]>([])
  const [showStructure, setShowStructure] = useState(true)
  const reconRef = useRef<HTMLDivElement>(null)
  const [reconWidth, setReconWidth] = useState(0)
@@ -92,12 +95,15 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
  // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [sessionId])
-  // Track image natural height for font scaling
+  // Track image natural dimensions for font scaling and structure layer
  const handleImageLoad = useCallback(() => {
    if (imageRef.current) {
      setImageNaturalH(imageRef.current.naturalHeight)
      if (!imageNaturalSize) {
        setImageNaturalSize({ w: imageRef.current.naturalWidth, h: imageRef.current.naturalHeight })
      }
    }
-  }, [])
+  }, [imageNaturalSize])
  const loadSessionData = async () => {
    if (!sessionId) return
@@ -132,6 +138,13 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
      setUndoStack([])
      setRedoStack([])
      // Load structure result (boxes, graphics, colors)
      const structureResult: StructureResult | undefined = data.structure_result
      if (structureResult) {
        setStructureBoxes(structureResult.boxes || [])
        setStructureGraphics(structureResult.graphics || [])
      }
      // Check for parent with boxes (sub-sessions + zones)
      const columnResult: ColumnResult | undefined = data.column_result
      const rowResult: RowResult | undefined = data.row_result
@@ -517,6 +530,65 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
    return bboxPct
  }
  // Structure layer: boxes and graphic elements as background
  const renderStructureLayer = (imgW: number, imgH: number) => {
    if (!showStructure) return null
    const hasElements = structureBoxes.length > 0 || structureGraphics.length > 0
    if (!hasElements) return null
    return (
      <>
        {/* Structure boxes */}
        {structureBoxes.map((box, i) => {
          const bgColor = box.bg_color_hex || '#6b7280'
          return (
            <div
              key={`sbox-${i}`}
              className="absolute pointer-events-none"
              style={{
                left: `${(box.x / imgW) * 100}%`,
                top: `${(box.y / imgH) * 100}%`,
                width: `${(box.w / imgW) * 100}%`,
                height: `${(box.h / imgH) * 100}%`,
                border: `${Math.max(1, box.border_thickness)}px solid ${bgColor}40`,
                backgroundColor: `${bgColor}0a`,
                borderRadius: '2px',
              }}
            />
          )
        })}
        {/* Graphic elements */}
        {structureGraphics.map((g, i) => (
          <div
            key={`sgfx-${i}`}
            className="absolute pointer-events-none"
            style={{
              left: `${(g.x / imgW) * 100}%`,
              top: `${(g.y / imgH) * 100}%`,
              width: `${(g.w / imgW) * 100}%`,
              height: `${(g.h / imgH) * 100}%`,
              border: `1px dashed ${g.color_hex}60`,
              backgroundColor: `${g.color_hex}08`,
              borderRadius: '2px',
            }}
          >
            <span
              className="absolute text-[8px] leading-none opacity-50"
              style={{
                top: '1px',
                left: '2px',
                color: g.color_hex,
              }}
            >
              {g.shape === 'illustration' ? 'Illust' : 'Bild'}
            </span>
          </div>
        ))}
      </>
    )
  }
  // Overlay rendering helper
  const renderOverlayMode = () => {
    const imgW = imageNaturalSize?.w || 1
@@ -597,6 +669,9 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
                  )
                })}
              {/* Structure elements (boxes, graphics) */}
              {renderStructureLayer(imgW, imgH)}
              {/* Pixel-positioned words / editable inputs */}
              {cells.map((cell) => {
                const displayText = getDisplayText(cell)
@@ -831,6 +906,19 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
              >
                180°
              </button>
              {(structureBoxes.length > 0 || structureGraphics.length > 0) && (
                <button
                  onClick={() => setShowStructure(v => !v)}
                  className={`px-2 py-1 text-xs border rounded transition-colors ${
                    showStructure
                      ? 'border-violet-300 bg-violet-50 text-violet-600 dark:border-violet-700 dark:bg-violet-900/30 dark:text-violet-400'
                      : 'border-gray-300 dark:border-gray-600 hover:bg-gray-50 dark:hover:bg-gray-700'
                  }`}
                  title="Strukturelemente anzeigen"
                >
                  Struktur
                </button>
              )}
              <div className="w-px h-5 bg-gray-300 dark:bg-gray-600 mx-1" />
            </>
          )}
@@ -851,6 +939,21 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
                Leer
              </button>
              {/* Structure toggle */}
              {(structureBoxes.length > 0 || structureGraphics.length > 0) && (
                <button
                  onClick={() => setShowStructure(v => !v)}
                  className={`px-2 py-1 text-xs border rounded transition-colors ${
                    showStructure
                      ? 'border-violet-300 bg-violet-50 text-violet-600 dark:border-violet-700 dark:bg-violet-900/30 dark:text-violet-400'
                      : 'border-gray-300 dark:border-gray-600 hover:bg-gray-50 dark:hover:bg-gray-700'
                  }`}
                  title="Strukturelemente anzeigen"
                >
                  Struktur
                </button>
              )}
              <div className="w-px h-5 bg-gray-300 dark:bg-gray-600 mx-1" />
              {/* Zoom controls */}
@@ -915,6 +1018,9 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
              onLoad={handleImageLoad}
            />
            {/* Structure elements (boxes, graphics) */}
            {imageNaturalSize && renderStructureLayer(imageNaturalSize.w, imageNaturalSize.h)}
            {/* Empty field markers */}
            {showEmptyHighlight && cells
              .filter(c => emptyCellIds.has(c.cellId))
--- a/admin-lehrer/components/ocr-pipeline/StepStructureDetection.tsx
+++ b/admin-lehrer/components/ocr-pipeline/StepStructureDetection.tsx
@@ -165,6 +165,11 @@ export function StepStructureDetection({ sessionId, onNext }: StepStructureDetec
                {result.word_count} Woerter
              </span>
            )}
            {(result.border_ghosts_removed ?? 0) > 0 && (
              <span className="inline-flex items-center gap-1.5 px-3 py-1 rounded-full bg-red-50 dark:bg-red-900/20 text-red-700 dark:text-red-400 text-xs font-medium">
                {result.border_ghosts_removed} Rahmenlinien entfernt
              </span>
            )}
            <span className="text-gray-400 text-xs ml-auto">
              {result.image_width}x{result.image_height}px | {result.duration_seconds}s
            </span>
--- a/docs-src/services/klausur-service/OCR-Pipeline.md
+++ b/docs-src/services/klausur-service/OCR-Pipeline.md
@@ -149,6 +149,8 @@ klausur-service/backend/
 ├── ocr_pipeline_api.py                 # FastAPI Router (Schritte 2-10)
 ├── orientation_crop_api.py             # FastAPI Router (Schritte 1 + 4)
 ├── cv_box_detect.py                    # Box-Erkennung + Zonen-Aufteilung
 ├── cv_graphic_detect.py                # Grafik-/Bilderkennung (Region-basiert)
 ├── cv_color_detect.py                  # Farbtext-Erkennung (HSV-Analyse)
 ├── cv_words_first.py                   # Words-First Grid Builder (bottom-up)
 ├── page_crop.py                        # Content-basierter Crop-Algorithmus
 ├── ocr_pipeline_session_store.py       # PostgreSQL Persistence
@@ -177,7 +179,8 @@ admin-lehrer/
    ├── StepColumnDetection.tsx          # Schritt 5: Spaltenerkennung
    ├── StepRowDetection.tsx             # Schritt 6: Zeilenerkennung
    ├── StepWordRecognition.tsx          # Schritt 7: Worterkennung
-    ├── StepLlmReview.tsx               # Schritt 8: Korrektur (SSE-Stream)
+    ├── StepStructureDetection.tsx       # Schritt 8: Strukturerkennung
    ├── StepLlmReview.tsx               # Schritt 9: Korrektur (SSE-Stream)
    ├── StepReconstruction.tsx           # Schritt 9: Rekonstruktion (Canvas + Overlay)
    ├── usePixelWordPositions.ts        # Shared Hook: Pixel-basierte Wortpositionierung
    ├── FabricReconstructionCanvas.tsx   # Fabric.js Editor
@@ -281,14 +284,21 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.
 | `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) |
 | `grid_method` | `v2` | Grid-Strategie: `v2` (top-down) oder `words_first` (bottom-up) |
-### Schritt 8: Korrektur
+### Schritt 8: Strukturerkennung
 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
 | `POST` | `/sessions/{id}/detect-structure` | Boxen, Zonen, Farben und Grafiken erkennen |
 | `GET` | `/sessions/{id}/image/structure-overlay` | Overlay mit allen Strukturelementen |
 ### Schritt 9: Korrektur
 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
 | `POST` | `/sessions/{id}/llm-review?stream=true` | SSE-Stream Korrektur starten |
 | `POST` | `/sessions/{id}/llm-review/apply` | Ausgewaehlte Korrekturen speichern |
-### Schritt 9: Rekonstruktion
+### Schritt 10: Rekonstruktion
 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
@@ -853,6 +863,93 @@ Change-Format:
 ---
 ## Schritt 8: Strukturerkennung (Detail)
 Erkennt Boxen, Zonen, Farbregionen und grafische Elemente auf der Seite.
 Laeuft **nach** der Worterkennung (Schritt 7), damit OCR-Wortpositionen
 fuer die Unterscheidung von Text vs. Grafik zur Verfuegung stehen.
 ### Teilschritte
 1. **Box-Erkennung** (`cv_box_detect.py`): Linien-Rahmen und farbige Hintergruende
 2. **Zonen-Aufteilung** (`split_page_into_zones`): Seite in Box- und Content-Zonen aufteilen
 3. **Farb-Analyse** (`cv_color_detect.py`): HSV-basierte Erkennung farbiger Textbereiche
 4. **Grafik-Erkennung** (`cv_graphic_detect.py`): Nicht-Text-Grafiken identifizieren
 ### Grafik-Erkennung: Region-basierter Ansatz
 Zwei Paesse trennen farbige Grafiken von farbigem Text und erkennen
 schwarze Illustrationen:
 **Pass 1 — Farbige Bildregionen:**
 1. HSV-Saturation-Kanal extrahieren (Schwelle > 40)
   - Schwarzer Text hat Saettigung ≈ 0 → unsichtbar auf diesem Kanal
 2. Starke Dilation (25×25 Ellipse) verschmilzt nahe Farbpixel zu Regionen
 3. Fuer jede Region: Wort-Ueberlappung pruefen
   - \> 50 % Ueberlappung mit OCR-Woertern → farbiger Text → ueberspringen
   - ≤ 50 % → farbige Grafik/Bild → behalten
 4. Minimum 200 Farbpixel erforderlich (kein Rauschen)
 5. Regionen > 50 % der Bildbreite oder -hoehe → Seitenumfassend → ueberspringen
 **Pass 2 — Schwarze Illustrationen:**
 1. Otsu-Binarisierung fuer Tinten-Maske
 2. Ausschlusszonen: OCR-Woerter (5 px Padding) + erkannte Boxen (8 px Inset)
 3. Farbige Pixel aus Pass 1 ebenfalls ausschliessen
 4. Nur Konturen mit Flaeche > 5000 px und min(Breite, Hoehe) > 40 px
 **Deduplizierung:** Ueberlappende Elemente (> 50 % IoU der kleineren
 Bounding-Box) werden zusammengefasst. Ergebnis nach Flaeche absteigend
 sortiert.
 ### Response-Format
 ```json
 {
  "boxes": [
    {"x": 50, "y": 300, "w": 1100, "h": 200, "confidence": 0.85,
     "border_thickness": 3, "bg_color_name": "blue", "bg_color_hex": "#2563eb"}
  ],
  "zones": [
    {"index": 0, "zone_type": "content", "x": 50, "y": 50, "w": 1100, "h": 250},
    {"index": 1, "zone_type": "box", "x": 50, "y": 300, "w": 1100, "h": 200}
  ],
  "graphics": [
    {"x": 100, "y": 500, "w": 150, "h": 120, "area": 8500,
     "shape": "image", "color_name": "red", "color_hex": "#dc2626",
     "confidence": 0.72}
  ],
  "color_pixel_counts": {"red": 1234, "blue": 5678},
  "has_words": true,
  "word_count": 96,
  "duration_seconds": 0.45
 }
 ```
 ### Grafik-Shape-Typen
 | Shape | Quelle | Beschreibung |
 |-------|--------|--------------|
 | `image` | Pass 1 | Farbige Grafik/Bild (Ballons, Pfeile, Icons) |
 | `illustration` | Pass 2 | Grosse schwarze Zeichnung/Illustration |
 ### Erkannte Farben
 `red`, `orange`, `yellow`, `green`, `blue`, `purple`, `black`
 — basierend auf dem Median-Hue der saturierten Pixel in der Region.
 ### Frontend-Anzeige
 `StepStructureDetection.tsx` zeigt:
 - Boxen-Liste mit Position, Hintergrundfarbe und Confidence
 - Zonen-Uebersicht (Content vs. Box)
 - Farb-Zusammenfassung (Pixel-Counts)
 - Grafik-Liste mit Shape, Abmessungen, Farbe und Confidence
 ---
 ## Schritt 9: Rekonstruktion (Detail)
 Drei Modi verfuegbar:
@@ -1263,6 +1360,7 @@ cd klausur-service/backend && pytest tests/test_paddle_kombi.py -v  # 36 Tests
 | Datum | Version | Aenderung |
 |-------|---------|----------|
 | 2026-03-16 | 4.6.0 | Strukturerkennung (Schritt 8): Region-basierte Grafikerkennung (`cv_graphic_detect.py`) mit Zwei-Pass-Verfahren (Farbregionen + schwarze Illustrationen), Wort-Ueberlappungs-Filter, Box/Zonen/Farb-Analyse. Schritt laeuft nach Worterkennung. |
 | 2026-03-12 | 4.5.0 | Kombi-Modus (PaddleOCR + Tesseract): Beide Engines laufen parallel, Koordinaten werden IoU-basiert gematcht und confidence-gewichtet gemittelt. Ungematchte Tesseract-Woerter (Bullets, Symbole) werden hinzugefuegt. 3er-Toggle in OCR Overlay. |
 | 2026-03-12 | 4.4.0 | PaddleOCR Remote-Engine (`engine=paddle`): PP-OCRv5 Latin auf Hetzner x86_64. Neuer Microservice (`paddleocr-service/`), HTTP-Client (`paddleocr_remote.py`), Frontend-Dropdown-Option. Nutzt words_first Grid-Methode. |
 | 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. |
--- a/klausur-service/backend/cv_graphic_detect.py
+++ b/klausur-service/backend/cv_graphic_detect.py
@@ -121,10 +121,9 @@ def detect_graphic_elements(
        return []
    h, w = img_bgr.shape[:2]
    img_area = h * w
-    logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
+    logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
-                w, h, len(word_boxes), len(detected_boxes or []))
+                 w, h, len(word_boxes), len(detected_boxes or []))
    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    candidates: List[GraphicElement] = []
@@ -161,7 +160,7 @@ def detect_graphic_elements(
    contours_regions, _ = cv2.findContours(
        region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )
-    logger.info("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
+    logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
    for cnt in contours_regions:
        bx, by, bw, bh = cv2.boundingRect(cnt)
@@ -172,7 +171,7 @@ def detect_graphic_elements(
        # Skip page-spanning regions
        if bw > w * 0.5 or bh > h * 0.5:
-            logger.info("GraphicDetect PASS1 SKIP page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
+            logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
            continue
        bbox_area = bw * bh
@@ -188,8 +187,8 @@ def detect_graphic_elements(
        # If most of the region is covered by word boxes → colored text, skip
        if word_overlap > 0.5:
-            logger.info("GraphicDetect PASS1 SKIP text region (%d,%d) %dx%d word_overlap=%.0f%%",
+            logger.debug("GraphicDetect PASS1 skip text region (%d,%d) %dx%d overlap=%.0f%%",
-                        bx, by, bw, bh, word_overlap * 100)
+                         bx, by, bw, bh, word_overlap * 100)
            continue
        # Need a minimum number of colored pixels (not just dilated area)
@@ -209,8 +208,7 @@ def detect_graphic_elements(
        density = color_pixel_count / bbox_area if bbox_area > 0 else 0
        conf = min(0.95, 0.5 + density * 0.5)
-        logger.info("GraphicDetect PASS1 ACCEPT image at (%d,%d) %dx%d "
+        logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d overlap=%.0f%% %s",
                     "color_px=%d word_overlap=%.0f%% color=%s",
                     bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name)
        candidates.append(GraphicElement(
            x=bx, y=by, width=bw, height=bh,
@@ -256,7 +254,7 @@ def detect_graphic_elements(
    contours_ink, _ = cv2.findContours(
        ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )
-    logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink))
+    logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink))
    for cnt in contours_ink:
        area = cv2.contourArea(cnt)
@@ -267,8 +265,8 @@ def detect_graphic_elements(
        if bw > w * 0.8 or bh > h * 0.8:
            continue
-        logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
+        logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d",
-                    bx, by, bw, bh, int(area))
+                     bx, by, bw, bh, int(area))
        candidates.append(GraphicElement(
            x=bx, y=by, width=bw, height=bh,
            area=int(area), shape="illustration",
--- a/klausur-service/backend/cv_layout.py
+++ b/klausur-service/backend/cv_layout.py
@@ -7,6 +7,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 import logging
 import re
 import statistics
 from typing import Any, Dict, List, Optional, Tuple
 import numpy as np
@@ -737,6 +738,24 @@ def _detect_sub_columns(
            result.append(geo)
            continue
        # --- Guard against inline markers (bullet points, numbering) ---
        # Bullet points like "1.", "2.", "•", "-" sit close to the main
        # column text and are part of the cell, not a separate column.
        # Only split if the horizontal gap between the rightmost sub-word
        # and the main column start is large enough.
        max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
        gap_to_main = col_start_bin[2] - max_sub_right  # px gap
        median_heights = [w.get('height', 20) for w in confident]
        med_h = statistics.median(median_heights) if median_heights else 20
        min_gap = max(med_h * 1.2, 20)  # at least 1.2× word height or 20px
        if gap_to_main < min_gap:
            logger.debug(
                "SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
                "(likely inline markers, not a sub-column)",
                geo.index, gap_to_main, min_gap)
            result.append(geo)
            continue
        # --- Build two sub-column geometries ---
        # Word 'left' values are relative to left_x; geo.x is absolute.
        # Convert the split position from relative to absolute coordinates.
@@ -3221,6 +3240,46 @@ def detect_column_geometry_zoned(
            g.y = abs_y
            g.height = abs_y_end - abs_y
    # --- Enrich column geometries with box-filtered original words ---
    # The combined-image Tesseract may miss words in small content strips
    # (e.g. a single row above a box).  Use the original full-page word_dicts
    # filtered to exclude box interiors, so that _detect_sub_columns()
    # downstream has ALL content-zone words for left-edge clustering.
    # This ensures narrow sub-columns (page_ref, marker) are detectable
    # even when only a few entries exist above/below a box.
    if word_dicts:
        content_words = []
        for w in word_dicts:
            # word positions are relative to left_x / top_y
            w_abs_cx = w['left'] + left_x + w['width'] / 2
            w_abs_cy = w['top'] + top_y + w['height'] / 2
            inside_box = any(
                box.x <= w_abs_cx <= box.x + box.width
                and box.y <= w_abs_cy <= box.y + box.height
                for box in boxes
            )
            if not inside_box:
                content_words.append(w)
        target_geoms = combined_geoms if combined_result is not None else geometries
        for g in target_geoms:
            # Word 'left' is relative to left_x; geometry 'x' is absolute
            g_left_rel = g.x - left_x
            g_right_rel = g_left_rel + g.width
            g.words = [
                w for w in content_words
                if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
            ]
            g.word_count = len(g.words)
        excluded_count = len(word_dicts) - len(content_words)
        if excluded_count:
            logger.info(
                "ZonedColumns: enriched geometries with %d content words "
                "(excluded %d box-interior words)",
                len(content_words), excluded_count,
            )
    # Build zones_data for the response
    zones_data: List[Dict] = []
    for zone in zones:
--- a/klausur-service/backend/cv_words_first.py
+++ b/klausur-service/backend/cv_words_first.py
@@ -17,7 +17,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 import logging
 import re
 import statistics
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 from cv_ocr_engines import (
    _group_words_into_lines,
@@ -259,6 +259,7 @@ def build_grid_from_words(
    img_w: int,
    img_h: int,
    min_confidence: int = 30,
    box_rects: Optional[List[Dict]] = None,
 ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """Build a cell grid bottom-up from Tesseract word boxes.
@@ -269,6 +270,9 @@ def build_grid_from_words(
        img_w: Image width in pixels.
        img_h: Image height in pixels.
        min_confidence: Minimum OCR confidence to keep a word.
        box_rects: Optional list of box dicts with keys x, y, width, height.
            Words inside these boxes are excluded from column clustering
            (box-internal columns are detected separately in sub-sessions).
    Returns:
        (cells, columns_meta) — same format as build_cell_grid_v2().
@@ -290,6 +294,28 @@ def build_grid_from_words(
    logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
    # Exclude words inside detected boxes — box columns are detected separately
    if box_rects:
        content_words = []
        for w in words:
            w_cx = w['left'] + w['width'] / 2
            w_cy = w['top'] + w['height'] / 2
            inside = any(
                b['x'] <= w_cx <= b['x'] + b['width']
                and b['y'] <= w_cy <= b['y'] + b['height']
                for b in box_rects
            )
            if not inside:
                content_words.append(w)
        excluded = len(words) - len(content_words)
        if excluded:
            logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
                        excluded, len(box_rects))
        words = content_words
        if not words:
            logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
            return [], []
    # Step 1: cluster columns
    columns = _cluster_columns(words, img_w)
    logger.info("build_grid_from_words: %d column(s) detected", len(columns))
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1202,6 +1202,147 @@ async def detect_type(session_id: str):
    return {"session_id": session_id, **result_dict}
 # ---------------------------------------------------------------------------
 # Border-ghost word filter
 # ---------------------------------------------------------------------------
 # Characters that OCR produces when reading box-border lines.
 _BORDER_GHOST_CHARS = set("|1lI![](){}iíì/\\-—–_~.,;:'\"")
 def _filter_border_ghost_words(
    word_result: Dict,
    boxes: List,
 ) -> int:
    """Remove OCR words that are actually box border lines.
    A word is considered a border ghost when it sits on a known box edge
    (left, right, top, or bottom) and looks like a line artefact (narrow
    aspect ratio or text consists only of line-like characters).
    After removing ghost cells, columns that have become empty are also
    removed from ``columns_used`` so the grid no longer shows phantom
    columns.
    Modifies *word_result* in-place and returns the number of removed cells.
    """
    if not boxes or not word_result:
        return 0
    cells = word_result.get("cells")
    if not cells:
        return 0
    # Build border bands — vertical (X) and horizontal (Y)
    x_bands = []  # list of (x_lo, x_hi)
    y_bands = []  # list of (y_lo, y_hi)
    for b in boxes:
        bx = b.x if hasattr(b, "x") else b.get("x", 0)
        by = b.y if hasattr(b, "y") else b.get("y", 0)
        bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
        bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
        bt = b.border_thickness if hasattr(b, "border_thickness") else b.get("border_thickness", 3)
        margin = max(bt * 2, 10) + 6  # generous margin
        # Vertical edges (left / right)
        x_bands.append((bx - margin, bx + margin))
        x_bands.append((bx + bw - margin, bx + bw + margin))
        # Horizontal edges (top / bottom)
        y_bands.append((by - margin, by + margin))
        y_bands.append((by + bh - margin, by + bh + margin))
    img_w = word_result.get("image_width", 1)
    img_h = word_result.get("image_height", 1)
    def _is_ghost(cell: Dict) -> bool:
        text = (cell.get("text") or "").strip()
        if not text:
            return False
        # Compute absolute pixel position
        if cell.get("bbox_px"):
            px = cell["bbox_px"]
            cx = px["x"] + px["w"] / 2
            cy = px["y"] + px["h"] / 2
            cw = px["w"]
            ch = px["h"]
        elif cell.get("bbox_pct"):
            pct = cell["bbox_pct"]
            cx = (pct["x"] / 100) * img_w + (pct["w"] / 100) * img_w / 2
            cy = (pct["y"] / 100) * img_h + (pct["h"] / 100) * img_h / 2
            cw = (pct["w"] / 100) * img_w
            ch = (pct["h"] / 100) * img_h
        else:
            return False
        # Check if center sits on a vertical or horizontal border
        on_vertical = any(lo <= cx <= hi for lo, hi in x_bands)
        on_horizontal = any(lo <= cy <= hi for lo, hi in y_bands)
        if not on_vertical and not on_horizontal:
            return False
        # Very short text (1-2 chars) on a border → very likely ghost
        if len(text) <= 2:
            # Narrow vertically (line-like) or narrow horizontally (dash-like)?
            if ch > 0 and cw / ch < 0.5:
                return True
            if cw > 0 and ch / cw < 0.5:
                return True
            # Text is only border-ghost characters?
            if all(c in _BORDER_GHOST_CHARS for c in text):
                return True
        # Longer text but still only ghost chars and very narrow
        if all(c in _BORDER_GHOST_CHARS for c in text):
            if ch > 0 and cw / ch < 0.35:
                return True
            if cw > 0 and ch / cw < 0.35:
                return True
            return True  # all ghost chars on a border → remove
        return False
    before = len(cells)
    word_result["cells"] = [c for c in cells if not _is_ghost(c)]
    removed = before - len(word_result["cells"])
    # --- Remove empty columns from columns_used ---
    columns_used = word_result.get("columns_used")
    if removed and columns_used and len(columns_used) > 1:
        remaining_cells = word_result["cells"]
        occupied_cols = {c.get("col_index") for c in remaining_cells}
        before_cols = len(columns_used)
        columns_used = [col for col in columns_used if col.get("index") in occupied_cols]
        # Re-index columns and remap cell col_index values
        if len(columns_used) < before_cols:
            old_to_new = {}
            for new_i, col in enumerate(columns_used):
                old_to_new[col["index"]] = new_i
                col["index"] = new_i
            for cell in remaining_cells:
                old_ci = cell.get("col_index")
                if old_ci in old_to_new:
                    cell["col_index"] = old_to_new[old_ci]
            word_result["columns_used"] = columns_used
            logger.info("border-ghost: removed %d empty column(s), %d remaining",
                        before_cols - len(columns_used), len(columns_used))
    if removed:
        # Update summary counts
        summary = word_result.get("summary", {})
        summary["total_cells"] = len(word_result["cells"])
        summary["non_empty_cells"] = sum(1 for c in word_result["cells"] if c.get("text"))
        word_result["summary"] = summary
        gs = word_result.get("grid_shape", {})
        gs["total_cells"] = len(word_result["cells"])
        if columns_used is not None:
            gs["cols"] = len(columns_used)
        word_result["grid_shape"] = gs
    return removed
 # ---------------------------------------------------------------------------
 # Structure Detection Endpoint
 # ---------------------------------------------------------------------------
@@ -1236,10 +1377,6 @@ async def detect_structure(session_id: str):
        for cell in word_result["cells"]:
            for wb in (cell.get("word_boxes") or []):
                words.append(wb)
    logger.info("detect-structure: word_result present=%s, cells=%d, word_boxes extracted=%d",
                word_result is not None,
                len(word_result.get("cells", [])) if word_result else 0,
                len(words))
    # If no words yet, use image dimensions with small margin
    if words:
        content_x = max(0, min(int(wb["left"]) for wb in words))
@@ -1319,6 +1456,15 @@ async def detect_structure(session_id: str):
        detected_boxes=box_dicts,
    )
    # --- Filter border-ghost words from OCR result ---
    ghost_count = 0
    if boxes and word_result:
        ghost_count = _filter_border_ghost_words(word_result, boxes)
        if ghost_count:
            logger.info("detect-structure: removed %d border-ghost words", ghost_count)
            await update_session_db(session_id, word_result=word_result)
            cached["word_result"] = word_result
    duration = time.time() - t0
    result_dict = {
@@ -1361,6 +1507,7 @@ async def detect_structure(session_id: str):
        "color_pixel_counts": color_summary,
        "has_words": len(words) > 0,
        "word_count": len(words),
        "border_ghosts_removed": ghost_count,
        "duration_seconds": round(duration, 2),
    }
@@ -1806,12 +1953,7 @@ async def _get_structure_overlay(session_id: str) -> Response:
    # --- Draw graphic elements ---
    graphics_data = structure.get("graphics", [])
    shape_icons = {
-        "arrow": "ARROW",
+        "image": "IMAGE",
        "circle": "CIRCLE",
        "line": "LINE",
        "exclamation": "!",
        "dot": "DOT",
        "icon": "ICON",
        "illustration": "ILLUST",
    }
    for gfx in graphics_data:
@@ -2401,7 +2543,15 @@ async def detect_words(
                    })
                wf_word_dicts = abs_words
-        cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
+        # Extract box rects for box-aware column clustering
        box_rects = []
        for zone in zones:
            if zone.get("zone_type") == "box" and zone.get("box"):
                box_rects.append(zone["box"])
        cells, columns_meta = build_grid_from_words(
            wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
        )
        duration = time.time() - t0
        # Apply IPA phonetic fixes
--- a/klausur-service/backend/tests/test_border_ghost_filter.py
+++ b/klausur-service/backend/tests/test_border_ghost_filter.py
@@ -0,0 +1,307 @@
 """
 Tests for _filter_border_ghost_words() — removes OCR artefacts from box borders.
 When OCR reads a scanned document, box border lines (vertical/horizontal
 strokes) are often misrecognised as characters like '|', '1', 'l', '-'.
 These phantom words create spurious columns/rows in the grid. The filter
 removes them by checking if a word sits on a known box border and looks
 like a line artefact.
 Lizenz: Apache 2.0
 """
 import sys
 import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 from ocr_pipeline_api import _filter_border_ghost_words, _BORDER_GHOST_CHARS
 from cv_vocab_types import DetectedBox
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _make_cell(text: str, x: int, y: int, w: int, h: int,
               col_index: int = 0) -> dict:
    """Create a cell dict with bbox_px matching the word recognition output."""
    return {
        "cell_id": f"c_{x}_{y}",
        "text": text,
        "bbox_px": {"x": x, "y": y, "w": w, "h": h},
        "bbox_pct": {
            "x": x / 12, "y": y / 18,
            "w": w / 12, "h": h / 18,
        },
        "confidence": 80,
        "row_index": 0,
        "col_index": col_index,
    }
 def _make_word_result(cells: list, img_w: int = 1200, img_h: int = 1800,
                      columns_used: list = None) -> dict:
    return {
        "cells": cells,
        "image_width": img_w,
        "image_height": img_h,
        "columns_used": columns_used,
        "summary": {
            "total_cells": len(cells),
            "non_empty_cells": sum(1 for c in cells if c.get("text")),
        },
        "grid_shape": {
            "total_cells": len(cells),
            "cols": len(columns_used) if columns_used else 1,
        },
    }
 def _make_box(x: int, y: int, w: int, h: int, bt: int = 3) -> DetectedBox:
    return DetectedBox(x=x, y=y, width=w, height=h, confidence=0.9, border_thickness=bt)
 # ---------------------------------------------------------------------------
 # Basic filtering tests
 # ---------------------------------------------------------------------------
 class TestBorderGhostFilter:
    """Tests for the _filter_border_ghost_words() function."""
    def test_no_boxes_no_change(self):
        """Without boxes, nothing should be filtered."""
        cells = [_make_cell("hello", 100, 200, 80, 30)]
        wr = _make_word_result(cells)
        removed = _filter_border_ghost_words(wr, [])
        assert removed == 0
        assert len(wr["cells"]) == 1
    def test_no_word_result_no_crash(self):
        removed = _filter_border_ghost_words(None, [_make_box(50, 300, 1100, 200)])
        assert removed == 0
    def test_empty_cells_no_crash(self):
        wr = _make_word_result([])
        removed = _filter_border_ghost_words(wr, [_make_box(50, 300, 1100, 200)])
        assert removed == 0
    def test_pipe_on_left_border_removed(self):
        """A '|' character sitting on the left border of a box should be removed."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cells = [
            _make_cell("|", x=48, y=350, w=3, h=25),
            _make_cell("hello", x=200, y=350, w=80, h=25),
        ]
        wr = _make_word_result(cells)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 1
        assert wr["cells"][0]["text"] == "hello"
    def test_pipe_on_right_border_removed(self):
        """A '|' character on the right border should be removed."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cells = [
            _make_cell("|", x=1148, y=350, w=4, h=25),
            _make_cell("world", x=600, y=350, w=80, h=25),
        ]
        wr = _make_word_result(cells)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 1
        assert wr["cells"][0]["text"] == "world"
    def test_digit_1_on_border_narrow_removed(self):
        """A narrow '1' on a box border should be removed."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cells = [_make_cell("1", x=49, y=400, w=5, h=20)]
        wr = _make_word_result(cells)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 1
    def test_dash_on_horizontal_border_removed(self):
        """A '-' on the bottom horizontal border should be removed."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        # Bottom border at y=500, dash at y=498
        cells = [_make_cell("-", x=600, y=498, w=20, h=4)]
        wr = _make_word_result(cells)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 1
    def test_real_word_on_border_not_removed(self):
        """A normal word near a border should NOT be removed."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cells = [_make_cell("Tip", x=52, y=350, w=60, h=25)]
        wr = _make_word_result(cells)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 0
    def test_word_far_from_border_not_removed(self):
        """Words far from any border should never be removed."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cells = [_make_cell("|", x=600, y=400, w=3, h=25)]
        wr = _make_word_result(cells)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 0
    def test_multiple_ghosts_on_same_box(self):
        """Multiple ghost words on the same box should all be removed."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cells = [
            _make_cell("|", x=48, y=350, w=3, h=25),
            _make_cell("l", x=1149, y=350, w=4, h=25),
            _make_cell("text", x=400, y=350, w=80, h=25),
        ]
        wr = _make_word_result(cells)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 2
        assert len(wr["cells"]) == 1
        assert wr["cells"][0]["text"] == "text"
    def test_summary_updated_after_removal(self):
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cells = [
            _make_cell("|", x=48, y=350, w=3, h=25),
            _make_cell("hello", x=200, y=350, w=80, h=25),
        ]
        wr = _make_word_result(cells)
        _filter_border_ghost_words(wr, [box])
        assert wr["summary"]["total_cells"] == 1
        assert wr["grid_shape"]["total_cells"] == 1
    def test_ghost_chars_covers_common_artefacts(self):
        """The ghost chars set should include common border-line OCR artefacts."""
        expected = {"|", "1", "l", "I", "!", "[", "]", "-", "—", "_", "/", "\\"}
        assert expected.issubset(_BORDER_GHOST_CHARS)
    def test_multiple_boxes(self):
        box1 = _make_box(x=50, y=300, w=500, h=200, bt=3)
        box2 = _make_box(x=600, y=300, w=500, h=200, bt=3)
        cells = [
            _make_cell("|", x=49, y=350, w=3, h=25),
            _make_cell("I", x=599, y=350, w=4, h=25),
            _make_cell("real", x=300, y=350, w=80, h=25),
        ]
        wr = _make_word_result(cells)
        removed = _filter_border_ghost_words(wr, [box1, box2])
        assert removed == 2
    def test_uses_bbox_pct_fallback(self):
        """Should work with bbox_pct when bbox_px is not available."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cell = {
            "cell_id": "c_test",
            "text": "|",
            "bbox_pct": {"x": (48 / 1200) * 100, "y": (350 / 1800) * 100,
                         "w": (4 / 1200) * 100, "h": (25 / 1800) * 100},
            "confidence": 80,
            "col_index": 0,
        }
        wr = _make_word_result([cell])
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 1
    def test_generous_margin_catches_offset_ghosts(self):
        """Even if OCR word is slightly offset from border, it should be caught."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        # Word 15px away from right border (at x=1135 vs border at x=1150)
        cells = [_make_cell("|", x=1135, y=350, w=4, h=25)]
        wr = _make_word_result(cells)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 1
 # ---------------------------------------------------------------------------
 # Column cleanup tests
 # ---------------------------------------------------------------------------
 class TestColumnCleanup:
    """Tests for empty column removal after ghost filtering."""
    def test_empty_column_removed(self):
        """After filtering all cells of column 4, it should be removed."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cols = [
            {"index": 0, "type": "column_en", "x": 60, "width": 250},
            {"index": 1, "type": "column_de", "x": 320, "width": 250},
            {"index": 2, "type": "column_3", "x": 580, "width": 250},
            {"index": 3, "type": "column_4", "x": 840, "width": 250},
            {"index": 4, "type": "column_5", "x": 1140, "width": 60},  # ghost column
        ]
        cells = [
            _make_cell("word", x=100, y=350, w=60, h=25, col_index=0),
            _make_cell("Wort", x=360, y=350, w=60, h=25, col_index=1),
            _make_cell("txt", x=620, y=350, w=50, h=25, col_index=2),
            _make_cell("abc", x=880, y=350, w=50, h=25, col_index=3),
            _make_cell("|", x=1148, y=350, w=4, h=25, col_index=4),   # ghost
            _make_cell("l", x=1149, y=400, w=3, h=25, col_index=4),   # ghost
        ]
        wr = _make_word_result(cells, columns_used=cols)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 2
        assert len(wr["columns_used"]) == 4  # column 5 removed
        assert wr["grid_shape"]["cols"] == 4
    def test_columns_reindexed_after_removal(self):
        """After removing a middle column, indices should be sequential."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cols = [
            {"index": 0, "type": "column_1", "x": 60, "width": 200},
            {"index": 1, "type": "column_2", "x": 280, "width": 30},  # border col
            {"index": 2, "type": "column_3", "x": 400, "width": 200},
        ]
        # Column 1 only has ghost cells
        cells = [
            _make_cell("hello", x=100, y=350, w=60, h=25, col_index=0),
            # This cell is NOT on a border so it won't be filtered by the ghost filter
            # For this test, put a ghost on the box border
            _make_cell("|", x=49, y=350, w=3, h=25, col_index=1),
            _make_cell("world", x=440, y=350, w=60, h=25, col_index=2),
        ]
        wr = _make_word_result(cells, columns_used=cols)
        _filter_border_ghost_words(wr, [box])
        # Column 1 should be removed, column 2 becomes column 1
        assert len(wr["columns_used"]) == 2
        assert wr["columns_used"][0]["index"] == 0
        assert wr["columns_used"][1]["index"] == 1
        # Remaining cells should have updated col_index
        assert wr["cells"][0]["col_index"] == 0
        assert wr["cells"][1]["col_index"] == 1
    def test_no_columns_used_no_crash(self):
        """If columns_used is None, column cleanup should be skipped."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cells = [_make_cell("|", x=48, y=350, w=3, h=25)]
        wr = _make_word_result(cells, columns_used=None)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 1
    def test_occupied_columns_kept(self):
        """Columns that still have cells after filtering should be kept."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cols = [
            {"index": 0, "type": "column_en", "x": 60, "width": 250},
            {"index": 1, "type": "column_de", "x": 320, "width": 250},
        ]
        cells = [
            _make_cell("word", x=100, y=350, w=60, h=25, col_index=0),
            _make_cell("Wort", x=360, y=350, w=60, h=25, col_index=1),
        ]
        wr = _make_word_result(cells, columns_used=cols)
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 0
        assert len(wr["columns_used"]) == 2
    def test_single_column_not_removed(self):
        """A single remaining column should never be removed."""
        box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
        cols = [{"index": 0, "type": "column_text", "x": 60, "width": 1000}]
        cells = [_make_cell("|", x=49, y=350, w=3, h=25, col_index=0)]
        wr = _make_word_result(cells, columns_used=cols)
        # Even if the only cell is filtered, we don't remove the last column
        removed = _filter_border_ghost_words(wr, [box])
        assert removed == 1
        # columns_used should still have 1 entry (we skip cleanup for len <= 1)
        assert len(wr["columns_used"]) == 1
--- a/klausur-service/backend/tests/test_box_column_awareness.py
+++ b/klausur-service/backend/tests/test_box_column_awareness.py
@@ -0,0 +1,174 @@
 """
 Tests for box-aware column detection.
 Verifies that:
 1. Words inside boxes are excluded from column clustering (words_first)
 2. Column geometries are enriched with box-filtered original words (layout)
 3. Inline markers (bullet points) are not split into sub-columns
 Lizenz: Apache 2.0
 """
 import sys
 import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 from cv_words_first import build_grid_from_words, _cluster_columns
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _word(text: str, left: int, top: int, width: int, height: int,
          conf: int = 90) -> dict:
    return {
        'text': text, 'left': left, 'top': top,
        'width': width, 'height': height, 'conf': conf,
    }
 def _box(x: int, y: int, w: int, h: int) -> dict:
    return {'x': x, 'y': y, 'width': w, 'height': h}
 # ---------------------------------------------------------------------------
 # Tests: box filtering in build_grid_from_words
 # ---------------------------------------------------------------------------
 class TestBoxAwareGridBuilding:
    """Words inside boxes should be excluded from column clustering."""
    def test_no_boxes_unchanged(self):
        """Without boxes, all words should be used."""
        words = [
            _word("hello", 50, 100, 80, 20),
            _word("world", 300, 100, 80, 20),
        ]
        cells, cols = build_grid_from_words(words, 600, 400)
        assert len(cells) >= 2
        texts = {c['text'] for c in cells}
        assert 'hello' in texts
        assert 'world' in texts
    def test_box_words_excluded(self):
        """Words inside a box should not appear in the grid."""
        words = [
            _word("outside1", 50, 50, 80, 20),
            _word("outside2", 300, 50, 80, 20),
            _word("inside_box", 150, 250, 100, 20),  # inside box
        ]
        box = _box(100, 200, 300, 150)  # box from x=100..400, y=200..350
        cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
        texts = {c['text'] for c in cells}
        assert 'outside1' in texts
        assert 'outside2' in texts
        assert 'inside_box' not in texts
    def test_all_words_in_box_returns_empty(self):
        """If all words are inside the box, return empty grid."""
        words = [
            _word("a", 150, 250, 30, 20),
            _word("b", 200, 250, 30, 20),
        ]
        box = _box(100, 200, 300, 150)
        cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
        assert cells == []
        assert cols == []
    def test_multiple_boxes(self):
        """Words in multiple boxes should all be excluded."""
        words = [
            _word("content", 50, 50, 80, 20),
            _word("box1_word", 120, 220, 80, 20),
            _word("box2_word", 420, 220, 80, 20),
        ]
        boxes = [
            _box(100, 200, 200, 100),  # box1
            _box(400, 200, 200, 100),  # box2
        ]
        cells, cols = build_grid_from_words(words, 700, 400, box_rects=boxes)
        texts = {c['text'] for c in cells}
        assert texts == {'content'}
    def test_word_on_box_border_excluded(self):
        """A word exactly on the box boundary should be excluded."""
        words = [
            _word("content", 50, 50, 80, 20),
            _word("edge", 100, 200, 40, 20),  # left edge = box.x, center inside
        ]
        box = _box(100, 200, 200, 100)
        cells, cols = build_grid_from_words(words, 600, 400, box_rects=[box])
        texts = {c['text'] for c in cells}
        assert 'edge' not in texts
    def test_columns_not_affected_by_box_words(self):
        """Box words should not create extra columns via X-gap analysis."""
        # Two columns of content words, plus a word in a box at a different X
        words = [
            _word("col1_a", 50, 50, 80, 20),
            _word("col1_b", 50, 100, 80, 20),
            _word("col2_a", 300, 50, 80, 20),
            _word("col2_b", 300, 100, 80, 20),
            # This box word is at X=500, would create a 3rd column if not filtered
            _word("box_far", 500, 250, 80, 20),
        ]
        box = _box(450, 200, 200, 150)
        cells, cols = build_grid_from_words(words, 700, 500, box_rects=[box])
        # Should only have 2 columns (not 3)
        assert len(cols) <= 2
 # ---------------------------------------------------------------------------
 # Tests: _cluster_columns with box-filtered words
 # ---------------------------------------------------------------------------
 class TestClusterColumnsFiltering:
    """Verify column clustering works correctly with filtered words."""
    def test_gap_detection_without_box_words(self):
        """Column gaps should be found from content words only."""
        content_words = [
            _word("a", 50, 50, 30, 20),
            _word("b", 50, 100, 30, 20),
            _word("c", 300, 50, 30, 20),
            _word("d", 300, 100, 30, 20),
        ]
        columns = _cluster_columns(content_words, 600)
        assert len(columns) == 2
    def test_single_column_when_words_close(self):
        """Close-together words should form a single column."""
        words = [
            _word("a", 50, 50, 80, 20),
            _word("b", 60, 100, 80, 20),
            _word("c", 55, 150, 80, 20),
        ]
        columns = _cluster_columns(words, 600)
        assert len(columns) == 1
 # ---------------------------------------------------------------------------
 # Tests: inline marker guard (bullet points)
 # ---------------------------------------------------------------------------
 class TestInlineMarkerGuard:
    """Bullet points / numbering should NOT be split into sub-columns."""
    def test_concept_bullet_vs_page_ref(self):
        """Demonstrate the gap difference between bullets and page refs.
        Bullet points have small gap to main text (~5-10px).
        Page references have large gap (~50+ px).
        """
        # Bullet point scenario: "1." at left=50, main text at left=65
        # Gap = 65 - (50+20) = -5  (overlapping or touching → no split)
        bullet_gap = 65 - (50 + 20)
        assert bullet_gap < 20  # very small gap
        # Page ref scenario: "p.55" at left=20, main text at left=120
        # Gap = 120 - (20+40) = 60  (clear separation → split)
        pageref_gap = 120 - (20 + 40)
        assert pageref_gap > 30  # clear gap
--- a/klausur-service/backend/tests/test_cv_graphic_detect.py
+++ b/klausur-service/backend/tests/test_cv_graphic_detect.py
@@ -0,0 +1,320 @@
 """
 Tests for cv_graphic_detect.py — graphic element detection.
 Lizenz: Apache 2.0
 """
 import numpy as np
 import pytest
 import cv2
 from cv_graphic_detect import detect_graphic_elements, GraphicElement, _dominant_color
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _white_image(width: int = 1200, height: int = 1800) -> np.ndarray:
    """Create a plain white BGR image."""
    return np.ones((height, width, 3), dtype=np.uint8) * 255
 def _draw_colored_circle(img: np.ndarray, cx: int, cy: int, radius: int,
                         color_bgr: tuple) -> np.ndarray:
    """Draw a filled colored circle (simulates a balloon / graphic)."""
    cv2.circle(img, (cx, cy), radius, color_bgr, -1)
    return img
 def _draw_colored_region(img: np.ndarray, x: int, y: int, w: int, h: int,
                         color_bgr: tuple) -> np.ndarray:
    """Draw a filled colored rectangle (simulates an image region)."""
    cv2.rectangle(img, (x, y), (x + w, y + h), color_bgr, -1)
    return img
 def _draw_black_illustration(img: np.ndarray, x: int, y: int, w: int, h: int) -> np.ndarray:
    """Draw a large black filled shape (simulates a black-ink illustration)."""
    cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)
    return img
 def _word_box(left: int, top: int, width: int, height: int) -> dict:
    """Create a word box dict matching OCR output format."""
    return {"left": left, "top": top, "width": width, "height": height}
 # ---------------------------------------------------------------------------
 # _dominant_color tests
 # ---------------------------------------------------------------------------
 class TestDominantColor:
    """Tests for the _dominant_color helper."""
    def test_empty_array(self):
        hsv = np.array([], dtype=np.uint8).reshape(0, 3)
        name, hex_val = _dominant_color(hsv)
        assert name == "black"
        assert hex_val == "#000000"
    def test_low_saturation_returns_black(self):
        """Pixels with low saturation should be classified as black."""
        # HSV: H=90 (irrelevant), S=10 (low), V=200
        hsv = np.full((50, 50, 3), [90, 10, 200], dtype=np.uint8)
        name, _ = _dominant_color(hsv)
        assert name == "black"
    def test_red_hue(self):
        """Pixels with hue ~0-10 or ~170+ should be red."""
        hsv = np.full((50, 50, 3), [5, 200, 200], dtype=np.uint8)
        name, hex_val = _dominant_color(hsv)
        assert name == "red"
        assert hex_val == "#dc2626"
    def test_blue_hue(self):
        """Pixels with hue ~100 should be blue."""
        hsv = np.full((50, 50, 3), [110, 200, 200], dtype=np.uint8)
        name, hex_val = _dominant_color(hsv)
        assert name == "blue"
        assert hex_val == "#2563eb"
    def test_green_hue(self):
        """Pixels with hue ~60 should be green."""
        hsv = np.full((50, 50, 3), [60, 200, 200], dtype=np.uint8)
        name, hex_val = _dominant_color(hsv)
        assert name == "green"
        assert hex_val == "#16a34a"
    def test_yellow_hue(self):
        """Pixels with hue ~30 should be yellow."""
        hsv = np.full((50, 50, 3), [30, 200, 200], dtype=np.uint8)
        name, hex_val = _dominant_color(hsv)
        assert name == "yellow"
    def test_orange_hue(self):
        """Pixels with hue ~15 should be orange."""
        hsv = np.full((50, 50, 3), [15, 200, 200], dtype=np.uint8)
        name, hex_val = _dominant_color(hsv)
        assert name == "orange"
    def test_purple_hue(self):
        """Pixels with hue ~140 should be purple."""
        hsv = np.full((50, 50, 3), [140, 200, 200], dtype=np.uint8)
        name, hex_val = _dominant_color(hsv)
        assert name == "purple"
 # ---------------------------------------------------------------------------
 # detect_graphic_elements tests
 # ---------------------------------------------------------------------------
 class TestDetectGraphicElements:
    """Tests for the detect_graphic_elements() function."""
    def test_none_image_returns_empty(self):
        """None input should return empty list."""
        result = detect_graphic_elements(None, [])
        assert result == []
    def test_white_image_no_graphics(self):
        """A plain white image should produce no graphic elements."""
        img = _white_image()
        result = detect_graphic_elements(img, [])
        assert result == []
    def test_colored_region_detected_as_image(self):
        """A large colored rectangle should be detected as an image."""
        img = _white_image()
        # Draw a large red region (not text-like)
        _draw_colored_region(img, x=100, y=300, w=200, h=200, color_bgr=(0, 0, 220))
        result = detect_graphic_elements(img, word_boxes=[])
        assert len(result) >= 1
        graphic = result[0]
        assert isinstance(graphic, GraphicElement)
        assert graphic.shape == "image"
        assert graphic.color_name == "red"
        assert graphic.confidence > 0
    def test_colored_text_excluded_by_word_overlap(self):
        """Colored regions that overlap heavily with word boxes should be skipped."""
        img = _white_image()
        # Draw colored region
        _draw_colored_region(img, x=100, y=300, w=400, h=50, color_bgr=(0, 0, 220))
        # Word boxes covering >50% of the colored region
        words = [
            _word_box(100, 300, 200, 50),
            _word_box(300, 300, 200, 50),
        ]
        result = detect_graphic_elements(img, word_boxes=words)
        # Should be filtered out (word overlap > 50%)
        for g in result:
            # If anything is detected at that location, overlap check failed
            if g.x >= 90 and g.x <= 110 and g.y >= 290 and g.y <= 310:
                pytest.fail("Colored text region should be excluded by word overlap")
    def test_colored_graphic_with_low_word_overlap_kept(self):
        """A colored region with low word overlap should be kept."""
        img = _white_image()
        # Draw a large colored circle
        _draw_colored_circle(img, cx=300, cy=400, radius=80, color_bgr=(0, 200, 0))
        # One small word box overlapping only a tiny portion
        words = [_word_box(250, 390, 30, 20)]
        result = detect_graphic_elements(img, word_boxes=words)
        assert len(result) >= 1
        assert result[0].shape == "image"
        assert result[0].color_name == "green"
    def test_black_illustration_detected(self):
        """A large black filled area should be detected as illustration."""
        img = _white_image()
        # Draw a large black rectangle (simulating an illustration)
        _draw_black_illustration(img, x=200, y=400, w=300, h=300)
        result = detect_graphic_elements(img, word_boxes=[])
        assert len(result) >= 1
        illust = [g for g in result if g.shape == "illustration"]
        assert len(illust) >= 1
        assert illust[0].color_name == "black"
    def test_black_illustration_excluded_by_word_boxes(self):
        """Black ink in word regions should NOT be detected as illustration."""
        img = _white_image()
        # Draw black text-like region
        _draw_black_illustration(img, x=100, y=300, w=400, h=60)
        # Word boxes covering the same area
        words = [
            _word_box(100, 300, 200, 60),
            _word_box(300, 300, 200, 60),
        ]
        result = detect_graphic_elements(img, word_boxes=words)
        # Should be empty — the word exclusion mask covers the ink
        illust = [g for g in result if g.shape == "illustration"]
        assert len(illust) == 0
    def test_tiny_colored_region_filtered(self):
        """Very small colored regions (<200 colored pixels) should be filtered."""
        img = _white_image()
        # Draw a tiny colored dot (5x5 pixels)
        _draw_colored_region(img, x=500, y=500, w=5, h=5, color_bgr=(220, 0, 0))
        result = detect_graphic_elements(img, word_boxes=[])
        assert result == []
    def test_page_spanning_region_filtered(self):
        """Colored regions spanning >50% of width/height should be skipped."""
        img = _white_image(width=1200, height=1800)
        # Draw a region wider than 50% of the image
        _draw_colored_region(img, x=50, y=300, w=700, h=100, color_bgr=(0, 0, 220))
        result = detect_graphic_elements(img, word_boxes=[])
        # Should be filtered as page-spanning
        assert result == []
    def test_multiple_graphics_detected(self):
        """Multiple separate colored regions should all be detected."""
        img = _white_image()
        # Three separate colored circles
        _draw_colored_circle(img, cx=200, cy=300, radius=60, color_bgr=(0, 0, 220))
        _draw_colored_circle(img, cx=500, cy=300, radius=60, color_bgr=(0, 200, 0))
        _draw_colored_circle(img, cx=200, cy=600, radius=60, color_bgr=(220, 0, 0))
        result = detect_graphic_elements(img, word_boxes=[])
        # Should detect at least 2 (some may merge if dilation connects them)
        assert len(result) >= 2
    def test_results_sorted_by_area_descending(self):
        """Results should be sorted by area, largest first."""
        img = _white_image()
        # Small circle
        _draw_colored_circle(img, cx=200, cy=300, radius=30, color_bgr=(0, 0, 220))
        # Large circle
        _draw_colored_circle(img, cx=600, cy=800, radius=100, color_bgr=(0, 200, 0))
        result = detect_graphic_elements(img, word_boxes=[])
        if len(result) >= 2:
            assert result[0].area >= result[1].area
    def test_max_elements_limit(self):
        """Should respect max_elements parameter."""
        img = _white_image(width=2000, height=2000)
        # Draw many colored regions
        for i in range(10):
            _draw_colored_circle(img, cx=100 + i * 180, cy=300, radius=40,
                                 color_bgr=(0, 0, 220))
        result = detect_graphic_elements(img, word_boxes=[], max_elements=3)
        assert len(result) <= 3
    def test_detected_boxes_excluded_from_ink(self):
        """Detected box regions should be excluded from ink illustration detection."""
        img = _white_image()
        # Draw a black rectangle well inside the "box" area (8px inset is used)
        _draw_black_illustration(img, x=120, y=320, w=360, h=160)
        # Mark the outer box — the 8px inset still covers the drawn region
        detected_boxes = [{"x": 100, "y": 300, "w": 400, "h": 200}]
        result = detect_graphic_elements(img, word_boxes=[], detected_boxes=detected_boxes)
        illust = [g for g in result if g.shape == "illustration"]
        assert len(illust) == 0
    def test_deduplication_overlapping_regions(self):
        """Overlapping elements should be deduplicated."""
        img = _white_image()
        # Two overlapping colored regions
        _draw_colored_region(img, x=200, y=300, w=200, h=200, color_bgr=(0, 0, 220))
        _draw_colored_region(img, x=250, y=350, w=200, h=200, color_bgr=(0, 0, 220))
        result = detect_graphic_elements(img, word_boxes=[])
        # Should be merged/deduplicated into 1 element (heavy dilation merges them)
        assert len(result) <= 2
    def test_graphicelement_dataclass_fields(self):
        """GraphicElement should have all expected fields."""
        elem = GraphicElement(
            x=10, y=20, width=100, height=80,
            area=5000, shape="image",
            color_name="red", color_hex="#dc2626",
            confidence=0.85,
        )
        assert elem.x == 10
        assert elem.y == 20
        assert elem.width == 100
        assert elem.height == 80
        assert elem.area == 5000
        assert elem.shape == "image"
        assert elem.color_name == "red"
        assert elem.color_hex == "#dc2626"
        assert elem.confidence == 0.85
        assert elem.contour is None
    def test_small_ink_area_filtered(self):
        """Black ink areas smaller than 5000px should be filtered."""
        img = _white_image()
        # Small black mark (50x50 = 2500 area, below 5000 threshold)
        _draw_black_illustration(img, x=500, y=500, w=50, h=50)
        result = detect_graphic_elements(img, word_boxes=[])
        illust = [g for g in result if g.shape == "illustration"]
        assert len(illust) == 0