Compare commits

...

2 Commits

Author SHA1 Message Date
Benjamin Admin
0340204c1f feat: box-aware column detection — exclude box content from global columns
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s
- Enrich column geometries with original full-page words (box-filtered)
  so _detect_sub_columns() finds narrow sub-columns across box boundaries
- Add inline marker guard: bullet points (1., 2., •) are not split into
  sub-columns (minimum gap check: 1.2× word height or 20px)
- Add box_rects parameter to build_grid_from_words() — words inside boxes
  are excluded from X-gap column clustering
- Pass box rects from zones to words_first grid builder
- Add 9 tests for box-aware column detection

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 18:42:46 +01:00
Benjamin Admin
729ebff63c feat: add border ghost filter + graphic detection tests + structure overlay
- Add _filter_border_ghost_words() to remove OCR artefacts from box borders
  (vertical + horizontal edge detection, column cleanup, re-indexing)
- Add 20 tests for border ghost filter (basic filtering + column cleanup)
- Add 24 tests for cv_graphic_detect (color detection, word overlap, boxes)
- Clean up cv_graphic_detect.py logging (per-candidate → DEBUG)
- Add structure overlay layer to StepReconstruction (boxes + graphics toggle)
- Show border_ghosts_removed badge in StepStructureDetection
- Update MkDocs with structure detection documentation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 18:28:53 +01:00
11 changed files with 1275 additions and 31 deletions

View File

@@ -219,7 +219,7 @@ export interface StructureGraphic {
w: number w: number
h: number h: number
area: number area: number
shape: string // arrow, circle, line, exclamation, dot, icon, illustration shape: string // image, illustration
color_name: string color_name: string
color_hex: string color_hex: string
confidence: number confidence: number
@@ -235,6 +235,7 @@ export interface StructureResult {
color_pixel_counts: Record<string, number> color_pixel_counts: Record<string, number>
has_words: boolean has_words: boolean
word_count: number word_count: number
border_ghosts_removed?: number
duration_seconds: number duration_seconds: number
} }

View File

@@ -2,7 +2,7 @@
import { useCallback, useEffect, useMemo, useRef, useState } from 'react' import { useCallback, useEffect, useMemo, useRef, useState } from 'react'
import dynamic from 'next/dynamic' import dynamic from 'next/dynamic'
import type { GridResult, GridCell, ColumnResult, RowResult, PageZone, PageRegion, RowItem } from '@/app/(admin)/ai/ocr-pipeline/types' import type { GridResult, GridCell, ColumnResult, RowResult, PageZone, PageRegion, RowItem, StructureResult, StructureBox, StructureGraphic } from '@/app/(admin)/ai/ocr-pipeline/types'
import { usePixelWordPositions } from './usePixelWordPositions' import { usePixelWordPositions } from './usePixelWordPositions'
const KLAUSUR_API = '/klausur-api' const KLAUSUR_API = '/klausur-api'
@@ -60,6 +60,9 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
const [fontScale, setFontScale] = useState(0.7) const [fontScale, setFontScale] = useState(0.7)
const [globalBold, setGlobalBold] = useState(false) const [globalBold, setGlobalBold] = useState(false)
const [imageRotation, setImageRotation] = useState<0 | 180>(0) const [imageRotation, setImageRotation] = useState<0 | 180>(0)
const [structureBoxes, setStructureBoxes] = useState<StructureBox[]>([])
const [structureGraphics, setStructureGraphics] = useState<StructureGraphic[]>([])
const [showStructure, setShowStructure] = useState(true)
const reconRef = useRef<HTMLDivElement>(null) const reconRef = useRef<HTMLDivElement>(null)
const [reconWidth, setReconWidth] = useState(0) const [reconWidth, setReconWidth] = useState(0)
@@ -92,12 +95,15 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
// eslint-disable-next-line react-hooks/exhaustive-deps // eslint-disable-next-line react-hooks/exhaustive-deps
}, [sessionId]) }, [sessionId])
// Track image natural height for font scaling // Track image natural dimensions for font scaling and structure layer
const handleImageLoad = useCallback(() => { const handleImageLoad = useCallback(() => {
if (imageRef.current) { if (imageRef.current) {
setImageNaturalH(imageRef.current.naturalHeight) setImageNaturalH(imageRef.current.naturalHeight)
if (!imageNaturalSize) {
setImageNaturalSize({ w: imageRef.current.naturalWidth, h: imageRef.current.naturalHeight })
}
} }
}, []) }, [imageNaturalSize])
const loadSessionData = async () => { const loadSessionData = async () => {
if (!sessionId) return if (!sessionId) return
@@ -132,6 +138,13 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
setUndoStack([]) setUndoStack([])
setRedoStack([]) setRedoStack([])
// Load structure result (boxes, graphics, colors)
const structureResult: StructureResult | undefined = data.structure_result
if (structureResult) {
setStructureBoxes(structureResult.boxes || [])
setStructureGraphics(structureResult.graphics || [])
}
// Check for parent with boxes (sub-sessions + zones) // Check for parent with boxes (sub-sessions + zones)
const columnResult: ColumnResult | undefined = data.column_result const columnResult: ColumnResult | undefined = data.column_result
const rowResult: RowResult | undefined = data.row_result const rowResult: RowResult | undefined = data.row_result
@@ -517,6 +530,65 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
return bboxPct return bboxPct
} }
// Structure layer: boxes and graphic elements as background
const renderStructureLayer = (imgW: number, imgH: number) => {
if (!showStructure) return null
const hasElements = structureBoxes.length > 0 || structureGraphics.length > 0
if (!hasElements) return null
return (
<>
{/* Structure boxes */}
{structureBoxes.map((box, i) => {
const bgColor = box.bg_color_hex || '#6b7280'
return (
<div
key={`sbox-${i}`}
className="absolute pointer-events-none"
style={{
left: `${(box.x / imgW) * 100}%`,
top: `${(box.y / imgH) * 100}%`,
width: `${(box.w / imgW) * 100}%`,
height: `${(box.h / imgH) * 100}%`,
border: `${Math.max(1, box.border_thickness)}px solid ${bgColor}40`,
backgroundColor: `${bgColor}0a`,
borderRadius: '2px',
}}
/>
)
})}
{/* Graphic elements */}
{structureGraphics.map((g, i) => (
<div
key={`sgfx-${i}`}
className="absolute pointer-events-none"
style={{
left: `${(g.x / imgW) * 100}%`,
top: `${(g.y / imgH) * 100}%`,
width: `${(g.w / imgW) * 100}%`,
height: `${(g.h / imgH) * 100}%`,
border: `1px dashed ${g.color_hex}60`,
backgroundColor: `${g.color_hex}08`,
borderRadius: '2px',
}}
>
<span
className="absolute text-[8px] leading-none opacity-50"
style={{
top: '1px',
left: '2px',
color: g.color_hex,
}}
>
{g.shape === 'illustration' ? 'Illust' : 'Bild'}
</span>
</div>
))}
</>
)
}
// Overlay rendering helper // Overlay rendering helper
const renderOverlayMode = () => { const renderOverlayMode = () => {
const imgW = imageNaturalSize?.w || 1 const imgW = imageNaturalSize?.w || 1
@@ -597,6 +669,9 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
) )
})} })}
{/* Structure elements (boxes, graphics) */}
{renderStructureLayer(imgW, imgH)}
{/* Pixel-positioned words / editable inputs */} {/* Pixel-positioned words / editable inputs */}
{cells.map((cell) => { {cells.map((cell) => {
const displayText = getDisplayText(cell) const displayText = getDisplayText(cell)
@@ -831,6 +906,19 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
> >
180° 180°
</button> </button>
{(structureBoxes.length > 0 || structureGraphics.length > 0) && (
<button
onClick={() => setShowStructure(v => !v)}
className={`px-2 py-1 text-xs border rounded transition-colors ${
showStructure
? 'border-violet-300 bg-violet-50 text-violet-600 dark:border-violet-700 dark:bg-violet-900/30 dark:text-violet-400'
: 'border-gray-300 dark:border-gray-600 hover:bg-gray-50 dark:hover:bg-gray-700'
}`}
title="Strukturelemente anzeigen"
>
Struktur
</button>
)}
<div className="w-px h-5 bg-gray-300 dark:bg-gray-600 mx-1" /> <div className="w-px h-5 bg-gray-300 dark:bg-gray-600 mx-1" />
</> </>
)} )}
@@ -851,6 +939,21 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
Leer Leer
</button> </button>
{/* Structure toggle */}
{(structureBoxes.length > 0 || structureGraphics.length > 0) && (
<button
onClick={() => setShowStructure(v => !v)}
className={`px-2 py-1 text-xs border rounded transition-colors ${
showStructure
? 'border-violet-300 bg-violet-50 text-violet-600 dark:border-violet-700 dark:bg-violet-900/30 dark:text-violet-400'
: 'border-gray-300 dark:border-gray-600 hover:bg-gray-50 dark:hover:bg-gray-700'
}`}
title="Strukturelemente anzeigen"
>
Struktur
</button>
)}
<div className="w-px h-5 bg-gray-300 dark:bg-gray-600 mx-1" /> <div className="w-px h-5 bg-gray-300 dark:bg-gray-600 mx-1" />
{/* Zoom controls */} {/* Zoom controls */}
@@ -915,6 +1018,9 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp
onLoad={handleImageLoad} onLoad={handleImageLoad}
/> />
{/* Structure elements (boxes, graphics) */}
{imageNaturalSize && renderStructureLayer(imageNaturalSize.w, imageNaturalSize.h)}
{/* Empty field markers */} {/* Empty field markers */}
{showEmptyHighlight && cells {showEmptyHighlight && cells
.filter(c => emptyCellIds.has(c.cellId)) .filter(c => emptyCellIds.has(c.cellId))

View File

@@ -165,6 +165,11 @@ export function StepStructureDetection({ sessionId, onNext }: StepStructureDetec
{result.word_count} Woerter {result.word_count} Woerter
</span> </span>
)} )}
{(result.border_ghosts_removed ?? 0) > 0 && (
<span className="inline-flex items-center gap-1.5 px-3 py-1 rounded-full bg-red-50 dark:bg-red-900/20 text-red-700 dark:text-red-400 text-xs font-medium">
{result.border_ghosts_removed} Rahmenlinien entfernt
</span>
)}
<span className="text-gray-400 text-xs ml-auto"> <span className="text-gray-400 text-xs ml-auto">
{result.image_width}x{result.image_height}px | {result.duration_seconds}s {result.image_width}x{result.image_height}px | {result.duration_seconds}s
</span> </span>

View File

@@ -149,6 +149,8 @@ klausur-service/backend/
├── ocr_pipeline_api.py # FastAPI Router (Schritte 2-10) ├── ocr_pipeline_api.py # FastAPI Router (Schritte 2-10)
├── orientation_crop_api.py # FastAPI Router (Schritte 1 + 4) ├── orientation_crop_api.py # FastAPI Router (Schritte 1 + 4)
├── cv_box_detect.py # Box-Erkennung + Zonen-Aufteilung ├── cv_box_detect.py # Box-Erkennung + Zonen-Aufteilung
├── cv_graphic_detect.py # Grafik-/Bilderkennung (Region-basiert)
├── cv_color_detect.py # Farbtext-Erkennung (HSV-Analyse)
├── cv_words_first.py # Words-First Grid Builder (bottom-up) ├── cv_words_first.py # Words-First Grid Builder (bottom-up)
├── page_crop.py # Content-basierter Crop-Algorithmus ├── page_crop.py # Content-basierter Crop-Algorithmus
├── ocr_pipeline_session_store.py # PostgreSQL Persistence ├── ocr_pipeline_session_store.py # PostgreSQL Persistence
@@ -177,7 +179,8 @@ admin-lehrer/
├── StepColumnDetection.tsx # Schritt 5: Spaltenerkennung ├── StepColumnDetection.tsx # Schritt 5: Spaltenerkennung
├── StepRowDetection.tsx # Schritt 6: Zeilenerkennung ├── StepRowDetection.tsx # Schritt 6: Zeilenerkennung
├── StepWordRecognition.tsx # Schritt 7: Worterkennung ├── StepWordRecognition.tsx # Schritt 7: Worterkennung
├── StepLlmReview.tsx # Schritt 8: Korrektur (SSE-Stream) ├── StepStructureDetection.tsx # Schritt 8: Strukturerkennung
├── StepLlmReview.tsx # Schritt 9: Korrektur (SSE-Stream)
├── StepReconstruction.tsx # Schritt 9: Rekonstruktion (Canvas + Overlay) ├── StepReconstruction.tsx # Schritt 9: Rekonstruktion (Canvas + Overlay)
├── usePixelWordPositions.ts # Shared Hook: Pixel-basierte Wortpositionierung ├── usePixelWordPositions.ts # Shared Hook: Pixel-basierte Wortpositionierung
├── FabricReconstructionCanvas.tsx # Fabric.js Editor ├── FabricReconstructionCanvas.tsx # Fabric.js Editor
@@ -281,14 +284,21 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.
| `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) | | `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) |
| `grid_method` | `v2` | Grid-Strategie: `v2` (top-down) oder `words_first` (bottom-up) | | `grid_method` | `v2` | Grid-Strategie: `v2` (top-down) oder `words_first` (bottom-up) |
### Schritt 8: Korrektur ### Schritt 8: Strukturerkennung
| Methode | Pfad | Beschreibung |
|---------|------|--------------|
| `POST` | `/sessions/{id}/detect-structure` | Boxen, Zonen, Farben und Grafiken erkennen |
| `GET` | `/sessions/{id}/image/structure-overlay` | Overlay mit allen Strukturelementen |
### Schritt 9: Korrektur
| Methode | Pfad | Beschreibung | | Methode | Pfad | Beschreibung |
|---------|------|--------------| |---------|------|--------------|
| `POST` | `/sessions/{id}/llm-review?stream=true` | SSE-Stream Korrektur starten | | `POST` | `/sessions/{id}/llm-review?stream=true` | SSE-Stream Korrektur starten |
| `POST` | `/sessions/{id}/llm-review/apply` | Ausgewaehlte Korrekturen speichern | | `POST` | `/sessions/{id}/llm-review/apply` | Ausgewaehlte Korrekturen speichern |
### Schritt 9: Rekonstruktion ### Schritt 10: Rekonstruktion
| Methode | Pfad | Beschreibung | | Methode | Pfad | Beschreibung |
|---------|------|--------------| |---------|------|--------------|
@@ -853,6 +863,93 @@ Change-Format:
--- ---
## Schritt 8: Strukturerkennung (Detail)
Erkennt Boxen, Zonen, Farbregionen und grafische Elemente auf der Seite.
Laeuft **nach** der Worterkennung (Schritt 7), damit OCR-Wortpositionen
fuer die Unterscheidung von Text vs. Grafik zur Verfuegung stehen.
### Teilschritte
1. **Box-Erkennung** (`cv_box_detect.py`): Linien-Rahmen und farbige Hintergruende
2. **Zonen-Aufteilung** (`split_page_into_zones`): Seite in Box- und Content-Zonen aufteilen
3. **Farb-Analyse** (`cv_color_detect.py`): HSV-basierte Erkennung farbiger Textbereiche
4. **Grafik-Erkennung** (`cv_graphic_detect.py`): Nicht-Text-Grafiken identifizieren
### Grafik-Erkennung: Region-basierter Ansatz
Zwei Paesse trennen farbige Grafiken von farbigem Text und erkennen
schwarze Illustrationen:
**Pass 1 — Farbige Bildregionen:**
1. HSV-Saturation-Kanal extrahieren (Schwelle > 40)
- Schwarzer Text hat Saettigung ≈ 0 → unsichtbar auf diesem Kanal
2. Starke Dilation (25×25 Ellipse) verschmilzt nahe Farbpixel zu Regionen
3. Fuer jede Region: Wort-Ueberlappung pruefen
- \> 50 % Ueberlappung mit OCR-Woertern → farbiger Text → ueberspringen
- ≤ 50 % → farbige Grafik/Bild → behalten
4. Minimum 200 Farbpixel erforderlich (kein Rauschen)
5. Regionen > 50 % der Bildbreite oder -hoehe → Seitenumfassend → ueberspringen
**Pass 2 — Schwarze Illustrationen:**
1. Otsu-Binarisierung fuer Tinten-Maske
2. Ausschlusszonen: OCR-Woerter (5 px Padding) + erkannte Boxen (8 px Inset)
3. Farbige Pixel aus Pass 1 ebenfalls ausschliessen
4. Nur Konturen mit Flaeche > 5000 px und min(Breite, Hoehe) > 40 px
**Deduplizierung:** Ueberlappende Elemente (> 50 % IoU der kleineren
Bounding-Box) werden zusammengefasst. Ergebnis nach Flaeche absteigend
sortiert.
### Response-Format
```json
{
"boxes": [
{"x": 50, "y": 300, "w": 1100, "h": 200, "confidence": 0.85,
"border_thickness": 3, "bg_color_name": "blue", "bg_color_hex": "#2563eb"}
],
"zones": [
{"index": 0, "zone_type": "content", "x": 50, "y": 50, "w": 1100, "h": 250},
{"index": 1, "zone_type": "box", "x": 50, "y": 300, "w": 1100, "h": 200}
],
"graphics": [
{"x": 100, "y": 500, "w": 150, "h": 120, "area": 8500,
"shape": "image", "color_name": "red", "color_hex": "#dc2626",
"confidence": 0.72}
],
"color_pixel_counts": {"red": 1234, "blue": 5678},
"has_words": true,
"word_count": 96,
"duration_seconds": 0.45
}
```
### Grafik-Shape-Typen
| Shape | Quelle | Beschreibung |
|-------|--------|--------------|
| `image` | Pass 1 | Farbige Grafik/Bild (Ballons, Pfeile, Icons) |
| `illustration` | Pass 2 | Grosse schwarze Zeichnung/Illustration |
### Erkannte Farben
`red`, `orange`, `yellow`, `green`, `blue`, `purple`, `black`
— basierend auf dem Median-Hue der saturierten Pixel in der Region.
### Frontend-Anzeige
`StepStructureDetection.tsx` zeigt:
- Boxen-Liste mit Position, Hintergrundfarbe und Confidence
- Zonen-Uebersicht (Content vs. Box)
- Farb-Zusammenfassung (Pixel-Counts)
- Grafik-Liste mit Shape, Abmessungen, Farbe und Confidence
---
## Schritt 9: Rekonstruktion (Detail) ## Schritt 9: Rekonstruktion (Detail)
Drei Modi verfuegbar: Drei Modi verfuegbar:
@@ -1263,6 +1360,7 @@ cd klausur-service/backend && pytest tests/test_paddle_kombi.py -v # 36 Tests
| Datum | Version | Aenderung | | Datum | Version | Aenderung |
|-------|---------|----------| |-------|---------|----------|
| 2026-03-16 | 4.6.0 | Strukturerkennung (Schritt 8): Region-basierte Grafikerkennung (`cv_graphic_detect.py`) mit Zwei-Pass-Verfahren (Farbregionen + schwarze Illustrationen), Wort-Ueberlappungs-Filter, Box/Zonen/Farb-Analyse. Schritt laeuft nach Worterkennung. |
| 2026-03-12 | 4.5.0 | Kombi-Modus (PaddleOCR + Tesseract): Beide Engines laufen parallel, Koordinaten werden IoU-basiert gematcht und confidence-gewichtet gemittelt. Ungematchte Tesseract-Woerter (Bullets, Symbole) werden hinzugefuegt. 3er-Toggle in OCR Overlay. | | 2026-03-12 | 4.5.0 | Kombi-Modus (PaddleOCR + Tesseract): Beide Engines laufen parallel, Koordinaten werden IoU-basiert gematcht und confidence-gewichtet gemittelt. Ungematchte Tesseract-Woerter (Bullets, Symbole) werden hinzugefuegt. 3er-Toggle in OCR Overlay. |
| 2026-03-12 | 4.4.0 | PaddleOCR Remote-Engine (`engine=paddle`): PP-OCRv5 Latin auf Hetzner x86_64. Neuer Microservice (`paddleocr-service/`), HTTP-Client (`paddleocr_remote.py`), Frontend-Dropdown-Option. Nutzt words_first Grid-Methode. | | 2026-03-12 | 4.4.0 | PaddleOCR Remote-Engine (`engine=paddle`): PP-OCRv5 Latin auf Hetzner x86_64. Neuer Microservice (`paddleocr-service/`), HTTP-Client (`paddleocr_remote.py`), Frontend-Dropdown-Option. Nutzt words_first Grid-Methode. |
| 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. | | 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. |

View File

@@ -121,10 +121,9 @@ def detect_graphic_elements(
return [] return []
h, w = img_bgr.shape[:2] h, w = img_bgr.shape[:2]
img_area = h * w
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes", logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
w, h, len(word_boxes), len(detected_boxes or [])) w, h, len(word_boxes), len(detected_boxes or []))
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
candidates: List[GraphicElement] = [] candidates: List[GraphicElement] = []
@@ -161,7 +160,7 @@ def detect_graphic_elements(
contours_regions, _ = cv2.findContours( contours_regions, _ = cv2.findContours(
region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
) )
logger.info("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions)) logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
for cnt in contours_regions: for cnt in contours_regions:
bx, by, bw, bh = cv2.boundingRect(cnt) bx, by, bw, bh = cv2.boundingRect(cnt)
@@ -172,7 +171,7 @@ def detect_graphic_elements(
# Skip page-spanning regions # Skip page-spanning regions
if bw > w * 0.5 or bh > h * 0.5: if bw > w * 0.5 or bh > h * 0.5:
logger.info("GraphicDetect PASS1 SKIP page-spanning (%d,%d) %dx%d", bx, by, bw, bh) logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
continue continue
bbox_area = bw * bh bbox_area = bw * bh
@@ -188,8 +187,8 @@ def detect_graphic_elements(
# If most of the region is covered by word boxes → colored text, skip # If most of the region is covered by word boxes → colored text, skip
if word_overlap > 0.5: if word_overlap > 0.5:
logger.info("GraphicDetect PASS1 SKIP text region (%d,%d) %dx%d word_overlap=%.0f%%", logger.debug("GraphicDetect PASS1 skip text region (%d,%d) %dx%d overlap=%.0f%%",
bx, by, bw, bh, word_overlap * 100) bx, by, bw, bh, word_overlap * 100)
continue continue
# Need a minimum number of colored pixels (not just dilated area) # Need a minimum number of colored pixels (not just dilated area)
@@ -209,8 +208,7 @@ def detect_graphic_elements(
density = color_pixel_count / bbox_area if bbox_area > 0 else 0 density = color_pixel_count / bbox_area if bbox_area > 0 else 0
conf = min(0.95, 0.5 + density * 0.5) conf = min(0.95, 0.5 + density * 0.5)
logger.info("GraphicDetect PASS1 ACCEPT image at (%d,%d) %dx%d " logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d overlap=%.0f%% %s",
"color_px=%d word_overlap=%.0f%% color=%s",
bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name) bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name)
candidates.append(GraphicElement( candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh, x=bx, y=by, width=bw, height=bh,
@@ -256,7 +254,7 @@ def detect_graphic_elements(
contours_ink, _ = cv2.findContours( contours_ink, _ = cv2.findContours(
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
) )
logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink)) logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink))
for cnt in contours_ink: for cnt in contours_ink:
area = cv2.contourArea(cnt) area = cv2.contourArea(cnt)
@@ -267,8 +265,8 @@ def detect_graphic_elements(
if bw > w * 0.8 or bh > h * 0.8: if bw > w * 0.8 or bh > h * 0.8:
continue continue
logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d", logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d",
bx, by, bw, bh, int(area)) bx, by, bw, bh, int(area))
candidates.append(GraphicElement( candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh, x=bx, y=by, width=bw, height=bh,
area=int(area), shape="illustration", area=int(area), shape="illustration",

View File

@@ -7,6 +7,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging import logging
import re import re
import statistics
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import numpy as np import numpy as np
@@ -737,6 +738,24 @@ def _detect_sub_columns(
result.append(geo) result.append(geo)
continue continue
# --- Guard against inline markers (bullet points, numbering) ---
# Bullet points like "1.", "2.", "•", "-" sit close to the main
# column text and are part of the cell, not a separate column.
# Only split if the horizontal gap between the rightmost sub-word
# and the main column start is large enough.
max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
gap_to_main = col_start_bin[2] - max_sub_right # px gap
median_heights = [w.get('height', 20) for w in confident]
med_h = statistics.median(median_heights) if median_heights else 20
min_gap = max(med_h * 1.2, 20) # at least 1.2× word height or 20px
if gap_to_main < min_gap:
logger.debug(
"SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
"(likely inline markers, not a sub-column)",
geo.index, gap_to_main, min_gap)
result.append(geo)
continue
# --- Build two sub-column geometries --- # --- Build two sub-column geometries ---
# Word 'left' values are relative to left_x; geo.x is absolute. # Word 'left' values are relative to left_x; geo.x is absolute.
# Convert the split position from relative to absolute coordinates. # Convert the split position from relative to absolute coordinates.
@@ -3221,6 +3240,46 @@ def detect_column_geometry_zoned(
g.y = abs_y g.y = abs_y
g.height = abs_y_end - abs_y g.height = abs_y_end - abs_y
# --- Enrich column geometries with box-filtered original words ---
# The combined-image Tesseract may miss words in small content strips
# (e.g. a single row above a box). Use the original full-page word_dicts
# filtered to exclude box interiors, so that _detect_sub_columns()
# downstream has ALL content-zone words for left-edge clustering.
# This ensures narrow sub-columns (page_ref, marker) are detectable
# even when only a few entries exist above/below a box.
if word_dicts:
content_words = []
for w in word_dicts:
# word positions are relative to left_x / top_y
w_abs_cx = w['left'] + left_x + w['width'] / 2
w_abs_cy = w['top'] + top_y + w['height'] / 2
inside_box = any(
box.x <= w_abs_cx <= box.x + box.width
and box.y <= w_abs_cy <= box.y + box.height
for box in boxes
)
if not inside_box:
content_words.append(w)
target_geoms = combined_geoms if combined_result is not None else geometries
for g in target_geoms:
# Word 'left' is relative to left_x; geometry 'x' is absolute
g_left_rel = g.x - left_x
g_right_rel = g_left_rel + g.width
g.words = [
w for w in content_words
if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
]
g.word_count = len(g.words)
excluded_count = len(word_dicts) - len(content_words)
if excluded_count:
logger.info(
"ZonedColumns: enriched geometries with %d content words "
"(excluded %d box-interior words)",
len(content_words), excluded_count,
)
# Build zones_data for the response # Build zones_data for the response
zones_data: List[Dict] = [] zones_data: List[Dict] = []
for zone in zones: for zone in zones:

View File

@@ -17,7 +17,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging import logging
import re import re
import statistics import statistics
from typing import Any, Dict, List, Tuple from typing import Any, Dict, List, Optional, Tuple
from cv_ocr_engines import ( from cv_ocr_engines import (
_group_words_into_lines, _group_words_into_lines,
@@ -259,6 +259,7 @@ def build_grid_from_words(
img_w: int, img_w: int,
img_h: int, img_h: int,
min_confidence: int = 30, min_confidence: int = 30,
box_rects: Optional[List[Dict]] = None,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Build a cell grid bottom-up from Tesseract word boxes. """Build a cell grid bottom-up from Tesseract word boxes.
@@ -269,6 +270,9 @@ def build_grid_from_words(
img_w: Image width in pixels. img_w: Image width in pixels.
img_h: Image height in pixels. img_h: Image height in pixels.
min_confidence: Minimum OCR confidence to keep a word. min_confidence: Minimum OCR confidence to keep a word.
box_rects: Optional list of box dicts with keys x, y, width, height.
Words inside these boxes are excluded from column clustering
(box-internal columns are detected separately in sub-sessions).
Returns: Returns:
(cells, columns_meta) — same format as build_cell_grid_v2(). (cells, columns_meta) — same format as build_cell_grid_v2().
@@ -290,6 +294,28 @@ def build_grid_from_words(
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts)) logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
# Exclude words inside detected boxes — box columns are detected separately
if box_rects:
content_words = []
for w in words:
w_cx = w['left'] + w['width'] / 2
w_cy = w['top'] + w['height'] / 2
inside = any(
b['x'] <= w_cx <= b['x'] + b['width']
and b['y'] <= w_cy <= b['y'] + b['height']
for b in box_rects
)
if not inside:
content_words.append(w)
excluded = len(words) - len(content_words)
if excluded:
logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
excluded, len(box_rects))
words = content_words
if not words:
logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
return [], []
# Step 1: cluster columns # Step 1: cluster columns
columns = _cluster_columns(words, img_w) columns = _cluster_columns(words, img_w)
logger.info("build_grid_from_words: %d column(s) detected", len(columns)) logger.info("build_grid_from_words: %d column(s) detected", len(columns))

View File

@@ -1202,6 +1202,147 @@ async def detect_type(session_id: str):
return {"session_id": session_id, **result_dict} return {"session_id": session_id, **result_dict}
# ---------------------------------------------------------------------------
# Border-ghost word filter
# ---------------------------------------------------------------------------
# Characters that OCR produces when reading box-border lines.
_BORDER_GHOST_CHARS = set("|1lI![](){}iíì/\\-—_~.,;:'\"")
def _filter_border_ghost_words(
word_result: Dict,
boxes: List,
) -> int:
"""Remove OCR words that are actually box border lines.
A word is considered a border ghost when it sits on a known box edge
(left, right, top, or bottom) and looks like a line artefact (narrow
aspect ratio or text consists only of line-like characters).
After removing ghost cells, columns that have become empty are also
removed from ``columns_used`` so the grid no longer shows phantom
columns.
Modifies *word_result* in-place and returns the number of removed cells.
"""
if not boxes or not word_result:
return 0
cells = word_result.get("cells")
if not cells:
return 0
# Build border bands — vertical (X) and horizontal (Y)
x_bands = [] # list of (x_lo, x_hi)
y_bands = [] # list of (y_lo, y_hi)
for b in boxes:
bx = b.x if hasattr(b, "x") else b.get("x", 0)
by = b.y if hasattr(b, "y") else b.get("y", 0)
bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
bt = b.border_thickness if hasattr(b, "border_thickness") else b.get("border_thickness", 3)
margin = max(bt * 2, 10) + 6 # generous margin
# Vertical edges (left / right)
x_bands.append((bx - margin, bx + margin))
x_bands.append((bx + bw - margin, bx + bw + margin))
# Horizontal edges (top / bottom)
y_bands.append((by - margin, by + margin))
y_bands.append((by + bh - margin, by + bh + margin))
img_w = word_result.get("image_width", 1)
img_h = word_result.get("image_height", 1)
def _is_ghost(cell: Dict) -> bool:
text = (cell.get("text") or "").strip()
if not text:
return False
# Compute absolute pixel position
if cell.get("bbox_px"):
px = cell["bbox_px"]
cx = px["x"] + px["w"] / 2
cy = px["y"] + px["h"] / 2
cw = px["w"]
ch = px["h"]
elif cell.get("bbox_pct"):
pct = cell["bbox_pct"]
cx = (pct["x"] / 100) * img_w + (pct["w"] / 100) * img_w / 2
cy = (pct["y"] / 100) * img_h + (pct["h"] / 100) * img_h / 2
cw = (pct["w"] / 100) * img_w
ch = (pct["h"] / 100) * img_h
else:
return False
# Check if center sits on a vertical or horizontal border
on_vertical = any(lo <= cx <= hi for lo, hi in x_bands)
on_horizontal = any(lo <= cy <= hi for lo, hi in y_bands)
if not on_vertical and not on_horizontal:
return False
# Very short text (1-2 chars) on a border → very likely ghost
if len(text) <= 2:
# Narrow vertically (line-like) or narrow horizontally (dash-like)?
if ch > 0 and cw / ch < 0.5:
return True
if cw > 0 and ch / cw < 0.5:
return True
# Text is only border-ghost characters?
if all(c in _BORDER_GHOST_CHARS for c in text):
return True
# Longer text but still only ghost chars and very narrow
if all(c in _BORDER_GHOST_CHARS for c in text):
if ch > 0 and cw / ch < 0.35:
return True
if cw > 0 and ch / cw < 0.35:
return True
return True # all ghost chars on a border → remove
return False
before = len(cells)
word_result["cells"] = [c for c in cells if not _is_ghost(c)]
removed = before - len(word_result["cells"])
# --- Remove empty columns from columns_used ---
columns_used = word_result.get("columns_used")
if removed and columns_used and len(columns_used) > 1:
remaining_cells = word_result["cells"]
occupied_cols = {c.get("col_index") for c in remaining_cells}
before_cols = len(columns_used)
columns_used = [col for col in columns_used if col.get("index") in occupied_cols]
# Re-index columns and remap cell col_index values
if len(columns_used) < before_cols:
old_to_new = {}
for new_i, col in enumerate(columns_used):
old_to_new[col["index"]] = new_i
col["index"] = new_i
for cell in remaining_cells:
old_ci = cell.get("col_index")
if old_ci in old_to_new:
cell["col_index"] = old_to_new[old_ci]
word_result["columns_used"] = columns_used
logger.info("border-ghost: removed %d empty column(s), %d remaining",
before_cols - len(columns_used), len(columns_used))
if removed:
# Update summary counts
summary = word_result.get("summary", {})
summary["total_cells"] = len(word_result["cells"])
summary["non_empty_cells"] = sum(1 for c in word_result["cells"] if c.get("text"))
word_result["summary"] = summary
gs = word_result.get("grid_shape", {})
gs["total_cells"] = len(word_result["cells"])
if columns_used is not None:
gs["cols"] = len(columns_used)
word_result["grid_shape"] = gs
return removed
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Structure Detection Endpoint # Structure Detection Endpoint
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -1236,10 +1377,6 @@ async def detect_structure(session_id: str):
for cell in word_result["cells"]: for cell in word_result["cells"]:
for wb in (cell.get("word_boxes") or []): for wb in (cell.get("word_boxes") or []):
words.append(wb) words.append(wb)
logger.info("detect-structure: word_result present=%s, cells=%d, word_boxes extracted=%d",
word_result is not None,
len(word_result.get("cells", [])) if word_result else 0,
len(words))
# If no words yet, use image dimensions with small margin # If no words yet, use image dimensions with small margin
if words: if words:
content_x = max(0, min(int(wb["left"]) for wb in words)) content_x = max(0, min(int(wb["left"]) for wb in words))
@@ -1319,6 +1456,15 @@ async def detect_structure(session_id: str):
detected_boxes=box_dicts, detected_boxes=box_dicts,
) )
# --- Filter border-ghost words from OCR result ---
ghost_count = 0
if boxes and word_result:
ghost_count = _filter_border_ghost_words(word_result, boxes)
if ghost_count:
logger.info("detect-structure: removed %d border-ghost words", ghost_count)
await update_session_db(session_id, word_result=word_result)
cached["word_result"] = word_result
duration = time.time() - t0 duration = time.time() - t0
result_dict = { result_dict = {
@@ -1361,6 +1507,7 @@ async def detect_structure(session_id: str):
"color_pixel_counts": color_summary, "color_pixel_counts": color_summary,
"has_words": len(words) > 0, "has_words": len(words) > 0,
"word_count": len(words), "word_count": len(words),
"border_ghosts_removed": ghost_count,
"duration_seconds": round(duration, 2), "duration_seconds": round(duration, 2),
} }
@@ -1806,12 +1953,7 @@ async def _get_structure_overlay(session_id: str) -> Response:
# --- Draw graphic elements --- # --- Draw graphic elements ---
graphics_data = structure.get("graphics", []) graphics_data = structure.get("graphics", [])
shape_icons = { shape_icons = {
"arrow": "ARROW", "image": "IMAGE",
"circle": "CIRCLE",
"line": "LINE",
"exclamation": "!",
"dot": "DOT",
"icon": "ICON",
"illustration": "ILLUST", "illustration": "ILLUST",
} }
for gfx in graphics_data: for gfx in graphics_data:
@@ -2401,7 +2543,15 @@ async def detect_words(
}) })
wf_word_dicts = abs_words wf_word_dicts = abs_words
cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h) # Extract box rects for box-aware column clustering
box_rects = []
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box_rects.append(zone["box"])
cells, columns_meta = build_grid_from_words(
wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
)
duration = time.time() - t0 duration = time.time() - t0
# Apply IPA phonetic fixes # Apply IPA phonetic fixes

View File

@@ -0,0 +1,307 @@
"""
Tests for _filter_border_ghost_words() — removes OCR artefacts from box borders.
When OCR reads a scanned document, box border lines (vertical/horizontal
strokes) are often misrecognised as characters like '|', '1', 'l', '-'.
These phantom words create spurious columns/rows in the grid. The filter
removes them by checking if a word sits on a known box border and looks
like a line artefact.
Lizenz: Apache 2.0
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from ocr_pipeline_api import _filter_border_ghost_words, _BORDER_GHOST_CHARS
from cv_vocab_types import DetectedBox
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_cell(text: str, x: int, y: int, w: int, h: int,
col_index: int = 0) -> dict:
"""Create a cell dict with bbox_px matching the word recognition output."""
return {
"cell_id": f"c_{x}_{y}",
"text": text,
"bbox_px": {"x": x, "y": y, "w": w, "h": h},
"bbox_pct": {
"x": x / 12, "y": y / 18,
"w": w / 12, "h": h / 18,
},
"confidence": 80,
"row_index": 0,
"col_index": col_index,
}
def _make_word_result(cells: list, img_w: int = 1200, img_h: int = 1800,
columns_used: list = None) -> dict:
return {
"cells": cells,
"image_width": img_w,
"image_height": img_h,
"columns_used": columns_used,
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
},
"grid_shape": {
"total_cells": len(cells),
"cols": len(columns_used) if columns_used else 1,
},
}
def _make_box(x: int, y: int, w: int, h: int, bt: int = 3) -> DetectedBox:
return DetectedBox(x=x, y=y, width=w, height=h, confidence=0.9, border_thickness=bt)
# ---------------------------------------------------------------------------
# Basic filtering tests
# ---------------------------------------------------------------------------
class TestBorderGhostFilter:
"""Tests for the _filter_border_ghost_words() function."""
def test_no_boxes_no_change(self):
"""Without boxes, nothing should be filtered."""
cells = [_make_cell("hello", 100, 200, 80, 30)]
wr = _make_word_result(cells)
removed = _filter_border_ghost_words(wr, [])
assert removed == 0
assert len(wr["cells"]) == 1
def test_no_word_result_no_crash(self):
removed = _filter_border_ghost_words(None, [_make_box(50, 300, 1100, 200)])
assert removed == 0
def test_empty_cells_no_crash(self):
wr = _make_word_result([])
removed = _filter_border_ghost_words(wr, [_make_box(50, 300, 1100, 200)])
assert removed == 0
def test_pipe_on_left_border_removed(self):
"""A '|' character sitting on the left border of a box should be removed."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cells = [
_make_cell("|", x=48, y=350, w=3, h=25),
_make_cell("hello", x=200, y=350, w=80, h=25),
]
wr = _make_word_result(cells)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 1
assert wr["cells"][0]["text"] == "hello"
def test_pipe_on_right_border_removed(self):
"""A '|' character on the right border should be removed."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cells = [
_make_cell("|", x=1148, y=350, w=4, h=25),
_make_cell("world", x=600, y=350, w=80, h=25),
]
wr = _make_word_result(cells)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 1
assert wr["cells"][0]["text"] == "world"
def test_digit_1_on_border_narrow_removed(self):
"""A narrow '1' on a box border should be removed."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cells = [_make_cell("1", x=49, y=400, w=5, h=20)]
wr = _make_word_result(cells)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 1
def test_dash_on_horizontal_border_removed(self):
"""A '-' on the bottom horizontal border should be removed."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
# Bottom border at y=500, dash at y=498
cells = [_make_cell("-", x=600, y=498, w=20, h=4)]
wr = _make_word_result(cells)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 1
def test_real_word_on_border_not_removed(self):
"""A normal word near a border should NOT be removed."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cells = [_make_cell("Tip", x=52, y=350, w=60, h=25)]
wr = _make_word_result(cells)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 0
def test_word_far_from_border_not_removed(self):
"""Words far from any border should never be removed."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cells = [_make_cell("|", x=600, y=400, w=3, h=25)]
wr = _make_word_result(cells)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 0
def test_multiple_ghosts_on_same_box(self):
"""Multiple ghost words on the same box should all be removed."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cells = [
_make_cell("|", x=48, y=350, w=3, h=25),
_make_cell("l", x=1149, y=350, w=4, h=25),
_make_cell("text", x=400, y=350, w=80, h=25),
]
wr = _make_word_result(cells)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 2
assert len(wr["cells"]) == 1
assert wr["cells"][0]["text"] == "text"
def test_summary_updated_after_removal(self):
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cells = [
_make_cell("|", x=48, y=350, w=3, h=25),
_make_cell("hello", x=200, y=350, w=80, h=25),
]
wr = _make_word_result(cells)
_filter_border_ghost_words(wr, [box])
assert wr["summary"]["total_cells"] == 1
assert wr["grid_shape"]["total_cells"] == 1
def test_ghost_chars_covers_common_artefacts(self):
"""The ghost chars set should include common border-line OCR artefacts."""
expected = {"|", "1", "l", "I", "!", "[", "]", "-", "", "_", "/", "\\"}
assert expected.issubset(_BORDER_GHOST_CHARS)
def test_multiple_boxes(self):
box1 = _make_box(x=50, y=300, w=500, h=200, bt=3)
box2 = _make_box(x=600, y=300, w=500, h=200, bt=3)
cells = [
_make_cell("|", x=49, y=350, w=3, h=25),
_make_cell("I", x=599, y=350, w=4, h=25),
_make_cell("real", x=300, y=350, w=80, h=25),
]
wr = _make_word_result(cells)
removed = _filter_border_ghost_words(wr, [box1, box2])
assert removed == 2
def test_uses_bbox_pct_fallback(self):
"""Should work with bbox_pct when bbox_px is not available."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cell = {
"cell_id": "c_test",
"text": "|",
"bbox_pct": {"x": (48 / 1200) * 100, "y": (350 / 1800) * 100,
"w": (4 / 1200) * 100, "h": (25 / 1800) * 100},
"confidence": 80,
"col_index": 0,
}
wr = _make_word_result([cell])
removed = _filter_border_ghost_words(wr, [box])
assert removed == 1
def test_generous_margin_catches_offset_ghosts(self):
"""Even if OCR word is slightly offset from border, it should be caught."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
# Word 15px away from right border (at x=1135 vs border at x=1150)
cells = [_make_cell("|", x=1135, y=350, w=4, h=25)]
wr = _make_word_result(cells)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 1
# ---------------------------------------------------------------------------
# Column cleanup tests
# ---------------------------------------------------------------------------
class TestColumnCleanup:
"""Tests for empty column removal after ghost filtering."""
def test_empty_column_removed(self):
"""After filtering all cells of column 4, it should be removed."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cols = [
{"index": 0, "type": "column_en", "x": 60, "width": 250},
{"index": 1, "type": "column_de", "x": 320, "width": 250},
{"index": 2, "type": "column_3", "x": 580, "width": 250},
{"index": 3, "type": "column_4", "x": 840, "width": 250},
{"index": 4, "type": "column_5", "x": 1140, "width": 60}, # ghost column
]
cells = [
_make_cell("word", x=100, y=350, w=60, h=25, col_index=0),
_make_cell("Wort", x=360, y=350, w=60, h=25, col_index=1),
_make_cell("txt", x=620, y=350, w=50, h=25, col_index=2),
_make_cell("abc", x=880, y=350, w=50, h=25, col_index=3),
_make_cell("|", x=1148, y=350, w=4, h=25, col_index=4), # ghost
_make_cell("l", x=1149, y=400, w=3, h=25, col_index=4), # ghost
]
wr = _make_word_result(cells, columns_used=cols)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 2
assert len(wr["columns_used"]) == 4 # column 5 removed
assert wr["grid_shape"]["cols"] == 4
def test_columns_reindexed_after_removal(self):
"""After removing a middle column, indices should be sequential."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cols = [
{"index": 0, "type": "column_1", "x": 60, "width": 200},
{"index": 1, "type": "column_2", "x": 280, "width": 30}, # border col
{"index": 2, "type": "column_3", "x": 400, "width": 200},
]
# Column 1 only has ghost cells
cells = [
_make_cell("hello", x=100, y=350, w=60, h=25, col_index=0),
# This cell is NOT on a border so it won't be filtered by the ghost filter
# For this test, put a ghost on the box border
_make_cell("|", x=49, y=350, w=3, h=25, col_index=1),
_make_cell("world", x=440, y=350, w=60, h=25, col_index=2),
]
wr = _make_word_result(cells, columns_used=cols)
_filter_border_ghost_words(wr, [box])
# Column 1 should be removed, column 2 becomes column 1
assert len(wr["columns_used"]) == 2
assert wr["columns_used"][0]["index"] == 0
assert wr["columns_used"][1]["index"] == 1
# Remaining cells should have updated col_index
assert wr["cells"][0]["col_index"] == 0
assert wr["cells"][1]["col_index"] == 1
def test_no_columns_used_no_crash(self):
"""If columns_used is None, column cleanup should be skipped."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cells = [_make_cell("|", x=48, y=350, w=3, h=25)]
wr = _make_word_result(cells, columns_used=None)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 1
def test_occupied_columns_kept(self):
"""Columns that still have cells after filtering should be kept."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cols = [
{"index": 0, "type": "column_en", "x": 60, "width": 250},
{"index": 1, "type": "column_de", "x": 320, "width": 250},
]
cells = [
_make_cell("word", x=100, y=350, w=60, h=25, col_index=0),
_make_cell("Wort", x=360, y=350, w=60, h=25, col_index=1),
]
wr = _make_word_result(cells, columns_used=cols)
removed = _filter_border_ghost_words(wr, [box])
assert removed == 0
assert len(wr["columns_used"]) == 2
def test_single_column_not_removed(self):
"""A single remaining column should never be removed."""
box = _make_box(x=50, y=300, w=1100, h=200, bt=3)
cols = [{"index": 0, "type": "column_text", "x": 60, "width": 1000}]
cells = [_make_cell("|", x=49, y=350, w=3, h=25, col_index=0)]
wr = _make_word_result(cells, columns_used=cols)
# Even if the only cell is filtered, we don't remove the last column
removed = _filter_border_ghost_words(wr, [box])
assert removed == 1
# columns_used should still have 1 entry (we skip cleanup for len <= 1)
assert len(wr["columns_used"]) == 1

View File

@@ -0,0 +1,174 @@
"""
Tests for box-aware column detection.
Verifies that:
1. Words inside boxes are excluded from column clustering (words_first)
2. Column geometries are enriched with box-filtered original words (layout)
3. Inline markers (bullet points) are not split into sub-columns
Lizenz: Apache 2.0
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from cv_words_first import build_grid_from_words, _cluster_columns
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _word(text: str, left: int, top: int, width: int, height: int,
conf: int = 90) -> dict:
return {
'text': text, 'left': left, 'top': top,
'width': width, 'height': height, 'conf': conf,
}
def _box(x: int, y: int, w: int, h: int) -> dict:
return {'x': x, 'y': y, 'width': w, 'height': h}
# ---------------------------------------------------------------------------
# Tests: box filtering in build_grid_from_words
# ---------------------------------------------------------------------------
class TestBoxAwareGridBuilding:
"""Words inside boxes should be excluded from column clustering."""
def test_no_boxes_unchanged(self):
"""Without boxes, all words should be used."""
words = [
_word("hello", 50, 100, 80, 20),
_word("world", 300, 100, 80, 20),
]
cells, cols = build_grid_from_words(words, 600, 400)
assert len(cells) >= 2
texts = {c['text'] for c in cells}
assert 'hello' in texts
assert 'world' in texts
def test_box_words_excluded(self):
"""Words inside a box should not appear in the grid."""
words = [
_word("outside1", 50, 50, 80, 20),
_word("outside2", 300, 50, 80, 20),
_word("inside_box", 150, 250, 100, 20), # inside box
]
box = _box(100, 200, 300, 150) # box from x=100..400, y=200..350
cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
texts = {c['text'] for c in cells}
assert 'outside1' in texts
assert 'outside2' in texts
assert 'inside_box' not in texts
def test_all_words_in_box_returns_empty(self):
"""If all words are inside the box, return empty grid."""
words = [
_word("a", 150, 250, 30, 20),
_word("b", 200, 250, 30, 20),
]
box = _box(100, 200, 300, 150)
cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
assert cells == []
assert cols == []
def test_multiple_boxes(self):
"""Words in multiple boxes should all be excluded."""
words = [
_word("content", 50, 50, 80, 20),
_word("box1_word", 120, 220, 80, 20),
_word("box2_word", 420, 220, 80, 20),
]
boxes = [
_box(100, 200, 200, 100), # box1
_box(400, 200, 200, 100), # box2
]
cells, cols = build_grid_from_words(words, 700, 400, box_rects=boxes)
texts = {c['text'] for c in cells}
assert texts == {'content'}
def test_word_on_box_border_excluded(self):
"""A word exactly on the box boundary should be excluded."""
words = [
_word("content", 50, 50, 80, 20),
_word("edge", 100, 200, 40, 20), # left edge = box.x, center inside
]
box = _box(100, 200, 200, 100)
cells, cols = build_grid_from_words(words, 600, 400, box_rects=[box])
texts = {c['text'] for c in cells}
assert 'edge' not in texts
def test_columns_not_affected_by_box_words(self):
"""Box words should not create extra columns via X-gap analysis."""
# Two columns of content words, plus a word in a box at a different X
words = [
_word("col1_a", 50, 50, 80, 20),
_word("col1_b", 50, 100, 80, 20),
_word("col2_a", 300, 50, 80, 20),
_word("col2_b", 300, 100, 80, 20),
# This box word is at X=500, would create a 3rd column if not filtered
_word("box_far", 500, 250, 80, 20),
]
box = _box(450, 200, 200, 150)
cells, cols = build_grid_from_words(words, 700, 500, box_rects=[box])
# Should only have 2 columns (not 3)
assert len(cols) <= 2
# ---------------------------------------------------------------------------
# Tests: _cluster_columns with box-filtered words
# ---------------------------------------------------------------------------
class TestClusterColumnsFiltering:
"""Verify column clustering works correctly with filtered words."""
def test_gap_detection_without_box_words(self):
"""Column gaps should be found from content words only."""
content_words = [
_word("a", 50, 50, 30, 20),
_word("b", 50, 100, 30, 20),
_word("c", 300, 50, 30, 20),
_word("d", 300, 100, 30, 20),
]
columns = _cluster_columns(content_words, 600)
assert len(columns) == 2
def test_single_column_when_words_close(self):
"""Close-together words should form a single column."""
words = [
_word("a", 50, 50, 80, 20),
_word("b", 60, 100, 80, 20),
_word("c", 55, 150, 80, 20),
]
columns = _cluster_columns(words, 600)
assert len(columns) == 1
# ---------------------------------------------------------------------------
# Tests: inline marker guard (bullet points)
# ---------------------------------------------------------------------------
class TestInlineMarkerGuard:
"""Bullet points / numbering should NOT be split into sub-columns."""
def test_concept_bullet_vs_page_ref(self):
"""Demonstrate the gap difference between bullets and page refs.
Bullet points have small gap to main text (~5-10px).
Page references have large gap (~50+ px).
"""
# Bullet point scenario: "1." at left=50, main text at left=65
# Gap = 65 - (50+20) = -5 (overlapping or touching → no split)
bullet_gap = 65 - (50 + 20)
assert bullet_gap < 20 # very small gap
# Page ref scenario: "p.55" at left=20, main text at left=120
# Gap = 120 - (20+40) = 60 (clear separation → split)
pageref_gap = 120 - (20 + 40)
assert pageref_gap > 30 # clear gap

View File

@@ -0,0 +1,320 @@
"""
Tests for cv_graphic_detect.py — graphic element detection.
Lizenz: Apache 2.0
"""
import numpy as np
import pytest
import cv2
from cv_graphic_detect import detect_graphic_elements, GraphicElement, _dominant_color
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _white_image(width: int = 1200, height: int = 1800) -> np.ndarray:
"""Create a plain white BGR image."""
return np.ones((height, width, 3), dtype=np.uint8) * 255
def _draw_colored_circle(img: np.ndarray, cx: int, cy: int, radius: int,
color_bgr: tuple) -> np.ndarray:
"""Draw a filled colored circle (simulates a balloon / graphic)."""
cv2.circle(img, (cx, cy), radius, color_bgr, -1)
return img
def _draw_colored_region(img: np.ndarray, x: int, y: int, w: int, h: int,
color_bgr: tuple) -> np.ndarray:
"""Draw a filled colored rectangle (simulates an image region)."""
cv2.rectangle(img, (x, y), (x + w, y + h), color_bgr, -1)
return img
def _draw_black_illustration(img: np.ndarray, x: int, y: int, w: int, h: int) -> np.ndarray:
"""Draw a large black filled shape (simulates a black-ink illustration)."""
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)
return img
def _word_box(left: int, top: int, width: int, height: int) -> dict:
"""Create a word box dict matching OCR output format."""
return {"left": left, "top": top, "width": width, "height": height}
# ---------------------------------------------------------------------------
# _dominant_color tests
# ---------------------------------------------------------------------------
class TestDominantColor:
"""Tests for the _dominant_color helper."""
def test_empty_array(self):
hsv = np.array([], dtype=np.uint8).reshape(0, 3)
name, hex_val = _dominant_color(hsv)
assert name == "black"
assert hex_val == "#000000"
def test_low_saturation_returns_black(self):
"""Pixels with low saturation should be classified as black."""
# HSV: H=90 (irrelevant), S=10 (low), V=200
hsv = np.full((50, 50, 3), [90, 10, 200], dtype=np.uint8)
name, _ = _dominant_color(hsv)
assert name == "black"
def test_red_hue(self):
"""Pixels with hue ~0-10 or ~170+ should be red."""
hsv = np.full((50, 50, 3), [5, 200, 200], dtype=np.uint8)
name, hex_val = _dominant_color(hsv)
assert name == "red"
assert hex_val == "#dc2626"
def test_blue_hue(self):
"""Pixels with hue ~100 should be blue."""
hsv = np.full((50, 50, 3), [110, 200, 200], dtype=np.uint8)
name, hex_val = _dominant_color(hsv)
assert name == "blue"
assert hex_val == "#2563eb"
def test_green_hue(self):
"""Pixels with hue ~60 should be green."""
hsv = np.full((50, 50, 3), [60, 200, 200], dtype=np.uint8)
name, hex_val = _dominant_color(hsv)
assert name == "green"
assert hex_val == "#16a34a"
def test_yellow_hue(self):
"""Pixels with hue ~30 should be yellow."""
hsv = np.full((50, 50, 3), [30, 200, 200], dtype=np.uint8)
name, hex_val = _dominant_color(hsv)
assert name == "yellow"
def test_orange_hue(self):
"""Pixels with hue ~15 should be orange."""
hsv = np.full((50, 50, 3), [15, 200, 200], dtype=np.uint8)
name, hex_val = _dominant_color(hsv)
assert name == "orange"
def test_purple_hue(self):
"""Pixels with hue ~140 should be purple."""
hsv = np.full((50, 50, 3), [140, 200, 200], dtype=np.uint8)
name, hex_val = _dominant_color(hsv)
assert name == "purple"
# ---------------------------------------------------------------------------
# detect_graphic_elements tests
# ---------------------------------------------------------------------------
class TestDetectGraphicElements:
"""Tests for the detect_graphic_elements() function."""
def test_none_image_returns_empty(self):
"""None input should return empty list."""
result = detect_graphic_elements(None, [])
assert result == []
def test_white_image_no_graphics(self):
"""A plain white image should produce no graphic elements."""
img = _white_image()
result = detect_graphic_elements(img, [])
assert result == []
def test_colored_region_detected_as_image(self):
"""A large colored rectangle should be detected as an image."""
img = _white_image()
# Draw a large red region (not text-like)
_draw_colored_region(img, x=100, y=300, w=200, h=200, color_bgr=(0, 0, 220))
result = detect_graphic_elements(img, word_boxes=[])
assert len(result) >= 1
graphic = result[0]
assert isinstance(graphic, GraphicElement)
assert graphic.shape == "image"
assert graphic.color_name == "red"
assert graphic.confidence > 0
def test_colored_text_excluded_by_word_overlap(self):
"""Colored regions that overlap heavily with word boxes should be skipped."""
img = _white_image()
# Draw colored region
_draw_colored_region(img, x=100, y=300, w=400, h=50, color_bgr=(0, 0, 220))
# Word boxes covering >50% of the colored region
words = [
_word_box(100, 300, 200, 50),
_word_box(300, 300, 200, 50),
]
result = detect_graphic_elements(img, word_boxes=words)
# Should be filtered out (word overlap > 50%)
for g in result:
# If anything is detected at that location, overlap check failed
if g.x >= 90 and g.x <= 110 and g.y >= 290 and g.y <= 310:
pytest.fail("Colored text region should be excluded by word overlap")
def test_colored_graphic_with_low_word_overlap_kept(self):
"""A colored region with low word overlap should be kept."""
img = _white_image()
# Draw a large colored circle
_draw_colored_circle(img, cx=300, cy=400, radius=80, color_bgr=(0, 200, 0))
# One small word box overlapping only a tiny portion
words = [_word_box(250, 390, 30, 20)]
result = detect_graphic_elements(img, word_boxes=words)
assert len(result) >= 1
assert result[0].shape == "image"
assert result[0].color_name == "green"
def test_black_illustration_detected(self):
"""A large black filled area should be detected as illustration."""
img = _white_image()
# Draw a large black rectangle (simulating an illustration)
_draw_black_illustration(img, x=200, y=400, w=300, h=300)
result = detect_graphic_elements(img, word_boxes=[])
assert len(result) >= 1
illust = [g for g in result if g.shape == "illustration"]
assert len(illust) >= 1
assert illust[0].color_name == "black"
def test_black_illustration_excluded_by_word_boxes(self):
"""Black ink in word regions should NOT be detected as illustration."""
img = _white_image()
# Draw black text-like region
_draw_black_illustration(img, x=100, y=300, w=400, h=60)
# Word boxes covering the same area
words = [
_word_box(100, 300, 200, 60),
_word_box(300, 300, 200, 60),
]
result = detect_graphic_elements(img, word_boxes=words)
# Should be empty — the word exclusion mask covers the ink
illust = [g for g in result if g.shape == "illustration"]
assert len(illust) == 0
def test_tiny_colored_region_filtered(self):
"""Very small colored regions (<200 colored pixels) should be filtered."""
img = _white_image()
# Draw a tiny colored dot (5x5 pixels)
_draw_colored_region(img, x=500, y=500, w=5, h=5, color_bgr=(220, 0, 0))
result = detect_graphic_elements(img, word_boxes=[])
assert result == []
def test_page_spanning_region_filtered(self):
"""Colored regions spanning >50% of width/height should be skipped."""
img = _white_image(width=1200, height=1800)
# Draw a region wider than 50% of the image
_draw_colored_region(img, x=50, y=300, w=700, h=100, color_bgr=(0, 0, 220))
result = detect_graphic_elements(img, word_boxes=[])
# Should be filtered as page-spanning
assert result == []
def test_multiple_graphics_detected(self):
"""Multiple separate colored regions should all be detected."""
img = _white_image()
# Three separate colored circles
_draw_colored_circle(img, cx=200, cy=300, radius=60, color_bgr=(0, 0, 220))
_draw_colored_circle(img, cx=500, cy=300, radius=60, color_bgr=(0, 200, 0))
_draw_colored_circle(img, cx=200, cy=600, radius=60, color_bgr=(220, 0, 0))
result = detect_graphic_elements(img, word_boxes=[])
# Should detect at least 2 (some may merge if dilation connects them)
assert len(result) >= 2
def test_results_sorted_by_area_descending(self):
"""Results should be sorted by area, largest first."""
img = _white_image()
# Small circle
_draw_colored_circle(img, cx=200, cy=300, radius=30, color_bgr=(0, 0, 220))
# Large circle
_draw_colored_circle(img, cx=600, cy=800, radius=100, color_bgr=(0, 200, 0))
result = detect_graphic_elements(img, word_boxes=[])
if len(result) >= 2:
assert result[0].area >= result[1].area
def test_max_elements_limit(self):
"""Should respect max_elements parameter."""
img = _white_image(width=2000, height=2000)
# Draw many colored regions
for i in range(10):
_draw_colored_circle(img, cx=100 + i * 180, cy=300, radius=40,
color_bgr=(0, 0, 220))
result = detect_graphic_elements(img, word_boxes=[], max_elements=3)
assert len(result) <= 3
def test_detected_boxes_excluded_from_ink(self):
"""Detected box regions should be excluded from ink illustration detection."""
img = _white_image()
# Draw a black rectangle well inside the "box" area (8px inset is used)
_draw_black_illustration(img, x=120, y=320, w=360, h=160)
# Mark the outer box — the 8px inset still covers the drawn region
detected_boxes = [{"x": 100, "y": 300, "w": 400, "h": 200}]
result = detect_graphic_elements(img, word_boxes=[], detected_boxes=detected_boxes)
illust = [g for g in result if g.shape == "illustration"]
assert len(illust) == 0
def test_deduplication_overlapping_regions(self):
"""Overlapping elements should be deduplicated."""
img = _white_image()
# Two overlapping colored regions
_draw_colored_region(img, x=200, y=300, w=200, h=200, color_bgr=(0, 0, 220))
_draw_colored_region(img, x=250, y=350, w=200, h=200, color_bgr=(0, 0, 220))
result = detect_graphic_elements(img, word_boxes=[])
# Should be merged/deduplicated into 1 element (heavy dilation merges them)
assert len(result) <= 2
def test_graphicelement_dataclass_fields(self):
"""GraphicElement should have all expected fields."""
elem = GraphicElement(
x=10, y=20, width=100, height=80,
area=5000, shape="image",
color_name="red", color_hex="#dc2626",
confidence=0.85,
)
assert elem.x == 10
assert elem.y == 20
assert elem.width == 100
assert elem.height == 80
assert elem.area == 5000
assert elem.shape == "image"
assert elem.color_name == "red"
assert elem.color_hex == "#dc2626"
assert elem.confidence == 0.85
assert elem.contour is None
def test_small_ink_area_filtered(self):
"""Black ink areas smaller than 5000px should be filtered."""
img = _white_image()
# Small black mark (50x50 = 2500 area, below 5000 threshold)
_draw_black_illustration(img, x=500, y=500, w=50, h=50)
result = detect_graphic_elements(img, word_boxes=[])
illust = [g for g in result if g.shape == "illustration"]
assert len(illust) == 0