Compare commits
12 Commits
08a91ba2be
...
52b66ebe07
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
52b66ebe07 | ||
|
|
424e5c51d4 | ||
|
|
12b4c61bac | ||
|
|
d9b2aa82e9 | ||
|
|
364086b86e | ||
|
|
fe754398c0 | ||
|
|
be86a7d14d | ||
|
|
19a5f69272 | ||
|
|
ea09fc75df | ||
|
|
410d36f3de | ||
|
|
72ce4420cb | ||
|
|
63dfb4d06f |
@@ -136,6 +136,7 @@ export default function OcrOverlayPage() {
|
||||
if (uiStep < 4) uiStep = 4
|
||||
} else if (dbStep >= 2) {
|
||||
skipIds.push('orientation')
|
||||
if (uiStep < 1) uiStep = 1 // advance past skipped orientation to deskew
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -382,13 +383,13 @@ export default function OcrOverlayPage() {
|
||||
if (mode === 'paddle-direct' || mode === 'kombi') {
|
||||
switch (currentStep) {
|
||||
case 0:
|
||||
return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} />
|
||||
return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} />
|
||||
case 1:
|
||||
return <StepDeskew sessionId={sessionId} onNext={handleNext} />
|
||||
return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} />
|
||||
case 2:
|
||||
return <StepDewarp sessionId={sessionId} onNext={handleNext} />
|
||||
return <StepDewarp key={sessionId} sessionId={sessionId} onNext={handleNext} />
|
||||
case 3:
|
||||
return <StepCrop sessionId={sessionId} onNext={handleNext} />
|
||||
return <StepCrop key={sessionId} sessionId={sessionId} onNext={handleNext} />
|
||||
case 4:
|
||||
if (mode === 'kombi') {
|
||||
return (
|
||||
@@ -420,13 +421,13 @@ export default function OcrOverlayPage() {
|
||||
}
|
||||
switch (currentStep) {
|
||||
case 0:
|
||||
return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} />
|
||||
return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} />
|
||||
case 1:
|
||||
return <StepDeskew sessionId={sessionId} onNext={handleNext} />
|
||||
return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} />
|
||||
case 2:
|
||||
return <StepDewarp sessionId={sessionId} onNext={handleNext} />
|
||||
return <StepDewarp key={sessionId} sessionId={sessionId} onNext={handleNext} />
|
||||
case 3:
|
||||
return <StepCrop sessionId={sessionId} onNext={handleNext} />
|
||||
return <StepCrop key={sessionId} sessionId={sessionId} onNext={handleNext} />
|
||||
case 4:
|
||||
return <StepRowDetection sessionId={sessionId} onNext={handleNext} />
|
||||
case 5:
|
||||
|
||||
@@ -108,6 +108,7 @@ export default function OcrPipelinePage() {
|
||||
} else if (dbStep >= 2) {
|
||||
// Page-split sub-session: parent orientation applied, skip only orientation
|
||||
if (!skipSteps.includes('orientation')) skipSteps.push('orientation')
|
||||
if (uiStep < 1) uiStep = 1 // advance past skipped orientation to deskew
|
||||
}
|
||||
// dbStep === 1: page-split from original image, needs full pipeline
|
||||
}
|
||||
@@ -397,13 +398,13 @@ export default function OcrPipelinePage() {
|
||||
const renderStep = () => {
|
||||
switch (currentStep) {
|
||||
case 0:
|
||||
return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} />
|
||||
return <StepOrientation key={sessionId} sessionId={sessionId} onNext={handleOrientationComplete} onSubSessionsCreated={handleBoxSessionsCreated} />
|
||||
case 1:
|
||||
return <StepDeskew sessionId={sessionId} onNext={handleNext} />
|
||||
return <StepDeskew key={sessionId} sessionId={sessionId} onNext={handleNext} />
|
||||
case 2:
|
||||
return <StepDewarp sessionId={sessionId} onNext={handleNext} />
|
||||
return <StepDewarp key={sessionId} sessionId={sessionId} onNext={handleNext} />
|
||||
case 3:
|
||||
return <StepCrop sessionId={sessionId} onNext={handleCropNext} />
|
||||
return <StepCrop key={sessionId} sessionId={sessionId} onNext={handleCropNext} />
|
||||
case 4:
|
||||
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} onBoxSessionsCreated={handleBoxSessionsCreated} />
|
||||
case 5:
|
||||
|
||||
@@ -17,13 +17,6 @@ export function StepCrop({ sessionId, onNext }: StepCropProps) {
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
const [hasRun, setHasRun] = useState(false)
|
||||
|
||||
// Reset state when sessionId changes (e.g. switching sub-sessions)
|
||||
useEffect(() => {
|
||||
setCropResult(null)
|
||||
setHasRun(false)
|
||||
setError(null)
|
||||
}, [sessionId])
|
||||
|
||||
// Auto-trigger crop on mount
|
||||
useEffect(() => {
|
||||
if (!sessionId || hasRun) return
|
||||
|
||||
@@ -22,14 +22,6 @@ export function StepDeskew({ sessionId, onNext }: StepDeskewProps) {
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
const [hasAutoRun, setHasAutoRun] = useState(false)
|
||||
|
||||
// Reset state when sessionId changes (e.g. switching sub-sessions)
|
||||
useEffect(() => {
|
||||
setSession(null)
|
||||
setDeskewResult(null)
|
||||
setHasAutoRun(false)
|
||||
setError(null)
|
||||
}, [sessionId])
|
||||
|
||||
// Load session and auto-trigger deskew
|
||||
useEffect(() => {
|
||||
if (!sessionId || session) return
|
||||
|
||||
@@ -20,13 +20,6 @@ export function StepDewarp({ sessionId, onNext }: StepDewarpProps) {
|
||||
const [showGrid, setShowGrid] = useState(true)
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
|
||||
// Reset state when sessionId changes (e.g. switching sub-sessions)
|
||||
useEffect(() => {
|
||||
setDewarpResult(null)
|
||||
setDeskewResult(null)
|
||||
setError(null)
|
||||
}, [sessionId])
|
||||
|
||||
// Load session info to get deskew_result (for fine-tuning init values)
|
||||
useEffect(() => {
|
||||
if (!sessionId) return
|
||||
|
||||
@@ -383,7 +383,7 @@ export function StepGridReview({ sessionId, onNext, saveRef }: StepGridReviewPro
|
||||
{group.map((zone) => (
|
||||
<div
|
||||
key={zone.zone_index}
|
||||
className={`${group.length > 1 ? 'flex-1 min-w-0' : ''} bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700 overflow-hidden`}
|
||||
className={`${group.length > 1 ? 'flex-1 min-w-0' : ''} bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700`}
|
||||
>
|
||||
<GridTable
|
||||
zone={zone}
|
||||
|
||||
@@ -30,14 +30,6 @@ export function StepOrientation({ sessionId: existingSessionId, onNext, onSubSes
|
||||
const [dragOver, setDragOver] = useState(false)
|
||||
const [sessionName, setSessionName] = useState('')
|
||||
|
||||
// Reset state when sessionId changes
|
||||
useEffect(() => {
|
||||
setSession(null)
|
||||
setOrientationResult(null)
|
||||
setPageSplitResult(null)
|
||||
setError(null)
|
||||
}, [existingSessionId])
|
||||
|
||||
// Reload session data when navigating back
|
||||
useEffect(() => {
|
||||
if (!existingSessionId || session) return
|
||||
|
||||
@@ -481,8 +481,9 @@ _CHAR_CONFUSION_RULES = [
|
||||
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
|
||||
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
|
||||
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
|
||||
# "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
|
||||
(re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
|
||||
# "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
|
||||
# and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
|
||||
(re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
|
||||
]
|
||||
|
||||
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
|
||||
|
||||
155
klausur-service/backend/cv_syllable_detect.py
Normal file
155
klausur-service/backend/cv_syllable_detect.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
CV-based syllable divider detection and insertion for dictionary pages.
|
||||
|
||||
Two-step approach:
|
||||
1. CV: morphological vertical line detection checks if a word_box image
|
||||
contains thin, isolated pipe-like vertical lines (syllable dividers).
|
||||
2. pyphen: inserts syllable breaks at linguistically correct positions
|
||||
for words where CV confirmed the presence of dividers.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _word_has_pipe_lines(img_gray: np.ndarray, wb: Dict) -> bool:
|
||||
"""CV check: does this word_box image show thin vertical pipe dividers?
|
||||
|
||||
Uses morphological opening with a tall thin kernel to isolate vertical
|
||||
structures, then filters for thin (≤4px), isolated contours that are
|
||||
NOT at the word edges (those would be l, I, 1 etc.).
|
||||
"""
|
||||
x = wb.get("left", 0)
|
||||
y = wb.get("top", 0)
|
||||
w = wb.get("width", 0)
|
||||
h = wb.get("height", 0)
|
||||
if w < 30 or h < 12:
|
||||
return False
|
||||
ih, iw = img_gray.shape[:2]
|
||||
y1, y2 = max(0, y), min(ih, y + h)
|
||||
x1, x2 = max(0, x), min(iw, x + w)
|
||||
roi = img_gray[y1:y2, x1:x2]
|
||||
if roi.size == 0:
|
||||
return False
|
||||
rh, rw = roi.shape
|
||||
|
||||
# Binarize (ink = white on black background)
|
||||
_, binary = cv2.threshold(
|
||||
roi, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
|
||||
)
|
||||
|
||||
# Morphological opening: keep only tall vertical structures (≥55% height)
|
||||
kern_h = max(int(rh * 0.55), 8)
|
||||
kernel = np.ones((kern_h, 1), np.uint8)
|
||||
vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
|
||||
|
||||
# Find surviving contours
|
||||
contours, _ = cv2.findContours(
|
||||
vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
|
||||
margin = max(int(rw * 0.08), 3)
|
||||
for cnt in contours:
|
||||
cx, cy, cw, ch = cv2.boundingRect(cnt)
|
||||
if cw > 4:
|
||||
continue # too wide for a pipe
|
||||
if cx < margin or cx + cw > rw - margin:
|
||||
continue # at word edge — likely l, I, 1
|
||||
# Check isolation: adjacent columns should be mostly empty (ink-free)
|
||||
left_zone = binary[cy:cy + ch, max(0, cx - 3):cx]
|
||||
right_zone = binary[cy:cy + ch, cx + cw:min(rw, cx + cw + 3)]
|
||||
left_ink = np.mean(left_zone) if left_zone.size else 255
|
||||
right_ink = np.mean(right_zone) if right_zone.size else 255
|
||||
if left_ink < 80 and right_ink < 80:
|
||||
return True # isolated thin vertical line = pipe divider
|
||||
return False
|
||||
|
||||
|
||||
# IPA/phonetic bracket pattern — don't hyphenate transcriptions
|
||||
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
||||
|
||||
|
||||
def insert_syllable_dividers(
|
||||
zones_data: List[Dict],
|
||||
img_bgr: np.ndarray,
|
||||
session_id: str,
|
||||
) -> int:
|
||||
"""Insert pipe syllable dividers into dictionary cells where CV confirms them.
|
||||
|
||||
For each cell on a dictionary page:
|
||||
1. Check if ANY word_box has CV-detected pipe lines
|
||||
2. If yes, apply pyphen to EACH word (≥4 chars) in the cell
|
||||
3. Try DE hyphenation first, then EN
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
try:
|
||||
import pyphen
|
||||
except ImportError:
|
||||
logger.warning("pyphen not installed — skipping syllable insertion")
|
||||
return 0
|
||||
|
||||
_hyph_de = pyphen.Pyphen(lang='de_DE')
|
||||
_hyph_en = pyphen.Pyphen(lang='en_US')
|
||||
img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
insertions = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if not text or "|" in text:
|
||||
continue
|
||||
if _IPA_RE.search(text):
|
||||
continue
|
||||
|
||||
# CV gate: check if ANY word_box in this cell has pipe lines
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if not any(_word_has_pipe_lines(img_gray, wb) for wb in wbs):
|
||||
continue
|
||||
|
||||
# Apply pyphen to each significant word in the cell
|
||||
tokens = re.split(r'(\s+|[,;]+\s*)', text)
|
||||
new_tokens = []
|
||||
changed = False
|
||||
for tok in tokens:
|
||||
# Skip whitespace/punctuation separators
|
||||
if re.match(r'^[\s,;]+$', tok):
|
||||
new_tokens.append(tok)
|
||||
continue
|
||||
# Only hyphenate words ≥ 4 alpha chars
|
||||
clean = re.sub(r'[().\-]', '', tok)
|
||||
if len(clean) < 4 or not re.search(r'[a-zA-ZäöüÄÖÜß]', clean):
|
||||
new_tokens.append(tok)
|
||||
continue
|
||||
# Try DE first, then EN
|
||||
hyph = _hyph_de.inserted(tok, hyphen='|')
|
||||
if '|' not in hyph:
|
||||
hyph = _hyph_en.inserted(tok, hyphen='|')
|
||||
if '|' in hyph and hyph != tok:
|
||||
new_tokens.append(hyph)
|
||||
changed = True
|
||||
else:
|
||||
new_tokens.append(tok)
|
||||
if changed:
|
||||
cell["text"] = ''.join(new_tokens)
|
||||
insertions += 1
|
||||
|
||||
if insertions:
|
||||
logger.info(
|
||||
"build-grid session %s: inserted syllable dividers in %d cells "
|
||||
"(CV-validated)",
|
||||
session_id, insertions,
|
||||
)
|
||||
return insertions
|
||||
File diff suppressed because it is too large
Load Diff
1390
klausur-service/backend/grid_editor_helpers.py
Normal file
1390
klausur-service/backend/grid_editor_helpers.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -258,9 +258,17 @@ async def mark_ground_truth(
|
||||
gt["build_grid_reference"] = reference
|
||||
await update_session_db(session_id, ground_truth=gt, current_step=11)
|
||||
|
||||
# Compare with auto-snapshot if available (shows what the user corrected)
|
||||
auto_snapshot = gt.get("auto_grid_snapshot")
|
||||
correction_diff = None
|
||||
if auto_snapshot:
|
||||
correction_diff = compare_grids(auto_snapshot, reference)
|
||||
|
||||
logger.info(
|
||||
"Ground truth marked for session %s: %d cells",
|
||||
session_id, len(reference["cells"]),
|
||||
"Ground truth marked for session %s: %d cells (corrections: %s)",
|
||||
session_id,
|
||||
len(reference["cells"]),
|
||||
correction_diff["summary"] if correction_diff else "no auto-snapshot",
|
||||
)
|
||||
|
||||
return {
|
||||
@@ -268,6 +276,7 @@ async def mark_ground_truth(
|
||||
"session_id": session_id,
|
||||
"cells_saved": len(reference["cells"]),
|
||||
"summary": reference["summary"],
|
||||
"correction_diff": correction_diff,
|
||||
}
|
||||
|
||||
|
||||
@@ -289,6 +298,68 @@ async def unmark_ground_truth(session_id: str):
|
||||
return {"status": "ok", "session_id": session_id}
|
||||
|
||||
|
||||
@router.get("/sessions/{session_id}/correction-diff")
|
||||
async def get_correction_diff(session_id: str):
|
||||
"""Compare automatic OCR grid with manually corrected ground truth.
|
||||
|
||||
Returns a diff showing exactly which cells the user corrected,
|
||||
broken down by col_type (english, german, ipa, etc.).
|
||||
"""
|
||||
session = await get_session_db(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||
|
||||
gt = session.get("ground_truth") or {}
|
||||
auto_snapshot = gt.get("auto_grid_snapshot")
|
||||
reference = gt.get("build_grid_reference")
|
||||
|
||||
if not auto_snapshot:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="No auto_grid_snapshot found. Re-run build-grid to create one.",
|
||||
)
|
||||
if not reference:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="No ground truth reference found. Mark as ground truth first.",
|
||||
)
|
||||
|
||||
diff = compare_grids(auto_snapshot, reference)
|
||||
|
||||
# Enrich with per-col_type breakdown
|
||||
col_type_stats: Dict[str, Dict[str, int]] = {}
|
||||
for cell_diff in diff.get("cell_diffs", []):
|
||||
if cell_diff["type"] != "text_change":
|
||||
continue
|
||||
# Find col_type from reference cells
|
||||
cell_id = cell_diff["cell_id"]
|
||||
ref_cell = next(
|
||||
(c for c in reference.get("cells", []) if c["cell_id"] == cell_id),
|
||||
None,
|
||||
)
|
||||
ct = ref_cell.get("col_type", "unknown") if ref_cell else "unknown"
|
||||
if ct not in col_type_stats:
|
||||
col_type_stats[ct] = {"total": 0, "corrected": 0}
|
||||
col_type_stats[ct]["corrected"] += 1
|
||||
|
||||
# Count total cells per col_type from reference
|
||||
for cell in reference.get("cells", []):
|
||||
ct = cell.get("col_type", "unknown")
|
||||
if ct not in col_type_stats:
|
||||
col_type_stats[ct] = {"total": 0, "corrected": 0}
|
||||
col_type_stats[ct]["total"] += 1
|
||||
|
||||
# Calculate accuracy per col_type
|
||||
for ct, stats in col_type_stats.items():
|
||||
total = stats["total"]
|
||||
corrected = stats["corrected"]
|
||||
stats["accuracy_pct"] = round((total - corrected) / total * 100, 1) if total > 0 else 100.0
|
||||
|
||||
diff["col_type_breakdown"] = col_type_stats
|
||||
|
||||
return diff
|
||||
|
||||
|
||||
@router.get("/ground-truth-sessions")
|
||||
async def list_ground_truth_sessions():
|
||||
"""List all sessions that have a ground-truth reference."""
|
||||
|
||||
@@ -38,6 +38,9 @@ eng-to-ipa
|
||||
# Spell-checker for rule-based OCR correction (MIT license)
|
||||
pyspellchecker>=0.8.1
|
||||
|
||||
# Syllable hyphenation for dictionary pipe-divider insertion (MIT license)
|
||||
pyphen>=0.16.0
|
||||
|
||||
# PostgreSQL (for metrics storage)
|
||||
psycopg2-binary>=2.9.0
|
||||
asyncpg>=0.29.0
|
||||
|
||||
Reference in New Issue
Block a user