feat(ocr): Word-based image deskew for Ground Truth pipeline
Begradigt schiefe Scans vor der OCR-Extraktion anhand der linksbuendigen
Wortanfaenge der Vokabelspalte. Tesseract liefert achsenparallele Boxen,
die bei ~2-3 Grad Schraege in Nachbarzeilen bluten — der Deskew behebt das.
- Neue Funktion deskew_image_by_word_alignment() in cv_vocab_pipeline.py
- Deskew-Integration im extract-with-boxes Endpoint (vor OCR)
- Neuer GET Endpoint /deskewed-image/{page} fuer begradigtes Seitenbild
- Frontend: GroundTruthPanel wechselt nach Extraktion auf deskewed Image
- ~1s Overhead durch schnellen Tesseract-Pass auf halbiertem Bild
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -111,6 +111,9 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
const [imageNatural, setImageNatural] = useState({ w: 0, h: 0 })
|
const [imageNatural, setImageNatural] = useState({ w: 0, h: 0 })
|
||||||
const [showSummary, setShowSummary] = useState(false)
|
const [showSummary, setShowSummary] = useState(false)
|
||||||
const [savedMessage, setSavedMessage] = useState<string | null>(null)
|
const [savedMessage, setSavedMessage] = useState<string | null>(null)
|
||||||
|
const [isFullscreen, setIsFullscreen] = useState(false)
|
||||||
|
const [imageUrl, setImageUrl] = useState(pageImageUrl)
|
||||||
|
const [deskewAngle, setDeskewAngle] = useState<number | null>(null)
|
||||||
|
|
||||||
// Editable fields for current entry
|
// Editable fields for current entry
|
||||||
const [editEn, setEditEn] = useState('')
|
const [editEn, setEditEn] = useState('')
|
||||||
@@ -120,13 +123,19 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
const panelRef = useRef<HTMLDivElement>(null)
|
const panelRef = useRef<HTMLDivElement>(null)
|
||||||
const enInputRef = useRef<HTMLInputElement>(null)
|
const enInputRef = useRef<HTMLInputElement>(null)
|
||||||
|
|
||||||
|
// Reset image URL when page changes
|
||||||
|
useEffect(() => {
|
||||||
|
setImageUrl(pageImageUrl)
|
||||||
|
setDeskewAngle(null)
|
||||||
|
}, [pageImageUrl])
|
||||||
|
|
||||||
// Load natural image dimensions
|
// Load natural image dimensions
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!pageImageUrl) return
|
if (!imageUrl) return
|
||||||
const img = new Image()
|
const img = new Image()
|
||||||
img.onload = () => setImageNatural({ w: img.naturalWidth, h: img.naturalHeight })
|
img.onload = () => setImageNatural({ w: img.naturalWidth, h: img.naturalHeight })
|
||||||
img.src = pageImageUrl
|
img.src = imageUrl
|
||||||
}, [pageImageUrl])
|
}, [imageUrl])
|
||||||
|
|
||||||
// Sync edit fields when current entry changes
|
// Sync edit fields when current entry changes
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
@@ -157,6 +166,12 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
const loaded: GTEntry[] = (data.entries || []).map((e: GTEntry) => ({ ...e, status: 'pending' as const }))
|
const loaded: GTEntry[] = (data.entries || []).map((e: GTEntry) => ({ ...e, status: 'pending' as const }))
|
||||||
setEntries(loaded)
|
setEntries(loaded)
|
||||||
setCurrentIndex(0)
|
setCurrentIndex(0)
|
||||||
|
|
||||||
|
// Switch to deskewed image if available
|
||||||
|
if (data.deskewed) {
|
||||||
|
setImageUrl(`${KLAUSUR_API}/api/v1/vocab/sessions/${sessionId}/deskewed-image/${selectedPage}`)
|
||||||
|
setDeskewAngle(data.deskew_angle)
|
||||||
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
setError(err instanceof Error ? err.message : 'Extraction failed')
|
setError(err instanceof Error ? err.message : 'Extraction failed')
|
||||||
} finally {
|
} finally {
|
||||||
@@ -225,9 +240,15 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
// ---------- Keyboard shortcuts ----------
|
// ---------- Keyboard shortcuts ----------
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
|
const handler = (e: KeyboardEvent) => {
|
||||||
|
if (e.key === 'Escape' && isFullscreen) {
|
||||||
|
e.preventDefault()
|
||||||
|
setIsFullscreen(false)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if (entries.length === 0 || showSummary) return
|
if (entries.length === 0 || showSummary) return
|
||||||
|
|
||||||
const handler = (e: KeyboardEvent) => {
|
|
||||||
// Don't capture when typing in inputs
|
// Don't capture when typing in inputs
|
||||||
const tag = (e.target as HTMLElement)?.tagName
|
const tag = (e.target as HTMLElement)?.tagName
|
||||||
const isInput = tag === 'INPUT' || tag === 'TEXTAREA'
|
const isInput = tag === 'INPUT' || tag === 'TEXTAREA'
|
||||||
@@ -251,7 +272,7 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
|
|
||||||
window.addEventListener('keydown', handler)
|
window.addEventListener('keydown', handler)
|
||||||
return () => window.removeEventListener('keydown', handler)
|
return () => window.removeEventListener('keydown', handler)
|
||||||
}, [entries.length, showSummary, confirmEntry, skipEntry, goTo, currentIndex])
|
}, [entries.length, showSummary, isFullscreen, confirmEntry, skipEntry, goTo, currentIndex])
|
||||||
|
|
||||||
// ---------- Computed ----------
|
// ---------- Computed ----------
|
||||||
|
|
||||||
@@ -298,8 +319,27 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
|
|
||||||
if (showSummary) {
|
if (showSummary) {
|
||||||
return (
|
return (
|
||||||
<div className="bg-white rounded-xl border border-slate-200 p-6" ref={panelRef}>
|
<div className={`bg-white rounded-xl border border-slate-200 p-6 ${
|
||||||
<h3 className="text-lg font-semibold text-slate-900 mb-4">Zusammenfassung</h3>
|
isFullscreen ? 'fixed inset-0 z-50 overflow-auto m-0 rounded-none' : ''
|
||||||
|
}`} ref={panelRef}>
|
||||||
|
<div className="flex items-center justify-between mb-4">
|
||||||
|
<h3 className="text-lg font-semibold text-slate-900">Zusammenfassung</h3>
|
||||||
|
<button
|
||||||
|
onClick={() => setIsFullscreen(!isFullscreen)}
|
||||||
|
className="p-1.5 rounded-lg hover:bg-slate-100 text-slate-500 hover:text-slate-700 transition-colors"
|
||||||
|
title={isFullscreen ? 'Vollbild verlassen (Esc)' : 'Vollbild'}
|
||||||
|
>
|
||||||
|
{isFullscreen ? (
|
||||||
|
<svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" strokeWidth={2} stroke="currentColor">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" d="M9 9V4.5M9 9H4.5M9 9 3.75 3.75M9 15v4.5M9 15H4.5M9 15l-5.25 5.25M15 9h4.5M15 9V4.5M15 9l5.25-5.25M15 15h4.5M15 15v4.5m0-4.5 5.25 5.25" />
|
||||||
|
</svg>
|
||||||
|
) : (
|
||||||
|
<svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" strokeWidth={2} stroke="currentColor">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" d="M3.75 3.75v4.5m0-4.5h4.5m-4.5 0L9 9M3.75 20.25v-4.5m0 4.5h4.5m-4.5 0L9 15M20.25 3.75h-4.5m4.5 0v4.5m0-4.5L15 9m5.25 11.25h-4.5m4.5 0v-4.5m0 4.5L15 15" />
|
||||||
|
</svg>
|
||||||
|
)}
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
<div className="grid grid-cols-3 gap-4 mb-6">
|
<div className="grid grid-cols-3 gap-4 mb-6">
|
||||||
<div className="bg-green-50 border border-green-200 rounded-lg p-4 text-center">
|
<div className="bg-green-50 border border-green-200 rounded-lg p-4 text-center">
|
||||||
<div className="text-2xl font-bold text-green-700">{confirmedCount}</div>
|
<div className="text-2xl font-bold text-green-700">{confirmedCount}</div>
|
||||||
@@ -385,22 +425,47 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
// ---------- Render: Main Review UI ----------
|
// ---------- Render: Main Review UI ----------
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="bg-white rounded-xl border border-slate-200 overflow-hidden" ref={panelRef}>
|
<div className={`bg-white rounded-xl border border-slate-200 overflow-hidden ${
|
||||||
{/* Progress bar */}
|
isFullscreen ? 'fixed inset-0 z-50 overflow-auto m-0 rounded-none bg-white' : ''
|
||||||
<div className="h-1.5 bg-slate-100">
|
}`} ref={panelRef}>
|
||||||
|
{/* Header with progress + fullscreen toggle */}
|
||||||
|
<div className="flex items-center gap-2 px-4 pt-2">
|
||||||
|
<div className="flex-1 h-1.5 bg-slate-100 rounded-full">
|
||||||
<div
|
<div
|
||||||
className="h-full bg-teal-500 transition-all duration-300"
|
className="h-full bg-teal-500 transition-all duration-300 rounded-full"
|
||||||
style={{ width: `${progress}%` }}
|
style={{ width: `${progress}%` }}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
<span className="text-xs text-slate-400 whitespace-nowrap">{currentIndex + 1}/{entries.length}</span>
|
||||||
|
{deskewAngle !== null && (
|
||||||
|
<span className="text-xs text-teal-600 whitespace-nowrap" title="Bild wurde begradigt">
|
||||||
|
{deskewAngle.toFixed(1)}°
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
<button
|
||||||
|
onClick={() => setIsFullscreen(!isFullscreen)}
|
||||||
|
className="p-1.5 rounded-lg hover:bg-slate-100 text-slate-500 hover:text-slate-700 transition-colors"
|
||||||
|
title={isFullscreen ? 'Vollbild verlassen (Esc)' : 'Vollbild'}
|
||||||
|
>
|
||||||
|
{isFullscreen ? (
|
||||||
|
<svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" strokeWidth={2} stroke="currentColor">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" d="M9 9V4.5M9 9H4.5M9 9 3.75 3.75M9 15v4.5M9 15H4.5M9 15l-5.25 5.25M15 9h4.5M15 9V4.5M15 9l5.25-5.25M15 15h4.5M15 15v4.5m0-4.5 5.25 5.25" />
|
||||||
|
</svg>
|
||||||
|
) : (
|
||||||
|
<svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" strokeWidth={2} stroke="currentColor">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" d="M3.75 3.75v4.5m0-4.5h4.5m-4.5 0L9 9M3.75 20.25v-4.5m0 4.5h4.5m-4.5 0L9 15M20.25 3.75h-4.5m4.5 0v4.5m0-4.5L15 9m5.25 11.25h-4.5m4.5 0v-4.5m0 4.5L15 15" />
|
||||||
|
</svg>
|
||||||
|
)}
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div className="flex flex-col lg:flex-row">
|
<div className={`flex flex-col ${isFullscreen ? 'lg:flex-row h-[calc(100vh-3rem)]' : 'lg:flex-row'}`}>
|
||||||
{/* Left: Page image with SVG overlay (2/3) */}
|
{/* Left: Page image with SVG overlay (2/3) */}
|
||||||
<div className="lg:w-2/3 p-4">
|
<div className={`${isFullscreen ? 'lg:w-2/3 p-4 overflow-y-auto h-full' : 'lg:w-2/3 p-4'}`}>
|
||||||
<div className="relative bg-slate-50 rounded-lg overflow-hidden">
|
<div className="relative bg-slate-50 rounded-lg overflow-hidden">
|
||||||
{pageImageUrl && (
|
{imageUrl && (
|
||||||
<img
|
<img
|
||||||
src={pageImageUrl}
|
src={imageUrl}
|
||||||
alt={`Seite ${selectedPage + 1}`}
|
alt={`Seite ${selectedPage + 1}`}
|
||||||
className="w-full"
|
className="w-full"
|
||||||
draggable={false}
|
draggable={false}
|
||||||
@@ -451,13 +516,13 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* Right: Crops + Edit fields (1/3) */}
|
{/* Right: Crops + Edit fields (1/3) */}
|
||||||
<div className="lg:w-1/3 border-l border-slate-200 p-4 space-y-4">
|
<div className={`lg:w-1/3 border-l border-slate-200 p-4 space-y-4 ${isFullscreen ? 'overflow-y-auto h-full' : ''}`}>
|
||||||
{currentEntry && (
|
{currentEntry && (
|
||||||
<>
|
<>
|
||||||
{/* Row crop */}
|
{/* Row crop */}
|
||||||
{imageNatural.w > 0 && (
|
{imageNatural.w > 0 && (
|
||||||
<ImageCrop
|
<ImageCrop
|
||||||
imageUrl={pageImageUrl}
|
imageUrl={imageUrl}
|
||||||
bbox={currentEntry.bbox}
|
bbox={currentEntry.bbox}
|
||||||
naturalWidth={imageNatural.w}
|
naturalWidth={imageNatural.w}
|
||||||
naturalHeight={imageNatural.h}
|
naturalHeight={imageNatural.h}
|
||||||
@@ -470,7 +535,7 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
<div className="grid grid-cols-3 gap-2">
|
<div className="grid grid-cols-3 gap-2">
|
||||||
{currentEntry.bbox_en.w > 0 && (
|
{currentEntry.bbox_en.w > 0 && (
|
||||||
<ImageCrop
|
<ImageCrop
|
||||||
imageUrl={pageImageUrl}
|
imageUrl={imageUrl}
|
||||||
bbox={currentEntry.bbox_en}
|
bbox={currentEntry.bbox_en}
|
||||||
naturalWidth={imageNatural.w}
|
naturalWidth={imageNatural.w}
|
||||||
naturalHeight={imageNatural.h}
|
naturalHeight={imageNatural.h}
|
||||||
@@ -480,7 +545,7 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
)}
|
)}
|
||||||
{currentEntry.bbox_de.w > 0 && (
|
{currentEntry.bbox_de.w > 0 && (
|
||||||
<ImageCrop
|
<ImageCrop
|
||||||
imageUrl={pageImageUrl}
|
imageUrl={imageUrl}
|
||||||
bbox={currentEntry.bbox_de}
|
bbox={currentEntry.bbox_de}
|
||||||
naturalWidth={imageNatural.w}
|
naturalWidth={imageNatural.w}
|
||||||
naturalHeight={imageNatural.h}
|
naturalHeight={imageNatural.h}
|
||||||
@@ -490,7 +555,7 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
)}
|
)}
|
||||||
{currentEntry.bbox_ex.w > 0 && (
|
{currentEntry.bbox_ex.w > 0 && (
|
||||||
<ImageCrop
|
<ImageCrop
|
||||||
imageUrl={pageImageUrl}
|
imageUrl={imageUrl}
|
||||||
bbox={currentEntry.bbox_ex}
|
bbox={currentEntry.bbox_ex}
|
||||||
naturalWidth={imageNatural.w}
|
naturalWidth={imageNatural.w}
|
||||||
naturalHeight={imageNatural.h}
|
naturalHeight={imageNatural.h}
|
||||||
@@ -590,7 +655,7 @@ export function GroundTruthPanel({ sessionId, selectedPage, pageImageUrl }: Grou
|
|||||||
|
|
||||||
{/* Keyboard hints */}
|
{/* Keyboard hints */}
|
||||||
<div className="text-xs text-slate-400 text-center border-t border-slate-100 pt-2">
|
<div className="text-xs text-slate-400 text-center border-t border-slate-100 pt-2">
|
||||||
Enter = Bestaetigen · Tab = Ueberspringen · ←→ = Navigieren
|
Enter = Bestaetigen · Tab = Ueberspringen · ←→ = Navigieren{isFullscreen ? ' \u00B7 Esc = Vollbild verlassen' : ''}
|
||||||
</div>
|
</div>
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
|
|||||||
@@ -193,6 +193,127 @@ def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
|
|||||||
return corrected, median_angle
|
return corrected, median_angle
|
||||||
|
|
||||||
|
|
||||||
|
def deskew_image_by_word_alignment(
|
||||||
|
image_data: bytes,
|
||||||
|
lang: str = "eng+deu",
|
||||||
|
downscale_factor: float = 0.5,
|
||||||
|
) -> Tuple[bytes, float]:
|
||||||
|
"""Correct rotation by fitting a line through left-most word starts per text line.
|
||||||
|
|
||||||
|
More robust than Hough-based deskew for vocabulary worksheets where text lines
|
||||||
|
have consistent left-alignment. Runs a quick Tesseract pass on a downscaled
|
||||||
|
copy to find word positions, computes the dominant left-edge column, fits a
|
||||||
|
line through those points and rotates the full-resolution image.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_data: Raw image bytes (PNG/JPEG).
|
||||||
|
lang: Tesseract language string for the quick pass.
|
||||||
|
downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (rotated image as PNG bytes, detected angle in degrees).
|
||||||
|
"""
|
||||||
|
if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
|
||||||
|
return image_data, 0.0
|
||||||
|
|
||||||
|
# 1. Decode image
|
||||||
|
img_array = np.frombuffer(image_data, dtype=np.uint8)
|
||||||
|
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||||||
|
if img is None:
|
||||||
|
logger.warning("deskew_by_word_alignment: could not decode image")
|
||||||
|
return image_data, 0.0
|
||||||
|
|
||||||
|
orig_h, orig_w = img.shape[:2]
|
||||||
|
|
||||||
|
# 2. Downscale for fast Tesseract pass
|
||||||
|
small_w = int(orig_w * downscale_factor)
|
||||||
|
small_h = int(orig_h * downscale_factor)
|
||||||
|
small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
|
||||||
|
|
||||||
|
# 3. Quick Tesseract — word-level positions
|
||||||
|
pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
|
||||||
|
try:
|
||||||
|
data = pytesseract.image_to_data(
|
||||||
|
pil_small, lang=lang, config="--psm 6 --oem 3",
|
||||||
|
output_type=pytesseract.Output.DICT,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
|
||||||
|
return image_data, 0.0
|
||||||
|
|
||||||
|
# 4. Per text-line, find the left-most word start
|
||||||
|
# Group by (block_num, par_num, line_num)
|
||||||
|
from collections import defaultdict
|
||||||
|
line_groups: Dict[tuple, list] = defaultdict(list)
|
||||||
|
for i in range(len(data["text"])):
|
||||||
|
text = (data["text"][i] or "").strip()
|
||||||
|
conf = int(data["conf"][i])
|
||||||
|
if not text or conf < 20:
|
||||||
|
continue
|
||||||
|
key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
|
||||||
|
line_groups[key].append(i)
|
||||||
|
|
||||||
|
if len(line_groups) < 5:
|
||||||
|
logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
|
||||||
|
return image_data, 0.0
|
||||||
|
|
||||||
|
# For each line, pick the word with smallest 'left' → compute (left_x, center_y)
|
||||||
|
# Scale back to original resolution
|
||||||
|
scale = 1.0 / downscale_factor
|
||||||
|
points = [] # list of (x, y) in original-image coords
|
||||||
|
for key, indices in line_groups.items():
|
||||||
|
best_idx = min(indices, key=lambda i: data["left"][i])
|
||||||
|
lx = data["left"][best_idx] * scale
|
||||||
|
top = data["top"][best_idx] * scale
|
||||||
|
h = data["height"][best_idx] * scale
|
||||||
|
cy = top + h / 2.0
|
||||||
|
points.append((lx, cy))
|
||||||
|
|
||||||
|
# 5. Find dominant left-edge column + compute angle
|
||||||
|
xs = np.array([p[0] for p in points])
|
||||||
|
ys = np.array([p[1] for p in points])
|
||||||
|
median_x = float(np.median(xs))
|
||||||
|
tolerance = orig_w * 0.03 # 3% of image width
|
||||||
|
|
||||||
|
mask = np.abs(xs - median_x) <= tolerance
|
||||||
|
filtered_xs = xs[mask]
|
||||||
|
filtered_ys = ys[mask]
|
||||||
|
|
||||||
|
if len(filtered_xs) < 5:
|
||||||
|
logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
|
||||||
|
return image_data, 0.0
|
||||||
|
|
||||||
|
# polyfit: x = a*y + b → a = dx/dy → angle = arctan(a)
|
||||||
|
coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
|
||||||
|
slope = coeffs[0] # dx/dy
|
||||||
|
angle_rad = np.arctan(slope)
|
||||||
|
angle_deg = float(np.degrees(angle_rad))
|
||||||
|
|
||||||
|
# Clamp to ±5°
|
||||||
|
angle_deg = max(-5.0, min(5.0, angle_deg))
|
||||||
|
|
||||||
|
logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points "
|
||||||
|
f"(total lines: {len(line_groups)})")
|
||||||
|
|
||||||
|
if abs(angle_deg) < 0.05:
|
||||||
|
return image_data, 0.0
|
||||||
|
|
||||||
|
# 6. Rotate full-res image
|
||||||
|
center = (orig_w // 2, orig_h // 2)
|
||||||
|
M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
|
||||||
|
rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
|
||||||
|
flags=cv2.INTER_LINEAR,
|
||||||
|
borderMode=cv2.BORDER_REPLICATE)
|
||||||
|
|
||||||
|
# Encode back to PNG
|
||||||
|
success, png_buf = cv2.imencode(".png", rotated)
|
||||||
|
if not success:
|
||||||
|
logger.warning("deskew_by_word_alignment: PNG encoding failed")
|
||||||
|
return image_data, 0.0
|
||||||
|
|
||||||
|
return png_buf.tobytes(), angle_deg
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Stage 3: Dewarp (Book Curvature) — Pass-Through for now
|
# Stage 3: Dewarp (Book Curvature) — Pass-Through for now
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -2134,7 +2134,22 @@ async def extract_with_boxes(session_id: str, page_number: int):
|
|||||||
# Convert page to hires image
|
# Convert page to hires image
|
||||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||||
|
|
||||||
# Extract entries with boxes
|
# Deskew image before OCR
|
||||||
|
deskew_angle = 0.0
|
||||||
|
try:
|
||||||
|
from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
|
||||||
|
if CV2_AVAILABLE:
|
||||||
|
image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
|
||||||
|
logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Deskew failed for page {page_number}: {e}")
|
||||||
|
|
||||||
|
# Cache deskewed image in session for later serving
|
||||||
|
if "deskewed_images" not in session:
|
||||||
|
session["deskewed_images"] = {}
|
||||||
|
session["deskewed_images"][str(page_number)] = image_data
|
||||||
|
|
||||||
|
# Extract entries with boxes (now on deskewed image)
|
||||||
result = await extract_entries_with_boxes(image_data)
|
result = await extract_entries_with_boxes(image_data)
|
||||||
|
|
||||||
# Cache in session
|
# Cache in session
|
||||||
@@ -2148,9 +2163,35 @@ async def extract_with_boxes(session_id: str, page_number: int):
|
|||||||
"entry_count": len(result["entries"]),
|
"entry_count": len(result["entries"]),
|
||||||
"image_width": result["image_width"],
|
"image_width": result["image_width"],
|
||||||
"image_height": result["image_height"],
|
"image_height": result["image_height"],
|
||||||
|
"deskew_angle": round(deskew_angle, 2),
|
||||||
|
"deskewed": abs(deskew_angle) > 0.05,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions/{session_id}/deskewed-image/{page_number}")
|
||||||
|
async def get_deskewed_image(session_id: str, page_number: int):
|
||||||
|
"""Return the deskewed page image as PNG.
|
||||||
|
|
||||||
|
Falls back to the original hires image if no deskewed version is cached.
|
||||||
|
"""
|
||||||
|
if session_id not in _sessions:
|
||||||
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
session = _sessions[session_id]
|
||||||
|
deskewed = session.get("deskewed_images", {}).get(str(page_number))
|
||||||
|
|
||||||
|
if deskewed:
|
||||||
|
return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
|
||||||
|
|
||||||
|
# Fallback: render original hires image
|
||||||
|
pdf_data = session.get("pdf_data")
|
||||||
|
if not pdf_data:
|
||||||
|
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||||
|
|
||||||
|
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||||
|
return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
|
||||||
|
|
||||||
|
|
||||||
@router.post("/sessions/{session_id}/ground-truth/{page_number}")
|
@router.post("/sessions/{session_id}/ground-truth/{page_number}")
|
||||||
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
|
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
|
||||||
"""Save ground truth labels for a page.
|
"""Save ground truth labels for a page.
|
||||||
|
|||||||
Reference in New Issue
Block a user