feat(ocr): Word-based image deskew for Ground Truth pipeline
Begradigt schiefe Scans vor der OCR-Extraktion anhand der linksbuendigen
Wortanfaenge der Vokabelspalte. Tesseract liefert achsenparallele Boxen,
die bei ~2-3 Grad Schraege in Nachbarzeilen bluten — der Deskew behebt das.
- Neue Funktion deskew_image_by_word_alignment() in cv_vocab_pipeline.py
- Deskew-Integration im extract-with-boxes Endpoint (vor OCR)
- Neuer GET Endpoint /deskewed-image/{page} fuer begradigtes Seitenbild
- Frontend: GroundTruthPanel wechselt nach Extraktion auf deskewed Image
- ~1s Overhead durch schnellen Tesseract-Pass auf halbiertem Bild
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -193,6 +193,127 @@ def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
|
||||
return corrected, median_angle
|
||||
|
||||
|
||||
def deskew_image_by_word_alignment(
|
||||
image_data: bytes,
|
||||
lang: str = "eng+deu",
|
||||
downscale_factor: float = 0.5,
|
||||
) -> Tuple[bytes, float]:
|
||||
"""Correct rotation by fitting a line through left-most word starts per text line.
|
||||
|
||||
More robust than Hough-based deskew for vocabulary worksheets where text lines
|
||||
have consistent left-alignment. Runs a quick Tesseract pass on a downscaled
|
||||
copy to find word positions, computes the dominant left-edge column, fits a
|
||||
line through those points and rotates the full-resolution image.
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes (PNG/JPEG).
|
||||
lang: Tesseract language string for the quick pass.
|
||||
downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
|
||||
|
||||
Returns:
|
||||
Tuple of (rotated image as PNG bytes, detected angle in degrees).
|
||||
"""
|
||||
if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
|
||||
return image_data, 0.0
|
||||
|
||||
# 1. Decode image
|
||||
img_array = np.frombuffer(image_data, dtype=np.uint8)
|
||||
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||||
if img is None:
|
||||
logger.warning("deskew_by_word_alignment: could not decode image")
|
||||
return image_data, 0.0
|
||||
|
||||
orig_h, orig_w = img.shape[:2]
|
||||
|
||||
# 2. Downscale for fast Tesseract pass
|
||||
small_w = int(orig_w * downscale_factor)
|
||||
small_h = int(orig_h * downscale_factor)
|
||||
small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
|
||||
|
||||
# 3. Quick Tesseract — word-level positions
|
||||
pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
|
||||
try:
|
||||
data = pytesseract.image_to_data(
|
||||
pil_small, lang=lang, config="--psm 6 --oem 3",
|
||||
output_type=pytesseract.Output.DICT,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
|
||||
return image_data, 0.0
|
||||
|
||||
# 4. Per text-line, find the left-most word start
|
||||
# Group by (block_num, par_num, line_num)
|
||||
from collections import defaultdict
|
||||
line_groups: Dict[tuple, list] = defaultdict(list)
|
||||
for i in range(len(data["text"])):
|
||||
text = (data["text"][i] or "").strip()
|
||||
conf = int(data["conf"][i])
|
||||
if not text or conf < 20:
|
||||
continue
|
||||
key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
|
||||
line_groups[key].append(i)
|
||||
|
||||
if len(line_groups) < 5:
|
||||
logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
|
||||
return image_data, 0.0
|
||||
|
||||
# For each line, pick the word with smallest 'left' → compute (left_x, center_y)
|
||||
# Scale back to original resolution
|
||||
scale = 1.0 / downscale_factor
|
||||
points = [] # list of (x, y) in original-image coords
|
||||
for key, indices in line_groups.items():
|
||||
best_idx = min(indices, key=lambda i: data["left"][i])
|
||||
lx = data["left"][best_idx] * scale
|
||||
top = data["top"][best_idx] * scale
|
||||
h = data["height"][best_idx] * scale
|
||||
cy = top + h / 2.0
|
||||
points.append((lx, cy))
|
||||
|
||||
# 5. Find dominant left-edge column + compute angle
|
||||
xs = np.array([p[0] for p in points])
|
||||
ys = np.array([p[1] for p in points])
|
||||
median_x = float(np.median(xs))
|
||||
tolerance = orig_w * 0.03 # 3% of image width
|
||||
|
||||
mask = np.abs(xs - median_x) <= tolerance
|
||||
filtered_xs = xs[mask]
|
||||
filtered_ys = ys[mask]
|
||||
|
||||
if len(filtered_xs) < 5:
|
||||
logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
|
||||
return image_data, 0.0
|
||||
|
||||
# polyfit: x = a*y + b → a = dx/dy → angle = arctan(a)
|
||||
coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
|
||||
slope = coeffs[0] # dx/dy
|
||||
angle_rad = np.arctan(slope)
|
||||
angle_deg = float(np.degrees(angle_rad))
|
||||
|
||||
# Clamp to ±5°
|
||||
angle_deg = max(-5.0, min(5.0, angle_deg))
|
||||
|
||||
logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}° from {len(filtered_xs)} points "
|
||||
f"(total lines: {len(line_groups)})")
|
||||
|
||||
if abs(angle_deg) < 0.05:
|
||||
return image_data, 0.0
|
||||
|
||||
# 6. Rotate full-res image
|
||||
center = (orig_w // 2, orig_h // 2)
|
||||
M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
|
||||
rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
|
||||
flags=cv2.INTER_LINEAR,
|
||||
borderMode=cv2.BORDER_REPLICATE)
|
||||
|
||||
# Encode back to PNG
|
||||
success, png_buf = cv2.imencode(".png", rotated)
|
||||
if not success:
|
||||
logger.warning("deskew_by_word_alignment: PNG encoding failed")
|
||||
return image_data, 0.0
|
||||
|
||||
return png_buf.tobytes(), angle_deg
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 3: Dewarp (Book Curvature) — Pass-Through for now
|
||||
# =============================================================================
|
||||
|
||||
@@ -2134,7 +2134,22 @@ async def extract_with_boxes(session_id: str, page_number: int):
|
||||
# Convert page to hires image
|
||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||
|
||||
# Extract entries with boxes
|
||||
# Deskew image before OCR
|
||||
deskew_angle = 0.0
|
||||
try:
|
||||
from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
|
||||
if CV2_AVAILABLE:
|
||||
image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
|
||||
logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Deskew failed for page {page_number}: {e}")
|
||||
|
||||
# Cache deskewed image in session for later serving
|
||||
if "deskewed_images" not in session:
|
||||
session["deskewed_images"] = {}
|
||||
session["deskewed_images"][str(page_number)] = image_data
|
||||
|
||||
# Extract entries with boxes (now on deskewed image)
|
||||
result = await extract_entries_with_boxes(image_data)
|
||||
|
||||
# Cache in session
|
||||
@@ -2148,9 +2163,35 @@ async def extract_with_boxes(session_id: str, page_number: int):
|
||||
"entry_count": len(result["entries"]),
|
||||
"image_width": result["image_width"],
|
||||
"image_height": result["image_height"],
|
||||
"deskew_angle": round(deskew_angle, 2),
|
||||
"deskewed": abs(deskew_angle) > 0.05,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/sessions/{session_id}/deskewed-image/{page_number}")
|
||||
async def get_deskewed_image(session_id: str, page_number: int):
|
||||
"""Return the deskewed page image as PNG.
|
||||
|
||||
Falls back to the original hires image if no deskewed version is cached.
|
||||
"""
|
||||
if session_id not in _sessions:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _sessions[session_id]
|
||||
deskewed = session.get("deskewed_images", {}).get(str(page_number))
|
||||
|
||||
if deskewed:
|
||||
return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
|
||||
|
||||
# Fallback: render original hires image
|
||||
pdf_data = session.get("pdf_data")
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||
return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/ground-truth/{page_number}")
|
||||
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
|
||||
"""Save ground truth labels for a page.
|
||||
|
||||
Reference in New Issue
Block a user