feat: add Kombi-Modus (PaddleOCR + Tesseract) for OCR Overlay
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 35s
CI / test-go-edu-search (push) Successful in 33s
CI / test-python-klausur (push) Failing after 2m20s
CI / test-python-agent-core (push) Successful in 22s
CI / test-nodejs-website (push) Successful in 41s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 35s
CI / test-go-edu-search (push) Successful in 33s
CI / test-python-klausur (push) Failing after 2m20s
CI / test-python-agent-core (push) Successful in 22s
CI / test-nodejs-website (push) Successful in 41s
Runs both OCR engines on the preprocessed image and merges results: word boxes matched by IoU, coordinates averaged by confidence weight. Unmatched Tesseract words (bullets, symbols) are added for better coverage. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2599,6 +2599,189 @@ async def paddle_direct(session_id: str):
|
||||
return {"session_id": session_id, **word_result}
|
||||
|
||||
|
||||
def _box_iou(a: dict, b: dict) -> float:
|
||||
"""Compute IoU between two word boxes (each has left, top, width, height)."""
|
||||
ax1, ay1 = a["left"], a["top"]
|
||||
ax2, ay2 = ax1 + a["width"], ay1 + a["height"]
|
||||
bx1, by1 = b["left"], b["top"]
|
||||
bx2, by2 = bx1 + b["width"], by1 + b["height"]
|
||||
|
||||
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
|
||||
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
|
||||
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
||||
if inter == 0:
|
||||
return 0.0
|
||||
area_a = (ax2 - ax1) * (ay2 - ay1)
|
||||
area_b = (bx2 - bx1) * (by2 - by1)
|
||||
return inter / (area_a + area_b - inter) if (area_a + area_b - inter) > 0 else 0.0
|
||||
|
||||
|
||||
def _merge_paddle_tesseract(paddle_words: list, tess_words: list) -> list:
|
||||
"""Merge word boxes from PaddleOCR and Tesseract.
|
||||
|
||||
Matching: IoU > 0.3 between bounding boxes.
|
||||
Merging: Weighted average of coordinates by confidence.
|
||||
"""
|
||||
merged = []
|
||||
used_tess: set = set()
|
||||
|
||||
for pw in paddle_words:
|
||||
best_iou, best_ti = 0.0, -1
|
||||
for ti, tw in enumerate(tess_words):
|
||||
if ti in used_tess:
|
||||
continue
|
||||
iou = _box_iou(pw, tw)
|
||||
if iou > best_iou:
|
||||
best_iou, best_ti = iou, ti
|
||||
|
||||
if best_iou > 0.3 and best_ti >= 0:
|
||||
tw = tess_words[best_ti]
|
||||
used_tess.add(best_ti)
|
||||
pc = pw.get("conf", 80)
|
||||
tc = tw.get("conf", 50)
|
||||
total = pc + tc
|
||||
if total == 0:
|
||||
total = 1
|
||||
merged.append({
|
||||
"text": pw["text"], # Paddle text usually better
|
||||
"left": round((pw["left"] * pc + tw["left"] * tc) / total),
|
||||
"top": round((pw["top"] * pc + tw["top"] * tc) / total),
|
||||
"width": round((pw["width"] * pc + tw["width"] * tc) / total),
|
||||
"height": round((pw["height"] * pc + tw["height"] * tc) / total),
|
||||
"conf": max(pc, tc),
|
||||
})
|
||||
else:
|
||||
merged.append(pw)
|
||||
|
||||
# Add unmatched Tesseract words (bullet points, symbols, etc.)
|
||||
for ti, tw in enumerate(tess_words):
|
||||
if ti not in used_tess and tw.get("conf", 0) >= 40:
|
||||
merged.append(tw)
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
@router.post("/sessions/{session_id}/paddle-kombi")
|
||||
async def paddle_kombi(session_id: str):
|
||||
"""Run PaddleOCR + Tesseract on the preprocessed image and merge results.
|
||||
|
||||
Both engines run on the same preprocessed (cropped/dewarped) image.
|
||||
Word boxes are matched by IoU and coordinates are averaged weighted by
|
||||
confidence. Unmatched Tesseract words (bullets, symbols) are added.
|
||||
"""
|
||||
img_png = await get_session_image(session_id, "cropped")
|
||||
if not img_png:
|
||||
img_png = await get_session_image(session_id, "dewarped")
|
||||
if not img_png:
|
||||
img_png = await get_session_image(session_id, "original")
|
||||
if not img_png:
|
||||
raise HTTPException(status_code=404, detail="No image found for this session")
|
||||
|
||||
img_arr = np.frombuffer(img_png, dtype=np.uint8)
|
||||
img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
|
||||
if img_bgr is None:
|
||||
raise HTTPException(status_code=400, detail="Failed to decode image")
|
||||
|
||||
img_h, img_w = img_bgr.shape[:2]
|
||||
|
||||
from cv_ocr_engines import ocr_region_paddle
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# --- PaddleOCR ---
|
||||
paddle_words = await ocr_region_paddle(img_bgr, region=None)
|
||||
if not paddle_words:
|
||||
paddle_words = []
|
||||
|
||||
# --- Tesseract ---
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
|
||||
pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
|
||||
data = pytesseract.image_to_data(
|
||||
pil_img, lang="eng+deu",
|
||||
config="--psm 6 --oem 3",
|
||||
output_type=pytesseract.Output.DICT,
|
||||
)
|
||||
tess_words = []
|
||||
for i in range(len(data["text"])):
|
||||
text = str(data["text"][i]).strip()
|
||||
conf_raw = str(data["conf"][i])
|
||||
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
|
||||
if not text or conf < 20:
|
||||
continue
|
||||
tess_words.append({
|
||||
"text": text,
|
||||
"left": data["left"][i],
|
||||
"top": data["top"][i],
|
||||
"width": data["width"][i],
|
||||
"height": data["height"][i],
|
||||
"conf": conf,
|
||||
})
|
||||
|
||||
# --- Merge ---
|
||||
if not paddle_words and not tess_words:
|
||||
raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
|
||||
|
||||
merged_words = _merge_paddle_tesseract(paddle_words, tess_words)
|
||||
|
||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
|
||||
duration = time.time() - t0
|
||||
|
||||
for cell in cells:
|
||||
cell["ocr_engine"] = "kombi"
|
||||
|
||||
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
|
||||
n_cols = len(columns_meta)
|
||||
col_types = {c.get("type") for c in columns_meta}
|
||||
is_vocab = bool(col_types & {"column_en", "column_de"})
|
||||
|
||||
word_result = {
|
||||
"cells": cells,
|
||||
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
|
||||
"columns_used": columns_meta,
|
||||
"layout": "vocab" if is_vocab else "generic",
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"duration_seconds": round(duration, 2),
|
||||
"ocr_engine": "kombi",
|
||||
"grid_method": "kombi",
|
||||
"summary": {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||||
"paddle_words": len(paddle_words),
|
||||
"tesseract_words": len(tess_words),
|
||||
"merged_words": len(merged_words),
|
||||
},
|
||||
}
|
||||
|
||||
await update_session_db(
|
||||
session_id,
|
||||
word_result=word_result,
|
||||
cropped_png=img_png,
|
||||
current_step=8,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"paddle_kombi session %s: %d cells (%d rows, %d cols) in %.2fs "
|
||||
"[paddle=%d, tess=%d, merged=%d]",
|
||||
session_id, len(cells), n_rows, n_cols, duration,
|
||||
len(paddle_words), len(tess_words), len(merged_words),
|
||||
)
|
||||
|
||||
await _append_pipeline_log(session_id, "paddle_kombi", {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
||||
"paddle_words": len(paddle_words),
|
||||
"tesseract_words": len(tess_words),
|
||||
"merged_words": len(merged_words),
|
||||
"ocr_engine": "kombi",
|
||||
}, duration_ms=int(duration * 1000))
|
||||
|
||||
return {"session_id": session_id, **word_result}
|
||||
|
||||
|
||||
class WordGroundTruthRequest(BaseModel):
|
||||
is_correct: bool
|
||||
corrected_entries: Optional[List[Dict[str, Any]]] = None
|
||||
|
||||
Reference in New Issue
Block a user