Files
Benjamin Admin e2c2acdf86
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 31s
CI / test-python-klausur (push) Failing after 2m14s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 24s
fix: increase PaddleOCR remote timeout to 120s for large scans
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 13:41:39 +01:00

45 lines
1.3 KiB
Python

"""Remote PaddleOCR client — calls Hetzner service."""
import logging
import os
from typing import Dict, List, Tuple
import httpx
logger = logging.getLogger(__name__)
PADDLEOCR_REMOTE_URL = os.environ.get("PADDLEOCR_REMOTE_URL", "")
PADDLEOCR_API_KEY = os.environ.get("PADDLEOCR_API_KEY", "")
_TIMEOUT = 120.0 # Full-page OCR can take 30-90s on large scans
async def ocr_remote_paddle(
image_bytes: bytes,
filename: str = "scan.png",
) -> Tuple[List[Dict], int, int]:
"""Send image to remote PaddleOCR, return (word_dicts, img_w, img_h)."""
if not PADDLEOCR_REMOTE_URL:
raise RuntimeError("PADDLEOCR_REMOTE_URL not configured")
headers = {}
if PADDLEOCR_API_KEY:
headers["X-API-Key"] = PADDLEOCR_API_KEY
async with httpx.AsyncClient(timeout=_TIMEOUT, verify=False) as client:
resp = await client.post(
f"{PADDLEOCR_REMOTE_URL.rstrip('/')}/ocr",
files={"file": (filename, image_bytes, "image/png")},
headers=headers,
)
resp.raise_for_status()
data = resp.json()
words = data.get("words", [])
logger.info(
"PaddleOCR remote returned %d words (img %dx%d)",
len(words),
data.get("image_width", 0),
data.get("image_height", 0),
)
return words, data["image_width"], data["image_height"]