""" Hybrid Vocab OCR - PaddleOCR integration and result parsing. Handles: - PaddleOCR lazy loading and initialization - Running OCR on image bytes - Parsing PaddleOCR v3 dict and traditional list formats - Grouping regions by rows and detecting columns """ import io import logging from typing import List, Tuple from dataclasses import dataclass import numpy as np from PIL import Image # OpenCV is optional try: import cv2 CV2_AVAILABLE = True except ImportError: cv2 = None CV2_AVAILABLE = False logger = logging.getLogger(__name__) _paddle_ocr = None @dataclass class OCRRegion: """Ein erkannter Textbereich mit Position.""" text: str confidence: float x1: int y1: int x2: int y2: int @property def center_x(self) -> int: return (self.x1 + self.x2) // 2 @property def center_y(self) -> int: return (self.y1 + self.y2) // 2 def get_paddle_ocr(): """Lazy load PaddleOCR to avoid startup delay.""" global _paddle_ocr if _paddle_ocr is None: try: from paddleocr import PaddleOCR import logging as std_logging for logger_name in ['ppocr', 'paddle', 'paddleocr', 'root']: std_logging.getLogger(logger_name).setLevel(std_logging.WARNING) try: _paddle_ocr = PaddleOCR(lang="de") logger.info("PaddleOCR 3.x initialized (lang=de)") except Exception as e1: logger.warning(f"PaddleOCR lang=de failed: {e1}") try: _paddle_ocr = PaddleOCR(lang="en") logger.info("PaddleOCR 3.x initialized (lang=en)") except Exception as e2: logger.warning(f"PaddleOCR lang=en failed: {e2}") _paddle_ocr = PaddleOCR() logger.info("PaddleOCR 3.x initialized (defaults)") except Exception as e: logger.error(f"PaddleOCR initialization failed: {e}") _paddle_ocr = None return _paddle_ocr def preprocess_image(img: Image.Image) -> np.ndarray: """Bildvorverarbeitung fuer bessere OCR-Ergebnisse.""" if not CV2_AVAILABLE: raise ImportError( "OpenCV (cv2) is required for image preprocessing. " "Install with: pip install opencv-python-headless" ) img_array = np.array(img) if len(img_array.shape) == 2: img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB) elif img_array.shape[2] == 4: img_array = cv2.cvtColor(img_array, cv2.COLOR_RGBA2RGB) return img_array def run_paddle_ocr(image_bytes: bytes) -> Tuple[List[OCRRegion], str]: """Fuehrt PaddleOCR auf einem Bild aus.""" ocr = get_paddle_ocr() if ocr is None: logger.error("PaddleOCR not available") return [], "" try: img = Image.open(io.BytesIO(image_bytes)) img_array = preprocess_image(img) try: result = ocr.ocr(img_array) except TypeError: logger.warning("Trying alternative OCR call method") result = ocr.ocr(img_array) if not result: logger.warning("PaddleOCR returned empty result") return [], "" if isinstance(result, dict): logger.info("Processing PaddleOCR 3.x dict format") return _parse_paddleocr_v3_dict(result) elif isinstance(result, list) and len(result) > 0: first_item = result[0] if first_item is None: logger.warning("PaddleOCR returned None for first page") return [], "" if hasattr(first_item, 'get') or isinstance(first_item, dict): item_dict = dict(first_item) if hasattr(first_item, 'items') else first_item if 'rec_texts' in item_dict or 'texts' in item_dict: logger.info("Processing PaddleOCR 3.x OCRResult format") return _parse_paddleocr_v3_dict(item_dict) if isinstance(first_item, list): if len(first_item) > 0 and isinstance(first_item[0], (list, tuple)): logger.info("Processing PaddleOCR traditional list format") return _parse_paddleocr_list(first_item) logger.warning(f"Unknown result format. Type: {type(first_item)}") try: item_dict = dict(first_item) if 'rec_texts' in item_dict: return _parse_paddleocr_v3_dict(item_dict) except Exception as e: logger.warning(f"Could not convert to dict: {e}") return [], "" else: logger.warning(f"Unexpected PaddleOCR result type: {type(result)}") return [], "" except Exception as e: logger.error(f"PaddleOCR execution failed: {e}") import traceback logger.error(traceback.format_exc()) return [], "" def _parse_paddleocr_v3_dict(result: dict) -> Tuple[List[OCRRegion], str]: """Parse PaddleOCR 3.x dict format result.""" regions = [] all_text_lines = [] texts = result.get('rec_texts', result.get('texts', [])) scores = result.get('rec_scores', result.get('scores', [])) polys = result.get('dt_polys', result.get('boxes', [])) rec_boxes = result.get('rec_boxes', []) logger.info(f"PaddleOCR 3.x: {len(texts)} texts, {len(scores)} scores, {len(polys)} polys, {len(rec_boxes)} rec_boxes") for i, (text, score) in enumerate(zip(texts, scores)): if not text or not str(text).strip(): continue x1, y1, x2, y2 = 0, 0, 100, 50 if i < len(rec_boxes) and rec_boxes[i] is not None: box = rec_boxes[i] try: if hasattr(box, 'flatten'): box = box.flatten().tolist() if len(box) >= 4: x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3]) except Exception as e: logger.debug(f"Could not parse rec_box: {e}") elif i < len(polys) and polys[i] is not None: poly = polys[i] try: if hasattr(poly, 'tolist'): poly = poly.tolist() if len(poly) >= 4: x_coords = [p[0] for p in poly] y_coords = [p[1] for p in poly] x1, y1 = int(min(x_coords)), int(min(y_coords)) x2, y2 = int(max(x_coords)), int(max(y_coords)) except Exception as e: logger.debug(f"Could not parse polygon: {e}") region = OCRRegion( text=text.strip(), confidence=float(score) if score else 0.5, x1=x1, y1=y1, x2=x2, y2=y2 ) regions.append(region) all_text_lines.append(text.strip()) regions.sort(key=lambda r: r.y1) raw_text = "\n".join(all_text_lines) logger.info(f"PaddleOCR 3.x extracted {len(regions)} text regions") return regions, raw_text def _parse_paddleocr_list(page_result: list) -> Tuple[List[OCRRegion], str]: """Parse PaddleOCR traditional list format result.""" regions = [] all_text_lines = [] for line in page_result: if not line or len(line) < 2: continue bbox_points = line[0] text_info = line[1] if isinstance(text_info, tuple) and len(text_info) >= 2: text, confidence = text_info[0], text_info[1] elif isinstance(text_info, str): text, confidence = text_info, 0.5 else: continue if not text or not text.strip(): continue x_coords = [p[0] for p in bbox_points] y_coords = [p[1] for p in bbox_points] region = OCRRegion( text=text.strip(), confidence=float(confidence), x1=int(min(x_coords)), y1=int(min(y_coords)), x2=int(max(x_coords)), y2=int(max(y_coords)) ) regions.append(region) all_text_lines.append(text.strip()) regions.sort(key=lambda r: r.y1) raw_text = "\n".join(all_text_lines) logger.info(f"PaddleOCR extracted {len(regions)} text regions") return regions, raw_text def group_regions_by_rows(regions: List[OCRRegion], y_tolerance: int = 20) -> List[List[OCRRegion]]: """Gruppiert Textregionen in Zeilen basierend auf Y-Position.""" if not regions: return [] rows = [] current_row = [regions[0]] current_y = regions[0].center_y for region in regions[1:]: if abs(region.center_y - current_y) <= y_tolerance: current_row.append(region) else: current_row.sort(key=lambda r: r.x1) rows.append(current_row) current_row = [region] current_y = region.center_y if current_row: current_row.sort(key=lambda r: r.x1) rows.append(current_row) return rows def detect_columns(rows: List[List[OCRRegion]]) -> int: """Erkennt die Anzahl der Spalten basierend auf den Textpositionen.""" if not rows: return 2 items_per_row = [len(row) for row in rows if len(row) >= 2] if not items_per_row: return 2 avg_items = sum(items_per_row) / len(items_per_row) return 3 if avg_items >= 2.5 else 2 def format_ocr_for_llm(regions: List[OCRRegion]) -> str: """Formatiert OCR-Output fuer LLM-Verarbeitung.""" rows = group_regions_by_rows(regions) num_columns = detect_columns(rows) lines = [f"Erkannte Spalten: {num_columns}", "---"] for row in rows: if len(row) >= 2: lines.append("\t".join(r.text for r in row)) elif len(row) == 1: lines.append(row[0].text) return "\n".join(lines)