""" CV Review Pipeline — Multi-pass OCR, line alignment, LLM post-correction, and orchestration. Stages 6-8 of the CV vocabulary pipeline plus the main orchestrator. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import time from typing import Any, Dict, List, Optional import numpy as np from cv_vocab_types import ( CV_PIPELINE_AVAILABLE, PageRegion, PipelineResult, VocabRow, ) from cv_preprocessing import ( deskew_image, dewarp_image, render_image_high_res, render_pdf_high_res, ) from cv_layout import ( analyze_layout, create_layout_image, create_ocr_image, ) from cv_ocr_engines import ( _group_words_into_lines, ) logger = logging.getLogger(__name__) try: import cv2 except ImportError: cv2 = None # type: ignore[assignment] try: import pytesseract from PIL import Image except ImportError: pytesseract = None # type: ignore[assignment] Image = None # type: ignore[assignment,misc] # ============================================================================= # Stage 6: Multi-Pass OCR # ============================================================================= def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str, psm: int, fallback_psm: Optional[int] = None, min_confidence: float = 40.0) -> List[Dict[str, Any]]: """Run Tesseract OCR on a specific region with given PSM. Args: ocr_img: Binarized full-page image. region: Region to crop and OCR. lang: Tesseract language string. psm: Page Segmentation Mode. fallback_psm: If confidence too low, retry with this PSM per line. min_confidence: Minimum average confidence before fallback. Returns: List of word dicts with text, position, confidence. """ crop = ocr_img[region.y:region.y + region.height, region.x:region.x + region.width] if crop.size == 0: return [] pil_img = Image.fromarray(crop) config = f'--psm {psm} --oem 3' try: data = pytesseract.image_to_data(pil_img, lang=lang, config=config, output_type=pytesseract.Output.DICT) except Exception as e: logger.warning(f"Tesseract failed for region {region.type}: {e}") return [] words = [] for i in range(len(data['text'])): text = data['text'][i].strip() conf = int(data['conf'][i]) if not text or conf < 10: continue words.append({ 'text': text, 'left': data['left'][i] + region.x, 'top': data['top'][i] + region.y, 'width': data['width'][i], 'height': data['height'][i], 'conf': conf, 'region_type': region.type, }) if words and fallback_psm is not None: avg_conf = sum(w['conf'] for w in words) / len(words) if avg_conf < min_confidence: logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, " f"trying fallback PSM {fallback_psm}") words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm) return words def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion, lang: str, psm: int) -> List[Dict[str, Any]]: """OCR a region line by line (fallback for low-confidence regions).""" crop = ocr_img[region.y:region.y + region.height, region.x:region.x + region.width] if crop.size == 0: return [] inv = cv2.bitwise_not(crop) h_proj = np.sum(inv, axis=1) threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0 lines = [] in_text = False line_start = 0 for y in range(len(h_proj)): if h_proj[y] > threshold and not in_text: line_start = y in_text = True elif h_proj[y] <= threshold and in_text: if y - line_start > 5: lines.append((line_start, y)) in_text = False if in_text and len(h_proj) - line_start > 5: lines.append((line_start, len(h_proj))) all_words = [] config = f'--psm {psm} --oem 3' for line_y_start, line_y_end in lines: pad = 3 y1 = max(0, line_y_start - pad) y2 = min(crop.shape[0], line_y_end + pad) line_crop = crop[y1:y2, :] if line_crop.size == 0: continue pil_img = Image.fromarray(line_crop) try: data = pytesseract.image_to_data(pil_img, lang=lang, config=config, output_type=pytesseract.Output.DICT) except Exception: continue for i in range(len(data['text'])): text = data['text'][i].strip() conf = int(data['conf'][i]) if not text or conf < 10: continue all_words.append({ 'text': text, 'left': data['left'][i] + region.x, 'top': data['top'][i] + region.y + y1, 'width': data['width'][i], 'height': data['height'][i], 'conf': conf, 'region_type': region.type, }) return all_words def run_multi_pass_ocr(ocr_img: np.ndarray, regions: List[PageRegion], lang: str = "eng+deu") -> Dict[str, List[Dict]]: """Run OCR on each detected region with optimized settings.""" results: Dict[str, List[Dict]] = {} _ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'} for region in regions: if region.type in _ocr_skip: continue if region.type == 'column_en': words = ocr_region(ocr_img, region, lang='eng', psm=4) elif region.type == 'column_de': words = ocr_region(ocr_img, region, lang='deu', psm=4) elif region.type == 'column_example': words = ocr_region(ocr_img, region, lang=lang, psm=6, fallback_psm=7, min_confidence=40.0) else: words = ocr_region(ocr_img, region, lang=lang, psm=6) results[region.type] = words logger.info(f"OCR {region.type}: {len(words)} words") return results # ============================================================================= # Stage 7: Line Alignment -> Vocabulary Entries # ============================================================================= def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]], regions: List[PageRegion], y_tolerance_px: int = 25) -> List[VocabRow]: """Align OCR results from different columns into vocabulary rows.""" if 'column_en' not in ocr_results and 'column_de' not in ocr_results: logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty") return [] en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px) de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px) ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px) def line_y_center(line: List[Dict]) -> float: return sum(w['top'] + w['height'] / 2 for w in line) / len(line) def line_text(line: List[Dict]) -> str: return ' '.join(w['text'] for w in line) def line_confidence(line: List[Dict]) -> float: return sum(w['conf'] for w in line) / len(line) if line else 0 vocab_rows: List[VocabRow] = [] for en_line in en_lines: en_y = line_y_center(en_line) en_text = line_text(en_line) en_conf = line_confidence(en_line) if len(en_text.strip()) < 2: continue de_text = "" de_conf = 0.0 best_de_dist = float('inf') best_de_idx = -1 for idx, de_line in enumerate(de_lines): dist = abs(line_y_center(de_line) - en_y) if dist < y_tolerance_px and dist < best_de_dist: best_de_dist = dist best_de_idx = idx if best_de_idx >= 0: de_text = line_text(de_lines[best_de_idx]) de_conf = line_confidence(de_lines[best_de_idx]) ex_text = "" ex_conf = 0.0 best_ex_dist = float('inf') best_ex_idx = -1 for idx, ex_line in enumerate(ex_lines): dist = abs(line_y_center(ex_line) - en_y) if dist < y_tolerance_px and dist < best_ex_dist: best_ex_dist = dist best_ex_idx = idx if best_ex_idx >= 0: ex_text = line_text(ex_lines[best_ex_idx]) ex_conf = line_confidence(ex_lines[best_ex_idx]) avg_conf = en_conf conf_count = 1 if de_conf > 0: avg_conf += de_conf conf_count += 1 if ex_conf > 0: avg_conf += ex_conf conf_count += 1 vocab_rows.append(VocabRow( english=en_text.strip(), german=de_text.strip(), example=ex_text.strip(), confidence=avg_conf / conf_count, y_position=int(en_y), )) # Handle multi-line wrapping in example column matched_ex_ys = set() for row in vocab_rows: if row.example: matched_ex_ys.add(row.y_position) for ex_line in ex_lines: ex_y = line_y_center(ex_line) already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys) if already_matched: continue best_row = None best_dist = float('inf') for row in vocab_rows: dist = ex_y - row.y_position if 0 < dist < y_tolerance_px * 3 and dist < best_dist: best_dist = dist best_row = row if best_row: continuation = line_text(ex_line).strip() if continuation: best_row.example = (best_row.example + " " + continuation).strip() vocab_rows.sort(key=lambda r: r.y_position) return vocab_rows # ============================================================================= # Stage 8: Optional LLM Post-Correction # ============================================================================= async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow], confidence_threshold: float = 50.0, enabled: bool = False) -> List[VocabRow]: """Optionally send low-confidence regions to Qwen-VL for correction.""" if not enabled: return vocab_rows logger.info(f"LLM post-correction skipped (not yet implemented)") return vocab_rows # ============================================================================= # Orchestrator # ============================================================================= async def run_cv_pipeline( pdf_data: Optional[bytes] = None, image_data: Optional[bytes] = None, page_number: int = 0, zoom: float = 3.0, enable_dewarp: bool = True, enable_llm_correction: bool = False, lang: str = "eng+deu", ) -> PipelineResult: """Run the complete CV document reconstruction pipeline.""" if not CV_PIPELINE_AVAILABLE: return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)") result = PipelineResult() total_start = time.time() try: # Stage 1: Render t = time.time() if pdf_data: img = render_pdf_high_res(pdf_data, page_number, zoom) elif image_data: img = render_image_high_res(image_data) else: return PipelineResult(error="No input data (pdf_data or image_data required)") result.stages['render'] = round(time.time() - t, 2) result.image_width = img.shape[1] result.image_height = img.shape[0] logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s") # Stage 2: Deskew t = time.time() img, angle = deskew_image(img) result.stages['deskew'] = round(time.time() - t, 2) logger.info(f"Stage 2 (deskew): {angle:.2f}\u00b0 in {result.stages['deskew']}s") # Stage 3: Dewarp if enable_dewarp: t = time.time() img, _dewarp_info = dewarp_image(img) result.stages['dewarp'] = round(time.time() - t, 2) # Stage 4: Dual image preparation t = time.time() ocr_img = create_ocr_image(img) layout_img = create_layout_image(img) result.stages['image_prep'] = round(time.time() - t, 2) # Stage 5: Layout analysis t = time.time() regions = analyze_layout(layout_img, ocr_img) result.stages['layout'] = round(time.time() - t, 2) result.columns_detected = len([r for r in regions if r.type.startswith('column')]) logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s") # Stage 6: Multi-pass OCR t = time.time() ocr_results = run_multi_pass_ocr(ocr_img, regions, lang) result.stages['ocr'] = round(time.time() - t, 2) total_words = sum(len(w) for w in ocr_results.values()) result.word_count = total_words logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s") # Stage 7: Line alignment t = time.time() vocab_rows = match_lines_to_vocab(ocr_results, regions) result.stages['alignment'] = round(time.time() - t, 2) # Stage 8: Optional LLM correction if enable_llm_correction: t = time.time() vocab_rows = await llm_post_correct(img, vocab_rows) result.stages['llm_correction'] = round(time.time() - t, 2) # Convert to output format result.vocabulary = [ { "english": row.english, "german": row.german, "example": row.example, "confidence": round(row.confidence, 1), } for row in vocab_rows if row.english or row.german ] result.duration_seconds = round(time.time() - total_start, 2) logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s") except Exception as e: logger.error(f"CV Pipeline error: {e}") import traceback logger.debug(traceback.format_exc()) result.error = str(e) result.duration_seconds = round(time.time() - total_start, 2) return result