""" OCR Pipeline Regression Helpers — DB persistence, snapshot building, comparison. Extracted from ocr_pipeline_regression.py for modularity. Lizenz: Apache 2.0 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import json import logging import os import uuid from datetime import datetime, timezone from typing import Any, Dict, List, Optional from ocr_pipeline_session_store import get_pool logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # DB persistence for regression runs # --------------------------------------------------------------------------- async def _init_regression_table(): """Ensure regression_runs table exists (idempotent).""" pool = await get_pool() async with pool.acquire() as conn: migration_path = os.path.join( os.path.dirname(__file__), "migrations/008_regression_runs.sql", ) if os.path.exists(migration_path): with open(migration_path, "r") as f: sql = f.read() await conn.execute(sql) async def _persist_regression_run( status: str, summary: dict, results: list, duration_ms: int, triggered_by: str = "manual", ) -> str: """Save a regression run to the database. Returns the run ID.""" try: await _init_regression_table() pool = await get_pool() run_id = str(uuid.uuid4()) async with pool.acquire() as conn: await conn.execute( """ INSERT INTO regression_runs (id, status, total, passed, failed, errors, duration_ms, results, triggered_by) VALUES ($1, $2, $3, $4, $5, $6, $7, $8::jsonb, $9) """, run_id, status, summary.get("total", 0), summary.get("passed", 0), summary.get("failed", 0), summary.get("errors", 0), duration_ms, json.dumps(results), triggered_by, ) logger.info("Regression run %s persisted: %s", run_id, status) return run_id except Exception as e: logger.warning("Failed to persist regression run: %s", e) return "" # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _extract_cells_for_comparison(grid_result: dict) -> List[Dict[str, Any]]: """Extract a flat list of cells from a grid_editor_result for comparison. Only keeps fields relevant for comparison: cell_id, row_index, col_index, col_type, text. Ignores confidence, bbox, word_boxes, duration, is_bold. """ cells = [] for zone in grid_result.get("zones", []): for cell in zone.get("cells", []): cells.append({ "cell_id": cell.get("cell_id", ""), "row_index": cell.get("row_index"), "col_index": cell.get("col_index"), "col_type": cell.get("col_type", ""), "text": cell.get("text", ""), }) return cells def _build_reference_snapshot( grid_result: dict, pipeline: Optional[str] = None, ) -> dict: """Build a ground-truth reference snapshot from a grid_editor_result.""" cells = _extract_cells_for_comparison(grid_result) total_zones = len(grid_result.get("zones", [])) total_columns = sum(len(z.get("columns", [])) for z in grid_result.get("zones", [])) total_rows = sum(len(z.get("rows", [])) for z in grid_result.get("zones", [])) snapshot = { "saved_at": datetime.now(timezone.utc).isoformat(), "version": 1, "pipeline": pipeline, "summary": { "total_zones": total_zones, "total_columns": total_columns, "total_rows": total_rows, "total_cells": len(cells), }, "cells": cells, } return snapshot def compare_grids(reference: dict, current: dict) -> dict: """Compare a reference grid snapshot with a newly computed one. Returns a diff report with: - status: "pass" or "fail" - structural_diffs: changes in zone/row/column counts - cell_diffs: list of individual cell changes """ ref_summary = reference.get("summary", {}) cur_summary = current.get("summary", {}) structural_diffs = [] for key in ("total_zones", "total_columns", "total_rows", "total_cells"): ref_val = ref_summary.get(key, 0) cur_val = cur_summary.get(key, 0) if ref_val != cur_val: structural_diffs.append({ "field": key, "reference": ref_val, "current": cur_val, }) # Build cell lookup by cell_id ref_cells = {c["cell_id"]: c for c in reference.get("cells", [])} cur_cells = {c["cell_id"]: c for c in current.get("cells", [])} cell_diffs: List[Dict[str, Any]] = [] # Check for missing cells (in reference but not in current) for cell_id in ref_cells: if cell_id not in cur_cells: cell_diffs.append({ "type": "cell_missing", "cell_id": cell_id, "reference_text": ref_cells[cell_id].get("text", ""), }) # Check for added cells (in current but not in reference) for cell_id in cur_cells: if cell_id not in ref_cells: cell_diffs.append({ "type": "cell_added", "cell_id": cell_id, "current_text": cur_cells[cell_id].get("text", ""), }) # Check for changes in shared cells for cell_id in ref_cells: if cell_id not in cur_cells: continue ref_cell = ref_cells[cell_id] cur_cell = cur_cells[cell_id] if ref_cell.get("text", "") != cur_cell.get("text", ""): cell_diffs.append({ "type": "text_change", "cell_id": cell_id, "reference": ref_cell.get("text", ""), "current": cur_cell.get("text", ""), }) if ref_cell.get("col_type", "") != cur_cell.get("col_type", ""): cell_diffs.append({ "type": "col_type_change", "cell_id": cell_id, "reference": ref_cell.get("col_type", ""), "current": cur_cell.get("col_type", ""), }) status = "pass" if not structural_diffs and not cell_diffs else "fail" return { "status": status, "structural_diffs": structural_diffs, "cell_diffs": cell_diffs, "summary": { "structural_changes": len(structural_diffs), "cells_missing": sum(1 for d in cell_diffs if d["type"] == "cell_missing"), "cells_added": sum(1 for d in cell_diffs if d["type"] == "cell_added"), "text_changes": sum(1 for d in cell_diffs if d["type"] == "text_change"), "col_type_changes": sum(1 for d in cell_diffs if d["type"] == "col_type_change"), }, }