breakpilot-lehrer/klausur-service/backend/ocr_pipeline_regression_helpers.py

"""
OCR Pipeline Regression Helpers — DB persistence, snapshot building, comparison.

Extracted from ocr_pipeline_regression.py for modularity.

Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""

import json
import logging
import os
import uuid
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional

from ocr_pipeline_session_store import get_pool

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# DB persistence for regression runs
# ---------------------------------------------------------------------------

async def _init_regression_table():
    """Ensure regression_runs table exists (idempotent)."""
    pool = await get_pool()
    async with pool.acquire() as conn:
        migration_path = os.path.join(
            os.path.dirname(__file__),
            "migrations/008_regression_runs.sql",
        )
        if os.path.exists(migration_path):
            with open(migration_path, "r") as f:
                sql = f.read()
            await conn.execute(sql)


async def _persist_regression_run(
    status: str,
    summary: dict,
    results: list,
    duration_ms: int,
    triggered_by: str = "manual",
) -> str:
    """Save a regression run to the database. Returns the run ID."""
    try:
        await _init_regression_table()
        pool = await get_pool()
        run_id = str(uuid.uuid4())
        async with pool.acquire() as conn:
            await conn.execute(
                """
                INSERT INTO regression_runs
                    (id, status, total, passed, failed, errors, duration_ms, results, triggered_by)
                VALUES ($1, $2, $3, $4, $5, $6, $7, $8::jsonb, $9)
                """,
                run_id,
                status,
                summary.get("total", 0),
                summary.get("passed", 0),
                summary.get("failed", 0),
                summary.get("errors", 0),
                duration_ms,
                json.dumps(results),
                triggered_by,
            )
        logger.info("Regression run %s persisted: %s", run_id, status)
        return run_id
    except Exception as e:
        logger.warning("Failed to persist regression run: %s", e)
        return ""


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _extract_cells_for_comparison(grid_result: dict) -> List[Dict[str, Any]]:
    """Extract a flat list of cells from a grid_editor_result for comparison.

    Only keeps fields relevant for comparison: cell_id, row_index, col_index,
    col_type, text.  Ignores confidence, bbox, word_boxes, duration, is_bold.
    """
    cells = []
    for zone in grid_result.get("zones", []):
        for cell in zone.get("cells", []):
            cells.append({
                "cell_id": cell.get("cell_id", ""),
                "row_index": cell.get("row_index"),
                "col_index": cell.get("col_index"),
                "col_type": cell.get("col_type", ""),
                "text": cell.get("text", ""),
            })
    return cells


def _build_reference_snapshot(
    grid_result: dict,
    pipeline: Optional[str] = None,
) -> dict:
    """Build a ground-truth reference snapshot from a grid_editor_result."""
    cells = _extract_cells_for_comparison(grid_result)

    total_zones = len(grid_result.get("zones", []))
    total_columns = sum(len(z.get("columns", [])) for z in grid_result.get("zones", []))
    total_rows = sum(len(z.get("rows", [])) for z in grid_result.get("zones", []))

    snapshot = {
        "saved_at": datetime.now(timezone.utc).isoformat(),
        "version": 1,
        "pipeline": pipeline,
        "summary": {
            "total_zones": total_zones,
            "total_columns": total_columns,
            "total_rows": total_rows,
            "total_cells": len(cells),
        },
        "cells": cells,
    }
    return snapshot


def compare_grids(reference: dict, current: dict) -> dict:
    """Compare a reference grid snapshot with a newly computed one.

    Returns a diff report with:
      - status: "pass" or "fail"
      - structural_diffs: changes in zone/row/column counts
      - cell_diffs: list of individual cell changes
    """
    ref_summary = reference.get("summary", {})
    cur_summary = current.get("summary", {})

    structural_diffs = []
    for key in ("total_zones", "total_columns", "total_rows", "total_cells"):
        ref_val = ref_summary.get(key, 0)
        cur_val = cur_summary.get(key, 0)
        if ref_val != cur_val:
            structural_diffs.append({
                "field": key,
                "reference": ref_val,
                "current": cur_val,
            })

    # Build cell lookup by cell_id
    ref_cells = {c["cell_id"]: c for c in reference.get("cells", [])}
    cur_cells = {c["cell_id"]: c for c in current.get("cells", [])}

    cell_diffs: List[Dict[str, Any]] = []

    # Check for missing cells (in reference but not in current)
    for cell_id in ref_cells:
        if cell_id not in cur_cells:
            cell_diffs.append({
                "type": "cell_missing",
                "cell_id": cell_id,
                "reference_text": ref_cells[cell_id].get("text", ""),
            })

    # Check for added cells (in current but not in reference)
    for cell_id in cur_cells:
        if cell_id not in ref_cells:
            cell_diffs.append({
                "type": "cell_added",
                "cell_id": cell_id,
                "current_text": cur_cells[cell_id].get("text", ""),
            })

    # Check for changes in shared cells
    for cell_id in ref_cells:
        if cell_id not in cur_cells:
            continue
        ref_cell = ref_cells[cell_id]
        cur_cell = cur_cells[cell_id]

        if ref_cell.get("text", "") != cur_cell.get("text", ""):
            cell_diffs.append({
                "type": "text_change",
                "cell_id": cell_id,
                "reference": ref_cell.get("text", ""),
                "current": cur_cell.get("text", ""),
            })

        if ref_cell.get("col_type", "") != cur_cell.get("col_type", ""):
            cell_diffs.append({
                "type": "col_type_change",
                "cell_id": cell_id,
                "reference": ref_cell.get("col_type", ""),
                "current": cur_cell.get("col_type", ""),
            })

    status = "pass" if not structural_diffs and not cell_diffs else "fail"

    return {
        "status": status,
        "structural_diffs": structural_diffs,
        "cell_diffs": cell_diffs,
        "summary": {
            "structural_changes": len(structural_diffs),
            "cells_missing": sum(1 for d in cell_diffs if d["type"] == "cell_missing"),
            "cells_added": sum(1 for d in cell_diffs if d["type"] == "cell_added"),
            "text_changes": sum(1 for d in cell_diffs if d["type"] == "text_change"),
            "col_type_changes": sum(1 for d in cell_diffs if d["type"] == "col_type_change"),
        },
    }