feat: OCR pipeline v2.1 – narrow column OCR, dewarp automation, Fabric.js editor

Proposal B: Adaptive padding, crop upscaling, PSM selection, row-strip re-OCR for narrow columns (<15% width) – expected accuracy boost 60-70% → 85-90%. Proposal A: New text-line straightness detector (Method D), quality gate (rejects counterproductive corrections), 2-pass projection refinement, higher confidence thresholds – expected manual dewarp reduction to <10%. Proposal C: Fabric.js canvas editor with drag/drop, inline editing, undo/redo, opacity slider, zoom, PDF/DOCX export endpoints. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-03 22:44:14 +01:00
parent 970ec1f548
commit ab3ecc7c08
7 changed files with 1105 additions and 128 deletions
@@ -350,6 +350,77 @@ def layout_to_fabric_json(layout_result: LayoutResult) -> str:
    return json.dumps(layout_result.fabric_json, ensure_ascii=False, indent=2)


+def cells_to_fabric_json(
+    cells: List[Dict[str, Any]],
+    image_width: int,
+    image_height: int,
+) -> Dict[str, Any]:
+    """Convert pipeline grid cells to Fabric.js-compatible JSON.
+
+    Each cell becomes a Textbox object positioned at its bbox_pct coordinates
+    (converted to pixels). Colour-coded by column type.
+
+    Args:
+        cells: List of cell dicts from GridResult (with bbox_pct, col_type, text).
+        image_width: Source image width in pixels.
+        image_height: Source image height in pixels.
+
+    Returns:
+        Dict with Fabric.js canvas JSON (version + objects array).
+    """
+    COL_TYPE_COLORS = {
+        'column_en': '#3b82f6',
+        'column_de': '#22c55e',
+        'column_example': '#f97316',
+        'column_text': '#a855f7',
+        'page_ref': '#06b6d4',
+        'column_marker': '#6b7280',
+    }
+
+    fabric_objects = []
+    for cell in cells:
+        bp = cell.get('bbox_pct', {})
+        x = bp.get('x', 0) / 100 * image_width
+        y = bp.get('y', 0) / 100 * image_height
+        w = bp.get('w', 10) / 100 * image_width
+        h = bp.get('h', 3) / 100 * image_height
+        col_type = cell.get('col_type', '')
+        color = COL_TYPE_COLORS.get(col_type, '#6b7280')
+        font_size = max(8, min(18, h * 0.55))
+
+        fabric_objects.append({
+            "type": "textbox",
+            "version": "6.0.0",
+            "originX": "left",
+            "originY": "top",
+            "left": round(x, 1),
+            "top": round(y, 1),
+            "width": max(round(w, 1), 30),
+            "height": round(h, 1),
+            "fill": "#000000",
+            "stroke": color,
+            "strokeWidth": 1,
+            "text": cell.get('text', ''),
+            "fontSize": round(font_size, 1),
+            "fontFamily": "monospace",
+            "editable": True,
+            "selectable": True,
+            "backgroundColor": color + "22",
+            "data": {
+                "cellId": cell.get('cell_id', ''),
+                "colType": col_type,
+                "rowIndex": cell.get('row_index', 0),
+                "colIndex": cell.get('col_index', 0),
+                "originalText": cell.get('text', ''),
+            },
+        })
+
+    return {
+        "version": "6.0.0",
+        "objects": fabric_objects,
+    }
+
+
 def reconstruct_and_clean(
    image_bytes: bytes,
    remove_handwriting: bool = True