[split-required] Split 500-1000 LOC files across all services

backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 23:35:37 +02:00
parent 6811264756
commit b6983ab1dc
99 changed files with 13484 additions and 16106 deletions
--- a/klausur-service/backend/ocr_pipeline_words_detect.py
+++ b/klausur-service/backend/ocr_pipeline_words_detect.py
@@ -0,0 +1,393 @@
+"""
+OCR Pipeline Words Detect — main word detection endpoint (Step 7).
+
+Extracted from ocr_pipeline_words.py. Contains the ``detect_words``
+endpoint which handles both v2 and words_first grid methods.
+
+Lizenz: Apache 2.0
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import time
+from typing import Any, Dict, List
+
+import numpy as np
+from fastapi import APIRouter, HTTPException, Request
+from fastapi.responses import StreamingResponse
+
+from cv_vocab_pipeline import (
+    PageRegion,
+    RowGeometry,
+    _cells_to_vocab_entries,
+    _fix_phonetic_brackets,
+    fix_cell_phonetics,
+    build_cell_grid_v2,
+    create_ocr_image,
+    detect_column_geometry,
+)
+from cv_words_first import build_grid_from_words
+from ocr_pipeline_session_store import (
+    get_session_db,
+    update_session_db,
+)
+from ocr_pipeline_common import (
+    _cache,
+    _load_session_to_cache,
+    _get_cached,
+    _append_pipeline_log,
+)
+from ocr_pipeline_words_stream import (
+    _word_batch_stream_generator,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
+
+
+# ---------------------------------------------------------------------------
+# Word Detection Endpoint (Step 7)
+# ---------------------------------------------------------------------------
+
+@router.post("/sessions/{session_id}/words")
+async def detect_words(
+    session_id: str,
+    request: Request,
+    engine: str = "auto",
+    pronunciation: str = "british",
+    stream: bool = False,
+    skip_heal_gaps: bool = False,
+    grid_method: str = "v2",
+):
+    """Build word grid from columns x rows, OCR each cell.
+
+    Query params:
+        engine: 'auto' (default), 'tesseract', 'rapid', or 'paddle'
+        pronunciation: 'british' (default) or 'american'
+        stream: false (default) for JSON response, true for SSE streaming
+        skip_heal_gaps: false (default). When true, cells keep exact row geometry.
+        grid_method: 'v2' (default) or 'words_first'
+    """
+    # PaddleOCR is full-page remote OCR -> force words_first grid method
+    if engine == "paddle" and grid_method != "words_first":
+        logger.info("detect_words: engine=paddle requires words_first, overriding grid_method=%s", grid_method)
+        grid_method = "words_first"
+
+    if session_id not in _cache:
+        logger.info("detect_words: session %s not in cache, loading from DB", session_id)
+        await _load_session_to_cache(session_id)
+    cached = _get_cached(session_id)
+
+    dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
+    if dewarped_bgr is None:
+        logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
+                       session_id, [k for k in cached.keys() if k.endswith('_bgr')])
+        raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")
+
+    session = await get_session_db(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+    column_result = session.get("column_result")
+    row_result = session.get("row_result")
+    if not column_result or not column_result.get("columns"):
+        img_h_tmp, img_w_tmp = dewarped_bgr.shape[:2]
+        column_result = {
+            "columns": [{
+                "type": "column_text",
+                "x": 0, "y": 0,
+                "width": img_w_tmp, "height": img_h_tmp,
+                "classification_confidence": 1.0,
+                "classification_method": "full_page_fallback",
+            }],
+            "zones": [],
+            "duration_seconds": 0,
+        }
+        logger.info("detect_words: no column_result -- using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
+    if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
+        raise HTTPException(status_code=400, detail="Row detection must be completed first")
+
+    # Convert column dicts back to PageRegion objects
+    col_regions = [
+        PageRegion(
+            type=c["type"],
+            x=c["x"], y=c["y"],
+            width=c["width"], height=c["height"],
+            classification_confidence=c.get("classification_confidence", 1.0),
+            classification_method=c.get("classification_method", ""),
+        )
+        for c in column_result["columns"]
+    ]
+
+    # Convert row dicts back to RowGeometry objects
+    row_geoms = [
+        RowGeometry(
+            index=r["index"],
+            x=r["x"], y=r["y"],
+            width=r["width"], height=r["height"],
+            word_count=r.get("word_count", 0),
+            words=[],
+            row_type=r.get("row_type", "content"),
+            gap_before=r.get("gap_before", 0),
+        )
+        for r in row_result["rows"]
+    ]
+
+    # Populate word counts from cached words
+    word_dicts = cached.get("_word_dicts")
+    if word_dicts is None:
+        ocr_img_tmp = create_ocr_image(dewarped_bgr)
+        geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
+        if geo_result is not None:
+            _geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
+            cached["_word_dicts"] = word_dicts
+            cached["_inv"] = inv
+            cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
+
+    if word_dicts:
+        content_bounds = cached.get("_content_bounds")
+        if content_bounds:
+            _lx, _rx, top_y, _by = content_bounds
+        else:
+            top_y = min(r.y for r in row_geoms) if row_geoms else 0
+
+        for row in row_geoms:
+            row_y_rel = row.y - top_y
+            row_bottom_rel = row_y_rel + row.height
+            row.words = [
+                w for w in word_dicts
+                if row_y_rel <= w['top'] + w['height'] / 2 < row_bottom_rel
+            ]
+            row.word_count = len(row.words)
+
+    # Exclude rows that fall within box zones
+    zones = column_result.get("zones") or []
+    box_ranges_inner = []
+    for zone in zones:
+        if zone.get("zone_type") == "box" and zone.get("box"):
+            box = zone["box"]
+            bt = max(box.get("border_thickness", 0), 5)
+            box_ranges_inner.append((box["y"] + bt, box["y"] + box["height"] - bt))
+
+    if box_ranges_inner:
+        def _row_in_box(r):
+            center_y = r.y + r.height / 2
+            return any(by_s <= center_y < by_e for by_s, by_e in box_ranges_inner)
+
+        before_count = len(row_geoms)
+        row_geoms = [r for r in row_geoms if not _row_in_box(r)]
+        excluded = before_count - len(row_geoms)
+        if excluded:
+            logger.info(f"detect_words: excluded {excluded} rows inside box zones")
+
+    # --- Words-First path ---
+    if grid_method == "words_first":
+        return await _words_first_path(
+            session_id, cached, dewarped_bgr, engine, pronunciation, zones,
+        )
+
+    if stream:
+        return StreamingResponse(
+            _word_batch_stream_generator(
+                session_id, cached, col_regions, row_geoms,
+                dewarped_bgr, engine, pronunciation, request,
+                skip_heal_gaps=skip_heal_gaps,
+            ),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no",
+            },
+        )
+
+    # --- Non-streaming path (grid_method=v2) ---
+    return await _v2_path(
+        session_id, cached, col_regions, row_geoms,
+        dewarped_bgr, engine, pronunciation, skip_heal_gaps,
+    )
+
+
+async def _words_first_path(
+    session_id: str,
+    cached: Dict[str, Any],
+    dewarped_bgr: np.ndarray,
+    engine: str,
+    pronunciation: str,
+    zones: list,
+) -> dict:
+    """Words-first grid construction path."""
+    t0 = time.time()
+    img_h, img_w = dewarped_bgr.shape[:2]
+
+    if engine == "paddle":
+        from cv_ocr_engines import ocr_region_paddle
+        wf_word_dicts = await ocr_region_paddle(dewarped_bgr, region=None)
+        cached["_paddle_word_dicts"] = wf_word_dicts
+    else:
+        wf_word_dicts = cached.get("_word_dicts")
+        if wf_word_dicts is None:
+            ocr_img_tmp = create_ocr_image(dewarped_bgr)
+            geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
+            if geo_result is not None:
+                _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
+                cached["_word_dicts"] = wf_word_dicts
+                cached["_inv"] = inv
+                cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
+
+    if not wf_word_dicts:
+        raise HTTPException(status_code=400, detail="No words detected -- cannot build words-first grid")
+
+    # Convert word coordinates to absolute if needed
+    if engine != "paddle":
+        content_bounds = cached.get("_content_bounds")
+        if content_bounds:
+            lx, _rx, ty, _by = content_bounds
+            abs_words = []
+            for w in wf_word_dicts:
+                abs_words.append({**w, 'left': w['left'] + lx, 'top': w['top'] + ty})
+            wf_word_dicts = abs_words
+
+    box_rects = []
+    for zone in zones:
+        if zone.get("zone_type") == "box" and zone.get("box"):
+            box_rects.append(zone["box"])
+
+    cells, columns_meta = build_grid_from_words(
+        wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
+    )
+    duration = time.time() - t0
+
+    fix_cell_phonetics(cells, pronunciation=pronunciation)
+    for cell in cells:
+        cell.setdefault("zone_index", 0)
+
+    col_types = {c['type'] for c in columns_meta}
+    is_vocab = bool(col_types & {'column_en', 'column_de'})
+    n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
+    n_cols = len(columns_meta)
+    used_engine = "paddle" if engine == "paddle" else "words_first"
+
+    word_result = {
+        "cells": cells,
+        "grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
+        "columns_used": columns_meta,
+        "layout": "vocab" if is_vocab else "generic",
+        "image_width": img_w,
+        "image_height": img_h,
+        "duration_seconds": round(duration, 2),
+        "ocr_engine": used_engine,
+        "grid_method": "words_first",
+        "summary": {
+            "total_cells": len(cells),
+            "non_empty_cells": sum(1 for c in cells if c.get("text")),
+            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
+        },
+    }
+
+    if is_vocab or 'column_text' in col_types:
+        entries = _cells_to_vocab_entries(cells, columns_meta)
+        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+        word_result["vocab_entries"] = entries
+        word_result["entries"] = entries
+        word_result["entry_count"] = len(entries)
+        word_result["summary"]["total_entries"] = len(entries)
+        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
+        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
+
+    await update_session_db(session_id, word_result=word_result, current_step=8)
+    cached["word_result"] = word_result
+
+    logger.info(f"OCR Pipeline: words-first session {session_id}: "
+                f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
+
+    await _append_pipeline_log(session_id, "words", {
+        "grid_method": "words_first",
+        "total_cells": len(cells),
+        "non_empty_cells": word_result["summary"]["non_empty_cells"],
+        "ocr_engine": used_engine,
+        "layout": word_result["layout"],
+    }, duration_ms=int(duration * 1000))
+
+    return {"session_id": session_id, **word_result}
+
+
+async def _v2_path(
+    session_id: str,
+    cached: Dict[str, Any],
+    col_regions: List[PageRegion],
+    row_geoms: List[RowGeometry],
+    dewarped_bgr: np.ndarray,
+    engine: str,
+    pronunciation: str,
+    skip_heal_gaps: bool,
+) -> dict:
+    """Cell-First OCR v2 non-streaming path."""
+    t0 = time.time()
+    ocr_img = create_ocr_image(dewarped_bgr)
+    img_h, img_w = dewarped_bgr.shape[:2]
+
+    cells, columns_meta = build_cell_grid_v2(
+        ocr_img, col_regions, row_geoms, img_w, img_h,
+        ocr_engine=engine, img_bgr=dewarped_bgr,
+        skip_heal_gaps=skip_heal_gaps,
+    )
+    duration = time.time() - t0
+
+    for cell in cells:
+        cell.setdefault("zone_index", 0)
+
+    col_types = {c['type'] for c in columns_meta}
+    is_vocab = bool(col_types & {'column_en', 'column_de'})
+    n_content_rows = len([r for r in row_geoms if r.row_type == 'content'])
+    n_cols = len(columns_meta)
+    used_engine = cells[0].get("ocr_engine", "tesseract") if cells else engine
+
+    fix_cell_phonetics(cells, pronunciation=pronunciation)
+
+    word_result = {
+        "cells": cells,
+        "grid_shape": {"rows": n_content_rows, "cols": n_cols, "total_cells": len(cells)},
+        "columns_used": columns_meta,
+        "layout": "vocab" if is_vocab else "generic",
+        "image_width": img_w,
+        "image_height": img_h,
+        "duration_seconds": round(duration, 2),
+        "ocr_engine": used_engine,
+        "summary": {
+            "total_cells": len(cells),
+            "non_empty_cells": sum(1 for c in cells if c.get("text")),
+            "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
+        },
+    }
+
+    has_text_col = 'column_text' in col_types
+    if is_vocab or has_text_col:
+        entries = _cells_to_vocab_entries(cells, columns_meta)
+        entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+        word_result["vocab_entries"] = entries
+        word_result["entries"] = entries
+        word_result["entry_count"] = len(entries)
+        word_result["summary"]["total_entries"] = len(entries)
+        word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
+        word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
+
+    await update_session_db(session_id, word_result=word_result, current_step=8)
+    cached["word_result"] = word_result
+
+    logger.info(f"OCR Pipeline: words session {session_id}: "
+                f"layout={word_result['layout']}, "
+                f"{len(cells)} cells ({duration:.2f}s), summary: {word_result['summary']}")
+
+    await _append_pipeline_log(session_id, "words", {
+        "total_cells": len(cells),
+        "non_empty_cells": word_result["summary"]["non_empty_cells"],
+        "low_confidence_count": word_result["summary"]["low_confidence"],
+        "ocr_engine": used_engine,
+        "layout": word_result["layout"],
+        "entry_count": word_result.get("entry_count", 0),
+    }, duration_ms=int(duration * 1000))
+
+    return {"session_id": session_id, **word_result}