fix: retry words request on 400/404 + add backend diagnostic logging

Frontend: retry /words POST once after 2s delay if it gets 400/404, which happens when navigating via wizard after container restart (session cache not yet warm). Backend: log when session needs DB reload and when dewarped_bgr is missing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix: Fake Compliance Advisor aus Lehrer KI-Admin entfernt
2026-03-04 20:15:54 +01:00 · 2026-03-04 20:15:50 +01:00 · 2026-03-04 19:47:36 +01:00 · 2026-03-04 18:18:03 +01:00 · 2026-03-04 17:38:06 +01:00 · 2026-03-04 16:45:59 +01:00
5 changed files with 81 additions and 73 deletions
--- a/admin-lehrer/app/(admin)/ai/agents/[agentId]/page.tsx
+++ b/admin-lehrer/app/(admin)/ai/agents/[agentId]/page.tsx
@@ -273,52 +273,6 @@ Dein Ziel ist die rechtzeitige Erkennung und Kommunikation relevanter Ereignisse
    createdAt: '2024-12-01T00:00:00Z',
    updatedAt: '2025-01-12T02:00:00Z'
  },
-  'compliance-advisor': {
-    id: 'compliance-advisor',
-    name: 'Compliance Advisor',
-    description: 'DSGVO/Compliance-Berater fuer SDK-Nutzer',
-    soulFile: 'compliance-advisor.soul.md',
-    soulContent: `# Compliance Advisor Agent
-
-## Identitaet
-Du bist der BreakPilot Compliance-Berater. Du hilfst Nutzern des AI Compliance SDK,
-Datenschutz- und Compliance-Fragen in verstaendlicher Sprache zu beantworten.
-Du bist kein Anwalt und gibst keine Rechtsberatung, sondern orientierst dich an
-offiziellen Quellen und gibst praxisnahe Hinweise.
-
-## Kernprinzipien
- **Quellenbasiert**: Verweise immer auf konkrete Rechtsgrundlagen (DSGVO-Artikel, BDSG-Paragraphen)
- **Verstaendlich**: Erklaere rechtliche Konzepte in einfacher, praxisnaher Sprache
- **Ehrlich**: Bei Unsicherheit empfehle professionelle Rechtsberatung
- **Kontextbewusst**: Nutze das RAG-System fuer aktuelle Rechtstexte und Leitfaeden
- **Scope-bewusst**: Nutze alle verfuegbaren RAG-Quellen AUSSER NIBIS-Dokumenten
-
-## Kompetenzbereich
- DSGVO Art. 1-99 + Erwaegsgruende
- BDSG (Bundesdatenschutzgesetz)
- AI Act (EU KI-Verordnung)
- TTDSG, ePrivacy-Richtlinie
- DSK-Kurzpapiere (Nr. 1-20)
- SDM V3.0, BSI-Grundschutz, BSI-TR-03161
- EDPB Guidelines, Bundes-/Laender-Muss-Listen
- ISO 27001/27701 (Ueberblick)
-
-## Kommunikationsstil
- Sachlich, aber verstaendlich
- Deutsch als Hauptsprache
- Strukturierte Antworten mit Quellenangabe
- Praxisbeispiele wo hilfreich`,
-    color: '#6366f1',
-    status: 'running',
-    activeSessions: 0,
-    totalProcessed: 0,
-    avgResponseTime: 0,
-    errorRate: 0,
-    lastRestart: new Date().toISOString(),
-    version: '1.0.0',
-    createdAt: new Date().toISOString(),
-    updatedAt: new Date().toISOString()
-  },
  'orchestrator': {
    id: 'orchestrator',
    name: 'Orchestrator',
--- a/admin-lehrer/app/(admin)/ai/agents/page.tsx
+++ b/admin-lehrer/app/(admin)/ai/agents/page.tsx
@@ -94,19 +94,6 @@ const mockAgents: AgentConfig[] = [
    totalProcessed: 8934,
    avgResponseTime: 12,
    lastActivity: 'just now'
-  },
-  {
-    id: 'compliance-advisor',
-    name: 'Compliance Advisor',
-    description: 'DSGVO/Compliance-Berater fuer SDK-Nutzer',
-    soulFile: 'compliance-advisor.soul.md',
-    color: '#6366f1',
-    icon: 'message',
-    status: 'running',
-    activeSessions: 0,
-    totalProcessed: 0,
-    avgResponseTime: 0,
-    lastActivity: new Date().toISOString()
  }
 ]

--- a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx
+++ b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx
@@ -105,12 +105,24 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
    setGridResult(null)

    try {
-      const res = await fetch(
-        `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=true&engine=${eng}&pronunciation=${pronunciation}`,
-        { method: 'POST' },
-      )
-      if (!res.ok) {
-        const err = await res.json().catch(() => ({ detail: res.statusText }))
+      // Retry once if initial request fails (e.g. after container restart,
+      // session cache may not be warm yet when navigating via wizard)
+      let res: Response | null = null
+      for (let attempt = 0; attempt < 2; attempt++) {
+        res = await fetch(
+          `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=true&engine=${eng}&pronunciation=${pronunciation}`,
+          { method: 'POST' },
+        )
+        if (res.ok) break
+        if (attempt === 0 && (res.status === 400 || res.status === 404)) {
+          // Wait briefly for cache to warm up, then retry
+          await new Promise(r => setTimeout(r, 2000))
+          continue
+        }
+        break
+      }
+      if (!res || !res.ok) {
+        const err = await res?.json().catch(() => ({ detail: res?.statusText })) || { detail: 'Worterkennung fehlgeschlagen' }
        throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
      }

--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -3692,7 +3692,8 @@ def _get_rapid_engine():
            "Rec.ocr_version": _OCRVersion.PPOCRV5,
            # Tighter detection boxes to reduce word merging
            "Det.unclip_ratio": 1.3,
-            "Det.box_thresh": 0.6,
+            # Lower threshold to detect small chars (periods, ellipsis, phonetics)
+            "Det.box_thresh": 0.4,
            # Silence verbose logging
            "Global.log_level": "critical",
        })
@@ -4703,11 +4704,16 @@ def _ocr_cell_crop(
    disp_w = col.width
    disp_h = row.height

-    # Crop boundaries (clamped to image)
-    cx = max(0, disp_x)
-    cy = max(0, disp_y)
-    cw = min(disp_w, img_w - cx)
-    ch = min(disp_h, img_h - cy)
+    # Crop boundaries: add small internal padding (3px each side) to avoid
+    # clipping characters near column/row edges (e.g. parentheses, descenders).
+    # Stays within image bounds but may extend slightly beyond strict cell.
+    _PAD = 3
+    cx = max(0, disp_x - _PAD)
+    cy = max(0, disp_y - _PAD)
+    cx2 = min(img_w, disp_x + disp_w + _PAD)
+    cy2 = min(img_h, disp_y + disp_h + _PAD)
+    cw = cx2 - cx
+    ch = cy2 - cy

    empty_cell = {
        'cell_id': f"R{row_idx:02d}_C{col_idx}",
@@ -4727,6 +4733,7 @@ def _ocr_cell_crop(
    }

    if cw <= 0 or ch <= 0:
+        logger.info("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
        return empty_cell

    # --- Pixel-density check: skip truly empty cells ---
@@ -4735,6 +4742,8 @@ def _ocr_cell_crop(
        if crop.size > 0:
            dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
            if dark_ratio < 0.005:
+                logger.info("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
+                            row_idx, col_idx, dark_ratio, cw, ch)
                return empty_cell

    # --- Prepare crop for OCR ---
@@ -4752,8 +4761,43 @@ def _ocr_cell_crop(
        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
        words = ocr_region_lighton(img_bgr, cell_region)
    elif engine_name == "rapid" and img_bgr is not None:
-        cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
-        words = ocr_region_rapid(img_bgr, cell_region)
+        # Upscale small BGR crops for RapidOCR.
+        # Cell crops typically have height 35-55px but width >300px.
+        # _ensure_minimum_crop_size only scales when EITHER dim < min_dim,
+        # using uniform scale → a 365×54 crop becomes ~1014×150 (scale ~2.78).
+        # For very short heights (< 80px), force 3× upscale for better OCR
+        # of small characters like periods, ellipsis, and phonetic symbols.
+        bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
+        if bgr_crop.size == 0:
+            words = []
+        else:
+            crop_h, crop_w = bgr_crop.shape[:2]
+            if crop_h < 80:
+                # Force 3× upscale for short rows — small chars need more pixels
+                scale = 3.0
+                bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
+                                    interpolation=cv2.INTER_CUBIC)
+            else:
+                bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
+            up_h, up_w = bgr_up.shape[:2]
+            scale_x = up_w / max(crop_w, 1)
+            scale_y = up_h / max(crop_h, 1)
+            was_scaled = (up_w != crop_w or up_h != crop_h)
+            logger.info("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
+                        row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
+            tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
+            words = ocr_region_rapid(bgr_up, tmp_region)
+            # Remap positions back to original image coords
+            if words and was_scaled:
+                for w in words:
+                    w['left'] = int(w['left'] / scale_x) + cx
+                    w['top'] = int(w['top'] / scale_y) + cy
+                    w['width'] = int(w['width'] / scale_x)
+                    w['height'] = int(w['height'] / scale_y)
+            elif words:
+                for w in words:
+                    w['left'] += cx
+                    w['top'] += cy
    else:
        # Tesseract: upscale tiny crops for better recognition
        if ocr_img is not None:
@@ -4787,6 +4831,11 @@ def _ocr_cell_crop(
        y_tol = max(15, ch)
        text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
        avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
+        logger.info("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
+                    row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
+    else:
+        logger.info("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
+                    row_idx, col_idx, cw, ch, psm, engine_name)

    # --- PSM 7 fallback for still-empty Tesseract cells ---
    if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
@@ -4808,8 +4857,11 @@ def _ocr_cell_crop(

    # --- Noise filter ---
    if text.strip():
+        pre_filter = text
        text = _clean_cell_text_lite(text)
        if not text:
+            logger.info("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
+                        row_idx, col_idx, pre_filter)
            avg_conf = 0.0

    result = dict(empty_cell)
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -1204,11 +1204,14 @@ async def detect_words(
        stream: false (default) for JSON response, true for SSE streaming
    """
    if session_id not in _cache:
+        logger.info("detect_words: session %s not in cache, loading from DB", session_id)
        await _load_session_to_cache(session_id)
    cached = _get_cached(session_id)

    dewarped_bgr = cached.get("dewarped_bgr")
    if dewarped_bgr is None:
+        logger.warning("detect_words: dewarped_bgr is None for session %s (cache keys: %s)",
+                       session_id, [k for k in cached.keys() if k.endswith('_bgr')])
        raise HTTPException(status_code=400, detail="Dewarp must be completed before word detection")

    session = await get_session_db(session_id)
Author	SHA1	Message	Date
Benjamin Admin	dd16c88007	fix: retry words request on 400/404 + add backend diagnostic logging Some checks failed CI / go-lint (push) Has been skipped Details CI / python-lint (push) Has been skipped Details CI / nodejs-lint (push) Has been skipped Details CI / test-go-school (push) Successful in 26s Details CI / test-go-edu-search (push) Successful in 26s Details CI / test-python-klausur (push) Failing after 1m55s Details CI / test-python-agent-core (push) Successful in 17s Details CI / test-nodejs-website (push) Successful in 18s Details Frontend: retry /words POST once after 2s delay if it gets 400/404, which happens when navigating via wizard after container restart (session cache not yet warm). Backend: log when session needs DB reload and when dewarped_bgr is missing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-04 20:15:54 +01:00
Benjamin Admin	9cbf0fb278	fix: Fake Compliance Advisor aus Lehrer KI-Admin entfernt Der Compliance Advisor gehoert ins Compliance SDK (macmini:3007/sdk/agents), nicht ins Lehrer-Admin. Die verbleibenden 5 Agenten (TutorAgent, GraderAgent, QualityJudge, AlertAgent, Orchestrator) bleiben erhalten. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-04 20:15:50 +01:00
Benjamin Admin	90ecb46bed	fix: force 3x upscale for short RapidOCR crops + lower box_thresh - Short cell crops (<80px height) are always 3x upscaled for RapidOCR to improve recognition of periods, ellipsis, and phonetic symbols - Lowered Det.box_thresh from 0.6 to 0.4 to detect small characters that were being filtered out (dots, brackets, IPA symbols) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-04 19:47:36 +01:00
Benjamin Admin	bb0e23303c	debug: log RapidOCR upscale dimensions to verify scaling Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-04 18:18:03 +01:00
Benjamin Admin	604da26b24	fix: upscale RapidOCR crops to min 150px (was 64px), matching Tesseract Cell crops of 35-54px height were too small for RapidOCR to detect text reliably. Uses _ensure_minimum_crop_size(min_dim=150) for consistent upscaling across all OCR engines. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-04 17:38:06 +01:00
Benjamin Admin	113a1c10e5	fix: add 3px cell padding + upscale small RapidOCR crops + diagnostic logging - Add 3px padding around cell crops to avoid clipping edge characters (parentheses in "Tanz(veranstaltung)", descenders, etc.) - Upscale small BGR crops for RapidOCR, same as Tesseract path - Add info-level diagnostic logging to _ocr_cell_crop for debugging Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-04 16:45:59 +01:00
Benjamin Admin	e4bdb3cc24	debug: add diagnostic logging to _ocr_cell_crop for empty cell investigation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-04 16:35:33 +01:00