From fae826e1f7612f874b5ac69c34bd222bf55d6a44 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Tue, 16 Jun 2026 20:22:57 +0200
Subject: [PATCH] =?UTF-8?q?fix(cra):=2035B-Datenblatt-Extraktion=20?=
 =?UTF-8?q?=E2=80=94=20Thinking-Mode=20aus=20(think=3Dfalse)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

qwen3.5:35b-a3b ist ein Thinking-Modell → generierte erst Reasoning, riss das
90s-Timeout → leere Extraktion. llm_cascade additiv um think-Param erweitert
(Cache-Key kennt think); Datenblatt-Extraktor setzt think=False → sauberes JSON
in ~1s. Default fuer alle anderen Cascade-Nutzer unveraendert.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../compliance/services/cra_datasheet_extractor.py |  2 +-
 .../compliance/services/llm_cascade.py             | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/backend-compliance/compliance/services/cra_datasheet_extractor.py b/backend-compliance/compliance/services/cra_datasheet_extractor.py
index 917c7524..a136d8ec 100644
--- a/backend-compliance/compliance/services/cra_datasheet_extractor.py
+++ b/backend-compliance/compliance/services/cra_datasheet_extractor.py
@@ -165,7 +165,7 @@ async def extract_grenzen(text: str, max_chars: int = 20000) -> dict:
             res = await call_with_cascade(
                 system=_system_prompt(),
                 user=f"Datenblatt-Text:\n\n{excerpt}",
-                min_confidence=0.5, max_tokens=4000, model=_DATASHEET_MODEL,
+                min_confidence=0.5, max_tokens=4000, model=_DATASHEET_MODEL, think=False,
             )
             parsed = parse_grenzen_json(res.get("text", "") if isinstance(res, dict) else "")
             for key, entry in parsed.items():
diff --git a/backend-compliance/compliance/services/llm_cascade.py b/backend-compliance/compliance/services/llm_cascade.py
index a5eba899..47ffcde5 100644
--- a/backend-compliance/compliance/services/llm_cascade.py
+++ b/backend-compliance/compliance/services/llm_cascade.py
@@ -105,7 +105,7 @@ def _heuristic_confidence(response_text: str, input_len: int) -> float:
 async def _call_ollama(system: str, user: str,
                         max_tokens: int = 6000,
                         timeout: float = 90.0,
-                        model: str = "") -> str:
+                        model: str = "", think=None) -> str:
     base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
     model = model or os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b")
     payload = {
@@ -114,6 +114,10 @@ async def _call_ollama(system: str, user: str,
                      {"role": "user", "content": user}],
         "options": {"temperature": 0.05, "num_predict": max_tokens},
     }
+    # Thinking models (qwen3/qwen3.5) otherwise emit long reasoning first and
+    # blow the timeout; think=False makes them answer JSON directly (~1s).
+    if think is not None:
+        payload["think"] = think
     try:
         async with httpx.AsyncClient(timeout=timeout) as c:
             r = await c.post(f"{base.rstrip('/')}/api/chat", json=payload)
@@ -190,10 +194,12 @@ async def call_with_cascade(
     min_confidence: float = 0.6,
     max_tokens: int = 6000,
     model: str = "",
+    think=None,
 ) -> dict:
     """Returns {'text': str, 'confidence': float, 'source': str,
-    'cached': bool}. `model` overrides the local Tier-1 (Ollama) model only."""
-    key = _cache_key(system, user, model)
+    'cached': bool}. `model` overrides the local Tier-1 (Ollama) model only;
+    `think` toggles thinking mode on the local model (False = direct answer)."""
+    key = _cache_key(system, user, f"{model}|think={think}")
     cached = _cache_get(key)
     if cached:
         cached["cached"] = True
@@ -213,7 +219,7 @@ async def call_with_cascade(
                 "or ANTHROPIC_API_KEY to enable fallbacks."
             )
     # Tier 1: Qwen lokal
-    text = await _call_ollama(system, user, max_tokens=max_tokens, model=model)
+    text = await _call_ollama(system, user, max_tokens=max_tokens, model=model, think=think)
     conf = _heuristic_confidence(text, input_len)
     if text and conf >= min_confidence:
         out = {"text": text, "confidence": conf,