fix(cra): 35B-Datenblatt-Extraktion — Thinking-Mode aus (think=false)

qwen3.5:35b-a3b ist ein Thinking-Modell → generierte erst Reasoning, riss das
90s-Timeout → leere Extraktion. llm_cascade additiv um think-Param erweitert
(Cache-Key kennt think); Datenblatt-Extraktor setzt think=False → sauberes JSON
in ~1s. Default fuer alle anderen Cascade-Nutzer unveraendert.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-16 20:22:57 +02:00
parent b217429d39
commit fae826e1f7
2 changed files with 11 additions and 5 deletions
@@ -165,7 +165,7 @@ async def extract_grenzen(text: str, max_chars: int = 20000) -> dict:
res = await call_with_cascade( res = await call_with_cascade(
system=_system_prompt(), system=_system_prompt(),
user=f"Datenblatt-Text:\n\n{excerpt}", user=f"Datenblatt-Text:\n\n{excerpt}",
min_confidence=0.5, max_tokens=4000, model=_DATASHEET_MODEL, min_confidence=0.5, max_tokens=4000, model=_DATASHEET_MODEL, think=False,
) )
parsed = parse_grenzen_json(res.get("text", "") if isinstance(res, dict) else "") parsed = parse_grenzen_json(res.get("text", "") if isinstance(res, dict) else "")
for key, entry in parsed.items(): for key, entry in parsed.items():
@@ -105,7 +105,7 @@ def _heuristic_confidence(response_text: str, input_len: int) -> float:
async def _call_ollama(system: str, user: str, async def _call_ollama(system: str, user: str,
max_tokens: int = 6000, max_tokens: int = 6000,
timeout: float = 90.0, timeout: float = 90.0,
model: str = "") -> str: model: str = "", think=None) -> str:
base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
model = model or os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b") model = model or os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b")
payload = { payload = {
@@ -114,6 +114,10 @@ async def _call_ollama(system: str, user: str,
{"role": "user", "content": user}], {"role": "user", "content": user}],
"options": {"temperature": 0.05, "num_predict": max_tokens}, "options": {"temperature": 0.05, "num_predict": max_tokens},
} }
# Thinking models (qwen3/qwen3.5) otherwise emit long reasoning first and
# blow the timeout; think=False makes them answer JSON directly (~1s).
if think is not None:
payload["think"] = think
try: try:
async with httpx.AsyncClient(timeout=timeout) as c: async with httpx.AsyncClient(timeout=timeout) as c:
r = await c.post(f"{base.rstrip('/')}/api/chat", json=payload) r = await c.post(f"{base.rstrip('/')}/api/chat", json=payload)
@@ -190,10 +194,12 @@ async def call_with_cascade(
min_confidence: float = 0.6, min_confidence: float = 0.6,
max_tokens: int = 6000, max_tokens: int = 6000,
model: str = "", model: str = "",
think=None,
) -> dict: ) -> dict:
"""Returns {'text': str, 'confidence': float, 'source': str, """Returns {'text': str, 'confidence': float, 'source': str,
'cached': bool}. `model` overrides the local Tier-1 (Ollama) model only.""" 'cached': bool}. `model` overrides the local Tier-1 (Ollama) model only;
key = _cache_key(system, user, model) `think` toggles thinking mode on the local model (False = direct answer)."""
key = _cache_key(system, user, f"{model}|think={think}")
cached = _cache_get(key) cached = _cache_get(key)
if cached: if cached:
cached["cached"] = True cached["cached"] = True
@@ -213,7 +219,7 @@ async def call_with_cascade(
"or ANTHROPIC_API_KEY to enable fallbacks." "or ANTHROPIC_API_KEY to enable fallbacks."
) )
# Tier 1: Qwen lokal # Tier 1: Qwen lokal
text = await _call_ollama(system, user, max_tokens=max_tokens, model=model) text = await _call_ollama(system, user, max_tokens=max_tokens, model=model, think=think)
conf = _heuristic_confidence(text, input_len) conf = _heuristic_confidence(text, input_len)
if text and conf >= min_confidence: if text and conf >= min_confidence:
out = {"text": text, "confidence": conf, out = {"text": text, "confidence": conf,