diff --git a/backend-compliance/compliance/services/cra_datasheet_extractor.py b/backend-compliance/compliance/services/cra_datasheet_extractor.py index 917c7524..a136d8ec 100644 --- a/backend-compliance/compliance/services/cra_datasheet_extractor.py +++ b/backend-compliance/compliance/services/cra_datasheet_extractor.py @@ -165,7 +165,7 @@ async def extract_grenzen(text: str, max_chars: int = 20000) -> dict: res = await call_with_cascade( system=_system_prompt(), user=f"Datenblatt-Text:\n\n{excerpt}", - min_confidence=0.5, max_tokens=4000, model=_DATASHEET_MODEL, + min_confidence=0.5, max_tokens=4000, model=_DATASHEET_MODEL, think=False, ) parsed = parse_grenzen_json(res.get("text", "") if isinstance(res, dict) else "") for key, entry in parsed.items(): diff --git a/backend-compliance/compliance/services/llm_cascade.py b/backend-compliance/compliance/services/llm_cascade.py index a5eba899..47ffcde5 100644 --- a/backend-compliance/compliance/services/llm_cascade.py +++ b/backend-compliance/compliance/services/llm_cascade.py @@ -105,7 +105,7 @@ def _heuristic_confidence(response_text: str, input_len: int) -> float: async def _call_ollama(system: str, user: str, max_tokens: int = 6000, timeout: float = 90.0, - model: str = "") -> str: + model: str = "", think=None) -> str: base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") model = model or os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b") payload = { @@ -114,6 +114,10 @@ async def _call_ollama(system: str, user: str, {"role": "user", "content": user}], "options": {"temperature": 0.05, "num_predict": max_tokens}, } + # Thinking models (qwen3/qwen3.5) otherwise emit long reasoning first and + # blow the timeout; think=False makes them answer JSON directly (~1s). + if think is not None: + payload["think"] = think try: async with httpx.AsyncClient(timeout=timeout) as c: r = await c.post(f"{base.rstrip('/')}/api/chat", json=payload) @@ -190,10 +194,12 @@ async def call_with_cascade( min_confidence: float = 0.6, max_tokens: int = 6000, model: str = "", + think=None, ) -> dict: """Returns {'text': str, 'confidence': float, 'source': str, - 'cached': bool}. `model` overrides the local Tier-1 (Ollama) model only.""" - key = _cache_key(system, user, model) + 'cached': bool}. `model` overrides the local Tier-1 (Ollama) model only; + `think` toggles thinking mode on the local model (False = direct answer).""" + key = _cache_key(system, user, f"{model}|think={think}") cached = _cache_get(key) if cached: cached["cached"] = True @@ -213,7 +219,7 @@ async def call_with_cascade( "or ANTHROPIC_API_KEY to enable fallbacks." ) # Tier 1: Qwen lokal - text = await _call_ollama(system, user, max_tokens=max_tokens, model=model) + text = await _call_ollama(system, user, max_tokens=max_tokens, model=model, think=think) conf = _heuristic_confidence(text, input_len) if text and conf >= min_confidence: out = {"text": text, "confidence": conf,