fix(llm-dedup): increase timeout to 120s, add /no_think, limit output to 200 tokens

qwen3.5 uses extended thinking by default which causes 95s+ responses and 30s timeouts. Add /no_think to system prompt and num_predict=200 to keep responses short. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-22 20:27:58 +02:00
parent 151bf3d322
commit 2188d6645e
1 changed files with 3 additions and 2 deletions
@@ -1475,14 +1475,15 @@ async def _run_llm_dedup(req: LLMDedupRequest, job_id: str):

                prompt = f"Control A ({row.candidate_control_id}):\n{candidate_ctx}\n\nControl B ({row.matched_control_id}):\n{matched_ctx}\n\nSind diese Controls Duplikate?"

-                async with httpx.AsyncClient(timeout=30.0) as client:
+                async with httpx.AsyncClient(timeout=120.0) as client:
                    resp = await client.post(
                        f"{OLLAMA_URL}/api/chat",
                        json={
                            "model": req.model,
                            "stream": False,
+                            "options": {"num_predict": 200},
                            "messages": [
-                                {"role": "system", "content": "Du bist ein Compliance-Experte. Vergleiche zwei Controls und entscheide: DUPLIKAT (gleiche Anforderung, nur anders formuliert) oder VERSCHIEDEN (unterschiedlicher Scope/Inhalt). Antworte NUR mit einem JSON: {\"verdict\": \"DUPLIKAT\" oder \"VERSCHIEDEN\", \"reason\": \"kurze Begruendung\"}"},
+                                {"role": "system", "content": "Du bist ein Compliance-Experte. Vergleiche zwei Controls und entscheide: DUPLIKAT (gleiche Anforderung, nur anders formuliert) oder VERSCHIEDEN (unterschiedlicher Scope/Inhalt). Antworte NUR mit einem JSON: {\"verdict\": \"DUPLIKAT\" oder \"VERSCHIEDEN\", \"reason\": \"kurze Begruendung\"} /no_think"},
                                {"role": "user", "content": prompt},
                            ],
                        },