diff --git a/backend-compliance/compliance/services/llm_cascade.py b/backend-compliance/compliance/services/llm_cascade.py index 47ffcde5..c5d7a01f 100644 --- a/backend-compliance/compliance/services/llm_cascade.py +++ b/backend-compliance/compliance/services/llm_cascade.py @@ -142,19 +142,26 @@ async def _call_ovh(system: str, user: str, max_tokens: int = 6000) -> str: headers = {"Content-Type": "application/json"} if key: headers["Authorization"] = f"Bearer {key}" + # gpt-oss-120b is a REASONING model: it spends output tokens on + # chain-of-thought before emitting the answer. A low cap (e.g. deep_check's + # max_tokens=400) makes it hit the length limit mid-reasoning and return + # content=null — the whole tier then silently yields nothing. Floor the + # budget so the reasoning AND the JSON answer fit. payload = { - "model": model, "temperature": 0.05, "max_tokens": max_tokens, + "model": model, "temperature": 0.05, "max_tokens": max(max_tokens, 2000), "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}], "response_format": {"type": "json_object"}, } try: - async with httpx.AsyncClient(timeout=45.0) as c: + async with httpx.AsyncClient(timeout=90.0) as c: r = await c.post(f"{base.rstrip('/')}/v1/chat/completions", json=payload, headers=headers) r.raise_for_status() - choice = (r.json().get("choices") or [{}])[0] - return (choice.get("message") or {}).get("content", "") or "" + msg = (r.json().get("choices") or [{}])[0].get("message") or {} + # Answer is normally in content; if the model was length-capped the + # JSON can land in reasoning_content instead — fall back to it. + return (msg.get("content") or "") or (msg.get("reasoning_content") or "") except Exception as e: logger.warning("ovh cascade tier 2 failed: %s", e) return "" diff --git a/backend-compliance/tests/test_llm_cascade_ovh.py b/backend-compliance/tests/test_llm_cascade_ovh.py new file mode 100644 index 00000000..a8170d73 --- /dev/null +++ b/backend-compliance/tests/test_llm_cascade_ovh.py @@ -0,0 +1,77 @@ +"""Regression tests for the OVH (gpt-oss-120b) tier of the LLM cascade. + +gpt-oss-120b is a reasoning model: it spends output tokens on chain-of-thought +before the answer. Two bugs this pins: + 1. A small max_tokens (deep_check passed 400) length-caps it mid-reasoning → + content=null → the tier silently returns nothing. _call_ovh must floor the + budget so reasoning + the JSON answer fit. + 2. When length-capped, the JSON can land in reasoning_content, not content → + _call_ovh must fall back to reasoning_content. +""" +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from compliance.services import llm_cascade + + +def _resp(data): + r = MagicMock() + r.raise_for_status = MagicMock() + r.json = MagicMock(return_value=data) + return r + + +def _client(resp): + inst = AsyncMock() + inst.post.return_value = resp + inst.__aenter__ = AsyncMock(return_value=inst) + inst.__aexit__ = AsyncMock(return_value=False) + return inst + + +class TestCallOvhReasoning: + @pytest.mark.asyncio + async def test_reasoning_content_used_when_content_null(self, monkeypatch): + monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com") + monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b") + monkeypatch.setenv("OVH_LLM_KEY", "k") + resp = _resp({"choices": [{"message": { + "content": None, + "reasoning_content": '{"erfuellt": true, "confidence": 0.9}'}}]}) + with patch("httpx.AsyncClient", return_value=_client(resp)): + out = await llm_cascade._call_ovh("sys", "user", max_tokens=400) + assert '"erfuellt": true' in out + + @pytest.mark.asyncio + async def test_small_budget_is_floored(self, monkeypatch): + monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com") + monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b") + inst = _client(_resp({"choices": [{"message": {"content": "{}"}}]})) + with patch("httpx.AsyncClient", return_value=inst): + await llm_cascade._call_ovh("sys", "user", max_tokens=400) + assert inst.post.call_args.kwargs["json"]["max_tokens"] >= 2000 + + @pytest.mark.asyncio + async def test_large_budget_is_preserved(self, monkeypatch): + monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com") + monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b") + inst = _client(_resp({"choices": [{"message": {"content": "{}"}}]})) + with patch("httpx.AsyncClient", return_value=inst): + await llm_cascade._call_ovh("sys", "user", max_tokens=6000) + assert inst.post.call_args.kwargs["json"]["max_tokens"] == 6000 + + @pytest.mark.asyncio + async def test_content_preferred_when_present(self, monkeypatch): + monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com") + monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b") + resp = _resp({"choices": [{"message": { + "content": '{"erfuellt": false}', "reasoning_content": "noise"}}]}) + with patch("httpx.AsyncClient", return_value=_client(resp)): + out = await llm_cascade._call_ovh("sys", "user") + assert out == '{"erfuellt": false}' + + @pytest.mark.asyncio + async def test_unconfigured_returns_empty(self, monkeypatch): + monkeypatch.delenv("OVH_LLM_URL", raising=False) + monkeypatch.delenv("OVH_LLM_MODEL", raising=False) + assert await llm_cascade._call_ovh("sys", "user") == ""