breakpilot-compliance/backend-compliance/tests/test_llm_cascade_ovh.py

"""Regression tests for the OVH (gpt-oss-120b) tier of the LLM cascade.

gpt-oss-120b is a reasoning model: it spends output tokens on chain-of-thought
before the answer. Two bugs this pins:
  1. A small max_tokens (deep_check passed 400) length-caps it mid-reasoning →
     content=null → the tier silently returns nothing. _call_ovh must floor the
     budget so reasoning + the JSON answer fit.
  2. When length-capped, the JSON can land in reasoning_content, not content →
     _call_ovh must fall back to reasoning_content.
"""
import pytest
from unittest.mock import AsyncMock, MagicMock, patch

from compliance.services import llm_cascade


def _resp(data):
    r = MagicMock()
    r.raise_for_status = MagicMock()
    r.json = MagicMock(return_value=data)
    return r


def _client(resp):
    inst = AsyncMock()
    inst.post.return_value = resp
    inst.__aenter__ = AsyncMock(return_value=inst)
    inst.__aexit__ = AsyncMock(return_value=False)
    return inst


class TestCallOvhReasoning:
    @pytest.mark.asyncio
    async def test_reasoning_content_used_when_content_null(self, monkeypatch):
        monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
        monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
        monkeypatch.setenv("OVH_LLM_KEY", "k")
        resp = _resp({"choices": [{"message": {
            "content": None,
            "reasoning_content": '{"erfuellt": true, "confidence": 0.9}'}}]})
        with patch("httpx.AsyncClient", return_value=_client(resp)):
            out = await llm_cascade._call_ovh("sys", "user", max_tokens=400)
        assert '"erfuellt": true' in out

    @pytest.mark.asyncio
    async def test_small_budget_is_floored(self, monkeypatch):
        monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
        monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
        inst = _client(_resp({"choices": [{"message": {"content": "{}"}}]}))
        with patch("httpx.AsyncClient", return_value=inst):
            await llm_cascade._call_ovh("sys", "user", max_tokens=400)
        assert inst.post.call_args.kwargs["json"]["max_tokens"] >= 2000

    @pytest.mark.asyncio
    async def test_large_budget_is_preserved(self, monkeypatch):
        monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
        monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
        inst = _client(_resp({"choices": [{"message": {"content": "{}"}}]}))
        with patch("httpx.AsyncClient", return_value=inst):
            await llm_cascade._call_ovh("sys", "user", max_tokens=6000)
        assert inst.post.call_args.kwargs["json"]["max_tokens"] == 6000

    @pytest.mark.asyncio
    async def test_content_preferred_when_present(self, monkeypatch):
        monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
        monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
        resp = _resp({"choices": [{"message": {
            "content": '{"erfuellt": false}', "reasoning_content": "noise"}}]})
        with patch("httpx.AsyncClient", return_value=_client(resp)):
            out = await llm_cascade._call_ovh("sys", "user")
        assert out == '{"erfuellt": false}'

    @pytest.mark.asyncio
    async def test_unconfigured_returns_empty(self, monkeypatch):
        monkeypatch.delenv("OVH_LLM_URL", raising=False)
        monkeypatch.delenv("OVH_LLM_MODEL", raising=False)
        assert await llm_cascade._call_ovh("sys", "user") == ""