fix(cascade): give OVH/gpt-oss reasoning headroom so Tier-2 isn't silently dead
CI / detect-changes (push) Successful in 8s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / loc-budget (push) Successful in 20s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 25s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 6s
CI / validate-canonical-controls (push) Successful in 5s
CI / detect-changes (push) Successful in 8s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / loc-budget (push) Successful in 20s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 25s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / build-sha-integrity (push) Successful in 6s
CI / validate-canonical-controls (push) Successful in 5s
gpt-oss-120b is a reasoning model: it spends output tokens on chain-of-thought before the answer. deep_check called _call_ovh with max_tokens=400, which length-capped it mid-reasoning -> content=null -> the OVH tier returned nothing and the cascade always skipped Tier-2. Floor the OVH budget to >=2000, fall back to reasoning_content when content is null, and raise the client timeout to 90s for the slower reasoning path. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -142,19 +142,26 @@ async def _call_ovh(system: str, user: str, max_tokens: int = 6000) -> str:
|
|||||||
headers = {"Content-Type": "application/json"}
|
headers = {"Content-Type": "application/json"}
|
||||||
if key:
|
if key:
|
||||||
headers["Authorization"] = f"Bearer {key}"
|
headers["Authorization"] = f"Bearer {key}"
|
||||||
|
# gpt-oss-120b is a REASONING model: it spends output tokens on
|
||||||
|
# chain-of-thought before emitting the answer. A low cap (e.g. deep_check's
|
||||||
|
# max_tokens=400) makes it hit the length limit mid-reasoning and return
|
||||||
|
# content=null — the whole tier then silently yields nothing. Floor the
|
||||||
|
# budget so the reasoning AND the JSON answer fit.
|
||||||
payload = {
|
payload = {
|
||||||
"model": model, "temperature": 0.05, "max_tokens": max_tokens,
|
"model": model, "temperature": 0.05, "max_tokens": max(max_tokens, 2000),
|
||||||
"messages": [{"role": "system", "content": system},
|
"messages": [{"role": "system", "content": system},
|
||||||
{"role": "user", "content": user}],
|
{"role": "user", "content": user}],
|
||||||
"response_format": {"type": "json_object"},
|
"response_format": {"type": "json_object"},
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=45.0) as c:
|
async with httpx.AsyncClient(timeout=90.0) as c:
|
||||||
r = await c.post(f"{base.rstrip('/')}/v1/chat/completions",
|
r = await c.post(f"{base.rstrip('/')}/v1/chat/completions",
|
||||||
json=payload, headers=headers)
|
json=payload, headers=headers)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
choice = (r.json().get("choices") or [{}])[0]
|
msg = (r.json().get("choices") or [{}])[0].get("message") or {}
|
||||||
return (choice.get("message") or {}).get("content", "") or ""
|
# Answer is normally in content; if the model was length-capped the
|
||||||
|
# JSON can land in reasoning_content instead — fall back to it.
|
||||||
|
return (msg.get("content") or "") or (msg.get("reasoning_content") or "")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("ovh cascade tier 2 failed: %s", e)
|
logger.warning("ovh cascade tier 2 failed: %s", e)
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -0,0 +1,77 @@
|
|||||||
|
"""Regression tests for the OVH (gpt-oss-120b) tier of the LLM cascade.
|
||||||
|
|
||||||
|
gpt-oss-120b is a reasoning model: it spends output tokens on chain-of-thought
|
||||||
|
before the answer. Two bugs this pins:
|
||||||
|
1. A small max_tokens (deep_check passed 400) length-caps it mid-reasoning →
|
||||||
|
content=null → the tier silently returns nothing. _call_ovh must floor the
|
||||||
|
budget so reasoning + the JSON answer fit.
|
||||||
|
2. When length-capped, the JSON can land in reasoning_content, not content →
|
||||||
|
_call_ovh must fall back to reasoning_content.
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
|
||||||
|
from compliance.services import llm_cascade
|
||||||
|
|
||||||
|
|
||||||
|
def _resp(data):
|
||||||
|
r = MagicMock()
|
||||||
|
r.raise_for_status = MagicMock()
|
||||||
|
r.json = MagicMock(return_value=data)
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
def _client(resp):
|
||||||
|
inst = AsyncMock()
|
||||||
|
inst.post.return_value = resp
|
||||||
|
inst.__aenter__ = AsyncMock(return_value=inst)
|
||||||
|
inst.__aexit__ = AsyncMock(return_value=False)
|
||||||
|
return inst
|
||||||
|
|
||||||
|
|
||||||
|
class TestCallOvhReasoning:
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_reasoning_content_used_when_content_null(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
|
||||||
|
monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
|
||||||
|
monkeypatch.setenv("OVH_LLM_KEY", "k")
|
||||||
|
resp = _resp({"choices": [{"message": {
|
||||||
|
"content": None,
|
||||||
|
"reasoning_content": '{"erfuellt": true, "confidence": 0.9}'}}]})
|
||||||
|
with patch("httpx.AsyncClient", return_value=_client(resp)):
|
||||||
|
out = await llm_cascade._call_ovh("sys", "user", max_tokens=400)
|
||||||
|
assert '"erfuellt": true' in out
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_small_budget_is_floored(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
|
||||||
|
monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
|
||||||
|
inst = _client(_resp({"choices": [{"message": {"content": "{}"}}]}))
|
||||||
|
with patch("httpx.AsyncClient", return_value=inst):
|
||||||
|
await llm_cascade._call_ovh("sys", "user", max_tokens=400)
|
||||||
|
assert inst.post.call_args.kwargs["json"]["max_tokens"] >= 2000
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_large_budget_is_preserved(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
|
||||||
|
monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
|
||||||
|
inst = _client(_resp({"choices": [{"message": {"content": "{}"}}]}))
|
||||||
|
with patch("httpx.AsyncClient", return_value=inst):
|
||||||
|
await llm_cascade._call_ovh("sys", "user", max_tokens=6000)
|
||||||
|
assert inst.post.call_args.kwargs["json"]["max_tokens"] == 6000
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_content_preferred_when_present(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("OVH_LLM_URL", "https://llm.example.com")
|
||||||
|
monkeypatch.setenv("OVH_LLM_MODEL", "gpt-oss-120b")
|
||||||
|
resp = _resp({"choices": [{"message": {
|
||||||
|
"content": '{"erfuellt": false}', "reasoning_content": "noise"}}]})
|
||||||
|
with patch("httpx.AsyncClient", return_value=_client(resp)):
|
||||||
|
out = await llm_cascade._call_ovh("sys", "user")
|
||||||
|
assert out == '{"erfuellt": false}'
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_unconfigured_returns_empty(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("OVH_LLM_URL", raising=False)
|
||||||
|
monkeypatch.delenv("OVH_LLM_MODEL", raising=False)
|
||||||
|
assert await llm_cascade._call_ovh("sys", "user") == ""
|
||||||
Reference in New Issue
Block a user