From 2c9ba141ec79de50b24760d1d5eb3a193e0f6664 Mon Sep 17 00:00:00 2001 From: Tejas Chopra Date: Fri, 3 Jul 2026 15:18:08 -0700 Subject: [PATCH] fix(proxy/openai): translate max_tokens -> max_completion_tokens on chat path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GPT-5 / o-series chat models reject the legacy `max_tokens` ("Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead."); gpt-4o/4.1 accept `max_completion_tokens` too. openai-compatible clients (opencode via @ai-sdk/openai-compatible, older SDKs) still send `max_tokens`, so requests for GPT-5 models fail at the proxy's OpenAI upstream — which is exactly what blocked a live opencode run. The proxy already owns the outbound chat/completions body (it rewrites messages to compress them), so translate the token param there: rename `max_tokens` -> `max_completion_tokens` when the newer form isn't already set, then drop the rejected legacy key. Safe one-way shim for current OpenAI models; no-op when the client already sends max_completion_tokens. Responses path (max_output_tokens) is unaffected. --- headroom/proxy/handlers/openai.py | 27 ++++++++++++ tests/test_openai_max_completion_tokens.py | 51 ++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 tests/test_openai_max_completion_tokens.py diff --git a/headroom/proxy/handlers/openai.py b/headroom/proxy/handlers/openai.py index 72593b4f3..c1d622e12 100644 --- a/headroom/proxy/handlers/openai.py +++ b/headroom/proxy/handlers/openai.py @@ -76,6 +76,24 @@ _OPENAI_BASE_URL_HEADER = "x-headroom-base-url" +def _normalize_openai_max_tokens(body: dict[str, Any]) -> None: + """Rename the legacy ``max_tokens`` to ``max_completion_tokens`` in-place. + + GPT-5 / o-series chat models reject ``max_tokens`` and require + ``max_completion_tokens``; gpt-4o/4.1 accept the latter too. So translating + is a safe, one-way shim for current OpenAI models that lets openai-compatible + clients (opencode, older SDKs) which still send ``max_tokens`` work unchanged. + No-op when there is no ``max_tokens``; keeps an already-set + ``max_completion_tokens`` and just drops the rejected legacy key. + """ + if not isinstance(body, dict) or "max_tokens" not in body: + return + legacy = body.get("max_tokens") + if legacy is not None and body.get("max_completion_tokens") is None: + body["max_completion_tokens"] = legacy + body.pop("max_tokens", None) + + def _header_get(headers: dict[str, str], name: str) -> str | None: """Case-insensitive header lookup for plain dicts.""" lowered = name.lower() @@ -2382,6 +2400,15 @@ async def handle_openai_chat( optimized_tokens = tokenizer.count_messages(body["messages"]) tokens_saved = original_tokens - optimized_tokens + # Compatibility shim: GPT-5 / o-series chat models REJECT the legacy + # `max_tokens` ("Unsupported parameter … Use 'max_completion_tokens' + # instead"); gpt-4o/4.1 accept `max_completion_tokens` too. openai- + # compatible clients (opencode, older SDKs) still send `max_tokens`, so + # translate it here — the proxy already owns the outbound body — and + # those requests work unchanged. No-op when the caller already set + # `max_completion_tokens`. + _normalize_openai_max_tokens(body) + # Route through LiteLLM/any-llm backend if configured if self.anthropic_backend is not None: try: diff --git a/tests/test_openai_max_completion_tokens.py b/tests/test_openai_max_completion_tokens.py new file mode 100644 index 000000000..91c795712 --- /dev/null +++ b/tests/test_openai_max_completion_tokens.py @@ -0,0 +1,51 @@ +"""OpenAI chat-path compatibility shim: max_tokens -> max_completion_tokens. + +GPT-5 / o-series chat models reject the legacy ``max_tokens`` and require +``max_completion_tokens`` ("Unsupported parameter: 'max_tokens' is not supported +with this model. Use 'max_completion_tokens' instead."). openai-compatible +clients (opencode, older SDKs) still send ``max_tokens``, so the proxy — which +already owns the outbound request body — translates it. +""" + +from __future__ import annotations + +from headroom.proxy.handlers.openai import _normalize_openai_max_tokens + + +def test_renames_legacy_max_tokens(): + body = {"model": "gpt-5.3-chat-latest", "max_tokens": 256, "messages": []} + _normalize_openai_max_tokens(body) + assert "max_tokens" not in body + assert body["max_completion_tokens"] == 256 + + +def test_preserves_existing_max_completion_tokens_and_drops_legacy(): + body = {"max_tokens": 256, "max_completion_tokens": 100} + _normalize_openai_max_tokens(body) + assert "max_tokens" not in body + assert body["max_completion_tokens"] == 100 # explicit value wins + + +def test_noop_when_only_max_completion_tokens(): + body = {"max_completion_tokens": 128} + _normalize_openai_max_tokens(body) + assert body == {"max_completion_tokens": 128} + + +def test_noop_when_neither_present(): + body = {"model": "gpt-4o", "messages": []} + _normalize_openai_max_tokens(body) + assert "max_completion_tokens" not in body + assert "max_tokens" not in body + + +def test_null_max_tokens_is_dropped_without_setting_completion(): + body = {"max_tokens": None} + _normalize_openai_max_tokens(body) + assert "max_tokens" not in body + assert body.get("max_completion_tokens") is None + + +def test_non_dict_is_safe(): + _normalize_openai_max_tokens(None) # type: ignore[arg-type] + _normalize_openai_max_tokens("nope") # type: ignore[arg-type]