From 2c9ba141ec79de50b24760d1d5eb3a193e0f6664 Mon Sep 17 00:00:00 2001
From: Tejas Chopra <chopratejas@gmail.com>
Date: Fri, 3 Jul 2026 15:18:08 -0700
Subject: [PATCH] fix(proxy/openai): translate max_tokens ->
 max_completion_tokens on chat path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GPT-5 / o-series chat models reject the legacy `max_tokens` ("Unsupported
parameter: 'max_tokens' is not supported with this model. Use
'max_completion_tokens' instead."); gpt-4o/4.1 accept `max_completion_tokens`
too. openai-compatible clients (opencode via @ai-sdk/openai-compatible, older
SDKs) still send `max_tokens`, so requests for GPT-5 models fail at the proxy's
OpenAI upstream — which is exactly what blocked a live opencode run.

The proxy already owns the outbound chat/completions body (it rewrites messages
to compress them), so translate the token param there: rename `max_tokens` ->
`max_completion_tokens` when the newer form isn't already set, then drop the
rejected legacy key. Safe one-way shim for current OpenAI models; no-op when the
client already sends max_completion_tokens. Responses path (max_output_tokens)
is unaffected.
---
 headroom/proxy/handlers/openai.py          | 27 ++++++++++++
 tests/test_openai_max_completion_tokens.py | 51 ++++++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 tests/test_openai_max_completion_tokens.py

diff --git a/headroom/proxy/handlers/openai.py b/headroom/proxy/handlers/openai.py
index 72593b4f3..c1d622e12 100644
--- a/headroom/proxy/handlers/openai.py
+++ b/headroom/proxy/handlers/openai.py
@@ -76,6 +76,24 @@
 _OPENAI_BASE_URL_HEADER = "x-headroom-base-url"
 
 
+def _normalize_openai_max_tokens(body: dict[str, Any]) -> None:
+    """Rename the legacy ``max_tokens`` to ``max_completion_tokens`` in-place.
+
+    GPT-5 / o-series chat models reject ``max_tokens`` and require
+    ``max_completion_tokens``; gpt-4o/4.1 accept the latter too. So translating
+    is a safe, one-way shim for current OpenAI models that lets openai-compatible
+    clients (opencode, older SDKs) which still send ``max_tokens`` work unchanged.
+    No-op when there is no ``max_tokens``; keeps an already-set
+    ``max_completion_tokens`` and just drops the rejected legacy key.
+    """
+    if not isinstance(body, dict) or "max_tokens" not in body:
+        return
+    legacy = body.get("max_tokens")
+    if legacy is not None and body.get("max_completion_tokens") is None:
+        body["max_completion_tokens"] = legacy
+    body.pop("max_tokens", None)
+
+
 def _header_get(headers: dict[str, str], name: str) -> str | None:
     """Case-insensitive header lookup for plain dicts."""
     lowered = name.lower()
@@ -2382,6 +2400,15 @@ async def handle_openai_chat(
         optimized_tokens = tokenizer.count_messages(body["messages"])
         tokens_saved = original_tokens - optimized_tokens
 
+        # Compatibility shim: GPT-5 / o-series chat models REJECT the legacy
+        # `max_tokens` ("Unsupported parameter … Use 'max_completion_tokens'
+        # instead"); gpt-4o/4.1 accept `max_completion_tokens` too. openai-
+        # compatible clients (opencode, older SDKs) still send `max_tokens`, so
+        # translate it here — the proxy already owns the outbound body — and
+        # those requests work unchanged. No-op when the caller already set
+        # `max_completion_tokens`.
+        _normalize_openai_max_tokens(body)
+
         # Route through LiteLLM/any-llm backend if configured
         if self.anthropic_backend is not None:
             try:
diff --git a/tests/test_openai_max_completion_tokens.py b/tests/test_openai_max_completion_tokens.py
new file mode 100644
index 000000000..91c795712
--- /dev/null
+++ b/tests/test_openai_max_completion_tokens.py
@@ -0,0 +1,51 @@
+"""OpenAI chat-path compatibility shim: max_tokens -> max_completion_tokens.
+
+GPT-5 / o-series chat models reject the legacy ``max_tokens`` and require
+``max_completion_tokens`` ("Unsupported parameter: 'max_tokens' is not supported
+with this model. Use 'max_completion_tokens' instead."). openai-compatible
+clients (opencode, older SDKs) still send ``max_tokens``, so the proxy — which
+already owns the outbound request body — translates it.
+"""
+
+from __future__ import annotations
+
+from headroom.proxy.handlers.openai import _normalize_openai_max_tokens
+
+
+def test_renames_legacy_max_tokens():
+    body = {"model": "gpt-5.3-chat-latest", "max_tokens": 256, "messages": []}
+    _normalize_openai_max_tokens(body)
+    assert "max_tokens" not in body
+    assert body["max_completion_tokens"] == 256
+
+
+def test_preserves_existing_max_completion_tokens_and_drops_legacy():
+    body = {"max_tokens": 256, "max_completion_tokens": 100}
+    _normalize_openai_max_tokens(body)
+    assert "max_tokens" not in body
+    assert body["max_completion_tokens"] == 100  # explicit value wins
+
+
+def test_noop_when_only_max_completion_tokens():
+    body = {"max_completion_tokens": 128}
+    _normalize_openai_max_tokens(body)
+    assert body == {"max_completion_tokens": 128}
+
+
+def test_noop_when_neither_present():
+    body = {"model": "gpt-4o", "messages": []}
+    _normalize_openai_max_tokens(body)
+    assert "max_completion_tokens" not in body
+    assert "max_tokens" not in body
+
+
+def test_null_max_tokens_is_dropped_without_setting_completion():
+    body = {"max_tokens": None}
+    _normalize_openai_max_tokens(body)
+    assert "max_tokens" not in body
+    assert body.get("max_completion_tokens") is None
+
+
+def test_non_dict_is_safe():
+    _normalize_openai_max_tokens(None)  # type: ignore[arg-type]
+    _normalize_openai_max_tokens("nope")  # type: ignore[arg-type]