Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions headroom/proxy/handlers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,24 @@
_OPENAI_BASE_URL_HEADER = "x-headroom-base-url"


def _normalize_openai_max_tokens(body: dict[str, Any]) -> None:
"""Rename the legacy ``max_tokens`` to ``max_completion_tokens`` in-place.

GPT-5 / o-series chat models reject ``max_tokens`` and require
``max_completion_tokens``; gpt-4o/4.1 accept the latter too. So translating
is a safe, one-way shim for current OpenAI models that lets openai-compatible
clients (opencode, older SDKs) which still send ``max_tokens`` work unchanged.
No-op when there is no ``max_tokens``; keeps an already-set
``max_completion_tokens`` and just drops the rejected legacy key.
"""
if not isinstance(body, dict) or "max_tokens" not in body:
return
legacy = body.get("max_tokens")
if legacy is not None and body.get("max_completion_tokens") is None:
body["max_completion_tokens"] = legacy
body.pop("max_tokens", None)


def _header_get(headers: dict[str, str], name: str) -> str | None:
"""Case-insensitive header lookup for plain dicts."""
lowered = name.lower()
Expand Down Expand Up @@ -2382,6 +2400,15 @@ async def handle_openai_chat(
optimized_tokens = tokenizer.count_messages(body["messages"])
tokens_saved = original_tokens - optimized_tokens

# Compatibility shim: GPT-5 / o-series chat models REJECT the legacy
# `max_tokens` ("Unsupported parameter … Use 'max_completion_tokens'
# instead"); gpt-4o/4.1 accept `max_completion_tokens` too. openai-
# compatible clients (opencode, older SDKs) still send `max_tokens`, so
# translate it here — the proxy already owns the outbound body — and
# those requests work unchanged. No-op when the caller already set
# `max_completion_tokens`.
_normalize_openai_max_tokens(body)

# Route through LiteLLM/any-llm backend if configured
if self.anthropic_backend is not None:
try:
Expand Down
51 changes: 51 additions & 0 deletions tests/test_openai_max_completion_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""OpenAI chat-path compatibility shim: max_tokens -> max_completion_tokens.
GPT-5 / o-series chat models reject the legacy ``max_tokens`` and require
``max_completion_tokens`` ("Unsupported parameter: 'max_tokens' is not supported
with this model. Use 'max_completion_tokens' instead."). openai-compatible
clients (opencode, older SDKs) still send ``max_tokens``, so the proxy — which
already owns the outbound request body — translates it.
"""

from __future__ import annotations

from headroom.proxy.handlers.openai import _normalize_openai_max_tokens


def test_renames_legacy_max_tokens():
body = {"model": "gpt-5.3-chat-latest", "max_tokens": 256, "messages": []}
_normalize_openai_max_tokens(body)
assert "max_tokens" not in body
assert body["max_completion_tokens"] == 256


def test_preserves_existing_max_completion_tokens_and_drops_legacy():
body = {"max_tokens": 256, "max_completion_tokens": 100}
_normalize_openai_max_tokens(body)
assert "max_tokens" not in body
assert body["max_completion_tokens"] == 100 # explicit value wins


def test_noop_when_only_max_completion_tokens():
body = {"max_completion_tokens": 128}
_normalize_openai_max_tokens(body)
assert body == {"max_completion_tokens": 128}


def test_noop_when_neither_present():
body = {"model": "gpt-4o", "messages": []}
_normalize_openai_max_tokens(body)
assert "max_completion_tokens" not in body
assert "max_tokens" not in body


def test_null_max_tokens_is_dropped_without_setting_completion():
body = {"max_tokens": None}
_normalize_openai_max_tokens(body)
assert "max_tokens" not in body
assert body.get("max_completion_tokens") is None


def test_non_dict_is_safe():
_normalize_openai_max_tokens(None) # type: ignore[arg-type]
_normalize_openai_max_tokens("nope") # type: ignore[arg-type]
Loading