feat(proxy): opt-in compression for catch-all passthrough routes

tenderdeve · tenderdeve · commit 41bce603fe92 · 2026-07-02T17:49:41.000+05:30
Requests whose path doesn't match a built-in API route fall through to `handle_passthrough`, which forwarded the body verbatim — bypassing the whole compression pipeline. Wrapper-proxy setups that front Headroom on custom paths (e.g. `/api/codex-proxy/<key>/v1/responses`) therefore got zero compression on coding-agent traffic and hit context-limit 400s in long sessions (#1546). Add `--compress-passthrough` (env `HEADROOM_COMPRESS_PASSTHROUGH=1`), off by default. When enabled, POST passthrough requests whose path ends in `/responses` and carry an OpenAI Responses-shaped body are routed through the same `_compress_openai_responses_payload_in_executor` path the native `/v1/responses` handler uses, then forwarded to the unknown upstream. Stale Content-Length is dropped so httpx recomputes it. Fail-open by construction: non-JSON, non-Responses payloads, unmodified results, and any compressor error forward the original body unchanged, so opting in can never drop a catch-all request. Anthropic `/messages` and `/chat/completions` passthrough compression are left as follow-ups. Closes #1546
diff --git a/docs/content/docs/proxy.mdx b/docs/content/docs/proxy.mdx
@@ -92,11 +92,13 @@ headroom proxy --mode cache
 | `--no-learn` | `false` | Explicitly disable traffic learning |
 | `--min-evidence` | `5` | Minimum observations before a learned pattern is persisted |
 | `--codex-wire-debug` | `false` | Write local Codex wire snapshots and matching proxy log traces |
+| `--compress-passthrough` | `false` | Also compress custom proxy paths that fall through to the catch-all handler (OpenAI Responses-shaped bodies, path ends in `/responses`). Also `HEADROOM_COMPRESS_PASSTHROUGH=1` |
 
 ```bash
 headroom proxy --memory
 headroom proxy --learn --min-evidence 3
 headroom proxy --codex-wire-debug
+headroom proxy --compress-passthrough
 ```
 
 <Callout type="info" title="LLMLingua removed from the proxy CLI">
diff --git a/headroom/proxy/handlers/openai.py b/headroom/proxy/handlers/openai.py
@@ -6365,6 +6365,52 @@ async def handle_compress(self, request: Request) -> JSONResponse:
                 },
             )
 
+    async def _maybe_compress_passthrough_responses(self, body: bytes) -> bytes:
+        """Compress an OpenAI Responses-shaped passthrough body, fail-open.
+
+        Reuses the native `/v1/responses` compression path so custom
+        wrapper-proxy routes get the same ContentRouter/Kompress treatment.
+        Any parse/compression failure returns the original body unchanged so a
+        catch-all request is never dropped by opting into compression.
+        """
+        try:
+            payload = json.loads(body)
+        except (json.JSONDecodeError, ValueError, TypeError):
+            return body
+        if not isinstance(payload, dict) or "input" not in payload:
+            # Not a Responses payload (no `input` array) — leave it alone.
+            return body
+
+        model = str(payload.get("model") or "passthrough")
+        request_id = await self._next_request_id()
+        try:
+            (
+                compressed_payload,
+                modified,
+                *_rest,
+            ) = await self._compress_openai_responses_payload_in_executor(
+                payload,
+                model=model,
+                request_id=request_id,
+            )
+        except Exception as exc:  # noqa: BLE001 — fail-open on any compressor error
+            logger.warning(
+                "[%s] passthrough Responses compression failed, forwarding verbatim: %s",
+                request_id,
+                exc,
+            )
+            return body
+        if not modified:
+            return body
+        try:
+            return json.dumps(
+                compressed_payload,
+                separators=(",", ":"),
+                ensure_ascii=False,
+            ).encode("utf-8")
+        except (TypeError, ValueError):
+            return body
+
     async def handle_passthrough(
         self,
         request: Request,
@@ -6423,6 +6469,25 @@ async def handle_passthrough(
 
         body = await request.body()
 
+        # Opt-in: compress requests that fall through here because their path
+        # doesn't match a built-in API route (custom wrapper-proxy paths like
+        # `/api/codex-proxy/<key>/v1/responses`). Off by default; only touches
+        # OpenAI Responses-shaped bodies (path ends in `/responses`) so we reuse
+        # the exact same ContentRouter/Kompress path the native handler runs.
+        if (
+            getattr(self.config, "compress_passthrough", False)
+            and self.config.optimize
+            and request.method == "POST"
+            and path.rstrip("/").endswith("/responses")
+            and body
+        ):
+            compressed = await self._maybe_compress_passthrough_responses(body)
+            if compressed != body:
+                body = compressed
+                # Body size changed — let httpx recompute Content-Length.
+                for _hk in [k for k in headers if k.lower() == "content-length"]:
+                    headers.pop(_hk, None)
+
         headers = await apply_copilot_api_auth(headers, url=url)
         # Cloudflare bot-management challenges our HTTP/2 fingerprint on
         # ChatGPT's sensitive account endpoints (/backend-api/me,
diff --git a/headroom/proxy/models.py b/headroom/proxy/models.py
@@ -180,6 +180,16 @@ class ProxyConfig:
     # CLI: --force-kompress-all; env: HEADROOM_FORCE_KOMPRESS_ALL=1.
     force_kompress_all: bool = False
 
+    # Compress requests that fall through to the catch-all passthrough handler
+    # (custom proxy paths that don't match a built-in API route, e.g.
+    # `/api/codex-proxy/<key>/v1/responses` fronted by another proxy). Off by
+    # default because passthrough targets are unknown upstreams; opt-in for
+    # wrapper-proxy architectures that need coding-agent traffic compressed.
+    # Currently applies to OpenAI Responses-shaped bodies (paths ending in
+    # `/responses`). CLI: --compress-passthrough; env:
+    # HEADROOM_COMPRESS_PASSTHROUGH=1.
+    compress_passthrough: bool = False
+
     # Code graph live watcher (triggers incremental reindex on file changes)
     code_graph_watcher: bool = False
 
diff --git a/headroom/proxy/server.py b/headroom/proxy/server.py
@@ -4080,6 +4080,7 @@ def _proxy_config_from_env() -> ProxyConfig:
         disable_kompress_anthropic=_get_env_optional_bool("HEADROOM_DISABLE_KOMPRESS_ANTHROPIC"),
         disable_kompress_openai=_get_env_optional_bool("HEADROOM_DISABLE_KOMPRESS_OPENAI"),
         force_kompress_all=_get_env_bool("HEADROOM_FORCE_KOMPRESS_ALL", False),
+        compress_passthrough=_get_env_bool("HEADROOM_COMPRESS_PASSTHROUGH", False),
         max_connections=_get_env_int("HEADROOM_MAX_CONNECTIONS", 500),
         max_keepalive_connections=_get_env_int("HEADROOM_MAX_KEEPALIVE", 100),
         keepalive_expiry=_get_env_float("HEADROOM_KEEPALIVE_EXPIRY", 90.0),
@@ -4574,6 +4575,18 @@ def _parse_tool_profiles(cli_profiles: list[str]) -> dict[str, Any]:
             "Also settable via HEADROOM_FORCE_KOMPRESS_ALL=1."
         ),
     )
+    parser.add_argument(
+        "--compress-passthrough",
+        action="store_true",
+        help=(
+            "Also compress requests that fall through to the catch-all "
+            "passthrough handler (custom proxy paths not matched by a built-in "
+            "API route, e.g. `/api/codex-proxy/<key>/v1/responses` behind "
+            "another proxy). Applies to OpenAI Responses-shaped bodies (paths "
+            "ending in `/responses`). Off by default; also settable via "
+            "HEADROOM_COMPRESS_PASSTHROUGH=1."
+        ),
+    )
     parser.add_argument(
         "--exclude-tools",
         default=None,
@@ -4655,6 +4668,9 @@ def _parse_tool_profiles(cli_profiles: list[str]) -> dict[str, Any]:
     force_kompress_all = args.force_kompress_all or _get_env_bool(
         "HEADROOM_FORCE_KOMPRESS_ALL", False
     )
+    compress_passthrough = args.compress_passthrough or _get_env_bool(
+        "HEADROOM_COMPRESS_PASSTHROUGH", False
+    )
 
     # Set OpenRouter API key from CLI if provided
     if hasattr(args, "openrouter_api_key") and args.openrouter_api_key:
@@ -4710,6 +4726,7 @@ def _parse_tool_profiles(cli_profiles: list[str]) -> dict[str, Any]:
         disable_kompress_anthropic=disable_kompress_anthropic,
         disable_kompress_openai=disable_kompress_openai,
         force_kompress_all=force_kompress_all,
+        compress_passthrough=compress_passthrough,
         # Connection pool settings
         max_connections=_get_env_int("HEADROOM_MAX_CONNECTIONS", args.max_connections),
         max_keepalive_connections=_get_env_int("HEADROOM_MAX_KEEPALIVE", args.max_keepalive),
diff --git a/tests/test_compress_passthrough.py b/tests/test_compress_passthrough.py
@@ -0,0 +1,93 @@
+"""Tests for opt-in passthrough compression (issue #1546).
+
+Requests whose path doesn't match a built-in API route fall through to
+``handle_passthrough``, which historically forwarded the body verbatim — no
+compression. With ``compress_passthrough`` enabled, OpenAI Responses-shaped
+bodies (path ends in ``/responses``) are routed through the same
+ContentRouter/Kompress path the native ``/v1/responses`` handler uses.
+
+``_maybe_compress_passthrough_responses`` is the fail-open core: any parse or
+compressor failure returns the original body so a catch-all request is never
+dropped by opting into compression.
+"""
+
+from __future__ import annotations
+
+import json
+from types import SimpleNamespace
+
+from headroom.proxy.handlers.openai import OpenAIHandlerMixin
+
+
+def _make_handler(compress_impl):
+    """Bare mixin instance with just the two collaborators the helper needs."""
+    handler = OpenAIHandlerMixin.__new__(OpenAIHandlerMixin)
+    handler.config = SimpleNamespace(optimize=True, compress_passthrough=True)
+
+    async def _next_request_id():
+        return "req-test"
+
+    handler._next_request_id = _next_request_id
+    handler._compress_openai_responses_payload_in_executor = compress_impl
+    return handler
+
+
+def _shrinking_compressor(marker: str = "[C]"):
+    async def _impl(payload, *, model, request_id):
+        new = dict(payload)
+        new["input"] = marker
+        return (new, True, 5, ["kompress"], None, 100, 40, 5, {})
+
+    return _impl
+
+
+async def test_compresses_responses_shaped_body() -> None:
+    handler = _make_handler(_shrinking_compressor())
+    body = json.dumps({"model": "gpt-5.4", "input": [{"role": "user"}]}).encode()
+
+    out = await handler._maybe_compress_passthrough_responses(body)
+
+    assert out != body
+    assert json.loads(out)["input"] == "[C]"
+
+
+async def test_non_json_body_passes_through() -> None:
+    handler = _make_handler(_shrinking_compressor())
+    body = b"not json at all"
+
+    assert await handler._maybe_compress_passthrough_responses(body) == body
+
+
+async def test_non_responses_payload_passes_through() -> None:
+    # No `input` key → not a Responses payload; must not be touched.
+    handler = _make_handler(_shrinking_compressor())
+    body = json.dumps({"model": "gpt-5.4", "messages": []}).encode()
+
+    assert await handler._maybe_compress_passthrough_responses(body) == body
+
+
+async def test_unmodified_result_returns_original_bytes() -> None:
+    async def _noop(payload, *, model, request_id):
+        return (payload, False, 0, [], "no-op", 0, 0, 0, {})
+
+    handler = _make_handler(_noop)
+    body = json.dumps({"input": [{"role": "user"}]}).encode()
+
+    assert await handler._maybe_compress_passthrough_responses(body) == body
+
+
+async def test_compressor_error_fails_open() -> None:
+    async def _boom(payload, *, model, request_id):
+        raise RuntimeError("kompress exploded")
+
+    handler = _make_handler(_boom)
+    body = json.dumps({"input": [{"role": "user"}]}).encode()
+
+    # Fail-open: original body forwarded, exception swallowed.
+    assert await handler._maybe_compress_passthrough_responses(body) == body
+
+
+def test_config_defaults_off() -> None:
+    from headroom.proxy.models import ProxyConfig
+
+    assert ProxyConfig().compress_passthrough is False