From 49dc6c7927f5eec4378a98f1fa48e9873b308fb8 Mon Sep 17 00:00:00 2001 From: Matt Haitana Date: Fri, 26 Jun 2026 15:24:09 +1000 Subject: [PATCH 1/5] fix(bedrock): route ARNs via converse, named AWS profiles, and au. region prefix --- headroom/backends/litellm.py | 46 ++++++++++++++++++------ headroom/providers/registry.py | 6 +++- headroom/proxy/server.py | 1 + tests/test_bedrock_region.py | 36 +++++++++++++++++-- tests/test_provider_registry.py | 2 +- tests/test_provider_registry_extended.py | 2 +- tests/test_vertex_claude_compression.py | 2 +- 7 files changed, 79 insertions(+), 16 deletions(-) diff --git a/headroom/backends/litellm.py b/headroom/backends/litellm.py index ff7cb5e27..7ee211ed1 100644 --- a/headroom/backends/litellm.py +++ b/headroom/backends/litellm.py @@ -69,8 +69,10 @@ class ProviderConfig: # Region prefix used in cross-region Bedrock inference profile IDs. # EU regions use "eu.", AP regions use "apac.", US (and everything else) use "us.". +# ap-southeast-2 (Sydney/Australia) uses "au." — distinct from the rest of APAC. _BEDROCK_REGION_PREFIXES: dict[str, str] = { "eu": "eu", + "ap-southeast-2": "au", "ap": "apac", } @@ -135,7 +137,9 @@ def _build_bedrock_fallback_map(region: str) -> dict[str, str]: return {name: f"bedrock/{prefix}.{model_id}" for name, model_id in _CLAUDE_MODELS} -def _fetch_bedrock_inference_profiles(region: str | None) -> dict[str, str]: +def _fetch_bedrock_inference_profiles( + region: str | None, profile_name: str | None = None +) -> dict[str, str]: """Fetch available Bedrock inference profiles from AWS API. Uses boto3 list_inference_profiles() to get all available profiles @@ -147,15 +151,21 @@ def _fetch_bedrock_inference_profiles(region: str | None) -> dict[str, str]: Args: region: AWS region (e.g., "us-east-1", "eu-central-1") + profile_name: AWS named profile (e.g., "my-sso-profile"). When set, + a boto3.Session is created with this profile name so + the correct SSO or credential file is used. Falls back + to ambient credentials (AWS_PROFILE env var, instance + metadata, etc.) when not provided. Returns: Model map: anthropic_model_name -> bedrock inference profile ID """ region = region or "us-east-1" - # Check cache first - if region in _bedrock_profiles_cache: - return _bedrock_profiles_cache[region] + # Cache key includes profile_name so different profiles don't collide + cache_key = f"{region}:{profile_name or ''}" + if cache_key in _bedrock_profiles_cache: + return _bedrock_profiles_cache[cache_key] model_map: dict[str, str] = {} @@ -167,11 +177,12 @@ def _fetch_bedrock_inference_profiles(region: str | None) -> dict[str, str]: "Install boto3 for dynamic model discovery: pip install boto3" ) model_map = _build_bedrock_fallback_map(region) - _bedrock_profiles_cache[region] = model_map + _bedrock_profiles_cache[cache_key] = model_map return model_map try: - bedrock_client = boto3.client("bedrock", region_name=region) + session = boto3.Session(profile_name=profile_name) if profile_name else boto3.Session() + bedrock_client = session.client("bedrock", region_name=region) response = bedrock_client.list_inference_profiles(typeEquals="SYSTEM_DEFINED") for profile in response.get("inferenceProfileSummaries", []): @@ -209,7 +220,7 @@ def _fetch_bedrock_inference_profiles(region: str | None) -> dict[str, str]: model_map = _build_bedrock_fallback_map(region) # Cache the result - _bedrock_profiles_cache[region] = model_map + _bedrock_profiles_cache[cache_key] = model_map return model_map @@ -220,18 +231,23 @@ def _normalize_bedrock_profile_id(profile_id: str) -> str | None: profile_id: e.g., "us.anthropic.claude-sonnet-4-20250514-v1:0" or "anthropic.claude-sonnet-4-20250514-v1:0" or "claude-sonnet-4-20250514" + or "arn:aws:bedrock:...:application-inference-profile/..." Returns: Normalized name like "claude-sonnet-4-20250514", or None if not parseable """ import re + # ARNs are opaque identifiers — cannot be normalized to a standard model name + if profile_id.startswith("arn:aws:"): + return None + # Strip "bedrock/" prefix if present if profile_id.startswith("bedrock/"): profile_id = profile_id[8:] - # Strip region prefix (us., eu., apac.) - for prefix in ["us.", "eu.", "apac."]: + # Strip region prefix (us., eu., apac., au.) + for prefix in ["us.", "eu.", "apac.", "au."]: if profile_id.startswith(prefix): profile_id = profile_id[len(prefix) :] break @@ -400,6 +416,7 @@ def __init__( self, provider: str = "bedrock", region: str | None = None, + profile_name: str | None = None, **kwargs: Any, ): """Initialize LiteLLM backend. @@ -407,6 +424,9 @@ def __init__( Args: provider: LiteLLM provider prefix (bedrock, vertex_ai, openrouter, etc.) region: Cloud region (provider-specific) + profile_name: AWS named profile for credential resolution (bedrock only). + When set, boto3 uses this profile (e.g. an SSO profile) instead + of the ambient credentials. Ignored for non-bedrock providers. **kwargs: Additional provider-specific config """ if not LITELLM_AVAILABLE: @@ -423,7 +443,7 @@ def __init__( # For Bedrock, fetch model map dynamically from AWS API if provider == "bedrock": - self._model_map = _fetch_bedrock_inference_profiles(region) + self._model_map = _fetch_bedrock_inference_profiles(region, profile_name=profile_name) litellm.set_verbose = False # Reduce noise else: self._model_map = self._config.model_map @@ -442,6 +462,7 @@ def map_model_id(self, anthropic_model: str) -> str: - "anthropic.claude-sonnet-4-20250514-v1:0" (Bedrock without region) - "us.anthropic.claude-sonnet-4-20250514-v1:0" (Bedrock with region) - "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0" (LiteLLM format) + - "arn:aws:bedrock:...:application-inference-profile/..." (application inference profile) """ # Check direct mapping first if anthropic_model in self._model_map: @@ -449,6 +470,11 @@ def map_model_id(self, anthropic_model: str) -> str: # For Bedrock, try to normalize various input formats if self.provider == "bedrock": + # Application inference profile ARNs must use the converse route — + # the invoke route rejects ARNs with HTTP 400. + if anthropic_model.startswith("arn:aws:"): + return f"bedrock/converse/{anthropic_model}" + normalized = _normalize_bedrock_profile_id(anthropic_model) if normalized and normalized in self._model_map: return self._model_map[normalized] diff --git a/headroom/providers/registry.py b/headroom/providers/registry.py index 1e57f0014..7da214646 100644 --- a/headroom/providers/registry.py +++ b/headroom/providers/registry.py @@ -148,6 +148,7 @@ def create_proxy_backend( backend: str, anyllm_provider: str, bedrock_region: str | None, + bedrock_profile: str | None = None, logger: logging.Logger, openai_api_url: str | None = None, anyllm_backend_cls: Any | None = None, @@ -181,7 +182,10 @@ def create_proxy_backend( provider = "vertex_ai" try: backend_cls = litellm_backend_cls or _load_litellm_backend() - instance = cast("Backend", backend_cls(provider=provider, region=bedrock_region)) + instance = cast( + "Backend", + backend_cls(provider=provider, region=bedrock_region, profile_name=bedrock_profile), + ) logger.info("LiteLLM backend enabled (provider=%s, region=%s)", provider, bedrock_region) return instance except ImportError as exc: diff --git a/headroom/proxy/server.py b/headroom/proxy/server.py index 97d9a3211..028dd962b 100644 --- a/headroom/proxy/server.py +++ b/headroom/proxy/server.py @@ -875,6 +875,7 @@ def _router_config_for(kompress_disabled: bool) -> ContentRouterConfig: backend=config.backend, anyllm_provider=config.anyllm_provider, bedrock_region=config.bedrock_region, + bedrock_profile=config.bedrock_profile, logger=logger, openai_api_url=config.openai_api_url, anyllm_backend_cls=AnyLLMBackend, diff --git a/tests/test_bedrock_region.py b/tests/test_bedrock_region.py index 8a52d5009..67d80db71 100644 --- a/tests/test_bedrock_region.py +++ b/tests/test_bedrock_region.py @@ -212,13 +212,25 @@ def patched_import(name, *args, **kwargs): ) def test_caching_prevents_repeated_api_calls(self): - """Second call for same region should return cached result.""" + """Second call for same region+profile should return cached result.""" _bedrock_profiles_cache.clear() - _bedrock_profiles_cache["us-east-1"] = {"test": "bedrock/test-model"} + _bedrock_profiles_cache["us-east-1:"] = {"test": "bedrock/test-model"} result = _fetch_bedrock_inference_profiles("us-east-1") assert result == {"test": "bedrock/test-model"} + def test_profile_cache_isolation(self): + """Different profiles for the same region must not share a cache entry.""" + _bedrock_profiles_cache.clear() + _bedrock_profiles_cache["us-east-1:profileA"] = {"model": "bedrock/profile-a-model"} + _bedrock_profiles_cache["us-east-1:profileB"] = {"model": "bedrock/profile-b-model"} + + result_a = _fetch_bedrock_inference_profiles("us-east-1", profile_name="profileA") + result_b = _fetch_bedrock_inference_profiles("us-east-1", profile_name="profileB") + assert result_a["model"] == "bedrock/profile-a-model" + assert result_b["model"] == "bedrock/profile-b-model" + assert result_a != result_b + # ============================================================================= # LiteLLMBackend.map_model_id with EU Regions @@ -310,6 +322,26 @@ def test_region_prefixed_format_normalized(self): result = backend.map_model_id("eu.anthropic.claude-sonnet-4-20250514-v1:0") assert result == "bedrock/eu.anthropic.claude-sonnet-4-20250514-v1:0" + def test_arn_passthrough(self): + """Application inference profile ARNs must use the converse route.""" + with patch( + "headroom.backends.litellm._fetch_bedrock_inference_profiles", + return_value={}, + ): + backend = LiteLLMBackend(provider="bedrock", region="ap-southeast-2") + arn = "arn:aws:bedrock:ap-southeast-2:123456789012:application-inference-profile/abc123" + result = backend.map_model_id(arn) + assert result == f"bedrock/converse/{arn}" + + def test_ap_southeast_2_uses_au_prefix(self): + """ap-southeast-2 (Sydney/Australia) should use 'au.' prefix, not 'apac.'.""" + with patch( + "headroom.backends.litellm._fetch_bedrock_inference_profiles", + return_value={}, + ): + backend = LiteLLMBackend(provider="bedrock", region="ap-southeast-2") + result = backend.map_model_id("claude-sonnet-4-5-20250929") + assert result == "bedrock/au.anthropic.claude-sonnet-4-5-20250929-v1:0" # ============================================================================= # Normalize Bedrock Profile ID (edge cases) diff --git a/tests/test_provider_registry.py b/tests/test_provider_registry.py index 8c95499a2..498d21f0c 100644 --- a/tests/test_provider_registry.py +++ b/tests/test_provider_registry.py @@ -119,7 +119,7 @@ def test_create_proxy_backend_handles_missing_litellm_backend(caplog) -> None: anyllm_provider="ignored", bedrock_region="us-east-1", logger=logger, - litellm_backend_cls=lambda provider, region: (_ for _ in ()).throw( + litellm_backend_cls=lambda provider, region, profile_name=None: (_ for _ in ()).throw( ImportError("missing") ), ) diff --git a/tests/test_provider_registry_extended.py b/tests/test_provider_registry_extended.py index 9c8050adf..c3ef4559a 100644 --- a/tests/test_provider_registry_extended.py +++ b/tests/test_provider_registry_extended.py @@ -113,7 +113,7 @@ def test_create_proxy_backend_uses_injected_backend_types() -> None: anyllm_provider="ignored", bedrock_region="us-east-1", logger=logger, - litellm_backend_cls=lambda provider, region: { + litellm_backend_cls=lambda provider, region, profile_name=None: { "kind": "litellm", "provider": provider, "region": region, diff --git a/tests/test_vertex_claude_compression.py b/tests/test_vertex_claude_compression.py index 0396f78aa..0689829a8 100644 --- a/tests/test_vertex_claude_compression.py +++ b/tests/test_vertex_claude_compression.py @@ -78,7 +78,7 @@ def _capture_provider(backend: str) -> dict[str, Any]: captured: dict[str, Any] = {} class FakeLiteLLM: - def __init__(self, provider: str, region: str | None = None) -> None: + def __init__(self, provider: str, region: str | None = None, profile_name: str | None = None) -> None: captured["provider"] = provider captured["region"] = region From b17840648196c5ccd9529a0ad3b103b8d9439163 Mon Sep 17 00:00:00 2001 From: Matt Haitana Date: Fri, 26 Jun 2026 16:10:00 +1000 Subject: [PATCH 2/5] docs(bedrock): add Claude Code + Bedrock setup guide --- docs/claude-code-bedrock-headroom.md | 151 +++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 docs/claude-code-bedrock-headroom.md diff --git a/docs/claude-code-bedrock-headroom.md b/docs/claude-code-bedrock-headroom.md new file mode 100644 index 000000000..c12559b7e --- /dev/null +++ b/docs/claude-code-bedrock-headroom.md @@ -0,0 +1,151 @@ +# Claude Code + AWS Bedrock, with Headroom compression + +*Validated end-to-end on 2026-06-26 (Claude Code 2.1, Headroom 0.27.0, ap-southeast-2).* + +This is the **working, tested** way to run **Claude Code** against **Claude models on +AWS Bedrock** with **Headroom compressing the context** in the middle. + +## TL;DR + +Run Claude Code in **normal Anthropic mode** (NOT Bedrock mode) pointed at a local +Headroom proxy, and let **Headroom** be the thing that talks to Bedrock: + +``` +Claude Code ──ANTHROPIC_BASE_URL──▶ Headroom proxy ──LiteLLM (bedrock)──▶ AWS Bedrock + (normal mode) (plain http) (compresses) (your AWS creds) (Claude) +``` + +Two non-obvious requirements make the difference between "works" and "silently bypasses +the proxy": + +1. **`CLAUDE_CODE_USE_BEDROCK=0`** — Without this, Claude Code detects ARN model IDs or + the `CLAUDE_CODE_USE_BEDROCK=1` flag and calls Bedrock directly via the AWS SDK, + completely bypassing `ANTHROPIC_BASE_URL` and the proxy. +2. **Standard Anthropic model names** in `ANTHROPIC_DEFAULT_*_MODEL` (e.g. + `claude-sonnet-4-6`, not ARNs) — Claude Code validates model names client-side against + a built-in list. ARN values are detected and cause Claude Code to bypass the proxy. + +## Why not "just set CLAUDE_CODE_USE_BEDROCK=1"? + +That approach **does not work** with Headroom. When `CLAUDE_CODE_USE_BEDROCK=1` is set, +Claude Code calls Bedrock directly using the AWS SDK — `ANTHROPIC_BASE_URL` is ignored +entirely and the proxy never receives a byte. The same bypass happens when +`ANTHROPIC_DEFAULT_*_MODEL` contains ARN values. Use the Anthropic-mode path below. + +## Prerequisites + +- **AWS credentials** configured for your environment (env vars, `~/.aws/credentials`, + instance profile, or SSO via `aws sso login`). Confirm direct access works before + involving Headroom: + ```bash + aws bedrock-runtime invoke-model \ + --region us-east-1 \ + --model-id anthropic.claude-3-haiku-20240307-v1:0 \ + --body '{"anthropic_version":"bedrock-2023-05-31","max_tokens":20,"messages":[{"role":"user","content":"hi"}]}' \ + /tmp/out.json + ``` +- **boto3** in the proxy's Python environment (for dynamic inference profile discovery): + ```bash + pip install boto3 + ``` +- **IAM permissions** for the models you intend to use — at minimum + `bedrock:InvokeModel` and `bedrock:InvokeModelWithResponseStream`. For application + inference profiles, scope to the specific profile ARN: + ```json + { + "Effect": "Allow", + "Action": ["bedrock:InvokeModel", "bedrock:InvokeModelWithResponseStream"], + "Resource": ["arn:aws:bedrock:::application-inference-profile/"] + } + ``` + +## Terminal 1 — start the Headroom proxy (Bedrock backend) + +```bash +headroom proxy --port 8787 \ + --backend bedrock \ + --region us-east-1 +``` + +With a named AWS SSO profile: + +```bash +headroom proxy --port 8787 \ + --backend bedrock \ + --region us-east-1 \ + --bedrock-profile my-sso-profile +``` + +On startup the proxy calls `list_inference_profiles` to build a model map. Confirm it +is routing correctly by checking the LiteLLM log lines — you should see: + +``` +LiteLLM completion() model= converse/arn:aws:... provider = bedrock +``` + +## Terminal 2 — run Claude Code (normal Anthropic mode) against the proxy + +```bash +export CLAUDE_CODE_USE_BEDROCK=0 # REQUIRED — prevents Claude Code bypassing the proxy +export ANTHROPIC_BASE_URL=http://127.0.0.1:8787 +export ANTHROPIC_API_KEY=headroom # Claude Code needs *a* key to start; value is ignored +export ANTHROPIC_MODEL=claude-opus-4-6 +export ANTHROPIC_DEFAULT_SONNET_MODEL=claude-sonnet-4-6 +export ANTHROPIC_DEFAULT_OPUS_MODEL=claude-opus-4-6 +export ANTHROPIC_DEFAULT_HAIKU_MODEL=claude-haiku-4-5-20251001 + +claude +``` + +Or via `~/.claude/settings.json`: + +```json +{ + "env": { + "CLAUDE_CODE_USE_BEDROCK": "0", + "ANTHROPIC_BASE_URL": "http://127.0.0.1:8787", + "ANTHROPIC_API_KEY": "headroom", + "ANTHROPIC_MODEL": "claude-opus-4-6", + "ANTHROPIC_DEFAULT_SONNET_MODEL": "claude-sonnet-4-6", + "ANTHROPIC_DEFAULT_OPUS_MODEL": "claude-opus-4-6", + "ANTHROPIC_DEFAULT_HAIKU_MODEL": "claude-haiku-4-5-20251001" + } +} +``` + +Claude Code now talks plain Anthropic `/v1/messages` to Headroom; Headroom compresses +and forwards to Bedrock via LiteLLM, then translates the answer back. + +## Application inference profiles (account-specific ARNs) + +If your IAM policy only permits **application inference profiles** (account-specific +ARNs) rather than system-defined cross-region profiles, pass the ARN directly as the +model value in `ANTHROPIC_DEFAULT_*_MODEL`. The proxy detects `arn:aws:` prefixed model +IDs and routes them via `bedrock/converse/` automatically — no extra configuration +required. + +## Region prefix notes + +| AWS region | Cross-region inference prefix | +|---|---| +| `us-*` | `us.` | +| `eu-*` | `eu.` | +| `ap-*` (except `ap-southeast-2`) | `apac.` | +| `ap-southeast-2` (Sydney) | `au.` | + +The proxy uses the correct prefix automatically when constructing fallback model IDs. + +## Verify compression is happening + +- Dashboard: — "tokens saved" climbs as you work. +- `curl -s localhost:8787/stats` → `tokens.saved` and `request_logs[].transforms_applied`. + +## Troubleshooting + +| Symptom | Cause | Fix | +|---|---|---| +| Proxy receives no requests | Claude Code is in Bedrock mode, bypassing proxy | Set `CLAUDE_CODE_USE_BEDROCK=0` | +| `400 The provided model identifier is invalid` | ARN value in `ANTHROPIC_DEFAULT_*_MODEL` detected by Claude Code | Use standard model names (`claude-sonnet-4-6`, not ARNs) | +| `403 AccessDeniedException` on system-defined profiles | IAM policy only permits application profiles | Use `--bedrock-profile` with an authorized profile and pass application inference profile ARNs as model values | +| `400 … Try calling via converse route` | Old proxy version routing ARNs to invoke path | Upgrade to headroom ≥ 0.27.1 | +| Model map empty at startup | boto3 not installed or wrong AWS profile | `pip install boto3`; check `--bedrock-profile` / `AWS_PROFILE` | From 660640e19912d13645cd8fcc0855d72a3c6286b2 Mon Sep 17 00:00:00 2001 From: Matt Haitana Date: Sat, 27 Jun 2026 00:17:44 +1000 Subject: [PATCH 3/5] fix(bedrock): wire profile_name to acompletion() and fix guide contradiction --- docs/claude-code-bedrock-headroom.md | 14 ++-- headroom/backends/litellm.py | 13 ++++ tests/test_bedrock_region.py | 95 ++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 9 deletions(-) diff --git a/docs/claude-code-bedrock-headroom.md b/docs/claude-code-bedrock-headroom.md index c12559b7e..1d0228016 100644 --- a/docs/claude-code-bedrock-headroom.md +++ b/docs/claude-code-bedrock-headroom.md @@ -15,22 +15,18 @@ Claude Code ──ANTHROPIC_BASE_URL──▶ Headroom proxy ──LiteLLM (b (normal mode) (plain http) (compresses) (your AWS creds) (Claude) ``` -Two non-obvious requirements make the difference between "works" and "silently bypasses +One non-obvious requirement makes the difference between "works" and "silently bypasses the proxy": -1. **`CLAUDE_CODE_USE_BEDROCK=0`** — Without this, Claude Code detects ARN model IDs or - the `CLAUDE_CODE_USE_BEDROCK=1` flag and calls Bedrock directly via the AWS SDK, +1. **`CLAUDE_CODE_USE_BEDROCK=0`** — Without this, Claude Code sees the + `CLAUDE_CODE_USE_BEDROCK=1` flag and calls Bedrock directly via the AWS SDK, completely bypassing `ANTHROPIC_BASE_URL` and the proxy. -2. **Standard Anthropic model names** in `ANTHROPIC_DEFAULT_*_MODEL` (e.g. - `claude-sonnet-4-6`, not ARNs) — Claude Code validates model names client-side against - a built-in list. ARN values are detected and cause Claude Code to bypass the proxy. ## Why not "just set CLAUDE_CODE_USE_BEDROCK=1"? That approach **does not work** with Headroom. When `CLAUDE_CODE_USE_BEDROCK=1` is set, Claude Code calls Bedrock directly using the AWS SDK — `ANTHROPIC_BASE_URL` is ignored -entirely and the proxy never receives a byte. The same bypass happens when -`ANTHROPIC_DEFAULT_*_MODEL` contains ARN values. Use the Anthropic-mode path below. +entirely and the proxy never receives a byte. Use the Anthropic-mode path below. ## Prerequisites @@ -145,7 +141,7 @@ The proxy uses the correct prefix automatically when constructing fallback model | Symptom | Cause | Fix | |---|---|---| | Proxy receives no requests | Claude Code is in Bedrock mode, bypassing proxy | Set `CLAUDE_CODE_USE_BEDROCK=0` | -| `400 The provided model identifier is invalid` | ARN value in `ANTHROPIC_DEFAULT_*_MODEL` detected by Claude Code | Use standard model names (`claude-sonnet-4-6`, not ARNs) | +| `400 The provided model identifier is invalid` | Bedrock rejected the model name format | Use standard cross-region profile names (`claude-sonnet-4-6`) or a valid application inference profile ARN | | `403 AccessDeniedException` on system-defined profiles | IAM policy only permits application profiles | Use `--bedrock-profile` with an authorized profile and pass application inference profile ARNs as model values | | `400 … Try calling via converse route` | Old proxy version routing ARNs to invoke path | Upgrade to headroom ≥ 0.27.1 | | Model map empty at startup | boto3 not installed or wrong AWS profile | `pip install boto3`; check `--bedrock-profile` / `AWS_PROFILE` | diff --git a/headroom/backends/litellm.py b/headroom/backends/litellm.py index 7ee211ed1..c0c1e8dc9 100644 --- a/headroom/backends/litellm.py +++ b/headroom/backends/litellm.py @@ -436,6 +436,7 @@ def __init__( self.provider = provider self.region = region + self.profile_name = profile_name self.kwargs = kwargs # Get provider config from registry @@ -707,6 +708,9 @@ async def send_message( elif self.provider in ("vertex_ai", "vertex_ai_beta"): kwargs["vertex_location"] = self.region + if self.provider == "bedrock" and self.profile_name: + kwargs["aws_profile_name"] = self.profile_name + # Forward API key from request headers if present. # Skip for Bedrock/Vertex: they use env-based auth (AWS SigV4 / Google ADC). # Forwarding x-api-key (e.g. sk-ant-dummy) would override their credentials. @@ -811,6 +815,9 @@ async def stream_message( elif self.provider in ("vertex_ai", "vertex_ai_beta"): kwargs["vertex_location"] = self.region + if self.provider == "bedrock" and self.profile_name: + kwargs["aws_profile_name"] = self.profile_name + # Forward API key from request headers if present. # Skip for Bedrock/Vertex: they use env-based auth (AWS SigV4 / Google ADC). # Forwarding x-api-key (e.g. sk-ant-dummy) would override their credentials. @@ -1035,6 +1042,9 @@ async def send_openai_message( elif self.provider in ("vertex_ai", "vertex_ai_beta"): kwargs["vertex_location"] = self.region + if self.provider == "bedrock" and self.profile_name: + kwargs["aws_profile_name"] = self.profile_name + # Forward API key from request headers if present. # Skip for Bedrock/Vertex: they use env-based auth (AWS SigV4 / Google ADC). # Forwarding x-api-key (e.g. sk-ant-dummy) would override their credentials. @@ -1210,6 +1220,9 @@ async def stream_openai_message( elif self.provider in ("vertex_ai", "vertex_ai_beta"): kwargs["vertex_location"] = self.region + if self.provider == "bedrock" and self.profile_name: + kwargs["aws_profile_name"] = self.profile_name + # Forward API key from request headers if present. # Skip for Bedrock/Vertex: they use env-based auth (AWS SigV4 / Google ADC). # Forwarding x-api-key (e.g. sk-ant-dummy) would override their credentials. diff --git a/tests/test_bedrock_region.py b/tests/test_bedrock_region.py index 67d80db71..21e9c37ce 100644 --- a/tests/test_bedrock_region.py +++ b/tests/test_bedrock_region.py @@ -384,3 +384,98 @@ def test_already_normalized(self): assert _normalize_bedrock_profile_id("claude-sonnet-4-20250514") == ( "claude-sonnet-4-20250514" ) + + +# ============================================================================= +# Named profile forwarded to acompletion kwargs +# ============================================================================= + +_MODEL_MAP_US = {"claude-sonnet-4-20250514": "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0"} +_BODY = { + "model": "claude-sonnet-4-20250514", + "messages": [{"role": "user", "content": "hi"}], + "max_tokens": 10, +} + + +def _make_fake_completion_resp(): + mock_resp = MagicMock() + mock_resp.choices = [MagicMock()] + mock_resp.choices[0].message.content = "hello" + mock_resp.choices[0].message.tool_calls = None + mock_resp.choices[0].finish_reason = "stop" + mock_resp.usage.prompt_tokens = 10 + mock_resp.usage.completion_tokens = 5 + return mock_resp + + +class TestBedrockProfileForwardedToCompletion: + """Regression: --bedrock-profile must be passed to acompletion(), not just to + _fetch_bedrock_inference_profiles() at startup. Without self.profile_name the + actual Bedrock call still uses ambient/default credentials even when the user + explicitly supplied a named SSO profile.""" + + def setup_method(self): + _bedrock_profiles_cache.clear() + + async def test_send_message_passes_aws_profile_name(self): + """send_message() must include aws_profile_name in the acompletion() kwargs.""" + captured_kwargs: dict = {} + + async def fake_acompletion(**kwargs): + captured_kwargs.update(kwargs) + return _make_fake_completion_resp() + + with patch( + "headroom.backends.litellm._fetch_bedrock_inference_profiles", + return_value=_MODEL_MAP_US, + ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion): + backend = LiteLLMBackend( + provider="bedrock", region="us-east-1", profile_name="my-sso-profile" + ) + await backend.send_message(body=_BODY, headers={}) + + assert captured_kwargs.get("aws_profile_name") == "my-sso-profile" + + async def test_stream_message_passes_aws_profile_name(self): + """stream_message() must include aws_profile_name in the acompletion() kwargs.""" + captured_kwargs: dict = {} + + async def fake_acompletion(**kwargs): + captured_kwargs.update(kwargs) + + async def _empty(): + return + yield # pragma: no cover — makes this an async generator + + return _empty() + + with patch( + "headroom.backends.litellm._fetch_bedrock_inference_profiles", + return_value=_MODEL_MAP_US, + ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion): + backend = LiteLLMBackend( + provider="bedrock", region="us-east-1", profile_name="my-sso-profile" + ) + async for _ in backend.stream_message(body=_BODY, headers={}): + pass + + assert captured_kwargs.get("aws_profile_name") == "my-sso-profile" + + async def test_no_profile_does_not_set_aws_profile_name(self): + """When no profile is configured, aws_profile_name must not appear in kwargs + (LiteLLM falls back to ambient credentials correctly).""" + captured_kwargs: dict = {} + + async def fake_acompletion(**kwargs): + captured_kwargs.update(kwargs) + return _make_fake_completion_resp() + + with patch( + "headroom.backends.litellm._fetch_bedrock_inference_profiles", + return_value=_MODEL_MAP_US, + ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion): + backend = LiteLLMBackend(provider="bedrock", region="us-east-1") + await backend.send_message(body=_BODY, headers={}) + + assert "aws_profile_name" not in captured_kwargs From 2cdddfccbfe93ff3db2373c9914bdb00bd6715d4 Mon Sep 17 00:00:00 2001 From: JerrettDavis Date: Wed, 1 Jul 2026 19:02:40 -0500 Subject: [PATCH 4/5] style(bedrock): format profile tests --- tests/test_bedrock_region.py | 34 ++++++++++++++++--------- tests/test_vertex_claude_compression.py | 4 ++- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/tests/test_bedrock_region.py b/tests/test_bedrock_region.py index 21e9c37ce..1c5c25bf7 100644 --- a/tests/test_bedrock_region.py +++ b/tests/test_bedrock_region.py @@ -343,6 +343,7 @@ def test_ap_southeast_2_uses_au_prefix(self): result = backend.map_model_id("claude-sonnet-4-5-20250929") assert result == "bedrock/au.anthropic.claude-sonnet-4-5-20250929-v1:0" + # ============================================================================= # Normalize Bedrock Profile ID (edge cases) # ============================================================================= @@ -426,10 +427,13 @@ async def fake_acompletion(**kwargs): captured_kwargs.update(kwargs) return _make_fake_completion_resp() - with patch( - "headroom.backends.litellm._fetch_bedrock_inference_profiles", - return_value=_MODEL_MAP_US, - ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion): + with ( + patch( + "headroom.backends.litellm._fetch_bedrock_inference_profiles", + return_value=_MODEL_MAP_US, + ), + patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion), + ): backend = LiteLLMBackend( provider="bedrock", region="us-east-1", profile_name="my-sso-profile" ) @@ -450,10 +454,13 @@ async def _empty(): return _empty() - with patch( - "headroom.backends.litellm._fetch_bedrock_inference_profiles", - return_value=_MODEL_MAP_US, - ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion): + with ( + patch( + "headroom.backends.litellm._fetch_bedrock_inference_profiles", + return_value=_MODEL_MAP_US, + ), + patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion), + ): backend = LiteLLMBackend( provider="bedrock", region="us-east-1", profile_name="my-sso-profile" ) @@ -471,10 +478,13 @@ async def fake_acompletion(**kwargs): captured_kwargs.update(kwargs) return _make_fake_completion_resp() - with patch( - "headroom.backends.litellm._fetch_bedrock_inference_profiles", - return_value=_MODEL_MAP_US, - ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion): + with ( + patch( + "headroom.backends.litellm._fetch_bedrock_inference_profiles", + return_value=_MODEL_MAP_US, + ), + patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion), + ): backend = LiteLLMBackend(provider="bedrock", region="us-east-1") await backend.send_message(body=_BODY, headers={}) diff --git a/tests/test_vertex_claude_compression.py b/tests/test_vertex_claude_compression.py index 0689829a8..e4753668b 100644 --- a/tests/test_vertex_claude_compression.py +++ b/tests/test_vertex_claude_compression.py @@ -78,7 +78,9 @@ def _capture_provider(backend: str) -> dict[str, Any]: captured: dict[str, Any] = {} class FakeLiteLLM: - def __init__(self, provider: str, region: str | None = None, profile_name: str | None = None) -> None: + def __init__( + self, provider: str, region: str | None = None, profile_name: str | None = None + ) -> None: captured["provider"] = provider captured["region"] = region From fcbe53ee91cccf823a02803f5226e019a244c000 Mon Sep 17 00:00:00 2001 From: JerrettDavis Date: Wed, 1 Jul 2026 21:14:56 -0500 Subject: [PATCH 5/5] test(bedrock): mock boto3 sessions in inference profile tests --- tests/test_bedrock_region.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_bedrock_region.py b/tests/test_bedrock_region.py index 1c5c25bf7..46df34b2d 100644 --- a/tests/test_bedrock_region.py +++ b/tests/test_bedrock_region.py @@ -150,7 +150,9 @@ def test_fallback_when_api_call_fails(self): mock_client.list_inference_profiles.side_effect = Exception( "AccessDeniedException: not authorized" ) - mock_boto3.client.return_value = mock_client + mock_session = MagicMock() + mock_session.client.return_value = mock_client + mock_boto3.Session.return_value = mock_session with patch("headroom.backends.litellm.boto3", mock_boto3, create=True): # Patch the import inside the function @@ -188,7 +190,9 @@ def test_successful_fetch_uses_api_results(self): {"inferenceProfileId": "eu.meta.llama-3-70b-v1:0"}, # non-Anthropic, should skip ] } - mock_boto3.client.return_value = mock_client + mock_session = MagicMock() + mock_session.client.return_value = mock_client + mock_boto3.Session.return_value = mock_session import builtins