From 49dc6c7927f5eec4378a98f1fa48e9873b308fb8 Mon Sep 17 00:00:00 2001
From: Matt Haitana <mhaitana@costar.com>
Date: Fri, 26 Jun 2026 15:24:09 +1000
Subject: [PATCH 1/5] fix(bedrock): route ARNs via converse, named AWS
 profiles, and au. region prefix

---
 headroom/backends/litellm.py             | 46 ++++++++++++++++++------
 headroom/providers/registry.py           |  6 +++-
 headroom/proxy/server.py                 |  1 +
 tests/test_bedrock_region.py             | 36 +++++++++++++++++--
 tests/test_provider_registry.py          |  2 +-
 tests/test_provider_registry_extended.py |  2 +-
 tests/test_vertex_claude_compression.py  |  2 +-
 7 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/headroom/backends/litellm.py b/headroom/backends/litellm.py
index ff7cb5e27..7ee211ed1 100644
--- a/headroom/backends/litellm.py
+++ b/headroom/backends/litellm.py
@@ -69,8 +69,10 @@ class ProviderConfig:
 
 # Region prefix used in cross-region Bedrock inference profile IDs.
 # EU regions use "eu.", AP regions use "apac.", US (and everything else) use "us.".
+# ap-southeast-2 (Sydney/Australia) uses "au." — distinct from the rest of APAC.
 _BEDROCK_REGION_PREFIXES: dict[str, str] = {
     "eu": "eu",
+    "ap-southeast-2": "au",
     "ap": "apac",
 }
 
@@ -135,7 +137,9 @@ def _build_bedrock_fallback_map(region: str) -> dict[str, str]:
     return {name: f"bedrock/{prefix}.{model_id}" for name, model_id in _CLAUDE_MODELS}
 
 
-def _fetch_bedrock_inference_profiles(region: str | None) -> dict[str, str]:
+def _fetch_bedrock_inference_profiles(
+    region: str | None, profile_name: str | None = None
+) -> dict[str, str]:
     """Fetch available Bedrock inference profiles from AWS API.
 
     Uses boto3 list_inference_profiles() to get all available profiles
@@ -147,15 +151,21 @@ def _fetch_bedrock_inference_profiles(region: str | None) -> dict[str, str]:
 
     Args:
         region: AWS region (e.g., "us-east-1", "eu-central-1")
+        profile_name: AWS named profile (e.g., "my-sso-profile"). When set,
+                      a boto3.Session is created with this profile name so
+                      the correct SSO or credential file is used. Falls back
+                      to ambient credentials (AWS_PROFILE env var, instance
+                      metadata, etc.) when not provided.
 
     Returns:
         Model map: anthropic_model_name -> bedrock inference profile ID
     """
     region = region or "us-east-1"
 
-    # Check cache first
-    if region in _bedrock_profiles_cache:
-        return _bedrock_profiles_cache[region]
+    # Cache key includes profile_name so different profiles don't collide
+    cache_key = f"{region}:{profile_name or ''}"
+    if cache_key in _bedrock_profiles_cache:
+        return _bedrock_profiles_cache[cache_key]
 
     model_map: dict[str, str] = {}
 
@@ -167,11 +177,12 @@ def _fetch_bedrock_inference_profiles(region: str | None) -> dict[str, str]:
             "Install boto3 for dynamic model discovery: pip install boto3"
         )
         model_map = _build_bedrock_fallback_map(region)
-        _bedrock_profiles_cache[region] = model_map
+        _bedrock_profiles_cache[cache_key] = model_map
         return model_map
 
     try:
-        bedrock_client = boto3.client("bedrock", region_name=region)
+        session = boto3.Session(profile_name=profile_name) if profile_name else boto3.Session()
+        bedrock_client = session.client("bedrock", region_name=region)
         response = bedrock_client.list_inference_profiles(typeEquals="SYSTEM_DEFINED")
 
         for profile in response.get("inferenceProfileSummaries", []):
@@ -209,7 +220,7 @@ def _fetch_bedrock_inference_profiles(region: str | None) -> dict[str, str]:
         model_map = _build_bedrock_fallback_map(region)
 
     # Cache the result
-    _bedrock_profiles_cache[region] = model_map
+    _bedrock_profiles_cache[cache_key] = model_map
     return model_map
 
 
@@ -220,18 +231,23 @@ def _normalize_bedrock_profile_id(profile_id: str) -> str | None:
         profile_id: e.g., "us.anthropic.claude-sonnet-4-20250514-v1:0"
                     or "anthropic.claude-sonnet-4-20250514-v1:0"
                     or "claude-sonnet-4-20250514"
+                    or "arn:aws:bedrock:...:application-inference-profile/..."
 
     Returns:
         Normalized name like "claude-sonnet-4-20250514", or None if not parseable
     """
     import re
 
+    # ARNs are opaque identifiers — cannot be normalized to a standard model name
+    if profile_id.startswith("arn:aws:"):
+        return None
+
     # Strip "bedrock/" prefix if present
     if profile_id.startswith("bedrock/"):
         profile_id = profile_id[8:]
 
-    # Strip region prefix (us., eu., apac.)
-    for prefix in ["us.", "eu.", "apac."]:
+    # Strip region prefix (us., eu., apac., au.)
+    for prefix in ["us.", "eu.", "apac.", "au."]:
         if profile_id.startswith(prefix):
             profile_id = profile_id[len(prefix) :]
             break
@@ -400,6 +416,7 @@ def __init__(
         self,
         provider: str = "bedrock",
         region: str | None = None,
+        profile_name: str | None = None,
         **kwargs: Any,
     ):
         """Initialize LiteLLM backend.
@@ -407,6 +424,9 @@ def __init__(
         Args:
             provider: LiteLLM provider prefix (bedrock, vertex_ai, openrouter, etc.)
             region: Cloud region (provider-specific)
+            profile_name: AWS named profile for credential resolution (bedrock only).
+                          When set, boto3 uses this profile (e.g. an SSO profile) instead
+                          of the ambient credentials. Ignored for non-bedrock providers.
             **kwargs: Additional provider-specific config
         """
         if not LITELLM_AVAILABLE:
@@ -423,7 +443,7 @@ def __init__(
 
         # For Bedrock, fetch model map dynamically from AWS API
         if provider == "bedrock":
-            self._model_map = _fetch_bedrock_inference_profiles(region)
+            self._model_map = _fetch_bedrock_inference_profiles(region, profile_name=profile_name)
             litellm.set_verbose = False  # Reduce noise
         else:
             self._model_map = self._config.model_map
@@ -442,6 +462,7 @@ def map_model_id(self, anthropic_model: str) -> str:
         - "anthropic.claude-sonnet-4-20250514-v1:0" (Bedrock without region)
         - "us.anthropic.claude-sonnet-4-20250514-v1:0" (Bedrock with region)
         - "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0" (LiteLLM format)
+        - "arn:aws:bedrock:...:application-inference-profile/..." (application inference profile)
         """
         # Check direct mapping first
         if anthropic_model in self._model_map:
@@ -449,6 +470,11 @@ def map_model_id(self, anthropic_model: str) -> str:
 
         # For Bedrock, try to normalize various input formats
         if self.provider == "bedrock":
+            # Application inference profile ARNs must use the converse route —
+            # the invoke route rejects ARNs with HTTP 400.
+            if anthropic_model.startswith("arn:aws:"):
+                return f"bedrock/converse/{anthropic_model}"
+
             normalized = _normalize_bedrock_profile_id(anthropic_model)
             if normalized and normalized in self._model_map:
                 return self._model_map[normalized]
diff --git a/headroom/providers/registry.py b/headroom/providers/registry.py
index 1e57f0014..7da214646 100644
--- a/headroom/providers/registry.py
+++ b/headroom/providers/registry.py
@@ -148,6 +148,7 @@ def create_proxy_backend(
     backend: str,
     anyllm_provider: str,
     bedrock_region: str | None,
+    bedrock_profile: str | None = None,
     logger: logging.Logger,
     openai_api_url: str | None = None,
     anyllm_backend_cls: Any | None = None,
@@ -181,7 +182,10 @@ def create_proxy_backend(
         provider = "vertex_ai"
     try:
         backend_cls = litellm_backend_cls or _load_litellm_backend()
-        instance = cast("Backend", backend_cls(provider=provider, region=bedrock_region))
+        instance = cast(
+            "Backend",
+            backend_cls(provider=provider, region=bedrock_region, profile_name=bedrock_profile),
+        )
         logger.info("LiteLLM backend enabled (provider=%s, region=%s)", provider, bedrock_region)
         return instance
     except ImportError as exc:
diff --git a/headroom/proxy/server.py b/headroom/proxy/server.py
index 97d9a3211..028dd962b 100644
--- a/headroom/proxy/server.py
+++ b/headroom/proxy/server.py
@@ -875,6 +875,7 @@ def _router_config_for(kompress_disabled: bool) -> ContentRouterConfig:
             backend=config.backend,
             anyllm_provider=config.anyllm_provider,
             bedrock_region=config.bedrock_region,
+            bedrock_profile=config.bedrock_profile,
             logger=logger,
             openai_api_url=config.openai_api_url,
             anyllm_backend_cls=AnyLLMBackend,
diff --git a/tests/test_bedrock_region.py b/tests/test_bedrock_region.py
index 8a52d5009..67d80db71 100644
--- a/tests/test_bedrock_region.py
+++ b/tests/test_bedrock_region.py
@@ -212,13 +212,25 @@ def patched_import(name, *args, **kwargs):
         )
 
     def test_caching_prevents_repeated_api_calls(self):
-        """Second call for same region should return cached result."""
+        """Second call for same region+profile should return cached result."""
         _bedrock_profiles_cache.clear()
-        _bedrock_profiles_cache["us-east-1"] = {"test": "bedrock/test-model"}
+        _bedrock_profiles_cache["us-east-1:"] = {"test": "bedrock/test-model"}
 
         result = _fetch_bedrock_inference_profiles("us-east-1")
         assert result == {"test": "bedrock/test-model"}
 
+    def test_profile_cache_isolation(self):
+        """Different profiles for the same region must not share a cache entry."""
+        _bedrock_profiles_cache.clear()
+        _bedrock_profiles_cache["us-east-1:profileA"] = {"model": "bedrock/profile-a-model"}
+        _bedrock_profiles_cache["us-east-1:profileB"] = {"model": "bedrock/profile-b-model"}
+
+        result_a = _fetch_bedrock_inference_profiles("us-east-1", profile_name="profileA")
+        result_b = _fetch_bedrock_inference_profiles("us-east-1", profile_name="profileB")
+        assert result_a["model"] == "bedrock/profile-a-model"
+        assert result_b["model"] == "bedrock/profile-b-model"
+        assert result_a != result_b
+
 
 # =============================================================================
 # LiteLLMBackend.map_model_id with EU Regions
@@ -310,6 +322,26 @@ def test_region_prefixed_format_normalized(self):
             result = backend.map_model_id("eu.anthropic.claude-sonnet-4-20250514-v1:0")
             assert result == "bedrock/eu.anthropic.claude-sonnet-4-20250514-v1:0"
 
+    def test_arn_passthrough(self):
+        """Application inference profile ARNs must use the converse route."""
+        with patch(
+            "headroom.backends.litellm._fetch_bedrock_inference_profiles",
+            return_value={},
+        ):
+            backend = LiteLLMBackend(provider="bedrock", region="ap-southeast-2")
+            arn = "arn:aws:bedrock:ap-southeast-2:123456789012:application-inference-profile/abc123"
+            result = backend.map_model_id(arn)
+            assert result == f"bedrock/converse/{arn}"
+
+    def test_ap_southeast_2_uses_au_prefix(self):
+        """ap-southeast-2 (Sydney/Australia) should use 'au.' prefix, not 'apac.'."""
+        with patch(
+            "headroom.backends.litellm._fetch_bedrock_inference_profiles",
+            return_value={},
+        ):
+            backend = LiteLLMBackend(provider="bedrock", region="ap-southeast-2")
+            result = backend.map_model_id("claude-sonnet-4-5-20250929")
+            assert result == "bedrock/au.anthropic.claude-sonnet-4-5-20250929-v1:0"
 
 # =============================================================================
 # Normalize Bedrock Profile ID (edge cases)
diff --git a/tests/test_provider_registry.py b/tests/test_provider_registry.py
index 8c95499a2..498d21f0c 100644
--- a/tests/test_provider_registry.py
+++ b/tests/test_provider_registry.py
@@ -119,7 +119,7 @@ def test_create_proxy_backend_handles_missing_litellm_backend(caplog) -> None:
             anyllm_provider="ignored",
             bedrock_region="us-east-1",
             logger=logger,
-            litellm_backend_cls=lambda provider, region: (_ for _ in ()).throw(
+            litellm_backend_cls=lambda provider, region, profile_name=None: (_ for _ in ()).throw(
                 ImportError("missing")
             ),
         )
diff --git a/tests/test_provider_registry_extended.py b/tests/test_provider_registry_extended.py
index 9c8050adf..c3ef4559a 100644
--- a/tests/test_provider_registry_extended.py
+++ b/tests/test_provider_registry_extended.py
@@ -113,7 +113,7 @@ def test_create_proxy_backend_uses_injected_backend_types() -> None:
         anyllm_provider="ignored",
         bedrock_region="us-east-1",
         logger=logger,
-        litellm_backend_cls=lambda provider, region: {
+        litellm_backend_cls=lambda provider, region, profile_name=None: {
             "kind": "litellm",
             "provider": provider,
             "region": region,
diff --git a/tests/test_vertex_claude_compression.py b/tests/test_vertex_claude_compression.py
index 0396f78aa..0689829a8 100644
--- a/tests/test_vertex_claude_compression.py
+++ b/tests/test_vertex_claude_compression.py
@@ -78,7 +78,7 @@ def _capture_provider(backend: str) -> dict[str, Any]:
     captured: dict[str, Any] = {}
 
     class FakeLiteLLM:
-        def __init__(self, provider: str, region: str | None = None) -> None:
+        def __init__(self, provider: str, region: str | None = None, profile_name: str | None = None) -> None:
             captured["provider"] = provider
             captured["region"] = region
 

From b17840648196c5ccd9529a0ad3b103b8d9439163 Mon Sep 17 00:00:00 2001
From: Matt Haitana <mhaitana@costar.com>
Date: Fri, 26 Jun 2026 16:10:00 +1000
Subject: [PATCH 2/5] docs(bedrock): add Claude Code + Bedrock setup guide

---
 docs/claude-code-bedrock-headroom.md | 151 +++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 docs/claude-code-bedrock-headroom.md

diff --git a/docs/claude-code-bedrock-headroom.md b/docs/claude-code-bedrock-headroom.md
new file mode 100644
index 000000000..c12559b7e
--- /dev/null
+++ b/docs/claude-code-bedrock-headroom.md
@@ -0,0 +1,151 @@
+# Claude Code + AWS Bedrock, with Headroom compression
+
+*Validated end-to-end on 2026-06-26 (Claude Code 2.1, Headroom 0.27.0, ap-southeast-2).*
+
+This is the **working, tested** way to run **Claude Code** against **Claude models on
+AWS Bedrock** with **Headroom compressing the context** in the middle.
+
+## TL;DR
+
+Run Claude Code in **normal Anthropic mode** (NOT Bedrock mode) pointed at a local
+Headroom proxy, and let **Headroom** be the thing that talks to Bedrock:
+
+```
+Claude Code  ──ANTHROPIC_BASE_URL──▶  Headroom proxy  ──LiteLLM (bedrock)──▶  AWS Bedrock
+ (normal mode)     (plain http)        (compresses)         (your AWS creds)      (Claude)
+```
+
+Two non-obvious requirements make the difference between "works" and "silently bypasses
+the proxy":
+
+1. **`CLAUDE_CODE_USE_BEDROCK=0`** — Without this, Claude Code detects ARN model IDs or
+   the `CLAUDE_CODE_USE_BEDROCK=1` flag and calls Bedrock directly via the AWS SDK,
+   completely bypassing `ANTHROPIC_BASE_URL` and the proxy.
+2. **Standard Anthropic model names** in `ANTHROPIC_DEFAULT_*_MODEL` (e.g.
+   `claude-sonnet-4-6`, not ARNs) — Claude Code validates model names client-side against
+   a built-in list. ARN values are detected and cause Claude Code to bypass the proxy.
+
+## Why not "just set CLAUDE_CODE_USE_BEDROCK=1"?
+
+That approach **does not work** with Headroom. When `CLAUDE_CODE_USE_BEDROCK=1` is set,
+Claude Code calls Bedrock directly using the AWS SDK — `ANTHROPIC_BASE_URL` is ignored
+entirely and the proxy never receives a byte. The same bypass happens when
+`ANTHROPIC_DEFAULT_*_MODEL` contains ARN values. Use the Anthropic-mode path below.
+
+## Prerequisites
+
+- **AWS credentials** configured for your environment (env vars, `~/.aws/credentials`,
+  instance profile, or SSO via `aws sso login`). Confirm direct access works before
+  involving Headroom:
+  ```bash
+  aws bedrock-runtime invoke-model \
+    --region us-east-1 \
+    --model-id anthropic.claude-3-haiku-20240307-v1:0 \
+    --body '{"anthropic_version":"bedrock-2023-05-31","max_tokens":20,"messages":[{"role":"user","content":"hi"}]}' \
+    /tmp/out.json
+  ```
+- **boto3** in the proxy's Python environment (for dynamic inference profile discovery):
+  ```bash
+  pip install boto3
+  ```
+- **IAM permissions** for the models you intend to use — at minimum
+  `bedrock:InvokeModel` and `bedrock:InvokeModelWithResponseStream`. For application
+  inference profiles, scope to the specific profile ARN:
+  ```json
+  {
+    "Effect": "Allow",
+    "Action": ["bedrock:InvokeModel", "bedrock:InvokeModelWithResponseStream"],
+    "Resource": ["arn:aws:bedrock:<region>:<account>:application-inference-profile/<id>"]
+  }
+  ```
+
+## Terminal 1 — start the Headroom proxy (Bedrock backend)
+
+```bash
+headroom proxy --port 8787 \
+  --backend bedrock \
+  --region us-east-1
+```
+
+With a named AWS SSO profile:
+
+```bash
+headroom proxy --port 8787 \
+  --backend bedrock \
+  --region us-east-1 \
+  --bedrock-profile my-sso-profile
+```
+
+On startup the proxy calls `list_inference_profiles` to build a model map. Confirm it
+is routing correctly by checking the LiteLLM log lines — you should see:
+
+```
+LiteLLM completion() model= converse/arn:aws:... provider = bedrock
+```
+
+## Terminal 2 — run Claude Code (normal Anthropic mode) against the proxy
+
+```bash
+export CLAUDE_CODE_USE_BEDROCK=0               # REQUIRED — prevents Claude Code bypassing the proxy
+export ANTHROPIC_BASE_URL=http://127.0.0.1:8787
+export ANTHROPIC_API_KEY=headroom              # Claude Code needs *a* key to start; value is ignored
+export ANTHROPIC_MODEL=claude-opus-4-6
+export ANTHROPIC_DEFAULT_SONNET_MODEL=claude-sonnet-4-6
+export ANTHROPIC_DEFAULT_OPUS_MODEL=claude-opus-4-6
+export ANTHROPIC_DEFAULT_HAIKU_MODEL=claude-haiku-4-5-20251001
+
+claude
+```
+
+Or via `~/.claude/settings.json`:
+
+```json
+{
+  "env": {
+    "CLAUDE_CODE_USE_BEDROCK": "0",
+    "ANTHROPIC_BASE_URL": "http://127.0.0.1:8787",
+    "ANTHROPIC_API_KEY": "headroom",
+    "ANTHROPIC_MODEL": "claude-opus-4-6",
+    "ANTHROPIC_DEFAULT_SONNET_MODEL": "claude-sonnet-4-6",
+    "ANTHROPIC_DEFAULT_OPUS_MODEL": "claude-opus-4-6",
+    "ANTHROPIC_DEFAULT_HAIKU_MODEL": "claude-haiku-4-5-20251001"
+  }
+}
+```
+
+Claude Code now talks plain Anthropic `/v1/messages` to Headroom; Headroom compresses
+and forwards to Bedrock via LiteLLM, then translates the answer back.
+
+## Application inference profiles (account-specific ARNs)
+
+If your IAM policy only permits **application inference profiles** (account-specific
+ARNs) rather than system-defined cross-region profiles, pass the ARN directly as the
+model value in `ANTHROPIC_DEFAULT_*_MODEL`. The proxy detects `arn:aws:` prefixed model
+IDs and routes them via `bedrock/converse/<arn>` automatically — no extra configuration
+required.
+
+## Region prefix notes
+
+| AWS region | Cross-region inference prefix |
+|---|---|
+| `us-*` | `us.` |
+| `eu-*` | `eu.` |
+| `ap-*` (except `ap-southeast-2`) | `apac.` |
+| `ap-southeast-2` (Sydney) | `au.` |
+
+The proxy uses the correct prefix automatically when constructing fallback model IDs.
+
+## Verify compression is happening
+
+- Dashboard: <http://localhost:8787/dashboard> — "tokens saved" climbs as you work.
+- `curl -s localhost:8787/stats` → `tokens.saved` and `request_logs[].transforms_applied`.
+
+## Troubleshooting
+
+| Symptom | Cause | Fix |
+|---|---|---|
+| Proxy receives no requests | Claude Code is in Bedrock mode, bypassing proxy | Set `CLAUDE_CODE_USE_BEDROCK=0` |
+| `400 The provided model identifier is invalid` | ARN value in `ANTHROPIC_DEFAULT_*_MODEL` detected by Claude Code | Use standard model names (`claude-sonnet-4-6`, not ARNs) |
+| `403 AccessDeniedException` on system-defined profiles | IAM policy only permits application profiles | Use `--bedrock-profile` with an authorized profile and pass application inference profile ARNs as model values |
+| `400 … Try calling via converse route` | Old proxy version routing ARNs to invoke path | Upgrade to headroom ≥ 0.27.1 |
+| Model map empty at startup | boto3 not installed or wrong AWS profile | `pip install boto3`; check `--bedrock-profile` / `AWS_PROFILE` |

From 660640e19912d13645cd8fcc0855d72a3c6286b2 Mon Sep 17 00:00:00 2001
From: Matt Haitana <mhaitana@costar.com>
Date: Sat, 27 Jun 2026 00:17:44 +1000
Subject: [PATCH 3/5] fix(bedrock): wire profile_name to acompletion() and fix
 guide contradiction

---
 docs/claude-code-bedrock-headroom.md | 14 ++--
 headroom/backends/litellm.py         | 13 ++++
 tests/test_bedrock_region.py         | 95 ++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 9 deletions(-)

diff --git a/docs/claude-code-bedrock-headroom.md b/docs/claude-code-bedrock-headroom.md
index c12559b7e..1d0228016 100644
--- a/docs/claude-code-bedrock-headroom.md
+++ b/docs/claude-code-bedrock-headroom.md
@@ -15,22 +15,18 @@ Claude Code  ──ANTHROPIC_BASE_URL──▶  Headroom proxy  ──LiteLLM (b
  (normal mode)     (plain http)        (compresses)         (your AWS creds)      (Claude)
 ```
 
-Two non-obvious requirements make the difference between "works" and "silently bypasses
+One non-obvious requirement makes the difference between "works" and "silently bypasses
 the proxy":
 
-1. **`CLAUDE_CODE_USE_BEDROCK=0`** — Without this, Claude Code detects ARN model IDs or
-   the `CLAUDE_CODE_USE_BEDROCK=1` flag and calls Bedrock directly via the AWS SDK,
+1. **`CLAUDE_CODE_USE_BEDROCK=0`** — Without this, Claude Code sees the
+   `CLAUDE_CODE_USE_BEDROCK=1` flag and calls Bedrock directly via the AWS SDK,
    completely bypassing `ANTHROPIC_BASE_URL` and the proxy.
-2. **Standard Anthropic model names** in `ANTHROPIC_DEFAULT_*_MODEL` (e.g.
-   `claude-sonnet-4-6`, not ARNs) — Claude Code validates model names client-side against
-   a built-in list. ARN values are detected and cause Claude Code to bypass the proxy.
 
 ## Why not "just set CLAUDE_CODE_USE_BEDROCK=1"?
 
 That approach **does not work** with Headroom. When `CLAUDE_CODE_USE_BEDROCK=1` is set,
 Claude Code calls Bedrock directly using the AWS SDK — `ANTHROPIC_BASE_URL` is ignored
-entirely and the proxy never receives a byte. The same bypass happens when
-`ANTHROPIC_DEFAULT_*_MODEL` contains ARN values. Use the Anthropic-mode path below.
+entirely and the proxy never receives a byte. Use the Anthropic-mode path below.
 
 ## Prerequisites
 
@@ -145,7 +141,7 @@ The proxy uses the correct prefix automatically when constructing fallback model
 | Symptom | Cause | Fix |
 |---|---|---|
 | Proxy receives no requests | Claude Code is in Bedrock mode, bypassing proxy | Set `CLAUDE_CODE_USE_BEDROCK=0` |
-| `400 The provided model identifier is invalid` | ARN value in `ANTHROPIC_DEFAULT_*_MODEL` detected by Claude Code | Use standard model names (`claude-sonnet-4-6`, not ARNs) |
+| `400 The provided model identifier is invalid` | Bedrock rejected the model name format | Use standard cross-region profile names (`claude-sonnet-4-6`) or a valid application inference profile ARN |
 | `403 AccessDeniedException` on system-defined profiles | IAM policy only permits application profiles | Use `--bedrock-profile` with an authorized profile and pass application inference profile ARNs as model values |
 | `400 … Try calling via converse route` | Old proxy version routing ARNs to invoke path | Upgrade to headroom ≥ 0.27.1 |
 | Model map empty at startup | boto3 not installed or wrong AWS profile | `pip install boto3`; check `--bedrock-profile` / `AWS_PROFILE` |
diff --git a/headroom/backends/litellm.py b/headroom/backends/litellm.py
index 7ee211ed1..c0c1e8dc9 100644
--- a/headroom/backends/litellm.py
+++ b/headroom/backends/litellm.py
@@ -436,6 +436,7 @@ def __init__(
 
         self.provider = provider
         self.region = region
+        self.profile_name = profile_name
         self.kwargs = kwargs
 
         # Get provider config from registry
@@ -707,6 +708,9 @@ async def send_message(
                 elif self.provider in ("vertex_ai", "vertex_ai_beta"):
                     kwargs["vertex_location"] = self.region
 
+            if self.provider == "bedrock" and self.profile_name:
+                kwargs["aws_profile_name"] = self.profile_name
+
             # Forward API key from request headers if present.
             # Skip for Bedrock/Vertex: they use env-based auth (AWS SigV4 / Google ADC).
             # Forwarding x-api-key (e.g. sk-ant-dummy) would override their credentials.
@@ -811,6 +815,9 @@ async def stream_message(
                 elif self.provider in ("vertex_ai", "vertex_ai_beta"):
                     kwargs["vertex_location"] = self.region
 
+            if self.provider == "bedrock" and self.profile_name:
+                kwargs["aws_profile_name"] = self.profile_name
+
             # Forward API key from request headers if present.
             # Skip for Bedrock/Vertex: they use env-based auth (AWS SigV4 / Google ADC).
             # Forwarding x-api-key (e.g. sk-ant-dummy) would override their credentials.
@@ -1035,6 +1042,9 @@ async def send_openai_message(
                 elif self.provider in ("vertex_ai", "vertex_ai_beta"):
                     kwargs["vertex_location"] = self.region
 
+            if self.provider == "bedrock" and self.profile_name:
+                kwargs["aws_profile_name"] = self.profile_name
+
             # Forward API key from request headers if present.
             # Skip for Bedrock/Vertex: they use env-based auth (AWS SigV4 / Google ADC).
             # Forwarding x-api-key (e.g. sk-ant-dummy) would override their credentials.
@@ -1210,6 +1220,9 @@ async def stream_openai_message(
                 elif self.provider in ("vertex_ai", "vertex_ai_beta"):
                     kwargs["vertex_location"] = self.region
 
+            if self.provider == "bedrock" and self.profile_name:
+                kwargs["aws_profile_name"] = self.profile_name
+
             # Forward API key from request headers if present.
             # Skip for Bedrock/Vertex: they use env-based auth (AWS SigV4 / Google ADC).
             # Forwarding x-api-key (e.g. sk-ant-dummy) would override their credentials.
diff --git a/tests/test_bedrock_region.py b/tests/test_bedrock_region.py
index 67d80db71..21e9c37ce 100644
--- a/tests/test_bedrock_region.py
+++ b/tests/test_bedrock_region.py
@@ -384,3 +384,98 @@ def test_already_normalized(self):
         assert _normalize_bedrock_profile_id("claude-sonnet-4-20250514") == (
             "claude-sonnet-4-20250514"
         )
+
+
+# =============================================================================
+# Named profile forwarded to acompletion kwargs
+# =============================================================================
+
+_MODEL_MAP_US = {"claude-sonnet-4-20250514": "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0"}
+_BODY = {
+    "model": "claude-sonnet-4-20250514",
+    "messages": [{"role": "user", "content": "hi"}],
+    "max_tokens": 10,
+}
+
+
+def _make_fake_completion_resp():
+    mock_resp = MagicMock()
+    mock_resp.choices = [MagicMock()]
+    mock_resp.choices[0].message.content = "hello"
+    mock_resp.choices[0].message.tool_calls = None
+    mock_resp.choices[0].finish_reason = "stop"
+    mock_resp.usage.prompt_tokens = 10
+    mock_resp.usage.completion_tokens = 5
+    return mock_resp
+
+
+class TestBedrockProfileForwardedToCompletion:
+    """Regression: --bedrock-profile must be passed to acompletion(), not just to
+    _fetch_bedrock_inference_profiles() at startup. Without self.profile_name the
+    actual Bedrock call still uses ambient/default credentials even when the user
+    explicitly supplied a named SSO profile."""
+
+    def setup_method(self):
+        _bedrock_profiles_cache.clear()
+
+    async def test_send_message_passes_aws_profile_name(self):
+        """send_message() must include aws_profile_name in the acompletion() kwargs."""
+        captured_kwargs: dict = {}
+
+        async def fake_acompletion(**kwargs):
+            captured_kwargs.update(kwargs)
+            return _make_fake_completion_resp()
+
+        with patch(
+            "headroom.backends.litellm._fetch_bedrock_inference_profiles",
+            return_value=_MODEL_MAP_US,
+        ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion):
+            backend = LiteLLMBackend(
+                provider="bedrock", region="us-east-1", profile_name="my-sso-profile"
+            )
+            await backend.send_message(body=_BODY, headers={})
+
+        assert captured_kwargs.get("aws_profile_name") == "my-sso-profile"
+
+    async def test_stream_message_passes_aws_profile_name(self):
+        """stream_message() must include aws_profile_name in the acompletion() kwargs."""
+        captured_kwargs: dict = {}
+
+        async def fake_acompletion(**kwargs):
+            captured_kwargs.update(kwargs)
+
+            async def _empty():
+                return
+                yield  # pragma: no cover — makes this an async generator
+
+            return _empty()
+
+        with patch(
+            "headroom.backends.litellm._fetch_bedrock_inference_profiles",
+            return_value=_MODEL_MAP_US,
+        ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion):
+            backend = LiteLLMBackend(
+                provider="bedrock", region="us-east-1", profile_name="my-sso-profile"
+            )
+            async for _ in backend.stream_message(body=_BODY, headers={}):
+                pass
+
+        assert captured_kwargs.get("aws_profile_name") == "my-sso-profile"
+
+    async def test_no_profile_does_not_set_aws_profile_name(self):
+        """When no profile is configured, aws_profile_name must not appear in kwargs
+        (LiteLLM falls back to ambient credentials correctly)."""
+        captured_kwargs: dict = {}
+
+        async def fake_acompletion(**kwargs):
+            captured_kwargs.update(kwargs)
+            return _make_fake_completion_resp()
+
+        with patch(
+            "headroom.backends.litellm._fetch_bedrock_inference_profiles",
+            return_value=_MODEL_MAP_US,
+        ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion):
+            backend = LiteLLMBackend(provider="bedrock", region="us-east-1")
+            await backend.send_message(body=_BODY, headers={})
+
+        assert "aws_profile_name" not in captured_kwargs

From 2cdddfccbfe93ff3db2373c9914bdb00bd6715d4 Mon Sep 17 00:00:00 2001
From: JerrettDavis <mxjerrett@gmail.com>
Date: Wed, 1 Jul 2026 19:02:40 -0500
Subject: [PATCH 4/5] style(bedrock): format profile tests

---
 tests/test_bedrock_region.py            | 34 ++++++++++++++++---------
 tests/test_vertex_claude_compression.py |  4 ++-
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/tests/test_bedrock_region.py b/tests/test_bedrock_region.py
index 21e9c37ce..1c5c25bf7 100644
--- a/tests/test_bedrock_region.py
+++ b/tests/test_bedrock_region.py
@@ -343,6 +343,7 @@ def test_ap_southeast_2_uses_au_prefix(self):
             result = backend.map_model_id("claude-sonnet-4-5-20250929")
             assert result == "bedrock/au.anthropic.claude-sonnet-4-5-20250929-v1:0"
 
+
 # =============================================================================
 # Normalize Bedrock Profile ID (edge cases)
 # =============================================================================
@@ -426,10 +427,13 @@ async def fake_acompletion(**kwargs):
             captured_kwargs.update(kwargs)
             return _make_fake_completion_resp()
 
-        with patch(
-            "headroom.backends.litellm._fetch_bedrock_inference_profiles",
-            return_value=_MODEL_MAP_US,
-        ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion):
+        with (
+            patch(
+                "headroom.backends.litellm._fetch_bedrock_inference_profiles",
+                return_value=_MODEL_MAP_US,
+            ),
+            patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion),
+        ):
             backend = LiteLLMBackend(
                 provider="bedrock", region="us-east-1", profile_name="my-sso-profile"
             )
@@ -450,10 +454,13 @@ async def _empty():
 
             return _empty()
 
-        with patch(
-            "headroom.backends.litellm._fetch_bedrock_inference_profiles",
-            return_value=_MODEL_MAP_US,
-        ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion):
+        with (
+            patch(
+                "headroom.backends.litellm._fetch_bedrock_inference_profiles",
+                return_value=_MODEL_MAP_US,
+            ),
+            patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion),
+        ):
             backend = LiteLLMBackend(
                 provider="bedrock", region="us-east-1", profile_name="my-sso-profile"
             )
@@ -471,10 +478,13 @@ async def fake_acompletion(**kwargs):
             captured_kwargs.update(kwargs)
             return _make_fake_completion_resp()
 
-        with patch(
-            "headroom.backends.litellm._fetch_bedrock_inference_profiles",
-            return_value=_MODEL_MAP_US,
-        ), patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion):
+        with (
+            patch(
+                "headroom.backends.litellm._fetch_bedrock_inference_profiles",
+                return_value=_MODEL_MAP_US,
+            ),
+            patch("headroom.backends.litellm.acompletion", side_effect=fake_acompletion),
+        ):
             backend = LiteLLMBackend(provider="bedrock", region="us-east-1")
             await backend.send_message(body=_BODY, headers={})
 
diff --git a/tests/test_vertex_claude_compression.py b/tests/test_vertex_claude_compression.py
index 0689829a8..e4753668b 100644
--- a/tests/test_vertex_claude_compression.py
+++ b/tests/test_vertex_claude_compression.py
@@ -78,7 +78,9 @@ def _capture_provider(backend: str) -> dict[str, Any]:
     captured: dict[str, Any] = {}
 
     class FakeLiteLLM:
-        def __init__(self, provider: str, region: str | None = None, profile_name: str | None = None) -> None:
+        def __init__(
+            self, provider: str, region: str | None = None, profile_name: str | None = None
+        ) -> None:
             captured["provider"] = provider
             captured["region"] = region
 

From fcbe53ee91cccf823a02803f5226e019a244c000 Mon Sep 17 00:00:00 2001
From: JerrettDavis <mxjerrett@gmail.com>
Date: Wed, 1 Jul 2026 21:14:56 -0500
Subject: [PATCH 5/5] test(bedrock): mock boto3 sessions in inference profile
 tests

---
 tests/test_bedrock_region.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_bedrock_region.py b/tests/test_bedrock_region.py
index 1c5c25bf7..46df34b2d 100644
--- a/tests/test_bedrock_region.py
+++ b/tests/test_bedrock_region.py
@@ -150,7 +150,9 @@ def test_fallback_when_api_call_fails(self):
         mock_client.list_inference_profiles.side_effect = Exception(
             "AccessDeniedException: not authorized"
         )
-        mock_boto3.client.return_value = mock_client
+        mock_session = MagicMock()
+        mock_session.client.return_value = mock_client
+        mock_boto3.Session.return_value = mock_session
 
         with patch("headroom.backends.litellm.boto3", mock_boto3, create=True):
             # Patch the import inside the function
@@ -188,7 +190,9 @@ def test_successful_fetch_uses_api_results(self):
                 {"inferenceProfileId": "eu.meta.llama-3-70b-v1:0"},  # non-Anthropic, should skip
             ]
         }
-        mock_boto3.client.return_value = mock_client
+        mock_session = MagicMock()
+        mock_session.client.return_value = mock_client
+        mock_boto3.Session.return_value = mock_session
 
         import builtins