diff --git a/headroom/proxy/handlers/anthropic.py b/headroom/proxy/handlers/anthropic.py index 41233c817..00f7a26cc 100644 --- a/headroom/proxy/handlers/anthropic.py +++ b/headroom/proxy/handlers/anthropic.py @@ -2379,6 +2379,10 @@ async def api_call_fn( if _auth_header.startswith("Bearer ") and not _auth_header.startswith( "Bearer sk-ant-api" ): + from headroom.proxy.savings_tracker import ( + _estimate_cache_savings_usd, + _estimate_compression_savings_usd, + ) from headroom.subscription.tracker import ( get_subscription_tracker as _get_sub_tracker, ) @@ -2389,6 +2393,10 @@ async def api_call_fn( tokens_submitted=optimized_tokens, tokens_saved_compression=tokens_saved, tokens_saved_cache_reads=cr_tokens, + compression_savings_usd=_estimate_compression_savings_usd( + model, tokens_saved + ), + cache_savings_usd=_estimate_cache_savings_usd(model, cr_tokens), ) # The pre-refactor PERF emit (above) read raw usage diff --git a/headroom/proxy/savings_tracker.py b/headroom/proxy/savings_tracker.py index cbeab8fad..3ef6130c4 100644 --- a/headroom/proxy/savings_tracker.py +++ b/headroom/proxy/savings_tracker.py @@ -202,6 +202,28 @@ def _estimate_compression_savings_usd(model: str, tokens_saved: int) -> float: return 0.0 +def _estimate_cache_savings_usd(model: str, cache_read_tokens: int) -> float: + """Estimate cache-read savings in USD from discounted input tokens.""" + + litellm = _get_litellm_module() + if cache_read_tokens <= 0 or litellm is None: + return 0.0 + + try: + resolved = _resolve_litellm_model(model) + info = litellm.model_cost.get(resolved, {}) + input_cost_per_token = info.get("input_cost_per_token") + if not input_cost_per_token: + return 0.0 + cache_read_cost_per_token = info.get("cache_read_input_token_cost", input_cost_per_token) + savings_per_token = float(input_cost_per_token) - float(cache_read_cost_per_token) + if savings_per_token <= 0: + return 0.0 + return float(cache_read_tokens) * savings_per_token + except Exception: + return 0.0 + + def _estimate_input_cost_usd( model: str, input_tokens: int, diff --git a/tests/test_proxy_cache_savings_usd.py b/tests/test_proxy_cache_savings_usd.py new file mode 100644 index 000000000..8aa06e121 --- /dev/null +++ b/tests/test_proxy_cache_savings_usd.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from headroom.proxy import savings_tracker as savings_tracker_module + + +def test_estimate_cache_savings_usd_uses_discounted_cache_read_price(monkeypatch) -> None: + fake_litellm = SimpleNamespace( + model_cost={ + "gpt-4o": { + "input_cost_per_token": 0.002, + "cache_read_input_token_cost": 0.001, + } + } + ) + monkeypatch.setattr(savings_tracker_module, "LITELLM_AVAILABLE", True) + monkeypatch.setattr(savings_tracker_module, "litellm", fake_litellm) + + assert savings_tracker_module._estimate_cache_savings_usd("gpt-4o", 100) == pytest.approx( + 0.1 + ) + + +def test_estimate_cache_savings_usd_handles_missing_pricing(monkeypatch) -> None: + fake_litellm = SimpleNamespace(model_cost={}) + monkeypatch.setattr(savings_tracker_module, "LITELLM_AVAILABLE", True) + monkeypatch.setattr(savings_tracker_module, "litellm", fake_litellm) + + assert savings_tracker_module._estimate_cache_savings_usd("gpt-4o", 100) == 0.0 + + monkeypatch.setattr(savings_tracker_module, "LITELLM_AVAILABLE", False) + assert savings_tracker_module._estimate_cache_savings_usd("gpt-4o", 100) == 0.0