diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 3ccfc5cdfe..e31ca07ce0 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -1686,6 +1686,13 @@ def supports_tools(self) -> bool: return False return self._supports_tools + @property + def supports_tool_passthrough(self) -> bool: + # supports_tools is forced off for DiffusionGemma (its agentic loop drops the + # per-step canvas frames); client tool loops skip that loop, so the passthrough + # follows the real _supports_tools instead of the forced-off value. + return self._supports_tools + @property def cache_type_kv(self) -> Optional[str]: return self._cache_type_kv diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py index 17be222d93..7831dfec42 100644 --- a/studio/backend/routes/inference.py +++ b/studio/backend/routes/inference.py @@ -5705,9 +5705,9 @@ def _reject_unsupported_n(path_label: str) -> "HTTPException": # free-form sampling. Guided decoding does not require ``supports_tools`` -- # the grammar machinery is independent of tool-call parsing. _has_response_format = _extract_response_format(payload) is not None - _tools_passthrough = llama_backend.supports_tools and ( - (payload.tools and len(payload.tools) > 0) or _has_tool_messages - ) + _tools_passthrough = getattr( + llama_backend, "supports_tool_passthrough", llama_backend.supports_tools + ) and ((payload.tools and len(payload.tools) > 0) or _has_tool_messages) if ( using_gguf and not _effective_enable_tools(payload) @@ -9564,7 +9564,9 @@ async def anthropic_messages( and not _has_image ) client_tools = ( - not server_tools and len(openai_client_tools) > 0 and llama_backend.supports_tools + not server_tools + and len(openai_client_tools) > 0 + and getattr(llama_backend, "supports_tool_passthrough", llama_backend.supports_tools) ) # Anthropic tool_choice.disable_parallel_tool_use caps the response to a