Fix Studio custom folders on Linux external drives (#6799) #9584

Workflow file for this run

.github/workflows/studio-inference-smoke.yml at 01f7e14

	# SPDX-License-Identifier: AGPL-3.0-only
	# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.

	# Three end-to-end smoke jobs that boot a freshly-installed Studio and
	# exercise the surfaces real users hit through the OpenAI / Anthropic
	# SDKs and curl. Each job picks the smallest model that exercises the
	# behaviour under test, primes HF_HOME via actions/cache, and shares
	# the install.sh --local --no-torch bootstrap.
	#
	# 1. OpenAI, Anthropic API tests
	# gemma-3-270m-it UD-Q4_K_XL (~254 MiB).
	# Password rotation via /api/auth/change-password (old fails,
	# new works), then OpenAI + Anthropic Python SDKs against /v1/*
	# with temperature=0 and a fixed seed. Asserts the four-turn
	# conversation is deterministic across two runs.
	#
	# 2. Tool calling Tests
	# Qwen3.5-2B UD-IQ3_XXS (~890 MiB). OpenAI function calling,
	# server-side tools (python, terminal, web_search) via
	# enable_tools / enabled_tools, and enable_thinking on/off.
	#
	# 3. JSON, images
	# Qwen3-VL-2B-Instruct UD-Q4_K_XL (~1.1 GiB) + mmproj-F16 (~780 MiB).
	# response_format JSON-schema decoding and OpenAI image_url
	# (data URI) plus Anthropic source/base64 image inputs.
	#
	# All three jobs run in parallel. Total wall time is dominated by job 3
	# on a cold cache; warm cache cuts that to ~3 min.

	name: Studio GGUF CI

	on:
	pull_request:
	paths:
	- 'studio/**'
	- 'unsloth/**'
	- 'unsloth_cli/**'
	- 'install.sh'
	- 'pyproject.toml'
	- '.github/workflows/studio-inference-smoke.yml'
	push:
	branches: [main, pip]
	# Manual trigger for pre-warming HF_HOME caches on main, or re-running
	# against an arbitrary branch without pushing a no-op commit.
	workflow_dispatch:

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	permissions:
	contents: read

	jobs:
	# ─────────────────────────────────────────────────────────────────────
	# Job 1: OpenAI, Anthropic API tests
	# ─────────────────────────────────────────────────────────────────────
	openai-anthropic:
	name: OpenAI, Anthropic API tests
	runs-on: ubuntu-latest
	timeout-minutes: 25
	env:
	GGUF_REPO: unsloth/gemma-3-270m-it-GGUF
	GGUF_VARIANT: UD-Q4_K_XL
	GGUF_FILE: gemma-3-270m-it-UD-Q4_K_XL.gguf
	STUDIO_PORT: '18888'
	HF_HOME: ${{ github.workspace }}/hf-cache
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	persist-credentials: false

	- name: Linux deps for llama.cpp prebuilt
	run: \|
	sudo apt-get update
	sudo apt-get install -y --no-install-recommends \
	libcurl4-openssl-dev libssl-dev jq

	- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
	with:
	node-version: '22'

	- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
	with:
	python-version: '3.12'
	cache: 'pip'

	- name: Restore HF_HOME for ${{ env.GGUF_REPO }}
	id: cache-hf
	uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
	continue-on-error: true
	with:
	path: hf-cache
	key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v2

	- name: Prime HF_HOME with the GGUF
	id: prime-hf
	if: steps.cache-hf.outputs.cache-hit != 'true' \|\| steps.cache-hf.outcome != 'success'
	env:
	# Withheld on PR: this step runs checked-out PR code; public GGUF still downloads.
	HF_TOKEN: ${{ github.event_name != 'pull_request' && secrets.HF_TOKEN \|\| '' }}
	run: \|
	python -m pip install --upgrade huggingface_hub
	mkdir -p hf-cache
	bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
	bash .github/scripts/hf-download-with-retry.sh ggml-org/models tinyllamas/stories260K.gguf

	- name: Save HF_HOME for ${{ env.GGUF_REPO }}
	if: always() && steps.prime-hf.outcome == 'success'
	uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
	with:
	path: hf-cache
	key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-v2

	- name: Install Studio (--local, --no-torch)
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	# Withheld on PR: this step runs checked-out PR code; public GGUF still downloads.
	HF_TOKEN: ${{ github.event_name != 'pull_request' && secrets.HF_TOKEN \|\| '' }}
	run: \|
	mkdir -p logs
	set -o pipefail
	bash install.sh --local --no-torch 2>&1 \| tee logs/install.log

	- name: Install OpenAI + Anthropic Python SDKs
	run: pip install 'openai>=1.50' 'anthropic>=0.40'

	- name: Reset auth + boot Studio (API-only)
	run: \|
	unsloth studio reset-password
	mkdir -p logs
	UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
	> logs/studio.log 2>&1 &
	echo "STUDIO_PID=$!" >> "$GITHUB_ENV"

	- name: Wait for /api/health
	run: \|
	for i in $(seq 1 180); do
	if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
	jq -e '.status == "healthy"' /tmp/health.json
	exit 0
	fi
	sleep 1
	done
	echo "Studio did not become healthy in 180s"
	tail -200 logs/studio.log
	exit 1

	- name: Password rotation (old must fail, new must work)
	run: \|
	OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
	NEW="CIRotated-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
	echo "::add-mask::$OLD"
	echo "::add-mask::$NEW"
	# 1. Login with the bootstrap password.
	OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
	-H 'content-type: application/json' \
	-d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" \| jq -r .access_token)
	[ -n "$OLD_TOKEN" ] && [ "$OLD_TOKEN" != "null" ] \|\| { echo "bootstrap login failed"; exit 1; }
	# 2. Rotate to a fresh random password.
	curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
	-H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
	-d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
	# 3. Old password must now be rejected (HTTP 401).
	OLD_STATUS=$(curl -s -o /dev/null -w '%{http_code}' \
	-X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
	-H 'content-type: application/json' \
	-d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}")
	if [ "$OLD_STATUS" != "401" ]; then
	echo "::error::Login with old password returned $OLD_STATUS, expected 401"
	exit 1
	fi
	# 4. New password must succeed; capture the JWT for downstream steps.
	NEW_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
	-H 'content-type: application/json' \
	-d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" \| jq -r .access_token)
	[ -n "$NEW_TOKEN" ] && [ "$NEW_TOKEN" != "null" ] \|\| { echo "new login failed"; exit 1; }
	echo "TOKEN=$NEW_TOKEN" >> "$GITHUB_ENV"
	echo "password rotation OK (old=401, new=200)"

	- name: Load the GGUF (HF repo + variant, served from HF_HOME cache)
	run: \|
	curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
	-H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
	--max-time 600 \
	-d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
	\| jq '{status, display_name, is_gguf, context_length}'

	- name: Multi-turn determinism via OpenAI + Anthropic SDKs
	env:
	BASE_URL: http://127.0.0.1:18888
	run: \|
	python - <<'PY'
	import json
	import os
	from openai import OpenAI
	from anthropic import Anthropic

	BASE = os.environ["BASE_URL"]
	KEY = os.environ["TOKEN"] # JWT also accepted as Bearer on /v1/*
	SEED = 3407

	# Four-turn conversation: the second and fourth turns can only be
	# answered correctly if the model sees the prior turns, so this
	# also exercises the conversation-history wiring.
	PROMPTS = [
	"What is 1+1?",
	"What did I ask before?",
	"What is the capital of France?",
	"Repeat the city name",
	]

	def run_openai():
	client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
	history, replies = [], []
	for prompt in PROMPTS:
	history.append({"role": "user", "content": prompt})
	resp = client.chat.completions.create(
	model = "default",
	messages = history,
	temperature = 0.0,
	max_tokens = 80,
	seed = SEED,
	extra_body = {"enable_thinking": False},
	)
	text = resp.choices[0].message.content or ""
	replies.append(text)
	history.append({"role": "assistant", "content": text})
	return replies

	def run_anthropic():
	# Two SDK quirks vs. Studio:
	# 1. base_url must NOT include /v1 -- the SDK appends
	# /v1/messages itself; otherwise the request hits
	# /v1/v1/messages and 405s.
	# 2. The SDK sends `x-api-key` by default, but Studio's
	# auth layer is HTTPBearer-only. Override via
	# default_headers so Authorization: Bearer ... is
	# sent instead.
	client = Anthropic(
	base_url = BASE,
	api_key = "unused",
	default_headers = {"Authorization": f"Bearer {KEY}"},
	)
	history, replies = [], []
	for prompt in PROMPTS:
	history.append({"role": "user", "content": prompt})
	msg = client.messages.create(
	model = "default",
	max_tokens = 80,
	messages = history,
	temperature = 0.0,
	extra_body = {"seed": SEED, "enable_thinking": False},
	)
	text = "".join(b.text for b in msg.content if getattr(b, "type", None) == "text")
	replies.append(text)
	history.append({"role": "assistant", "content": text})
	return replies

	for label, runner in (("openai", run_openai), ("anthropic", run_anthropic)):
	first = runner()
	second = runner()
	determinism_failures = []
	for i, (a, b) in enumerate(zip(first, second), start = 1):
	print(f"[{label} turn {i}] {a!r}")
	# Both runs must be non-empty; small-quant drift
	# across runs is WARN-only (grounding asserts below
	# are the stronger signal).
	assert a, f"{label}: empty turn {i} response in first run"
	assert b, f"{label}: empty turn {i} response in second run"
	if a.strip() != b.strip():
	determinism_failures.append(
	f"turn {i}: run1={a!r} run2={b!r}"
	)
	if determinism_failures:
	print(
	f"[{label}] WARN non-determinism at temperature=0.0 across "
	f"{len(determinism_failures)} of {len(first)} turn(s); "
	f"small-quant model drift, not a Studio regression. "
	f"Details: " + " \| ".join(determinism_failures)
	)
	# Sanity: turn-2 reply should mention the earlier question, and
	# turn-4 reply should mention Paris (model echoes the city it
	# produced for turn 3). Lower-cased substring checks keep the
	# assertion robust to formatting jitter.
	joined = " ".join(first).lower()
	assert "1" in first[0], f"{label}: turn-1 answer should contain '1', got {first[0]!r}"
	assert "paris" in joined, f"{label}: expected 'paris' somewhere in the four-turn transcript: {first}"
	status_word = "PASS" if not determinism_failures else "PASS (with drift)"
	print(f"[{label}] {status_word} -- 4 turns, history grounded ('paris' present)")
	PY

	- name: Stop Studio
	if: always()
	run: \|
	kill "${STUDIO_PID}" 2>/dev/null \|\| true
	sleep 2
	ss -tln \| grep ":${STUDIO_PORT}" \|\| true

	- name: Upload logs
	# Always upload so green runs are still reviewable.
	if: always()
	# Diagnostic only: a transient artifact-service drop must not fail a green job.
	continue-on-error: true
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: openai-anthropic-log
	path: \|
	logs/studio.log
	logs/install.log
	retention-days: 7

	# ─────────────────────────────────────────────────────────────────────
	# Job 2: Tool calling Tests
	# ─────────────────────────────────────────────────────────────────────
	tool-calling:
	name: Tool calling Tests
	runs-on: ubuntu-latest
	timeout-minutes: 25
	env:
	# Tool calling is the highest-volume GGUF in this workflow
	# (Qwen3.5-2B at Q4_K_XL = ~1.28 GiB). Caching HF_HOME would
	# store xet chunks + blobs + snapshots = ~4 GiB compressed --
	# 4-5x file-size inflation, dominated by xet chunks. Use main's
	# `--local-dir gguf-cache` pattern to cache the flat .gguf only.
	# Studio's /api/inference/load accepts either a HF repo (which
	# uses HF_HOME) or an absolute file path; passing the absolute
	# path keeps the test off HF_HOME entirely so the cache size
	# tracks the GGUF file 1:1. The OpenAI/Anth and JSON+images
	# jobs still cover the gguf_variant resolution path.
	# Q4_K_XL, not IQ3_XXS: at IQ3_XXS this model emits malformed
	# tool calls that llama-server's peg-native parser rejects with a
	# 500. Mac/Windows already use Q4_K_XL for the same reason.
	GGUF_REPO: unsloth/Qwen3.5-2B-GGUF
	GGUF_FILE: Qwen3.5-2B-UD-Q4_K_XL.gguf
	STUDIO_PORT: '18889'
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	persist-credentials: false

	- name: Linux deps for llama.cpp prebuilt
	run: \|
	sudo apt-get update
	sudo apt-get install -y --no-install-recommends \
	libcurl4-openssl-dev libssl-dev jq

	- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
	with:
	node-version: '22'

	- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
	with:
	python-version: '3.12'
	cache: 'pip'

	- name: Restore GGUF model file
	id: cache-gguf
	uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
	continue-on-error: true
	with:
	path: gguf-cache
	key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1

	- name: Download GGUF if cache miss
	id: download-gguf
	if: steps.cache-gguf.outputs.cache-hit != 'true' \|\| steps.cache-gguf.outcome != 'success'
	env:
	# Withheld on PR: this step runs checked-out PR code; public GGUF still downloads.
	HF_TOKEN: ${{ github.event_name != 'pull_request' && secrets.HF_TOKEN \|\| '' }}
	run: \|
	python -m pip install --upgrade huggingface_hub
	mkdir -p gguf-cache
	bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE" gguf-cache

	- name: Save GGUF model file
	if: always() && steps.download-gguf.outcome == 'success'
	uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
	with:
	path: gguf-cache
	key: ${{ runner.os }}-gguf-${{ env.GGUF_REPO }}-${{ env.GGUF_FILE }}-v1

	- name: Install Studio (--local, --no-torch)
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	# Withheld on PR: this step runs checked-out PR code; public GGUF still downloads.
	HF_TOKEN: ${{ github.event_name != 'pull_request' && secrets.HF_TOKEN \|\| '' }}
	run: \|
	mkdir -p logs
	set -o pipefail
	bash install.sh --local --no-torch 2>&1 \| tee logs/install.log

	- name: Reset auth + boot Studio (API-only, default tool policy)
	# We deliberately use the API-only mode rather than
	# `unsloth studio run` because the latter calls
	# `set_tool_policy(...)` with a resolved bool: on loopback the
	# default resolves to True, which forces every request through
	# the server-side agentic loop and breaks the standard
	# function-calling test below. API-only mode leaves
	# tool_policy=None so each request's `enable_tools` field is
	# honoured.
	run: \|
	unsloth studio reset-password
	mkdir -p logs
	UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
	> logs/studio.log 2>&1 &
	echo "STUDIO_PID=$!" >> "$GITHUB_ENV"

	- name: Wait for /api/health, log in, change password, load model
	run: \|
	for i in $(seq 1 180); do
	if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
	jq -e '.status == "healthy"' /tmp/health.json && break
	fi
	sleep 1
	done
	jq -e '.status == "healthy"' /tmp/health.json
	OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
	NEW="CITool-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
	echo "::add-mask::$OLD"
	echo "::add-mask::$NEW"
	OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
	-H 'content-type: application/json' \
	-d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" \| jq -r .access_token)
	curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
	-H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
	-d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
	TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
	-H 'content-type: application/json' \
	-d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" \| jq -r .access_token)
	echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
	GGUF_PATH="$GITHUB_WORKSPACE/gguf-cache/${GGUF_FILE}"
	ls -lh "$GGUF_PATH"
	curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
	-H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
	--max-time 600 \
	-d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
	\| jq '{status, display_name}'

	- name: Tool calling, server-side tools, thinking on/off
	env:
	BASE_URL: http://127.0.0.1:18889
	run: \|
	python - <<'PY'
	import json
	import os
	import urllib.request

	BASE = os.environ["BASE_URL"]
	KEY = os.environ["API_KEY"]
	SEED = 3407

	def post(path, body, *, timeout = 240):
	"""Plain JSON POST. For requests that don't go through
	the server-side agentic loop, the response is one JSON
	object."""
	data = json.dumps(body).encode()
	req = urllib.request.Request(
	f"{BASE}{path}",
	data = data,
	method = "POST",
	headers = {
	"Authorization": f"Bearer {KEY}",
	"Content-Type": "application/json",
	},
	)
	with urllib.request.urlopen(req, timeout = timeout) as resp:
	return resp.status, json.loads(resp.read().decode())

	def post_sse(path, body, *, timeout = 600):
	"""POST a streaming request and accumulate the assistant
	text deltas. The server-side agentic loop ALWAYS returns
	SSE regardless of the request's `stream` field, so any
	call with enable_tools=true must use this helper.

	Returns (content, raw_payloads):
	content -- concatenated assistant delta.content
	raw_payloads -- list of every raw "data: ..." event
	payload (JSON strings). Callers asserting
	that a server-side tool actually ran (and
	not just that the model emitted some
	text) should grep raw_payloads for tool
	invocation markers / tool output, since
	`delta.content` alone is not evidence
	that the tool path executed.
	"""
	body = {**body, "stream": True}
	data = json.dumps(body).encode()
	req = urllib.request.Request(
	f"{BASE}{path}",
	data = data,
	method = "POST",
	headers = {
	"Authorization": f"Bearer {KEY}",
	"Content-Type": "application/json",
	},
	)
	parts = []
	events = []
	with urllib.request.urlopen(req, timeout = timeout) as resp:
	for raw in resp:
	line = raw.decode().strip()
	if not line.startswith("data: "):
	continue
	payload = line[6:]
	if payload == "[DONE]":
	break
	events.append(payload)
	try:
	chunk = json.loads(payload)
	except json.JSONDecodeError:
	continue
	for choice in chunk.get("choices", []):
	delta = choice.get("delta", {}) or {}
	if delta.get("content"):
	parts.append(delta["content"])
	return "".join(parts), events

	_STUDIO_TOOL_TYPES = {
	"tool_start", "tool_end", "tool_use", "tool_result",
	}

	def _tool_invoked(events):
	"""Structural check: True iff some SSE payload is a real
	tool envelope (Studio tool_start/tool_end, Anthropic
	tool_use/tool_result, OpenAI non-empty delta.tool_calls /
	message.tool_calls / finish_reason='tool_calls' /
	role:'tool' / function_call). tool_status is NOT
	evidence: Studio emits empty tool_status events on
	iteration boundaries even when no tool ran.
	"""
	for raw in events:
	try:
	ev = json.loads(raw)
	except (json.JSONDecodeError, TypeError):
	continue
	if not isinstance(ev, dict):
	continue
	if ev.get("type") in _STUDIO_TOOL_TYPES:
	return True
	for choice in ev.get("choices", []) or []:
	if not isinstance(choice, dict):
	continue
	if choice.get("finish_reason") == "tool_calls":
	return True
	for src_key in ("delta", "message"):
	src = choice.get(src_key) or {}
	if not isinstance(src, dict):
	continue
	tc = src.get("tool_calls")
	if isinstance(tc, list) and tc:
	return True
	if src.get("function_call"):
	return True
	if src.get("role") == "tool":
	return True
	for item in ev.get("output", []) or []:
	if isinstance(item, dict) and item.get("type") in {
	"tool_call", "function_call", "tool_use",
	}:
	return True
	content = ev.get("content")
	if isinstance(content, list):
	for blk in content:
	if isinstance(blk, dict) and blk.get("type") in {
	"tool_use", "tool_result",
	}:
	return True
	return False

	def _tool_output_contains(events, *needles):
	"""True iff any tool_end.result / tool_result.content /
	tool-role message content contains a needle. Inspects
	the tool's own output, not the model's narration."""
	for raw in events:
	try:
	ev = json.loads(raw)
	except (json.JSONDecodeError, TypeError):
	continue
	if not isinstance(ev, dict):
	continue
	if ev.get("type") == "tool_end":
	result = ev.get("result")
	if isinstance(result, str) and any(n in result for n in needles if n):
	return True
	if ev.get("type") == "tool_result":
	content = ev.get("content")
	if isinstance(content, str) and any(n in content for n in needles if n):
	return True
	if isinstance(content, list):
	for blk in content:
	if isinstance(blk, dict):
	text = blk.get("text") or blk.get("content")
	if isinstance(text, str) and any(n in text for n in needles if n):
	return True
	for choice in ev.get("choices", []) or []:
	delta = (choice or {}).get("delta") or {}
	msg = (choice or {}).get("message") or {}
	for src in (delta, msg):
	if src.get("role") == "tool":
	content = src.get("content") or ""
	if isinstance(content, str) and any(n in content for n in needles if n):
	return True
	return False

	# ── 1. Standard OpenAI function calling ──────────────────────
	weather_tool = {
	"type": "function",
	"function": {
	"name": "get_weather",
	"description": "Get current weather for a city.",
	"parameters": {
	"type": "object",
	"properties": {"city": {"type": "string"}},
	"required": ["city"],
	},
	},
	}

	status, data = post("/v1/chat/completions", {
	"messages": [{"role": "user", "content": "What is the weather in Paris?"}],
	"tools": [weather_tool],
	"tool_choice": "required",
	"stream": False,
	"temperature": 0.0,
	"seed": SEED,
	"max_tokens": 120,
	})
	assert status == 200, f"tool call status {status}: {data}"
	choice = data["choices"][0]
	assert choice["finish_reason"] == "tool_calls", f"finish_reason={choice['finish_reason']!r}"
	tc = choice["message"]["tool_calls"][0]
	assert tc["function"]["name"] == "get_weather"
	args = json.loads(tc["function"]["arguments"])
	assert args.get("city"), f"missing city arg: {args}"
	print(f"[tools] PASS function calling -> {tc['function']['name']}({args})")

	# T=0 = deterministic argmax in llama.cpp; T>0 lets seed
	# rotation explore distinct trajectories on retry.
	TOOL_PROBE_TEMP = 0.4

	def _run_tool_probe(*, label, prompt, enabled, session, needles,
	max_attempts = 4):
	"""Drive a server-side tool with retries. Hard FAIL if no
	attempt has structural invocation evidence. WARN (not
	FAIL) if invoked but no attempt produces the expected
	literal in tool_end.result -- small-quant Qwen3.5-2B can
	emit OpenAI tool_calls deltas without Studio's GGUF
	agentic loop intercepting them, and that GGUF-vs-OpenAI
	format mismatch is out of scope for #5642.
	"""
	attempts_log = []
	best = None
	for attempt_i in range(max_attempts):
	attempt_seed = SEED + attempt_i
	content, events = post_sse("/v1/chat/completions", {
	"messages": [{"role": "user", "content": prompt}],
	"enable_tools": True,
	"enabled_tools": enabled,
	"session_id": f"{session}-att{attempt_i}",
	"temperature": TOOL_PROBE_TEMP,
	"seed": attempt_seed,
	"max_tokens": 600,
	})
	invoked = _tool_invoked(events)
	produced = _tool_output_contains(events, *needles)
	attempts_log.append({
	"attempt": attempt_i, "seed": attempt_seed,
	"n_events": len(events),
	"tool_invoked": invoked, "tool_output_contains": produced,
	"content_len": len(content),
	})
	if invoked and produced:
	print(f"[tools] PASS {label} attempt {attempt_i}")
	return content, events, attempts_log
	if invoked and best is None:
	best = (content, events)
	print(f"[tools] retry {label} attempt {attempt_i}: invoked={invoked} output_ok={produced} events={len(events)}")
	if best is not None:
	print(f"[tools] WARN {label}: invoked but no tool_end.result match (small-quant flake). Attempts: {attempts_log}")
	content, events = best
	return content, events, attempts_log
	raise AssertionError(
	f"{label}: no structural tool-invocation evidence across "
	f"{max_attempts} attempts. enable_tools may be silently "
	f"ignored. Attempts: {attempts_log}"
	)

	# ── 2. Server-side python tool ───────────────────────────────
	content, events, _attempts = _run_tool_probe(
	label = "python tool",
	prompt = "What is 123 * 456? Use the python tool to compute it and tell me the number.",
	enabled = ["python"],
	session = "ci-tool-calling-py",
	needles = ("56088", "56,088"),
	)
	if "56088" in content or "56,088" in content:
	print(f"[tools] python tool narration OK")
	else:
	print(f"[tools] python tool narration drifted -- content={content!r}")

	# ── 3. Server-side bash (terminal) tool ──────────────────────
	content, events, _attempts = _run_tool_probe(
	label = "bash/terminal tool",
	prompt = "Use the terminal tool to run `echo hello-bash-tool` and tell me the exact output.",
	enabled = ["terminal"],
	session = "ci-tool-calling-bash",
	needles = ("hello-bash-tool",),
	)
	if "hello-bash-tool" in content:
	print(f"[tools] bash/terminal narration OK")
	else:
	print(f"[tools] bash/terminal narration dropped literal -- content={content!r}")

	# ── 4. Server-side web_search tool ───────────────────────────
	# DuckDuckGo is flaky from CI runners and small Qwen3.5-2B
	# may not actually search. Only assert that the SSE stream
	# opens and yields any data; HTTP / parser failures already
	# raise above. Tool-invocation strictness is relaxed here
	# because (a) the search may legitimately return no results,
	# and (b) DuckDuckGo upstream blocks GHA IP ranges often
	# enough that requiring a tool_call marker would create
	# red-herring failures from infra rather than from Studio.
	try:
	content, events = post_sse("/v1/chat/completions", {
	"messages": [{"role": "user", "content": "Search the web for 'unsloth ai github' and summarise."}],
	"enable_tools": True,
	"enabled_tools": ["web_search"],
	"session_id": "ci-tool-calling-web",
	"temperature": 0.0,
	"seed": SEED,
	"max_tokens": 400,
	})
	print(
	f"[tools] PASS web_search stream ({len(content)} chars in content, "
	f"{len(events)} raw events)"
	)
	except Exception as exc:
	print(f"[tools] WARN web_search probe failed (non-blocking): {exc}")

	# ── 5. Thinking on / off ─────────────────────────────────────
	# Studio strips think blocks from message.content for tools-mode
	# responses, so we toggle plain chat (no enable_tools) and look
	# at the surfaced reasoning_content / message.thinking field.
	def thinking_call(enable):
	status, data = post("/v1/chat/completions", {
	"messages": [{"role": "user", "content": "Briefly: is 17 prime?"}],
	"stream": False,
	"enable_thinking": enable,
	"temperature": 0.0,
	"seed": SEED,
	"max_tokens": 300,
	})
	assert status == 200
	msg = data["choices"][0]["message"]
	# Studio surfaces thinking via reasoning_content (OpenAI
	# extension). Fall back to inline <think> markers for
	# robustness across template versions.
	raw = (msg.get("content") or "") + (msg.get("reasoning_content") or "")
	return raw

	on_text = thinking_call(True)
	off_text = thinking_call(False)
	had_think_on = ("<think>" in on_text) or len(on_text) > 80
	had_think_off = ("<think>" in off_text) and len(off_text) > 0
	assert had_think_on, (
	f"enable_thinking=True produced no thinking signal: {on_text!r}"
	)
	# Off-mode should not contain the literal <think> marker.
	assert "<think>" not in off_text, (
	f"enable_thinking=False but <think> still present: {off_text!r}"
	)
	print(f"[tools] PASS thinking on/off (on={len(on_text)} chars, off={len(off_text)} chars)")
	PY

	- name: Stop Studio
	if: always()
	run: \|
	kill "${STUDIO_PID}" 2>/dev/null \|\| true
	sleep 2
	ss -tln \| grep ":${STUDIO_PORT}" \|\| true
	# Capture backend + llama-server logs so a 500 has a server-side traceback.
	mkdir -p logs/server-logs
	cp -r ~/.unsloth/studio/logs/. logs/server-logs/ 2>/dev/null \|\| true

	- name: Upload logs
	# Always upload so green runs are still reviewable.
	if: always()
	# Diagnostic only: a transient artifact-service drop must not fail a green job.
	continue-on-error: true
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: tool-calling-log
	path: \|
	logs/studio.log
	logs/install.log
	logs/server-logs/
	retention-days: 7

	# ─────────────────────────────────────────────────────────────────────
	# Job 3: JSON, images
	# ─────────────────────────────────────────────────────────────────────
	json-images:
	name: JSON, images
	runs-on: ubuntu-latest
	timeout-minutes: 30
	env:
	GGUF_REPO: unsloth/Qwen3-VL-2B-Instruct-GGUF
	# UD-Q4_K_XL, not UD-IQ2_XXS: at 2-bit the temp-0 answer to the JSON
	# step's capital-of-France probe flips with the host's SIMD kernels
	# (GitHub runners deterministically answered France while other CPUs
	# answer Paris; seeds do not rescue it, 1/5 Paris at temp 0.7). The
	# Q4 quant answered Paris 13/13 across temps and seeds on the same
	# runners, so the hard Paris assertion below stays reliable.
	GGUF_VARIANT: UD-Q4_K_XL
	GGUF_FILE: Qwen3-VL-2B-Instruct-UD-Q4_K_XL.gguf
	MMPROJ_FILE: mmproj-F16.gguf
	STUDIO_PORT: '18890'
	HF_HOME: ${{ github.workspace }}/hf-cache
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	persist-credentials: false

	- name: Linux deps for llama.cpp prebuilt
	run: \|
	sudo apt-get update
	sudo apt-get install -y --no-install-recommends \
	libcurl4-openssl-dev libssl-dev jq

	- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
	with:
	node-version: '22'

	- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
	with:
	python-version: '3.12'
	cache: 'pip'

	- name: Restore HF_HOME for ${{ env.GGUF_REPO }} (model + mmproj)
	id: cache-hf
	uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
	continue-on-error: true
	with:
	path: hf-cache
	key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v2

	- name: Prime HF_HOME with the GGUF + mmproj
	id: prime-hf
	if: steps.cache-hf.outputs.cache-hit != 'true' \|\| steps.cache-hf.outcome != 'success'
	env:
	# Withheld on PR: this step runs checked-out PR code; public GGUF still downloads.
	HF_TOKEN: ${{ github.event_name != 'pull_request' && secrets.HF_TOKEN \|\| '' }}
	run: \|
	python -m pip install --upgrade huggingface_hub
	mkdir -p hf-cache
	bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$GGUF_FILE"
	bash .github/scripts/hf-download-with-retry.sh "$GGUF_REPO" "$MMPROJ_FILE"
	bash .github/scripts/hf-download-with-retry.sh ggml-org/models tinyllamas/stories260K.gguf

	- name: Save HF_HOME for ${{ env.GGUF_REPO }} (model + mmproj)
	if: always() && steps.prime-hf.outcome == 'success'
	uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
	with:
	path: hf-cache
	key: ${{ runner.os }}-hf-${{ env.GGUF_REPO }}-${{ env.GGUF_VARIANT }}-${{ env.MMPROJ_FILE }}-v2

	- name: Install Studio (--local, --no-torch)
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	# Withheld on PR: this step runs checked-out PR code; public GGUF still downloads.
	HF_TOKEN: ${{ github.event_name != 'pull_request' && secrets.HF_TOKEN \|\| '' }}
	run: \|
	mkdir -p logs
	set -o pipefail
	bash install.sh --local --no-torch 2>&1 \| tee logs/install.log

	- name: Install OpenAI + Anthropic Python SDKs
	run: pip install 'openai>=1.50' 'anthropic>=0.40'

	- name: Reset auth + boot Studio (API-only)
	# See Job 2's comment: API-only mode keeps tool_policy=None so
	# response_format requests aren't routed through the agentic
	# tool loop.
	run: \|
	unsloth studio reset-password
	mkdir -p logs
	UNSLOTH_API_ONLY=1 unsloth studio -H 127.0.0.1 -p "$STUDIO_PORT" \
	> logs/studio.log 2>&1 &
	echo "STUDIO_PID=$!" >> "$GITHUB_ENV"

	- name: Wait for /api/health, log in, change password, load model
	run: \|
	for i in $(seq 1 180); do
	if curl -fs "http://127.0.0.1:${STUDIO_PORT}/api/health" > /tmp/health.json; then
	jq -e '.status == "healthy"' /tmp/health.json && break
	fi
	sleep 1
	done
	jq -e '.status == "healthy"' /tmp/health.json
	OLD=$(cat ~/.unsloth/studio/auth/.bootstrap_password)
	NEW="CIJson-$(python -c 'import secrets; print(secrets.token_urlsafe(12))')"
	echo "::add-mask::$OLD"
	echo "::add-mask::$NEW"
	OLD_TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
	-H 'content-type: application/json' \
	-d "{\"username\":\"unsloth\",\"password\":\"$OLD\"}" \| jq -r .access_token)
	curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/change-password" \
	-H "Authorization: Bearer $OLD_TOKEN" -H 'content-type: application/json' \
	-d "{\"current_password\":\"$OLD\",\"new_password\":\"$NEW\"}" > /dev/null
	TOKEN=$(curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/auth/login" \
	-H 'content-type: application/json' \
	-d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" \| jq -r .access_token)
	echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
	# Retry: llama-server startup can race process teardown after a
	# failed attempt. Keep curl out of a pipe so HTTP failures are not
	# masked by jq.
	LOAD_OK=0
	for attempt in 1 2 3; do
	HTTP=$(curl -s -o /tmp/load.json -w '%{http_code}' \
	-X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
	-H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
	--max-time 900 \
	-d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}")
	if [ "$HTTP" = "200" ]; then LOAD_OK=1; break; fi
	echo "::warning::/api/inference/load attempt $attempt returned $HTTP; response:"
	cat /tmp/load.json \|\| true
	sleep 10
	done
	[ "$LOAD_OK" = "1" ] \|\| { echo "::error::/api/inference/load failed 3 attempts"; exit 22; }
	jq '{status, display_name, is_vision}' /tmp/load.json

	- name: JSON schema decoding + image input
	env:
	BASE_URL: http://127.0.0.1:18890
	run: \|
	python - <<'PY'
	import base64
	import json
	import os
	import urllib.request
	from openai import OpenAI
	from anthropic import Anthropic

	BASE = os.environ["BASE_URL"]
	KEY = os.environ["API_KEY"]
	SEED = 3407

	def post(path, body, *, timeout = 240):
	req = urllib.request.Request(
	f"{BASE}{path}",
	data = json.dumps(body).encode(),
	method = "POST",
	headers = {
	"Authorization": f"Bearer {KEY}",
	"Content-Type": "application/json",
	},
	)
	with urllib.request.urlopen(req, timeout = timeout) as resp:
	return resp.status, json.loads(resp.read().decode())

	# ── 1. response_format = json_object (JSON mode) ─────────────
	# llama.cpp's HTTP server supports OpenAI-compatible JSON
	# mode: `response_format: {"type": "json_object"}` constrains
	# the model to emit syntactically-valid JSON. We use raw HTTP
	# rather than the OpenAI SDK so that the field shape Studio
	# forwards to llama-server is unambiguous (the SDK rewrites
	# response_format depending on which variant it recognises).
	# We deliberately do NOT pass a strict JSON schema -- on
	# small Gemma-4 quants the GBNF-from-schema path occasionally
	# produces empty output, and JSON mode is the surface we care
	# about exposing through Studio.
	status, data = post("/v1/chat/completions", {
	"model": "default",
	"messages": [
	{"role": "system", "content": 'Reply with a single JSON object of the form {"city": "...", "country": "..."}. Output ONLY the JSON, nothing else.'},
	{"role": "user", "content": "What is the capital of France?"},
	],
	"temperature": 0.0,
	"max_tokens": 200,
	"seed": SEED,
	"stream": False,
	"enable_thinking": False,
	"response_format": {"type": "json_object"},
	}, timeout = 600)
	assert status == 200, f"json status {status}: {data}"
	content = (data["choices"][0]["message"].get("content") or "").strip()
	# Some chat templates wrap JSON in ```json fences even in JSON
	# mode -- strip those before parsing.
	if content.startswith("```"):
	content = content.split("```", 2)[1]
	if content.startswith("json"):
	content = content[4:]
	content = content.strip("`\n ")
	parsed = json.loads(content)
	assert "paris" in str(parsed.get("city", "")).lower(), (
	f"city != Paris: {parsed}"
	)
	print(f"[json] PASS json_object -> {parsed}")

	# ── 2. OpenAI image_url (data URI base64) ───────────────────
	# 64x64 solid-red PNG. stb_image (used by Studio's image
	# normaliser at routes/inference.py:3410) rejects 4x4 or
	# smaller PNGs as truncated, so we go up to 64x64 -- still
	# tiny in token cost. The assertion is loose: any non-empty
	# response from the vision path proves multimodal end-to-end
	# wiring; small VL quants are weak at colour identification.
	PNG_64X64_RED_B64 = (
	"iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAYklEQVR4nO3PMQ0AIADAMEAI/k"
	"UhBhEcDcmqYJtn7/GzpQNeNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA"
	"1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaA1oDWgNaBdCJ0BmMJ25zMAAAAASUVORK5CYII="
	)
	data_uri = f"data:image/png;base64,{PNG_64X64_RED_B64}"

	client = OpenAI(base_url = f"{BASE}/v1", api_key = KEY)
	openai_resp = client.chat.completions.create(
	model = "default",
	temperature = 0.0,
	max_tokens = 80,
	seed = SEED,
	messages = [{
	"role": "user",
	"content": [
	{"type": "image_url", "image_url": {"url": data_uri}},
	{"type": "text", "text": "What colour dominates this image? Reply in one word."},
	],
	}],
	)
	openai_text = (openai_resp.choices[0].message.content or "").lower()
	print(f"[image/openai] reply: {openai_text!r}")
	assert openai_text, "OpenAI image_url returned empty content"
	# We do not strictly require 'red' -- some quants of small VL
	# models are weak at colour names. Just require a non-empty
	# answer; the vision path is the part under test.
	print("[image/openai] PASS image_url accepted, non-empty response")

	# ── 3. Anthropic source/base64 image ────────────────────────
	# Two SDK quirks vs. Studio: base_url must NOT include /v1
	# (the SDK appends it itself; otherwise /v1/v1/messages -> 405),
	# and Studio's auth is HTTPBearer-only so the SDK's default
	# x-api-key header is ignored -- send Authorization: Bearer
	# via default_headers.
	anthropic = Anthropic(
	base_url = BASE,
	api_key = "unused",
	default_headers = {"Authorization": f"Bearer {KEY}"},
	)
	a_msg = anthropic.messages.create(
	model = "default",
	max_tokens = 80,
	temperature = 0.0,
	extra_body = {"seed": SEED},
	messages = [{
	"role": "user",
	"content": [
	{
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": "image/png",
	"data": PNG_64X64_RED_B64,
	},
	},
	{"type": "text", "text": "Describe this image briefly."},
	],
	}],
	)
	a_text = "".join(b.text for b in a_msg.content if getattr(b, "type", None) == "text")
	print(f"[image/anthropic] reply: {a_text!r}")
	assert a_text, "Anthropic source/base64 returned empty content"
	print("[image/anthropic] PASS source/base64 accepted, non-empty response")
	PY

	- name: Stop Studio
	if: always()
	run: \|
	kill "${STUDIO_PID}" 2>/dev/null \|\| true
	sleep 2
	ss -tln \| grep ":${STUDIO_PORT}" \|\| true

	- name: Upload logs
	# Always upload so green runs are still reviewable.
	if: always()
	# Diagnostic only: a transient artifact-service drop must not fail a green job.
	continue-on-error: true
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: json-images-log
	path: \|
	logs/studio.log
	logs/install.log
	retention-days: 7

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Fix Studio custom folders on Linux external drives (#6799) #9584

Workflow file

Fix Studio custom folders on Linux external drives (#6799) #9584

Uh oh!

Workflow file for this run