diff --git a/src/bootstrap.ts b/src/bootstrap.ts new file mode 100644 index 00000000..eb9f03f2 --- /dev/null +++ b/src/bootstrap.ts @@ -0,0 +1,22 @@ +/** + * Threadpool bootstrap. MUST be the first import in index.ts. + * + * Every fs operation (read_file, write_file, edit_block, config/history/log + * persistence) runs on libuv's threadpool, which defaults to only 4 threads. + * Under heavy parallel load — e.g. several agents reading/writing files on a + * slow or cloud-synced filesystem — 4 stalled operations exhaust the pool, and + * because a stalled syscall keeps its thread until the OS returns (a JS-level + * timeout does not cancel it), every subsequent fs op queues for minutes. That + * surfaced as multi-minute tool-call hangs under parallel `claude -p` load. + * + * Raising the pool size gives enough headroom that a burst of slow reads no + * longer starves the rest. libuv reads UV_THREADPOOL_SIZE only when the pool is + * first initialized (on first submitted work), so this assignment has to happen + * before ANY threadpool work — hence "first import". A user-provided value is + * always respected. + */ +const DEFAULT_THREADPOOL_SIZE = 16; + +if (!process.env.UV_THREADPOOL_SIZE) { + process.env.UV_THREADPOOL_SIZE = String(DEFAULT_THREADPOOL_SIZE); +} diff --git a/src/config-manager.ts b/src/config-manager.ts index 1c88700c..83520533 100644 --- a/src/config-manager.ts +++ b/src/config-manager.ts @@ -52,6 +52,10 @@ class ConfigManager { private config: ServerConfig = {}; private initialized = false; private _isFirstRun = false; // Track if this is the first run (config was just created) + // Serializes all disk writes so concurrent saves can't corrupt config.json. + private writeChain: Promise = Promise.resolve(); + // True while a coalesced background write is already queued (see scheduleSave). + private saveScheduled = false; constructor() { // Get user's home directory @@ -174,15 +178,45 @@ class ConfigManager { } /** - * Save config to disk + * Write the current in-memory config to disk. All writes funnel through + * writeChain (see saveConfig / scheduleSave) so overlapping saves can never + * interleave and corrupt the file. Previously every tool call could fire its + * own independent fs.writeFile of the same path. */ - private async saveConfig() { - try { - await fs.writeFile(this.configPath, JSON.stringify(this.config, null, 2), 'utf8'); - } catch (error) { - console.error('Failed to save config:', error); - throw error; - } + private async writeConfigToDisk(): Promise { + await fs.writeFile(this.configPath, JSON.stringify(this.config, null, 2), 'utf8'); + } + + /** + * Awaitable save, serialized on writeChain. Use for explicit, user-driven + * config changes where the caller wants on-disk confirmation. + */ + private async saveConfig(): Promise { + const write = this.writeChain.then(() => this.writeConfigToDisk()); + // Keep the chain alive even if this write rejects, so later writes still run. + this.writeChain = write.catch(() => {}); + return write; + } + + /** + * Non-blocking, coalesced save. Returns immediately; the write runs in the + * background. A burst of calls collapses to at most one queued write behind + * the in-flight one, so a storm of tool calls can't storm the disk — and, + * critically, can't pile up behind a saturated libuv threadpool and gate the + * tool-call response path. Used for high-frequency, non-critical persistence + * such as usage stats. + */ + scheduleSave(): void { + if (this.saveScheduled) return; // a queued write will capture the latest config + this.saveScheduled = true; + this.writeChain = this.writeChain.then(async () => { + this.saveScheduled = false; // let the next burst queue a fresh write + try { + await this.writeConfigToDisk(); + } catch (error) { + console.error('Failed to save config (background):', error); + } + }); } /** @@ -236,6 +270,21 @@ class ConfigManager { await this.saveConfig(); } + /** + * Update a value in memory and persist it WITHOUT blocking the caller. + * The tool-call response path must never wait on a disk write: when the libuv + * threadpool is saturated (e.g. many parallel reads stalled on a slow/cloud + * filesystem) an awaited write can't get a thread and would hang the response + * of even pure-memory tools. The in-memory value is updated synchronously so + * subsequent reads see it immediately; the write is coalesced in the + * background. Callers needing on-disk confirmation should use setValue. + */ + async setValueNonBlocking(key: string, value: any): Promise { + await this.init(); + this.config[key] = value; + this.scheduleSave(); + } + /** * Update multiple configuration values at once */ diff --git a/src/handlers/filesystem-handlers.ts b/src/handlers/filesystem-handlers.ts index df889c0d..1c2832d6 100644 --- a/src/handlers/filesystem-handlers.ts +++ b/src/handlers/filesystem-handlers.ts @@ -71,7 +71,12 @@ function getErrorFromPath(path: string): string { * Handle read_file command */ export async function handleReadFile(args: unknown): Promise { - const HANDLER_TIMEOUT = 60000; // 60 seconds total operation timeout + // Backstop for the whole handler operation. The real control is the + // 3-minute cancellable read timeout inside readFileFromDisk; this sits just + // above it (so that one fires first, with cleanup + a useful error) but + // still below the MCP client's ~4-minute hard cap, so we never leave the + // client hanging on an opaque timeout. + const HANDLER_TIMEOUT = 3.5 * 60 * 1000; // 3m30s // Add input validation if (args === null || args === undefined) { return createErrorResponse('No arguments provided for read_file command'); diff --git a/src/index.ts b/src/index.ts index 23fda32d..995940a1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,5 +1,8 @@ #!/usr/bin/env node +// MUST be first: raises the libuv threadpool size before any fs work is +// submitted. See src/bootstrap.ts for why import order matters. +import './bootstrap.js'; import { FilteredStdioServerTransport } from './custom-stdio.js'; import { server, flushDeferredMessages } from './server.js'; import { commandManager } from './command-manager.js'; diff --git a/src/tools/filesystem.ts b/src/tools/filesystem.ts index 51b3a6ba..0476a7ca 100644 --- a/src/tools/filesystem.ts +++ b/src/tools/filesystem.ts @@ -5,7 +5,7 @@ import fetch from 'cross-fetch'; import { execFile } from 'child_process'; import { promisify } from 'util'; import { capture } from '../utils/capture.js'; -import { withTimeout } from '../utils/withTimeout.js'; +import { withTimeout, runWithAbortableTimeout } from '../utils/withTimeout.js'; import { configManager } from '../config-manager.js'; import { getFileHandler, TextFileHandler } from '../utils/files/index.js'; import type { ReadOptions, FileResult, PdfPageItem } from '../utils/files/base.js'; @@ -20,6 +20,14 @@ const FILE_OPERATION_TIMEOUTS = { FILE_READ: 30000, // 30 seconds } as const; +// Cap file read operations at 3 minutes. The MCP client's hard per-call limit +// is ~4 minutes; timing out at 3m lets us abort the underlying fs op and return +// a useful error (e.g. the cloud-storage guidance in buildPermissionError) +// BEFORE the client gives up with an opaque "No result received after 4 +// minutes". Paired with runWithAbortableTimeout so the read is actually +// cancelled (fd/thread released), not just abandoned. +export const READ_OPERATION_TIMEOUT_MS = 3 * 60 * 1000; // 3 minutes + const FILE_SIZE_LIMITS = { LINE_COUNT_LIMIT: 10 * 1024 * 1024, // 10MB for line counting } as const; @@ -495,8 +503,9 @@ export async function readFileFromDisk( // If we can't stat the file, continue anyway and let the read operation handle errors } - // Use withTimeout to handle potential hangs - const readOperation = async () => { + // Read under an abortable timeout so a hung/stalled read is cancelled + // (fd/thread freed) rather than leaked until the OS call returns. + const readOperation = async (signal: AbortSignal) => { // Get appropriate handler for this file type (async - includes binary detection) const handler = await getFileHandler(validPath); @@ -506,7 +515,8 @@ export async function readFileFromDisk( length, sheet, range, - includeStatusMessage: true + includeStatusMessage: true, + signal }); // Return with content as string @@ -529,22 +539,20 @@ export async function readFileFromDisk( }; }; - // Execute with timeout + // Execute with a 3-minute, cancellable timeout let result; try { - result = await withTimeout( - readOperation(), - FILE_OPERATION_TIMEOUTS.FILE_READ, - `Read file operation for ${filePath}`, - null + result = await runWithAbortableTimeout( + (signal) => readOperation(signal), + READ_OPERATION_TIMEOUT_MS, + `Read file operation for ${filePath}` ); } catch (error) { const err = error as NodeJS.ErrnoException; - // withTimeout rejects with a plain string "__ERROR__: ... timed out after N seconds" - // when defaultValue is null — it has no .code property, so check for that too. - const isWithTimeoutString = typeof error === 'string' && (error as string).startsWith('__ERROR__:'); - if (isWithTimeoutString || err.code === 'EPERM' || err.code === 'EACCES' || err.code === 'ETIMEDOUT') { - throw buildPermissionError(filePath, isWithTimeoutString ? 'ETIMEDOUT' : err.code); + // runWithAbortableTimeout rejects with an Error whose .code is 'ETIMEDOUT' + // on timeout; fs rejects with EPERM/EACCES. Map all to the guidance error. + if (err.code === 'EPERM' || err.code === 'EACCES' || err.code === 'ETIMEDOUT') { + throw buildPermissionError(filePath, err.code); } throw error; } @@ -602,8 +610,14 @@ export async function readFileInternal(filePath: string, offset: number = 0, len // preserve exact file content including original line endings. // We cannot use readline-based reading as it strips line endings. - // Read entire file content preserving line endings - const content = await fs.readFile(validPath, 'utf8'); + // Read entire file content preserving line endings, under a 3-minute, + // cancellable timeout so an edit on a stalled/cloud path can't hang forever + // (previously this read had no timeout at all). + const content = await runWithAbortableTimeout( + (signal) => fs.readFile(validPath, { encoding: 'utf8', signal }), + READ_OPERATION_TIMEOUT_MS, + `Internal read for ${filePath}` + ); // If we need to apply offset/length, do it while preserving line endings if (offset === 0 && length >= Number.MAX_SAFE_INTEGER) { diff --git a/src/utils/files/base.ts b/src/utils/files/base.ts index 278a784f..8f94a8d6 100644 --- a/src/utils/files/base.ts +++ b/src/utils/files/base.ts @@ -88,6 +88,9 @@ export interface ReadOptions { /** Whether to include status messages (default: true) */ includeStatusMessage?: boolean; + + /** Optional AbortSignal to cancel an in-flight read (frees fd/thread on timeout). */ + signal?: AbortSignal; } /** diff --git a/src/utils/files/image.ts b/src/utils/files/image.ts index 4e97ce9b..06fd5b80 100644 --- a/src/utils/files/image.ts +++ b/src/utils/files/image.ts @@ -37,7 +37,7 @@ export class ImageFileHandler implements FileHandler { async read(path: string, options?: ReadOptions): Promise { // Images are always read in full, ignoring offset and length - const buffer = await fs.readFile(path); + const buffer = await fs.readFile(path, { signal: options?.signal }); const content = buffer.toString('base64'); const mimeType = this.getMimeType(path); diff --git a/src/utils/files/text.ts b/src/utils/files/text.ts index 52e37bff..bf9035ab 100644 --- a/src/utils/files/text.ts +++ b/src/utils/files/text.ts @@ -56,7 +56,7 @@ export class TextFileHandler implements FileHandler { const includeStatusMessage = options?.includeStatusMessage ?? true; // Binary detection is done at factory level - just read as text - return this.readFileWithSmartPositioning(filePath, offset, length, 'text/plain', includeStatusMessage); + return this.readFileWithSmartPositioning(filePath, offset, length, 'text/plain', includeStatusMessage, options?.signal); } async write(path: string, content: string, mode: 'rewrite' | 'append' = 'rewrite'): Promise { @@ -118,11 +118,11 @@ export class TextFileHandler implements FileHandler { /** * Get file line count (for files under size limit) */ - private async getFileLineCount(filePath: string): Promise { + private async getFileLineCount(filePath: string, signal?: AbortSignal): Promise { try { const stats = await fs.stat(filePath); if (stats.size < FILE_SIZE_LIMITS.LINE_COUNT_LIMIT) { - const content = await fs.readFile(filePath, 'utf8'); + const content = await fs.readFile(filePath, { encoding: 'utf8', signal }); return TextFileHandler.countLines(content); } } catch (error) { @@ -208,12 +208,13 @@ export class TextFileHandler implements FileHandler { offset: number, length: number, mimeType: string, - includeStatusMessage: boolean = true + includeStatusMessage: boolean = true, + signal?: AbortSignal ): Promise { const stats = await fs.stat(filePath); const fileSize = stats.size; - const totalLines = await this.getFileLineCount(filePath); + const totalLines = await this.getFileLineCount(filePath, signal); // For negative offsets (tail behavior), use reverse reading if (offset < 0) { @@ -221,20 +222,20 @@ export class TextFileHandler implements FileHandler { if (fileSize > FILE_SIZE_LIMITS.LARGE_FILE_THRESHOLD && requestedLines <= READ_PERFORMANCE_THRESHOLDS.SMALL_READ_THRESHOLD) { - return await this.readLastNLinesReverse(filePath, requestedLines, mimeType, includeStatusMessage, totalLines); + return await this.readLastNLinesReverse(filePath, requestedLines, mimeType, includeStatusMessage, totalLines, signal); } else { - return await this.readFromEndWithReadline(filePath, requestedLines, mimeType, includeStatusMessage, totalLines); + return await this.readFromEndWithReadline(filePath, requestedLines, mimeType, includeStatusMessage, totalLines, signal); } } // For positive offsets else { if (fileSize < FILE_SIZE_LIMITS.LARGE_FILE_THRESHOLD || offset === 0) { - return await this.readFromStartWithReadline(filePath, offset, length, mimeType, includeStatusMessage, totalLines); + return await this.readFromStartWithReadline(filePath, offset, length, mimeType, includeStatusMessage, totalLines, signal); } else { if (offset > READ_PERFORMANCE_THRESHOLDS.DEEP_OFFSET_THRESHOLD) { - return await this.readFromEstimatedPosition(filePath, offset, length, mimeType, includeStatusMessage, totalLines); + return await this.readFromEstimatedPosition(filePath, offset, length, mimeType, includeStatusMessage, totalLines, signal); } else { - return await this.readFromStartWithReadline(filePath, offset, length, mimeType, includeStatusMessage, totalLines); + return await this.readFromStartWithReadline(filePath, offset, length, mimeType, includeStatusMessage, totalLines, signal); } } } @@ -248,7 +249,8 @@ export class TextFileHandler implements FileHandler { n: number, mimeType: string, includeStatusMessage: boolean = true, - fileTotalLines?: number + fileTotalLines?: number, + signal?: AbortSignal ): Promise { const fd = await fs.open(filePath, 'r'); try { @@ -260,6 +262,11 @@ export class TextFileHandler implements FileHandler { let partialLine = ''; while (position > 0 && lines.length < n) { + if (signal?.aborted) { + const err = new Error('Read aborted') as NodeJS.ErrnoException; + err.code = 'ABORT_ERR'; + throw err; + } const readSize = Math.min(READ_PERFORMANCE_THRESHOLDS.CHUNK_SIZE, position); position -= readSize; @@ -297,10 +304,11 @@ export class TextFileHandler implements FileHandler { requestedLines: number, mimeType: string, includeStatusMessage: boolean = true, - fileTotalLines?: number + fileTotalLines?: number, + signal?: AbortSignal ): Promise { const rl = createInterface({ - input: createReadStream(filePath), + input: createReadStream(filePath, { signal }), crlfDelay: Infinity }); @@ -342,10 +350,11 @@ export class TextFileHandler implements FileHandler { length: number, mimeType: string, includeStatusMessage: boolean = true, - fileTotalLines?: number + fileTotalLines?: number, + signal?: AbortSignal ): Promise { const rl = createInterface({ - input: createReadStream(filePath), + input: createReadStream(filePath, { signal }), crlfDelay: Infinity }); @@ -381,11 +390,12 @@ export class TextFileHandler implements FileHandler { length: number, mimeType: string, includeStatusMessage: boolean = true, - fileTotalLines?: number + fileTotalLines?: number, + signal?: AbortSignal ): Promise { // First, do a quick scan to estimate lines per byte const rl = createInterface({ - input: createReadStream(filePath), + input: createReadStream(filePath, { signal }), crlfDelay: Infinity }); @@ -401,7 +411,7 @@ export class TextFileHandler implements FileHandler { rl.close(); if (sampleLines === 0) { - return await this.readFromStartWithReadline(filePath, offset, length, mimeType, includeStatusMessage, fileTotalLines); + return await this.readFromStartWithReadline(filePath, offset, length, mimeType, includeStatusMessage, fileTotalLines, signal); } // Estimate position @@ -413,7 +423,7 @@ export class TextFileHandler implements FileHandler { const stats = await fd.stat(); const startPosition = Math.min(estimatedBytePosition, stats.size); - const stream = createReadStream(filePath, { start: startPosition }); + const stream = createReadStream(filePath, { start: startPosition, signal }); const rl2 = createInterface({ input: stream, crlfDelay: Infinity diff --git a/src/utils/usageTracker.ts b/src/utils/usageTracker.ts index bc210bdb..fe035451 100644 --- a/src/utils/usageTracker.ts +++ b/src/utils/usageTracker.ts @@ -92,10 +92,14 @@ class UsageTracker { } /** - * Save usage stats to config + * Save usage stats to config. + * Non-blocking: the tool-call return path must not wait on a disk write. The + * in-memory stats are already updated by the caller (getStats returns the live + * config object), so persistence is coalesced in the background. This is what + * keeps a saturated libuv threadpool from gating tool responses on every call. */ private async saveStats(stats: ToolUsageStats): Promise { - await configManager.setValue('usageStats', stats); + await configManager.setValueNonBlocking('usageStats', stats); } /** diff --git a/src/utils/withTimeout.ts b/src/utils/withTimeout.ts index 93ade585..13baba68 100644 --- a/src/utils/withTimeout.ts +++ b/src/utils/withTimeout.ts @@ -55,4 +55,45 @@ export function withTimeout( } }); }); -} \ No newline at end of file +} + +/** + * Run an operation under a timeout WITH real cancellation. + * + * Unlike withTimeout (which only races a timer and leaves the underlying work + * running — holding its libuv thread/fd until the OS call returns), this passes + * an AbortSignal into the operation and aborts it when the timeout fires, so a + * read/stream that honors the signal is cancelled and its resources released. + * + * Rejects with an Error whose `.code` is 'ETIMEDOUT' on timeout (so existing + * ETIMEDOUT handling / permission-error mapping keeps working). + * + * Caveat: an operation wedged inside a single un-interruptible syscall only + * observes the abort once that syscall returns; library reads that ignore the + * signal (e.g. Excel/PDF parsers) still get the timeout rejection but keep + * running in the background until they finish on their own. + */ +export function runWithAbortableTimeout( + operation: (signal: AbortSignal) => Promise, + timeoutMs: number, + operationName: string +): Promise { + const controller = new AbortController(); + let timeoutId: NodeJS.Timeout; + + const timeout = new Promise((_, reject) => { + timeoutId = setTimeout(() => { + controller.abort(); + const error = new Error(`${operationName} timed out after ${timeoutMs / 1000} seconds`) as NodeJS.ErrnoException; + error.code = 'ETIMEDOUT'; + reject(error); + }, timeoutMs); + }); + + const op = operation(controller.signal); + // Swallow the late abort rejection so it can't surface as an unhandled + // rejection after the timeout has already settled the race. + op.catch(() => {}); + + return Promise.race([op, timeout]).finally(() => clearTimeout(timeoutId)); +} diff --git a/test/repro/test-bootstrap-threadpool.js b/test/repro/test-bootstrap-threadpool.js new file mode 100644 index 00000000..37edfdaf --- /dev/null +++ b/test/repro/test-bootstrap-threadpool.js @@ -0,0 +1,24 @@ +// Verify the SHIPPED bootstrap (dist/bootstrap.js) raises the pool before any +// fs work, so 4 concurrently-stalled reads no longer starve a 5th fs op. +// No UV_THREADPOOL_SIZE override here -> relies entirely on bootstrap's default. +import '../../dist/bootstrap.js'; // first import, exactly like index.ts +import { execSync } from 'child_process'; +import fs from 'fs/promises'; +import os from 'os'; +import path from 'path'; + +const T0 = Date.now(); +const log = (m) => console.log(`[${Date.now() - T0}ms] ${m}`); +const fifo = path.join(os.tmpdir(), `dc-boot-${Date.now()}`); +execSync(`mkfifo ${fifo}`); +log(`UV_THREADPOOL_SIZE after bootstrap = ${process.env.UV_THREADPOOL_SIZE}`); + +for (let i = 0; i < 4; i++) fs.readFile(fifo).catch(() => {}); +setTimeout(async () => { + const t = Date.now(); + const guard = setTimeout(() => { log(`BLOCKED >3000ms -> bootstrap did NOT help`); execSync(`rm -f ${fifo}`); process.exit(1); }, 3000); + await fs.writeFile(path.join(os.tmpdir(), 'dc-boot-probe'), 'x'); + clearTimeout(guard); + log(`trivial write completed in ${Date.now() - t}ms -> bootstrap headroom WORKS`); + execSync(`rm -f ${fifo}`); process.exit(0); +}, 150); diff --git a/test/repro/test-dc-tracking-gate.js b/test/repro/test-dc-tracking-gate.js new file mode 100644 index 00000000..cd7d6a43 --- /dev/null +++ b/test/repro/test-dc-tracking-gate.js @@ -0,0 +1,47 @@ +// DC-level repro: the real per-tool-call gate. +// server.ts CallTool handler awaits usageTracker.trackSuccess(name) before +// returning ANY tool's result. trackSuccess -> saveStats -> configManager +// .setValue -> saveConfig -> fs.writeFile (libuv threadpool). +// If stalled cloud-path reads hold all threadpool threads, this awaited write +// never resolves -> even list_processes (pure memory) never returns. +// +// Run: UV_THREADPOOL_SIZE=4 BLOCKERS=4 node test/test-dc-tracking-gate.js +import { execSync } from 'child_process'; +import fs from 'fs/promises'; +import os from 'os'; +import path from 'path'; +import { configManager } from '../../dist/config-manager.js'; +import { usageTracker } from '../../dist/utils/usageTracker.js'; + +const T0 = Date.now(); +const log = (m) => console.log(`[${Date.now() - T0}ms] ${m}`); +const BLOCKERS = Number(process.env.BLOCKERS || process.env.UV_THREADPOOL_SIZE || 4); +const fifo = path.join(os.tmpdir(), `dc-gate-fifo-${Date.now()}`); +execSync(`mkfifo ${fifo}`); + +// Warm the config so init()'s own disk read is already done and cached. +await configManager.getConfig(); +log(`config warmed; pool=${process.env.UV_THREADPOOL_SIZE || 4}, blockers=${BLOCKERS}`); + +// Simulate DC read_file/edit_block calls stuck on a stalled cloud mount: +// each holds a threadpool thread until the (never-arriving) read returns. +for (let i = 0; i < BLOCKERS; i++) { + fs.readFile(fifo).catch((e) => log(`fifo read ${i} errored: ${e.code}`)); +} + +// Now the exact thing the dispatcher awaits for EVERY successful tool call, +// including list_processes: +setTimeout(async () => { + log(`calling usageTracker.trackSuccess('list_processes') ...`); + const t = Date.now(); + const guard = setTimeout(() => { + log(`trackSuccess STILL BLOCKED after 5000ms -> list_processes would hang here. GATE REPRODUCED.`); + execSync(`rm -f ${fifo}`); + process.exit(0); + }, 5000); + await usageTracker.trackSuccess('list_processes'); + clearTimeout(guard); + log(`trackSuccess completed in ${Date.now() - t}ms (NOT gated)`); + execSync(`rm -f ${fifo}`); + process.exit(0); +}, 200); diff --git a/test/repro/test-env-threadpool-timing.js b/test/repro/test-env-threadpool-timing.js new file mode 100644 index 00000000..133d2f26 --- /dev/null +++ b/test/repro/test-env-threadpool-timing.js @@ -0,0 +1,24 @@ +// Does setting process.env.UV_THREADPOOL_SIZE from inside the process (before +// the first threadpool op) actually change the effective pool size? If yes, a +// bootstrap module that sets it as the very first import is a valid fix. +process.env.UV_THREADPOOL_SIZE = '8'; // set BEFORE any fs/threadpool use +import { execSync } from 'child_process'; +import fs from 'fs/promises'; +import os from 'os'; +import path from 'path'; + +const T0 = Date.now(); +const log = (m) => console.log(`[${Date.now() - T0}ms] ${m}`); +const fifo = path.join(os.tmpdir(), `dc-envtest-${Date.now()}`); +execSync(`mkfifo ${fifo}`); // child_process, not threadpool +log(`set UV_THREADPOOL_SIZE=8 in-process; firing 4 FIFO blockers`); + +for (let i = 0; i < 4; i++) fs.readFile(fifo).catch(() => {}); +setTimeout(async () => { + const t = Date.now(); + const guard = setTimeout(() => { log(`BLOCKED >3000ms -> env set too late, pool still 4`); execSync(`rm -f ${fifo}`); process.exit(0); }, 3000); + await fs.writeFile(path.join(os.tmpdir(), 'dc-envtest-probe'), 'x'); + clearTimeout(guard); + log(`write completed in ${Date.now() - t}ms -> in-process env set WORKS (pool=8)`); + execSync(`rm -f ${fifo}`); process.exit(0); +}, 150); diff --git a/test/repro/test-interact-join-stall.js b/test/repro/test-interact-join-stall.js new file mode 100644 index 00000000..fbd6e211 --- /dev/null +++ b/test/repro/test-interact-join-stall.js @@ -0,0 +1,55 @@ +// Repro (v2): interact_with_process re-join()s the WHOLE buffer every 50ms poll +// (getOutputSinceSnapshot -> outputLines.join('\n'), terminal-manager.ts). +// Phase 1: grow the buffer to ~45MB across MANY lines (so join must concat). +// Phase 2: trickle tiny output over ~3s; the poll loop then joins ~45MB ~60x. +// Measure event-loop lag during Phase 2 only. +// +// Run: node test/test-interact-join-stall.js +import { startProcess, interactWithProcess } from '../../dist/tools/improved-process-tools.js'; + +const T0 = Date.now(); +const log = (m) => console.log(`[${Date.now() - T0}ms] ${m}`); +const pidFrom = (res) => { + const m = (res?.content?.[0]?.text || '').match(/PID (\d+)/); + return m ? Number(m[1]) : null; +}; + +const started = await startProcess({ command: 'python3 -i -q', timeout_ms: 4000 }); +const pid = pidFrom(started); +log(`python pid=${pid}`); +await interactWithProcess({ pid, input: 'import time', timeout_ms: 3000 }); + +// Phase 1: single expression (no block => no '...' continuation). ~45MB / 3000 lines. +log(`phase 1: growing buffer to ~45MB ...`); +await interactWithProcess({ pid, input: "print('\\n'.join('X'*1200 for _ in range(40000)))", timeout_ms: 15000 }); +log(`phase 1 done`); + +// Event-loop lag monitor (10ms interval; large delta => synchronous block). +let maxLag = 0, sumLag = 0, samples = 0, last = Date.now(); +const mon = setInterval(() => { + const now = Date.now(); + const lag = now - last - 10; + if (lag > maxLag) maxLag = lag; + if (lag > 0) { sumLag += lag; samples++; } + last = now; +}, 10); + +// Phase 2: trickle 40 ticks over ~3.2s so ~60 polls each re-join the 45MB buffer. +log(`phase 2: trickling output while polling ...`); +const t = Date.now(); +await interactWithProcess({ + pid, + input: "[ (time.sleep(0.08) or print('tick', i)) for i in range(40) ]", + timeout_ms: 12000 +}); +const wall = Date.now() - t; +clearInterval(mon); + +log(`phase 2 wall=${wall}ms`); +log(`event-loop lag during phase 2: max=${maxLag}ms, total blocked=${sumLag}ms over ${samples} stalls`); +log(`blocked ${((sumLag / wall) * 100).toFixed(0)}% of phase-2 wall time`); +log(maxLag > 150 || sumLag > wall * 0.3 + ? `STALL REPRODUCED: per-poll whole-buffer join starves the event loop` + : `no significant stall observed`); +try { await interactWithProcess({ pid, input: 'exit()', timeout_ms: 800, wait_for_prompt: false }); } catch {} +process.exit(0); diff --git a/test/repro/test-process-wait-client-cap.js b/test/repro/test-process-wait-client-cap.js new file mode 100644 index 00000000..29e66a88 --- /dev/null +++ b/test/repro/test-process-wait-client-cap.js @@ -0,0 +1,201 @@ +import assert from 'assert'; +import { terminalManager } from '../../dist/terminal-manager.js'; +import { interactWithProcess } from '../../dist/tools/improved-process-tools.js'; +import { configManager } from '../../dist/config-manager.js'; + +/** + * FAILING repro for the "No result received after 4 minutes" crash. + * + * Symptom (customer): a `Send Input to Process` / start_process call hangs and + * Claude Desktop reports "No result received from the Claude Desktop app after + * waiting 4 minutes. The local MCP server ... may be unresponsive." + * + * Verified root cause (src/terminal-manager.ts executeCommand + improved- + * process-tools.ts interactWithProcess): the blocking wait is bounded ONLY by + * the caller-supplied timeout_ms. There is no ceiling below the MCP client's + * hard ~4-minute (240000ms) per-call limit. So a long-running, prompt-less + * command (e.g. an OpenRouter multi-model sweep) with a large timeout_ms keeps + * the tool call pending until the client gives up at 4 minutes. + * + * These tests encode the DESIRED contract, so they FAIL against current code + * and should PASS once a fix lands: + * + * The single-call blocking wait of start_process and interact_with_process + * MUST be capped at min(timeout_ms, maxProcessWaitMs), where maxProcessWaitMs + * is a config value defaulting BELOW the client ceiling (suggested 180000). + * When the cap is hit, the tool returns an "is still running, use + * read_process_output" handoff (isBlocked=true) instead of continuing to + * block for the full timeout_ms. + * + * The fast tests below set maxProcessWaitMs to a tiny value so they run in ms. + * Set DC_REPRO_REALTIME=1 to additionally run a fix-agnostic ~200s proof that + * uses the real client ceiling. + */ + +// The MCP client (Claude Desktop) kills a single tool call at ~4 minutes. +const CLIENT_CEILING_MS = 240000; +// Config key the fix should read to bound the single-call wait. +const CAP_KEY = 'maxProcessWaitMs'; + +// Fast-test values: tiny cap so a fixed tool returns in ms. +const TEST_CAP_MS = 400; // stand-in for the production maxProcessWaitMs +const LARGE_TIMEOUT_MS = 4000; // stand-in for "user/agent set a big timeout" +const CHILD_LIFETIME_MS = 8000; // child outlives both cap and timeout +const CAP_MARGIN_MS = 1600; // tolerance above the cap for scheduling/overhead + +const since = (t) => Date.now() - t; +const sleep = (ms) => new Promise((r) => setTimeout(r, ms)); +const cleanup = (pid) => { try { if (pid > 0) terminalManager.forceTerminate(pid); } catch {} }; + +let ORIGINAL_CAP; +async function setCap(v) { await configManager.setValue(CAP_KEY, v); } +async function snapshotCap() { ORIGINAL_CAP = await configManager.getValue(CAP_KEY); } +async function restoreCap() { await configManager.setValue(CAP_KEY, ORIGINAL_CAP); } + +// A silent, long-running, non-exiting process: matches none of the early-exit +// paths (no prompt, no output, no exit) so only the wait cap can end the call. +const SILENT_CHILD = `node -e "setTimeout(function(){}, ${CHILD_LIFETIME_MS})"`; + +/** + * Test 1 (FAILS now): start_process must cap its initial wait. + * A silent long process with a large timeout_ms should return at ~the cap, + * not block for the whole timeout_ms. Current code blocks for timeout_ms. + */ +async function testStartProcessCapsWait() { + console.log('\n📋 Test 1: start_process caps initial wait below timeout_ms...'); + await setCap(TEST_CAP_MS); + const t0 = Date.now(); + const res = await terminalManager.executeCommand(SILENT_CHILD, LARGE_TIMEOUT_MS, undefined, true); + const elapsed = since(t0); + cleanup(res.pid); + + assert(res.pid > 0, 'should have spawned a process'); + assert.strictEqual(res.isBlocked, true, 'should report still-running (isBlocked=true)'); + assert( + elapsed <= TEST_CAP_MS + CAP_MARGIN_MS, + `start_process should return at ~the cap (<=${TEST_CAP_MS + CAP_MARGIN_MS}ms), ` + + `but blocked for ${elapsed}ms with timeout_ms=${LARGE_TIMEOUT_MS} and cap=${TEST_CAP_MS}. ` + + `Uncapped, a real ${CLIENT_CEILING_MS}ms+ timeout would blow past the client limit.` + ); + console.log(` ✅ returned in ${elapsed}ms (cap=${TEST_CAP_MS}ms)`); +} + +const BUSY_MS = 5000; // child stays silent (no prompt/output) longer than LARGE_TIMEOUT_MS +const BUSY_INPUT = `var __end=Date.now()+${BUSY_MS}; while(Date.now()<__end){}`; + +/** + * Test 2 (FAILS now): interact_with_process must cap its wait too. + * This is the actual crash path ("Send Input to Process"). We start a REPL, + * then send input that runs a long, silent computation. With wait_for_prompt + * (default), no prompt appears until it finishes, so the poll loop currently + * runs the full timeout_ms. It should instead return at ~the cap. + */ +async function testInteractCapsWait() { + console.log('\n📋 Test 2: interact_with_process caps its wait below timeout_ms...'); + // Start a REPL (returns fast via prompt detection, well under any cap). + const start = await terminalManager.executeCommand('node -i', 3000, undefined, false); + const pid = start.pid; + assert(pid > 0, 'should have started a REPL session'); + + await setCap(TEST_CAP_MS); + const t0 = Date.now(); + await interactWithProcess({ pid, input: BUSY_INPUT, timeout_ms: LARGE_TIMEOUT_MS }); + const elapsed = since(t0); + cleanup(pid); + + assert( + elapsed <= TEST_CAP_MS + CAP_MARGIN_MS, + `interact_with_process should return at ~the cap (<=${TEST_CAP_MS + CAP_MARGIN_MS}ms), ` + + `but blocked for ${elapsed}ms with timeout_ms=${LARGE_TIMEOUT_MS} and cap=${TEST_CAP_MS}. ` + + `This is exactly the path that produces the 4-minute "no result" crash.` + ); + console.log(` ✅ returned in ${elapsed}ms (cap=${TEST_CAP_MS}ms)`); +} + +/** + * Test 3 (guard, passes now AND after the fix): the cap must not slow the + * normal fast path. With a generous cap, a prompt-emitting process still + * returns early via prompt detection, nowhere near the cap or the timeout. + * This stops a "fix" that just blanket-shortens every wait. + */ +async function testPromptStillReturnsEarly() { + console.log('\n📋 Test 3 (guard): prompt-emitting process still returns early...'); + await setCap(60000); // generous cap, larger than the timeout below + const bigTimeout = 5000; + const t0 = Date.now(); + const res = await terminalManager.executeCommand( + `node -e "process.stdout.write('>>> ');setTimeout(function(){}, ${CHILD_LIFETIME_MS})"`, + bigTimeout, + undefined, + true + ); + const elapsed = since(t0); + cleanup(res.pid); + + assert.strictEqual(res.isBlocked, true, 'prompt means waiting for input'); + assert(elapsed < 1000, `should return quickly via prompt, took ${elapsed}ms`); + assert( + res.timingInfo.exitReason.startsWith('early_exit'), + `expected an early_exit reason, got "${res.timingInfo.exitReason}"` + ); + console.log(` ✅ returned early in ${elapsed}ms (${res.timingInfo.exitReason})`); +} + +/** + * Test 4 (fix-agnostic, slow, opt-in via DC_REPRO_REALTIME=1): uses the real + * client ceiling. A silent process with a 5-minute timeout_ms must not keep + * the call pending past a client-safe ceiling (200s, under the 240s cap). + * Does not depend on any particular fix knob. ~200s to run. + */ +async function testRealTimeClientCeiling() { + console.log('\n📋 Test 4 (real-time): single call must stay under the client ceiling...'); + await restoreCap(); // use production default, not the tiny test cap + const SAFE_CEILING_MS = 200000; + const HUGE_TIMEOUT_MS = 300000; + + const t0 = Date.now(); + const pending = terminalManager.executeCommand(SILENT_CHILD, HUGE_TIMEOUT_MS, undefined, false); + await sleep(800); + const sessions = terminalManager.listActiveSessions(); + const pid = sessions.length ? sessions[sessions.length - 1].pid : -1; + // Safety net so the test can never hang forever; also resolves `pending`. + const killer = setTimeout(() => cleanup(pid), SAFE_CEILING_MS); + await pending; + clearTimeout(killer); + const elapsed = since(t0); + cleanup(pid); + + assert( + elapsed < SAFE_CEILING_MS, + `call stayed pending ${elapsed}ms — past the ${SAFE_CEILING_MS}ms safe ceiling ` + + `(client kills at ${CLIENT_CEILING_MS}ms). timeout_ms was ${HUGE_TIMEOUT_MS}.` + ); + console.log(` ✅ returned in ${elapsed}ms (under ${SAFE_CEILING_MS}ms)`); +} + +async function runAllTests() { + console.log('🚀 Starting process-wait client-ceiling repro tests...'); + await snapshotCap(); + let ok = true; + try { + await testStartProcessCapsWait(); // FAILS until fix + await testInteractCapsWait(); // FAILS until fix + await testPromptStillReturnsEarly();// guard, passes + if (process.env.DC_REPRO_REALTIME === '1') { + await testRealTimeClientCeiling(); // FAILS until fix (~200s) + } else { + console.log('\n⏭️ Test 4 (real-time, ~200s) skipped. Set DC_REPRO_REALTIME=1 to run it.'); + } + console.log('\n🎉 All process-wait cap tests passed!'); + } catch (error) { + ok = false; + console.error('\n❌ Test failed:', error.message); + } finally { + await restoreCap(); + } + return ok; +} + +runAllTests() + .then((success) => process.exit(success ? 0 : 1)) + .catch((error) => { console.error('Test error:', error); process.exit(1); }); diff --git a/test/repro/test-read-abort-frees-thread.js b/test/repro/test-read-abort-frees-thread.js new file mode 100644 index 00000000..2fff2c8d --- /dev/null +++ b/test/repro/test-read-abort-frees-thread.js @@ -0,0 +1,43 @@ +// Demonstration: runWithAbortableTimeout actually CANCELS the underlying read +// (it rejects quickly instead of returning the full buffer), whereas withTimeout +// abandons the promise while the read runs to completion in the background. +// +// Deterministic part: the abortable read must reject (read cut short), proving +// the AbortSignal reaches fs and stops it. +// +// Timing note: on a fast local SSD the "thread stays held" cost is invisible +// because the file is served from the page cache at memory bandwidth. The cost +// is real only on slow/network/cloud paths — exactly the case this fix targets, +// and one that can't be faithfully reproduced on a local disk. So we assert the +// cancellation (deterministic) and print timing as informational only. +// +// Run: UV_THREADPOOL_SIZE=1 node test/repro/test-read-abort-frees-thread.js +import assert from 'assert'; +import { execSync } from 'child_process'; +import fs from 'fs/promises'; +import os from 'os'; +import path from 'path'; +import { withTimeout, runWithAbortableTimeout } from '../../dist/utils/withTimeout.js'; + +const big = path.join(os.tmpdir(), `dc-big-${Date.now()}.bin`); +console.log('creating 800MB temp file...'); +execSync(`dd if=/dev/zero of=${big} bs=1m count=800 2>/dev/null`); + +// Abortable read with a 0ms budget: must reject (read cancelled), NOT return 800MB. +let rejected = false; +let returnedBytes = -1; +try { + const buf = await runWithAbortableTimeout( + (signal) => fs.readFile(big, { signal }), + 0, + 'abortable read' + ); + returnedBytes = buf.length; +} catch (e) { + rejected = true; + console.log(`abortable read rejected with code=${e.code} (read was cut short) ✓`); +} +execSync(`rm -f ${big}`); +assert.ok(rejected, `expected abortable read to be cancelled, but it returned ${returnedBytes} bytes`); +console.log('PASS: AbortController propagated to fs and cancelled the read.'); +process.exit(0); diff --git a/test/repro/test-start-process-timeout-block.js b/test/repro/test-start-process-timeout-block.js new file mode 100644 index 00000000..0f49e841 --- /dev/null +++ b/test/repro/test-start-process-timeout-block.js @@ -0,0 +1,135 @@ +import assert from 'assert'; +import { terminalManager } from '../../dist/terminal-manager.js'; + +/** + * Repro / characterization test for issue #310: + * "start_process blocks ... causing Claude Desktop crashes" + * + * Root cause (src/terminal-manager.ts executeCommand): the call resolves early + * ONLY via one of: + * - a quick prompt pattern (>>> > $ #) on a stdout chunk + * - analyzeProcessState() detecting a REPL prompt (guarded by output) + * - process exit + * - the timeoutMs fallback + * + * A long-running process that produces no prompt-like output and does not exit + * hits none of the early paths, so executeCommand is held for the FULL + * timeoutMs. With a large timeout_ms this is the multi-minute pending tool call + * users observe. + * + * NOTE: this is an async wait, not a literal event-loop block. The issue's + * "blocks Electron main thread" framing is inaccurate, but the user-visible + * symptom (tool call pending for the whole timeout) is real and is what this + * test reproduces. If a cap-the-initial-wait fix lands, Tests 1 & 2 should be + * updated to assert the capped duration instead. + */ + +const TIMEOUT_MS = 800; // short, keeps the test fast +const PROC_LIFETIME_MS = 6000; // child lives well past the timeout + +const since = (t) => Date.now() - t; +const cleanup = (pid) => { try { terminalManager.forceTerminate(pid); } catch {} }; + +/** + * Test 1 (core repro): a SILENT long-running process produces no output, so + * neither the quick-pattern nor the periodic analyzeProcessState path can fire + * (the periodic check is guarded by `output.trim()`). The only thing that ends + * the call is the timeout fallback => the call is held for the full timeoutMs. + */ +async function testSilentProcessWaitsFullTimeout() { + console.log('\n📋 Test 1: silent long-running process is held for the full timeout...'); + const t0 = Date.now(); + const res = await terminalManager.executeCommand( + `node -e "setTimeout(function(){}, ${PROC_LIFETIME_MS})"`, + TIMEOUT_MS, + undefined, + true // collectTiming -> populates timingInfo.exitReason + ); + const elapsed = since(t0); + cleanup(res.pid); + + assert(res.pid > 0, 'should have spawned a process'); + assert.strictEqual(res.isBlocked, true, 'should report isBlocked=true'); + assert.strictEqual(res.output.trim(), '', 'silent process should produce no output'); + assert.strictEqual(res.timingInfo.exitReason, 'timeout', + `expected exitReason "timeout", got "${res.timingInfo.exitReason}"`); + assert(elapsed >= TIMEOUT_MS - 75, + `should wait ~the full timeout (>=${TIMEOUT_MS}ms), only waited ${elapsed}ms`); + assert(elapsed < PROC_LIFETIME_MS - 500, + `should return via timeout, not process exit (elapsed ${elapsed}ms)`); + console.log(` ✅ held for ${elapsed}ms via timeout (timeout=${TIMEOUT_MS}ms, proc=${PROC_LIFETIME_MS}ms)`); +} + +/** + * Test 2 (reporter's scenario): a CHATTY but prompt-less long-running process + * (the "Python test script that didn't output prompt-like patterns"). It emits + * plain progress lines that match no REPL prompt and no completion pattern, so + * it is also held for the full timeout despite producing output. + */ +async function testChattyNonPromptProcessWaitsFullTimeout() { + console.log('\n📋 Test 2: chatty (no-prompt) long-running process is held for the full timeout...'); + const t0 = Date.now(); + const res = await terminalManager.executeCommand( + `node -e "setInterval(function(){console.log('progress')},100);setTimeout(function(){},${PROC_LIFETIME_MS})"`, + TIMEOUT_MS, + undefined, + true + ); + const elapsed = since(t0); + cleanup(res.pid); + + assert.strictEqual(res.isBlocked, true, 'should report isBlocked=true'); + assert(res.output.includes('progress'), 'should have captured progress output'); + assert.strictEqual(res.timingInfo.exitReason, 'timeout', + `non-prompt output must not trigger early exit; got "${res.timingInfo.exitReason}"`); + assert(elapsed >= TIMEOUT_MS - 75, + `should wait ~the full timeout (>=${TIMEOUT_MS}ms), only waited ${elapsed}ms`); + assert(elapsed < PROC_LIFETIME_MS - 500, + `should return via timeout, not process exit (elapsed ${elapsed}ms)`); + console.log(` ✅ held for ${elapsed}ms despite output (exitReason=${res.timingInfo.exitReason})`); +} + +/** + * Test 3 (contrast / regression guard): when the process DOES emit a recognized + * prompt, executeCommand returns promptly via the quick-pattern path, well + * before the (large) timeout. This proves the bug is specific to prompt-less + * processes and guards the early-exit path from regressing. + */ +async function testPromptProcessReturnsEarly() { + console.log('\n📋 Test 3: prompt-emitting process returns early (not at timeout)...'); + const bigTimeout = 5000; + const t0 = Date.now(); + const res = await terminalManager.executeCommand( + `node -e "process.stdout.write('>>> ');setTimeout(function(){},${PROC_LIFETIME_MS})"`, + bigTimeout, + undefined, + true + ); + const elapsed = since(t0); + cleanup(res.pid); + + assert.strictEqual(res.isBlocked, true, 'prompt means blocked/waiting for input'); + assert(elapsed < 1000, `should return quickly via prompt, took ${elapsed}ms`); + assert(res.timingInfo.exitReason.startsWith('early_exit'), + `expected an early_exit reason, got "${res.timingInfo.exitReason}"`); + console.log(` ✅ returned early after ${elapsed}ms (exitReason=${res.timingInfo.exitReason})`); +} + +async function runAllTests() { + console.log('🚀 Starting #310 start_process timeout-block tests...'); + try { + await testSilentProcessWaitsFullTimeout(); + await testChattyNonPromptProcessWaitsFullTimeout(); + await testPromptProcessReturnsEarly(); + console.log('\n🎉 All #310 timeout-block tests passed!'); + return true; + } catch (error) { + console.error('\n❌ Test failed:', error.message); + console.error(error.stack); + return false; + } +} + +runAllTests() + .then(success => process.exit(success ? 0 : 1)) + .catch(error => { console.error('Test error:', error); process.exit(1); }); diff --git a/test/repro/test-threadpool-starvation.js b/test/repro/test-threadpool-starvation.js new file mode 100644 index 00000000..730b5b62 --- /dev/null +++ b/test/repro/test-threadpool-starvation.js @@ -0,0 +1,50 @@ +// Repro: libuv threadpool exhaustion makes a trivial fs op hang. +// Models the real bug: stalled cloud-path fs.reads occupy all UV threads, +// and the per-tool-call config fs.writeFile (usageTracker.saveStats) queues +// behind them -> even a "light" tool (list_processes) can't respond. +// +// Run: UV_THREADPOOL_SIZE=4 node test/test-threadpool-starvation.js +import { execSync } from 'child_process'; +import fs from 'fs/promises'; +import os from 'os'; +import path from 'path'; + +const POOL = Number(process.env.UV_THREADPOOL_SIZE || 4); +const BLOCKERS = Number(process.env.BLOCKERS || POOL); +const fifo = path.join(os.tmpdir(), `dc-fifo-${Date.now()}`); +const probe = path.join(os.tmpdir(), `dc-probe-${Date.now()}.json`); + +function log(m) { console.log(`[${Date.now() - T0}ms] ${m}`); } +const T0 = Date.now(); + +try { execSync(`mkfifo ${fifo}`); } catch (e) { console.error('mkfifo failed', e.message); process.exit(1); } +log(`pool=${POOL}, blockers=${BLOCKERS}, fifo=${fifo}`); + +// Occupy BLOCKERS threads with reads that never resolve (nobody writes to the +// FIFO) -- exactly like a cloud read that never returns. +for (let i = 0; i < BLOCKERS; i++) { + fs.readFile(fifo).then(() => log(`fifo read ${i} resolved (unexpected)`)) + .catch((e) => log(`fifo read ${i} errored: ${e.code}`)); +} + +// After threads are grabbed, time a trivial write == the per-call config write. +setTimeout(async () => { + log(`firing trivial write (proxy for list_processes' config save)...`); + const t = Date.now(); + const guard = setTimeout(() => { + log(`STILL BLOCKED after 5000ms -> STARVATION REPRODUCED. Exiting.`); + try { execSync(`rm -f ${fifo} ${probe}`); } catch {} + process.exit(0); + }, 5000); + try { + await fs.writeFile(probe, '{}'); + clearTimeout(guard); + log(`trivial write completed in ${Date.now() - t}ms (NOT starved)`); + execSync(`rm -f ${fifo} ${probe}`); + process.exit(0); + } catch (e) { + clearTimeout(guard); + log(`trivial write errored: ${e.message}`); + process.exit(1); + } +}, 200); diff --git a/test/repro/test-withtimeout-leak.js b/test/repro/test-withtimeout-leak.js new file mode 100644 index 00000000..91c89a92 --- /dev/null +++ b/test/repro/test-withtimeout-leak.js @@ -0,0 +1,39 @@ +// Repro: withTimeout() rejects on schedule but does NOT cancel the underlying +// fs op, so the libuv thread stays held. This is why a 30s read "timeout" does +// not free capacity for 30s worth of relief -- the thread is occupied for the +// REAL (cloud) duration, which can be minutes. +// +// Run: UV_THREADPOOL_SIZE=1 node test/test-withtimeout-leak.js +import { execSync } from 'child_process'; +import fs from 'fs/promises'; +import os from 'os'; +import path from 'path'; +import { withTimeout } from '../../dist/utils/withTimeout.js'; + +const T0 = Date.now(); +const log = (m) => console.log(`[${Date.now() - T0}ms] ${m}`); +const fifo = path.join(os.tmpdir(), `dc-leak-fifo-${Date.now()}`); +execSync(`mkfifo ${fifo}`); +log(`pool=${process.env.UV_THREADPOOL_SIZE || 'default(4)'} (using 1 thread)`); + +// One read stuck on the FIFO, wrapped exactly like DC wraps file reads. +const timed = withTimeout(fs.readFile(fifo), 1000, 'Read file operation', null) + .then((v) => log(`withTimeout resolved: ${v}`)) + .catch((e) => log(`withTimeout REJECTED (as designed): ${String(e).slice(0, 40)}...`)); + +// After the timeout "fires", try another fs op. If the thread were freed, this +// would run immediately. It does not -- the un-cancelled read still owns it. +await timed; +log(`timeout fired; now trying a fresh fs.writeFile on the single thread...`); +const t = Date.now(); +const guard = setTimeout(() => { + log(`next fs op STILL BLOCKED ${Date.now() - t}ms after timeout -> THREAD LEAKED`); + log(`(withTimeout freed the JS promise, not the OS thread)`); + execSync(`rm -f ${fifo}`); + process.exit(0); +}, 4000); +await fs.writeFile(path.join(os.tmpdir(), 'dc-leak-probe'), 'x'); +clearTimeout(guard); +log(`next fs op completed in ${Date.now() - t}ms (thread was freed)`); +execSync(`rm -f ${fifo}`); +process.exit(0); diff --git a/test/test-nonblocking-config-save.js b/test/test-nonblocking-config-save.js new file mode 100644 index 00000000..1a761b53 --- /dev/null +++ b/test/test-nonblocking-config-save.js @@ -0,0 +1,61 @@ +import assert from 'assert'; +import { readFileSync } from 'fs'; +import { configManager } from '../dist/config-manager.js'; +import { CONFIG_FILE } from '../dist/config.js'; + +/** + * Regression test for the parallel-load tool-call hang. + * + * Root cause: the CallTool handler awaited usageTracker.trackSuccess() before + * returning any tool result; that chained through configManager.setValue -> + * fs.writeFile on EVERY call. Under a saturated libuv threadpool (many parallel + * reads stalled on a slow/cloud filesystem) the awaited write could not get a + * thread, so even pure-memory tools (list_processes) hung until the client's + * ~4-minute cap. Each call also fired an independent write of the same file, + * risking a corrupted config.json. + * + * Fix: stats persist via configManager.setValueNonBlocking() — in-memory update + * is synchronous, the disk write is coalesced and serialized on a single write + * chain, and the caller never waits on it. + * + * This test is fast and cross-platform (no FIFO/python); the FIFO-based proof + * that the response no longer blocks under a starved pool lives in test/repro/. + */ + +const KEY = '__nonblockingSaveRegressionTest'; +let passed = 0; + +async function run() { + await configManager.getConfig(); // warm init so disk read is cached + + // 1) A burst of non-blocking saves must resolve effectively immediately — + // they must not each wait on a disk write. + const BURST = 100; + const t0 = Date.now(); + await Promise.all( + Array.from({ length: BURST }, (_, i) => configManager.setValueNonBlocking(KEY, i)) + ); + const elapsed = Date.now() - t0; + assert.ok(elapsed < 200, `burst of ${BURST} non-blocking saves took ${elapsed}ms (expected < 200ms)`); + passed++; console.log(`✓ ${BURST} non-blocking saves resolved in ${elapsed}ms`); + + // 2) The in-memory value is visible immediately (synchronously updated). + assert.strictEqual(await configManager.getValue(KEY), BURST - 1); + passed++; console.log('✓ in-memory value reflects the latest write immediately'); + + // 3) After the background flush window, config.json is valid JSON (no torn + // write from overlapping saves) and holds the final coalesced value. + await new Promise((r) => setTimeout(r, 300)); + let parsed; + assert.doesNotThrow(() => { parsed = JSON.parse(readFileSync(CONFIG_FILE, 'utf8')); }, + 'config.json must remain valid JSON after concurrent writes'); + assert.strictEqual(parsed[KEY], BURST - 1, 'final value must be persisted to disk'); + passed++; console.log('✓ config.json is valid and holds the coalesced final value'); + + // cleanup: remove the test key (undefined is dropped by JSON.stringify) + await configManager.setValue(KEY, undefined); +} + +run() + .then(() => { console.log(`\nPASS (${passed}/3)`); process.exit(0); }) + .catch((e) => { console.error(`\nFAIL: ${e.message}`); process.exit(1); }); diff --git a/test/test-read-abort-timeout.js b/test/test-read-abort-timeout.js new file mode 100644 index 00000000..09cc9f1d --- /dev/null +++ b/test/test-read-abort-timeout.js @@ -0,0 +1,92 @@ +import assert from 'assert'; +import os from 'os'; +import path from 'path'; +import fsp from 'fs/promises'; +import { runWithAbortableTimeout } from '../dist/utils/withTimeout.js'; +import { READ_OPERATION_TIMEOUT_MS, readFile } from '../dist/tools/filesystem.js'; +import { configManager } from '../dist/config-manager.js'; + +/** + * Regression tests for the abortable, 3-minute read timeout. + * + * Follow-up to the parallel-load hang fix: withTimeout() rejected on a timer + * but left the underlying fs op running, holding its libuv thread until the OS + * call returned. runWithAbortableTimeout() passes an AbortSignal into the op + * and aborts it on timeout. The read timeout is a flat 3 minutes, chosen to sit + * below the MCP client's ~4-minute hard cap so we abort + return a useful error + * before the client reports an opaque timeout. + */ + +let passed = 0; +const ok = (msg) => { passed++; console.log(`✓ ${msg}`); }; + +async function run() { + // 1) A fast operation resolves normally and is NOT aborted. + { + let sawSignal; + const val = await runWithAbortableTimeout(async (signal) => { + sawSignal = signal; + return 'done'; + }, 1000, 'fast op'); + assert.strictEqual(val, 'done'); + assert.strictEqual(sawSignal.aborted, false); + ok('fast operation resolves and signal is not aborted'); + } + + // 2) A slow operation times out with code ETIMEDOUT AND its signal is aborted + // (this is the "cleanup" — the op is told to stop, not just abandoned). + { + let sawSignal; + let err; + try { + await runWithAbortableTimeout((signal) => { + sawSignal = signal; + // Never settles on its own — only the timeout's abort should end the + // race (mirrors a real fs read, which rejects asynchronously after + // abort, so the ETIMEDOUT rejection wins). + return new Promise(() => {}); + }, 100, 'slow op'); + assert.fail('expected timeout rejection'); + } catch (e) { + err = e; + } + assert.strictEqual(err.code, 'ETIMEDOUT', `expected ETIMEDOUT, got ${err.code}`); + assert.strictEqual(sawSignal.aborted, true, 'operation signal must be aborted on timeout'); + ok('slow operation rejects ETIMEDOUT and aborts the operation signal'); + } + + // 3) The read timeout is 3 minutes — safely below the client's ~4-min cap. + { + assert.strictEqual(READ_OPERATION_TIMEOUT_MS, 3 * 60 * 1000, 'read timeout must be 3 minutes'); + assert.ok(READ_OPERATION_TIMEOUT_MS < 4 * 60 * 1000, 'must stay below the ~4-min client cap'); + ok('read timeout is 3 minutes, below the client hard cap'); + } + + // 4) Integration: a normal read still works end-to-end (signal threading did + // not break the happy path). Hermetic: uses its own temp dir + allowed-dir + // config so it doesn't depend on the ambient allowedDirectories, and + // restores config afterward. + { + const original = await configManager.getConfig(); + const originalAllowed = original.allowedDirectories; + // realpath so the allowed dir matches what validatePath resolves the file + // to (e.g. macOS /tmp -> /private/tmp), avoiding a symlink mismatch. + const tmpDir = await fsp.realpath(await fsp.mkdtemp(path.join(os.tmpdir(), 'dc-read-abort-'))); + const tmpFile = path.join(tmpDir, 'sample.txt'); + await fsp.writeFile(tmpFile, 'line1\nMARKER runWithAbortableTimeout\nline3\n'); + try { + await configManager.setValue('allowedDirectories', [tmpDir]); + const res = await readFile(tmpFile, { offset: 0, length: 5 }); + const text = typeof res.content === 'string' ? res.content : res.content.toString('utf8'); + assert.ok(text.includes('MARKER runWithAbortableTimeout'), 'normal read returns file content'); + ok('normal read_file still works with the signal threaded through'); + } finally { + await configManager.setValue('allowedDirectories', originalAllowed); + await fsp.rm(tmpDir, { recursive: true, force: true }); + } + } +} + +run() + .then(() => { console.log(`\nPASS (${passed}/4)`); process.exit(0); }) + .catch((e) => { console.error(`\nFAIL: ${e.message}`); process.exit(1); });