Skip to content

Commit 305c9d2

Browse files
edgarsskoreclaude
andcommitted
fix: recover device Realtime channel from half-open socket on reconnect
After idle / wifi-loss / sleep the device's Supabase Realtime socket can go half-open (conn.readyState stays OPEN but the peer is gone). recreateChannel() removed the old channel un-awaited and synchronously pushed a new one, so the channel registry never reached 0, realtime-js never tore the dead socket down, and every re-subscribe TIMED_OUT forever -- only a process restart recovered. Fix (remote-channel.ts): - recreateChannel(): add a re-entrancy guard, await removeChannel(), and force a fresh WebSocket via realtime.disconnect() before re-subscribing. - checkConnectionHealth(): treat 'joining' as healthy so realtime-js's own rejoin backoff can converge instead of being torn down mid-join. Also enrich the existing reconnect/timeout logs with a compact connState() line (socket state + readyState + channel state + attempt), turn on the previously commented-out CHANNEL_ERROR log, and add a CLOSED branch. Add test/remote-channel-reconnect.test.ts -- a deterministic repro that fails on the old behavior (8x TIMED_OUT, dead socket reused) and passes with the fix. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent be82290 commit 305c9d2

2 files changed

Lines changed: 328 additions & 30 deletions

File tree

src/remote-device/remote-channel.ts

Lines changed: 72 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ export class RemoteChannel {
3535
// Track last channel state for debug logging
3636
private lastChannelState: string | null = null;
3737

38+
// Reconnect diagnostics + guard (see connState() / recreateChannel())
39+
private reconnectAttempt = 0; // recreateChannel() attempts since last success
40+
private isRecreatingChannel = false; // a recreate is in flight (re-entrancy guard)
41+
3842
private _user: User | null = null;
3943
get user(): User | null { return this._user; }
4044

@@ -166,7 +170,7 @@ export class RemoteChannel {
166170

167171
// ! Ignore silently in Initialization to reconnect after
168172
await this.createChannel().catch((error) => {
169-
console.debug('[DEBUG] Failed to create channel, will retry after socket reconnect', error);
173+
console.debug(`[DEBUG] Failed to create channel, will retry after socket reconnect: ${error?.message || error}${this.connState()}`);
170174
});
171175

172176
} else {
@@ -206,10 +210,12 @@ export class RemoteChannel {
206210
)
207211
.subscribe((status: string, err: any) => {
208212
// Debug: Log all subscription status events
209-
console.debug(`[DEBUG] Channel subscription status: ${status}${err ? ' (error: ' + err + ')' : ''}`);
213+
console.debug(`[DEBUG] Channel subscription status: ${status}${err ? ' (error: ' + (err?.message || err) + ')' : ''}${this.connState()}`);
210214

211215
if (status === 'SUBSCRIBED') {
212-
console.log('✅ Channel subscribed');
216+
const recovered = this.reconnectAttempt;
217+
this.reconnectAttempt = 0;
218+
console.log(`✅ Channel subscribed${recovered > 0 ? ` (recovered after ${recovered} attempt${recovered === 1 ? '' : 's'})` : ''}`);
213219
// Update device status on successful connection
214220
if (this.deviceId) {
215221
this.setOnlineStatus(this.deviceId, 'online').catch(e => {
@@ -218,20 +224,37 @@ export class RemoteChannel {
218224
}
219225
resolve();
220226
} else if (status === 'CHANNEL_ERROR') {
221-
// console.error('❌ Channel subscription failed:', err);
227+
// CHANNEL_ERROR is the only status carrying a real error message.
228+
console.error(`❌ Channel error: ${err?.message || 'unknown'}${this.connState()}`);
222229
this.setOnlineStatus(this.deviceId!, 'offline');
223-
captureRemote('remote_channel_subscription_error', { error: err || 'Channel error' }).catch(() => { });
230+
captureRemote('remote_channel_subscription_error', { error: err?.message || 'Channel error' }).catch(() => { });
224231
reject(err || new Error('Failed to initialize tool call channel subscription'));
225232
} else if (status === 'TIMED_OUT') {
226-
console.error('⏱️ Channel subscription timed out, Reconnecting...');
233+
console.error(`⏱️ Channel subscription timed out, Reconnecting...${this.connState()}`);
227234
this.setOnlineStatus(this.deviceId!, 'offline');
228-
captureRemote('remote_channel_subscription_timeout', {}).catch(() => { });
235+
captureRemote('remote_channel_subscription_timeout', { attempt: this.reconnectAttempt }).catch(() => { });
229236
reject(new Error('Tool call channel subscription timed out'));
237+
} else if (status === 'CLOSED') {
238+
console.warn(`⚠️ Channel closed — ${this.connState()}`);
230239
}
231240
});
232241
});
233242
}
234243

244+
/**
245+
* Compact connection state for logs — e.g. "socket=open(1) ch=errored attempt=3".
246+
* readyState 1=OPEN (a 1 while joins keep failing = a half-open socket being reused),
247+
* 3=CLOSED, '-'=no socket. Reads realtime-js internals defensively; never throws.
248+
*/
249+
private connState(): string {
250+
let socket = '?';
251+
try {
252+
const rt: any = (this.client as any)?.realtime;
253+
socket = `${rt?.connectionState?.() ?? '?'}(${rt?.conn?.readyState ?? '-'})`;
254+
} catch { /* best effort */ }
255+
return `socket=${socket} ch=${this.channel?.state ?? '-'} attempt=${this.reconnectAttempt}`;
256+
}
257+
235258
/**
236259
* Check if channel is connected, recreate if not.
237260
*/
@@ -244,47 +267,66 @@ export class RemoteChannel {
244267

245268
// Debug: Log current channel state (only if changed)
246269
if (!this.lastChannelState || this.lastChannelState !== state) {
247-
console.debug(`[DEBUG] channel state: ${state}`);
270+
console.debug(`[DEBUG] channel state: ${state}${this.connState()}`);
248271
this.lastChannelState = state;
249272
}
250273

251-
// Aggressive health check: Only 'joined' is considered healthy
252-
// Any other state (joining, leaving, closed, errored, etc.) triggers recreation
253-
if (state !== 'joined') {
254-
captureRemote('remote_channel_state_health', { state });
274+
// 'joined' = healthy, 'joining' = transitional — let realtime-js's own rejoin
275+
// backoff converge instead of tearing the channel down mid-join. (FIX: previously
276+
// recreated on every non-joined state, which amputated that backoff.)
277+
if (state === 'joined' || state === 'joining') return;
255278

256-
console.debug(`[DEBUG] ⚠️ Channel in unhealthy state '${state}' - recreating...`);
257-
this.recreateChannel();
258-
}
279+
// Unhealthy: closed, errored, leaving — recreate
280+
captureRemote('remote_channel_state_health', { state, attempt: this.reconnectAttempt });
281+
console.debug(`[DEBUG] ⚠️ Channel in unhealthy state '${state}' - recreating... — ${this.connState()}`);
282+
this.recreateChannel();
259283
}
260284

261285
/**
262286
* Recreate the channel by destroying old one and creating fresh instance.
263287
*/
264-
private recreateChannel(): void {
288+
private async recreateChannel(): Promise<void> {
265289
if (!this.client || !this.user?.id || !this.onToolCall) {
266290
console.warn('Cannot recreate channel - missing parameters');
267291
console.debug('[DEBUG] recreateChannel() aborted - missing prerequisites');
268292
return;
269293
}
270294

271-
// Destroy old channel
272-
if (this.channel) {
273-
console.debug('[DEBUG] Destroying old channel');
274-
this.client.removeChannel(this.channel);
275-
this.channel = null;
295+
// FIX: re-entrancy guard so a 10s health tick can't stack a second recreate
296+
// on top of an in-flight one.
297+
if (this.isRecreatingChannel) {
298+
console.debug('[DEBUG] recreateChannel() skipped - already in progress');
299+
return;
276300
}
301+
this.isRecreatingChannel = true;
302+
this.reconnectAttempt++;
277303

278304
// Create fresh channel
279-
console.log('🔄 Recreating channel...');
280-
console.debug('[DEBUG] Calling createChannel() for recreation');
281-
this.createChannel().catch(err => {
282-
captureRemote('remote_channel_recreate_error', { err });
283-
console.debug('[DEBUG] Channel recreation failed:', err.message);
284-
285-
// TODO: enable only for debug mode
286-
// console.error('Failed to recreate channel:', err);
287-
});
305+
console.log(`🔄 Recreating channel... (attempt ${this.reconnectAttempt}) — ${this.connState()}`);
306+
307+
try {
308+
// Destroy old channel — AWAIT it so the channel registry empties before we
309+
// rebuild. (The un-awaited version raced the synchronous new-channel push, so
310+
// realtime-js never tore the socket down and a half-open one got reused.)
311+
if (this.channel) {
312+
console.debug('[DEBUG] Destroying old channel');
313+
await this.client.removeChannel(this.channel);
314+
this.channel = null;
315+
}
316+
317+
// FIX (core): force a brand-new WebSocket. After idle / wifi-loss the socket can
318+
// be HALF-OPEN (readyState OPEN but dead); reusing it made every join TIME_OUT
319+
// forever. disconnect() drops it so the next subscribe() dials a fresh one.
320+
try { await (this.client as any).realtime?.disconnect?.(); } catch { /* best effort */ }
321+
322+
console.debug('[DEBUG] Calling createChannel() for recreation');
323+
await this.createChannel();
324+
} catch (err: any) {
325+
captureRemote('remote_channel_recreate_error', { errMsg: err?.message, attempt: this.reconnectAttempt });
326+
console.debug(`[DEBUG] Channel recreation failed: ${err?.message}${this.connState()}`);
327+
} finally {
328+
this.isRecreatingChannel = false;
329+
}
288330
}
289331

290332
async markCallExecuting(callId: string) {

0 commit comments

Comments
 (0)