fix(swarm): WindowsTerminalBackend pidFile health check + 5-state lifecycle (#1237)

* fix(swarm): WindowsTerminalBackend pidFile health check + 5-state lifecycle

修 wt.exe split-pane fire-and-forget 导致 teammate 假死、TeamDelete 卡死、
kill-while-spawn race 等多个问题。

- 加 waitForPidFile() 在 wt.exe 返回后等 powershell.exe 真启动写 pidFile
  默认 8s timeout,env CLAUDE_WT_PANE_TIMEOUT_MS 覆盖,超时 throw 含完整诊断
- 加 5 态生命周期 (registered/spawning/ready/killing/dead),sendCommandToPane
  inner Promise 包装 spawnPromise,ready 态重 spawn 直接 throw
- killPane TOCTOU 修正:await spawnPromise 后重读 status;优先用缓存 pane.pid
  避免读盘,Stop-Process 失败也清缓存 + 标 dead 防 PID 复用误杀
- pid 解析严格化:/^\d+$/ + Number.isFinite + >0;移除 dead try/catch
- 构造函数 options 对象注入 pidFileDir(兼容原位置参数)
- 清启动前陈旧 pidFile,killPane fallback 3×500ms retry 兜底

* test(swarm): 12 tests covering WindowsTerminalBackend lifecycle, race, pid validation

为 WindowsTerminalBackend 加 12 个测试覆盖 v2 全部新行为,含 5 个 v1 兼容 + 7 个
v2 新场景。配套构造函数 options 对象,测试用 pidFileDir: tempDir 隔离防泄漏到
真实 OS tmpdir。

新场景覆盖:
- unlinks stale pidFile so a stale pid is not adopted
- rejects re-spawn on a ready pane
- throws on unknown paneId in sendCommandToPane
- rejects corrupted pidFile content ("123abc") and times out
- killPane awaits in-flight spawn before killing (kill-while-spawn race)
- Stop-Process failure clears cached pid and marks pane dead
- killPane uses cached pid and returns false when pane is unknown

createBackend helper 改用 options 对象 + simulatePidWrite 模拟 powershell 写
pidFile,pidFileDir 注入 tempDir,env CLAUDE_WT_PANE_TIMEOUT_MS beforeEach 设置
afterEach 清理。

---------

Co-authored-by: unraid <local@unraid.local>
This commit is contained in:
Dosion
2026-05-22 21:06:47 +08:00
committed by GitHub
parent 9d17597e58
commit f91060836f
2 changed files with 462 additions and 51 deletions

View File

@@ -1,5 +1,5 @@
import { randomUUID } from 'crypto'
import { readFile } from 'fs/promises'
import { readFile, unlink } from 'fs/promises'
import { join } from 'path'
import { tmpdir } from 'os'
import type { AgentColorName } from '@claude-code-best/builtin-tools/tools/AgentTool/agentColorManager.js'
@@ -13,10 +13,15 @@ import type { CreatePaneResult, PaneBackend, PaneId } from './types.js'
type CommandResult = { stdout: string; stderr: string; code: number }
type CommandRunner = (command: string, args: string[]) => Promise<CommandResult>
type PaneStatus = 'registered' | 'spawning' | 'ready' | 'killing' | 'dead'
type WindowsTerminalPane = {
title: string
mode: 'pane' | 'window'
pidFile: string
status: PaneStatus
pid?: number
spawnPromise?: Promise<void>
}
function quotePowerShellString(value: string): string {
@@ -39,8 +44,42 @@ function wrapPowerShellCommand(command: string, pidFile: string): string {
].join('; ')
}
function makePidFile(paneId: string): string {
return join(tmpdir(), `${paneId.replace(/[^a-zA-Z0-9_-]/g, '-')}.pid`)
const WT_PANE_TIMEOUT_DEFAULT_MS = 8000
const WT_PANE_POLL_INTERVAL_MS = 200
function getWtPaneTimeoutMs(): number {
const raw = process.env.CLAUDE_WT_PANE_TIMEOUT_MS
if (!raw) return WT_PANE_TIMEOUT_DEFAULT_MS
const parsed = Number.parseInt(raw, 10)
return Number.isFinite(parsed) && parsed > 0
? parsed
: WT_PANE_TIMEOUT_DEFAULT_MS
}
async function waitForPidFile(
pidFile: string,
timeoutMs: number,
): Promise<number> {
const deadline = Date.now() + timeoutMs
let lastErr: unknown
while (Date.now() < deadline) {
try {
const content = (await readFile(pidFile, 'utf-8')).trim()
if (!/^\d+$/.test(content)) {
lastErr = new Error(
`pidFile content not a valid pid: ${JSON.stringify(content)}`,
)
} else {
const pid = Number.parseInt(content, 10)
if (Number.isFinite(pid) && pid > 0) return pid
lastErr = new Error(`pidFile content parsed to invalid pid: ${pid}`)
}
} catch (err) {
lastErr = err
}
await new Promise(r => setTimeout(r, WT_PANE_POLL_INTERVAL_MS))
}
throw lastErr ?? new Error('pidFile never appeared')
}
/**
@@ -58,10 +97,40 @@ export class WindowsTerminalBackend implements PaneBackend {
private panes = new Map<PaneId, WindowsTerminalPane>()
private readonly runCommand: CommandRunner
private readonly getPlatformValue: () => Platform
private readonly pidFileDir: string
constructor(
private readonly runCommand: CommandRunner = execFileNoThrow,
private readonly getPlatformValue: () => Platform = getPlatform,
) {}
runCommandOrOptions?:
| CommandRunner
| {
runCommand?: CommandRunner
getPlatform?: () => Platform
pidFileDir?: string
},
getPlatformValue?: () => Platform,
) {
if (
typeof runCommandOrOptions === 'function' ||
runCommandOrOptions === undefined
) {
this.runCommand = runCommandOrOptions ?? execFileNoThrow
this.getPlatformValue = getPlatformValue ?? getPlatform
this.pidFileDir = tmpdir()
} else {
this.runCommand = runCommandOrOptions.runCommand ?? execFileNoThrow
this.getPlatformValue = runCommandOrOptions.getPlatform ?? getPlatform
this.pidFileDir = runCommandOrOptions.pidFileDir ?? tmpdir()
}
}
private makePidFile(paneId: string): string {
return join(
this.pidFileDir,
`${paneId.replace(/[^a-zA-Z0-9_-]/g, '-')}.pid`,
)
}
async isAvailable(): Promise<boolean> {
if (this.getPlatformValue() !== 'windows') {
@@ -92,7 +161,8 @@ export class WindowsTerminalBackend implements PaneBackend {
this.panes.set(paneId, {
title: name,
mode: 'pane',
pidFile: makePidFile(paneId),
pidFile: this.makePidFile(paneId),
status: 'registered',
})
return { paneId, isFirstTeammate }
}
@@ -106,7 +176,8 @@ export class WindowsTerminalBackend implements PaneBackend {
this.panes.set(paneId, {
title: name,
mode: 'window',
pidFile: makePidFile(paneId),
pidFile: this.makePidFile(paneId),
status: 'registered',
})
return { paneId, isFirstTeammate: false, windowName }
}
@@ -121,32 +192,95 @@ export class WindowsTerminalBackend implements PaneBackend {
throw new Error(`Unknown Windows Terminal pane id: ${paneId}`)
}
const launcher = wrapPowerShellCommand(command, pane.pidFile)
// wt.exe treats ';' as its own command separator, which breaks
// multi-statement PowerShell commands passed via -Command. Encode the
// entire script as Base64 UTF-16LE and use -EncodedCommand instead.
const encoded = Buffer.from(launcher, 'utf16le').toString('base64')
const args =
pane.mode === 'window'
? ['-w', '-1', 'new-tab', '--title', pane.title]
: ['-w', '0', 'split-pane', '--vertical', '--title', pane.title]
const result = await this.runCommand('wt.exe', [
...args,
'powershell.exe',
'-NoLogo',
'-NoProfile',
'-ExecutionPolicy',
'Bypass',
'-EncodedCommand',
encoded,
])
if (result.code !== 0) {
// 拒绝 ready 态重 spawn避免同 pidFile 双进程竞争)
if (pane.status === 'ready' || pane.status === 'killing') {
throw new Error(
`Failed to launch Windows Terminal teammate ${paneId}: ${result.stderr}`,
`Pane ${paneId} already spawned (status=${pane.status}); create a new pane to re-launch`,
)
}
if (pane.status === 'spawning') {
throw new Error(
`Pane ${paneId} is currently spawning; wait for the in-flight launch to complete`,
)
}
if (pane.status === 'dead') {
throw new Error(`Pane ${paneId} is dead; create a new pane`)
}
// pane.status === 'registered' → 继续
// 提前赋值 spawnPromise 在任何 await 前inner Promise 包装)
// Attach a no-op .catch() immediately to prevent unhandled rejection warnings
// in case killPane never awaits spawnPromise (e.g. sendCommandToPane fails
// before killPane is called).
let resolveSpawn!: () => void
let rejectSpawn!: (err: unknown) => void
const spawnPromise = new Promise<void>((res, rej) => {
resolveSpawn = res
rejectSpawn = rej
})
// Silence unhandled-rejection: killPane may .catch() this later, but if
// the pane dies before any kill is attempted, the rejection must not leak.
spawnPromise.catch(() => {})
pane.status = 'spawning'
pane.spawnPromise = spawnPromise
try {
const launcher = wrapPowerShellCommand(command, pane.pidFile)
// wt.exe treats ';' as its own command separator, which breaks
// multi-statement PowerShell commands passed via -Command. Encode the
// entire script as Base64 UTF-16LE and use -EncodedCommand instead.
const encoded = Buffer.from(launcher, 'utf16le').toString('base64')
const args =
pane.mode === 'window'
? ['-w', '-1', 'new-tab', '--title', pane.title]
: ['-w', '0', 'split-pane', '--vertical', '--title', pane.title]
await unlink(pane.pidFile).catch(() => {})
const result = await this.runCommand('wt.exe', [
...args,
'powershell.exe',
'-NoLogo',
'-NoProfile',
'-ExecutionPolicy',
'Bypass',
'-EncodedCommand',
encoded,
])
if (result.code !== 0) {
throw new Error(
`Failed to launch Windows Terminal teammate ${paneId}: ${result.stderr}`,
)
}
const timeoutMs = getWtPaneTimeoutMs()
let pid: number
try {
pid = await waitForPidFile(pane.pidFile, timeoutMs)
} catch (err) {
throw new Error(
`Windows Terminal pane failed to launch within ${timeoutMs}ms\n` +
` paneId: ${paneId}\n` +
` pidFile: ${pane.pidFile}\n` +
` wt.exe stdout: ${result.stdout || '(empty)'}\n` +
` wt.exe stderr: ${result.stderr || '(empty)'}\n` +
` underlying: ${err instanceof Error ? err.message : String(err)}\n` +
` override timeout via env CLAUDE_WT_PANE_TIMEOUT_MS`,
)
}
pane.pid = pid
pane.status = 'ready'
resolveSpawn()
} catch (err) {
pane.status = 'dead'
pane.pid = undefined
rejectSpawn(err)
throw err
} finally {
pane.spawnPromise = undefined
}
}
async setPaneBorderColor(
@@ -189,26 +323,69 @@ export class WindowsTerminalBackend implements PaneBackend {
return false
}
let pid: number
try {
pid = Number.parseInt((await readFile(pane.pidFile, 'utf-8')).trim(), 10)
} catch {
// 1. 解 kill-while-spawn raceawait spawn 完成(不论成功失败)
if (pane.status === 'spawning' && pane.spawnPromise) {
await pane.spawnPromise.catch(() => {})
}
// 2. TOCTOU 修正:重读 status/pid
if (pane.status === 'dead') {
this.panes.delete(paneId)
return false
}
if (!Number.isFinite(pid)) {
this.panes.delete(paneId)
if (pane.status !== 'ready') {
// 还在其它非终态(理论不可达,保险)
return false
}
pane.status = 'killing'
// 3. 优先用缓存 pid
let pid: number | undefined = pane.pid
// 4. fallback缓存没有则读盘保留 retry 3×500ms
if (pid === undefined) {
let pidContent: string | null = null
for (let attempt = 0; attempt < 3; attempt++) {
try {
pidContent = (await readFile(pane.pidFile, 'utf-8')).trim()
break
} catch {
if (attempt === 2) {
pane.status = 'dead'
this.panes.delete(paneId)
return false
}
await new Promise(r => setTimeout(r, 500))
}
}
if (!pidContent || !/^\d+$/.test(pidContent)) {
pane.status = 'dead'
this.panes.delete(paneId)
return false
}
const parsed = Number.parseInt(pidContent, 10)
if (!Number.isFinite(parsed) || parsed <= 0) {
pane.status = 'dead'
this.panes.delete(paneId)
return false
}
pid = parsed
}
// 5. 执行 Stop-Process
const result = await this.runCommand('powershell.exe', [
'-NoLogo',
'-NoProfile',
'-Command',
`Stop-Process -Id ${pid} -Force -ErrorAction Stop`,
])
// 6. 不管成功失败都清缓存 + 标 dead + 从 map 删(防 PID 复用误杀)
pane.pid = undefined
pane.status = 'dead'
this.panes.delete(paneId)
logForDebugging(
`[WindowsTerminalBackend] killPane ${paneId} pid=${pid} code=${result.code}`,
)

View File

@@ -14,20 +14,43 @@ beforeEach(async () => {
`windows-terminal-backend-${Date.now()}-${Math.random().toString(16).slice(2)}`,
)
await mkdir(tempDir, { recursive: true })
process.env.CLAUDE_WT_PANE_TIMEOUT_MS = '2000'
})
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true })
delete process.env.CLAUDE_WT_PANE_TIMEOUT_MS
})
function createBackend(calls: Call[]): WindowsTerminalBackend {
return new WindowsTerminalBackend(
async (command, args) => {
function createBackend(
calls: Call[],
opts: { simulatePidWrite?: boolean | number } = {},
): WindowsTerminalBackend {
const simulate = opts.simulatePidWrite !== false
const delayMs =
typeof opts.simulatePidWrite === 'number' ? opts.simulatePidWrite : 30
return new WindowsTerminalBackend({
runCommand: async (command, args) => {
calls.push({ command, args })
if (simulate && command === 'wt.exe') {
const encIdx = args.indexOf('-EncodedCommand')
if (encIdx >= 0) {
const decoded = Buffer.from(args[encIdx + 1]!, 'base64').toString(
'utf16le',
)
const match = decoded.match(/Set-Content -LiteralPath '([^']+)'/)
if (match) {
setTimeout(() => {
writeFile(match[1]!, '54321', 'utf-8').catch(() => {})
}, delayMs)
}
}
}
return { stdout: 'ok', stderr: '', code: 0 }
},
() => 'windows',
)
getPlatform: () => 'windows',
pidFileDir: tempDir,
})
}
function decodeEncodedCommand(call: Call): {
@@ -78,25 +101,236 @@ describe('WindowsTerminalBackend', () => {
expect(args.join(' ')).toContain('-w -1 new-tab --title')
})
test('force kills the recorded teammate shell pid when available', async () => {
test('force kills the cached pid from sendCommandToPane without reading pidFile', async () => {
const calls: Call[] = []
const backend = createBackend(calls)
const pane = await backend.createTeammatePaneInSwarmView('killer', 'red')
// sendCommandToPane resolves — simulate writes '54321' to pidFile, which
// becomes pane.pid. killPane should use the cached pid, not re-read the file.
await backend.sendCommandToPane(pane.paneId, "Write-Output 'running'")
const { decodedLauncher } = decodeEncodedCommand(calls[0]!)
const pidFile = decodedLauncher.match(
/Set-Content -LiteralPath '([^']+)'/,
)?.[1]
expect(pidFile).toBeString()
await writeFile(pidFile!, '12345', 'utf-8')
const killed = await backend.killPane(pane.paneId)
expect(killed).toBe(true)
expect(calls[calls.length - 1]!.command).toBe('powershell.exe')
expect(calls[calls.length - 1]!.args.join(' ')).toContain(
'Stop-Process -Id 12345',
'Stop-Process -Id 54321',
)
})
test('throws a diagnostic error when pidFile never appears within timeout', async () => {
process.env.CLAUDE_WT_PANE_TIMEOUT_MS = '300'
const calls: Call[] = []
const backend = createBackend(calls, { simulatePidWrite: false })
const pane = await backend.createTeammatePaneInSwarmView('slowpane', 'blue')
let caught: unknown
try {
await backend.sendCommandToPane(pane.paneId, "Write-Output 'x'")
} catch (err) {
caught = err
}
expect(caught).toBeInstanceOf(Error)
expect((caught as Error).message).toMatch(
/Windows Terminal pane failed to launch within 300ms/,
)
})
test('error message includes paneId pidFile and override hint', async () => {
process.env.CLAUDE_WT_PANE_TIMEOUT_MS = '250'
const calls: Call[] = []
const backend = createBackend(calls, { simulatePidWrite: false })
const pane = await backend.createTeammatePaneInSwarmView(
'diagpane',
'green',
)
let caught: unknown
try {
await backend.sendCommandToPane(pane.paneId, "Write-Output 'x'")
} catch (err) {
caught = err
}
expect(caught).toBeInstanceOf(Error)
const msg = (caught as Error).message
expect(msg).toContain(pane.paneId)
expect(msg).toContain('CLAUDE_WT_PANE_TIMEOUT_MS')
})
test('unlinks stale pidFile so a stale pid is not adopted', async () => {
const calls: Call[] = []
const backend = createBackend(calls, { simulatePidWrite: 30 })
const pane = await backend.createTeammatePaneInSwarmView('stale', 'pink')
// pidFile path is deterministic: <tempDir>/<sanitized paneId>.pid
const stalePidFile = join(
tempDir,
`${pane.paneId.replace(/[^a-zA-Z0-9_-]/g, '-')}.pid`,
)
// Pre-seed stale content. If sendCommandToPane did NOT unlink, waitForPidFile
// would immediately accept '99999' and cache it as pane.pid. With unlink,
// simulate's '54321' is the value killPane sees.
await writeFile(stalePidFile, '99999', 'utf-8')
await backend.sendCommandToPane(pane.paneId, "Write-Output 'x'")
const killed = await backend.killPane(pane.paneId)
expect(killed).toBe(true)
expect(calls[calls.length - 1]!.args.join(' ')).toContain(
'Stop-Process -Id 54321',
)
})
test('rejects re-spawn on a ready pane', async () => {
const calls: Call[] = []
const backend = createBackend(calls)
const pane = await backend.createTeammatePaneInSwarmView('reentry', 'cyan')
await backend.sendCommandToPane(pane.paneId, "Write-Output 'first'")
// pane.status === 'ready' now. Second sendCommandToPane must throw.
let caught: unknown
try {
await backend.sendCommandToPane(pane.paneId, "Write-Output 'second'")
} catch (err) {
caught = err
}
expect(caught).toBeInstanceOf(Error)
expect((caught as Error).message).toMatch(/already spawned/)
})
test('throws on unknown paneId in sendCommandToPane', async () => {
const calls: Call[] = []
const backend = createBackend(calls)
let caught: unknown
try {
await backend.sendCommandToPane('wt-nonexistent', "Write-Output 'x'")
} catch (err) {
caught = err
}
expect(caught).toBeInstanceOf(Error)
expect((caught as Error).message).toContain('Unknown Windows Terminal pane')
})
test('rejects corrupted pidFile content ("123abc") and times out', async () => {
process.env.CLAUDE_WT_PANE_TIMEOUT_MS = '400'
const calls: Call[] = []
// Custom runner writes invalid pid content (not all digits).
const backend = new WindowsTerminalBackend({
runCommand: async (command, args) => {
calls.push({ command, args })
if (command === 'wt.exe') {
const encIdx = args.indexOf('-EncodedCommand')
if (encIdx >= 0) {
const decoded = Buffer.from(args[encIdx + 1]!, 'base64').toString(
'utf16le',
)
const match = decoded.match(/Set-Content -LiteralPath '([^']+)'/)
if (match) {
setTimeout(() => {
writeFile(match[1]!, '123abc', 'utf-8').catch(() => {})
}, 30)
}
}
}
return { stdout: 'ok', stderr: '', code: 0 }
},
getPlatform: () => 'windows',
pidFileDir: tempDir,
})
const pane = await backend.createTeammatePaneInSwarmView('corrupt', 'red')
let caught: unknown
try {
await backend.sendCommandToPane(pane.paneId, "Write-Output 'x'")
} catch (err) {
caught = err
}
expect(caught).toBeInstanceOf(Error)
// Inner error from waitForPidFile must reach the wrapped diagnostic message.
const msg = (caught as Error).message
expect(msg).toMatch(/failed to launch within 400ms/)
expect(msg).toMatch(/not a valid pid|invalid pid|123abc/)
})
test('killPane awaits in-flight spawn before killing (kill-while-spawn race)', async () => {
// simulatePidWrite: 800ms — sendCommandToPane stays in waitForPidFile for ~800ms.
process.env.CLAUDE_WT_PANE_TIMEOUT_MS = '3000'
const calls: Call[] = []
const backend = createBackend(calls, { simulatePidWrite: 800 })
const pane = await backend.createTeammatePaneInSwarmView('racy', 'blue')
// Start spawn but don't await it yet.
const spawnP = backend.sendCommandToPane(pane.paneId, "Write-Output 'x'")
// 50ms later, call killPane — pane is still 'spawning', killPane must
// await spawnPromise (which resolves at ~800ms when simulate writes pid 54321),
// then kill using the cached pid.
await new Promise(r => setTimeout(r, 50))
const killP = backend.killPane(pane.paneId)
// Both must resolve cleanly.
await spawnP
const killed = await killP
expect(killed).toBe(true)
// The kill must target the freshly-spawned pid (54321), not have used a
// stale-or-missing fallback path.
const killCall = calls[calls.length - 1]!
expect(killCall.command).toBe('powershell.exe')
expect(killCall.args.join(' ')).toContain('Stop-Process -Id 54321')
})
test('Stop-Process failure clears cached pid and marks pane dead', async () => {
const calls: Call[] = []
// Runner returns code 1 only for powershell.exe (kill); wt.exe succeeds.
const backend = new WindowsTerminalBackend({
runCommand: async (command, args) => {
calls.push({ command, args })
if (command === 'wt.exe') {
const encIdx = args.indexOf('-EncodedCommand')
if (encIdx >= 0) {
const decoded = Buffer.from(args[encIdx + 1]!, 'base64').toString(
'utf16le',
)
const match = decoded.match(/Set-Content -LiteralPath '([^']+)'/)
if (match) {
setTimeout(() => {
writeFile(match[1]!, '54321', 'utf-8').catch(() => {})
}, 30)
}
}
return { stdout: 'ok', stderr: '', code: 0 }
}
// powershell Stop-Process fails
return { stdout: '', stderr: 'access denied', code: 1 }
},
getPlatform: () => 'windows',
pidFileDir: tempDir,
})
const pane = await backend.createTeammatePaneInSwarmView('dier', 'orange')
await backend.sendCommandToPane(pane.paneId, "Write-Output 'x'")
const killed = await backend.killPane(pane.paneId)
expect(killed).toBe(false) // Stop-Process exit 1 → false
// After kill failure, pane is removed from map: second killPane → false (not retry).
const killedAgain = await backend.killPane(pane.paneId)
expect(killedAgain).toBe(false)
// Critically: only ONE powershell call happened — the second killPane returned
// false from "pane not in map", not from another Stop-Process attempt.
const psCalls = calls.filter(c => c.command === 'powershell.exe')
expect(psCalls.length).toBe(1)
})
test('killPane uses cached pid and returns false when pane is unknown', async () => {
const calls: Call[] = []
const backend = createBackend(calls, { simulatePidWrite: 30 })
const pane = await backend.createTeammatePaneInSwarmView('cached', 'yellow')
await backend.sendCommandToPane(pane.paneId, "Write-Output 'x'")
// After sendCommandToPane, pane.pid = 54321 (from simulate). killPane must
// use this cached pid without reading the pidFile at all.
const killed = await backend.killPane(pane.paneId)
expect(killed).toBe(true)
expect(calls[calls.length - 1]!.args.join(' ')).toContain(
'Stop-Process -Id 54321',
)
// After kill, pane is removed — a second killPane must return false.
const killedAgain = await backend.killPane(pane.paneId)
expect(killedAgain).toBe(false)
})
})